Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use strategy_import_error method in pandera

Best Python code snippet using pandera_python

schema_components.py

Source:schema_components.py

1"""Components used in pandera schemas."""2import warnings3from copy import copy, deepcopy4from typing import Any, Dict, List, Optional, Tuple, Union5import numpy as np6import pandas as pd7from . import check_utils, errors8from . import strategies as st9from .deprecations import deprecate_pandas_dtype10from .error_handlers import SchemaErrorHandler11from .schemas import (12    CheckList,13    DataFrameSchema,14    PandasDtypeInputTypes,15    SeriesSchemaBase,16)17def _is_valid_multiindex_tuple_str(x: Tuple[Any, ...]) -> bool:18    """Check that a multi-index tuple key has all string elements"""19    return isinstance(x, tuple) and all(isinstance(i, str) for i in x)20class Column(SeriesSchemaBase):21    """Validate types and properties of DataFrame columns."""22    @deprecate_pandas_dtype23    def __init__(24        self,25        dtype: PandasDtypeInputTypes = None,26        checks: CheckList = None,27        nullable: bool = False,28        unique: bool = False,29        allow_duplicates: Optional[bool] = None,30        coerce: bool = False,31        required: bool = True,32        name: Union[str, Tuple[str, ...], None] = None,33        regex: bool = False,34        pandas_dtype: PandasDtypeInputTypes = None,35        title: Optional[str] = None,36        description: Optional[str] = None,37    ) -> None:38        """Create column validator object.39        :param dtype: datatype of the column. A ``PandasDtype`` for40            type-checking dataframe. If a string is specified, then assumes41            one of the valid pandas string values:42            http://pandas.pydata.org/pandas-docs/stable/basics.html#dtypes43        :param checks: checks to verify validity of the column44        :param nullable: Whether or not column can contain null values.45        :param unique: whether column values should be unique46        :param allow_duplicates: Whether or not column can contain duplicate47            values.48            .. warning::49                This option will be deprecated in 0.8.0. Use the ``unique``50                argument instead.51        :param coerce: If True, when schema.validate is called the column will52            be coerced into the specified dtype. This has no effect on columns53            where ``pandas_dtype=None``.54        :param required: Whether or not column is allowed to be missing55        :param name: column name in dataframe to validate.56        :param regex: whether the ``name`` attribute should be treated as a57            regex pattern to apply to multiple columns in a dataframe.58        :param pandas_dtype: alias of ``dtype`` for backwards compatibility.59            .. warning:: This option will be deprecated in 0.8.060        :param title: A human-readable label for the column.61        :param description: An arbitrary textual description of the column.62        :raises SchemaInitError: if impossible to build schema from parameters63        :example:64        >>> import pandas as pd65        >>> import pandera as pa66        >>>67        >>>68        >>> schema = pa.DataFrameSchema({69        ...     "column": pa.Column(str)70        ... })71        >>>72        >>> schema.validate(pd.DataFrame({"column": ["foo", "bar"]}))73          column74        0    foo75        1    bar76        See :ref:`here<column>` for more usage details.77        """78        super().__init__(79            dtype,80            checks,81            nullable,82            unique,83            allow_duplicates,84            coerce,85            name,86            pandas_dtype,87            title,88            description,89        )90        if (91            name is not None92            and not isinstance(name, str)93            and not _is_valid_multiindex_tuple_str(name)94            and regex95        ):96            raise ValueError(97                "You cannot specify a non-string name when setting regex=True"98            )99        self.required = required100        self._name = name101        self._regex = regex102    @property103    def regex(self) -> bool:104        """True if ``name`` attribute should be treated as a regex pattern."""105        return self._regex106    @property107    def _allow_groupby(self) -> bool:108        """Whether the schema or schema component allows groupby operations."""109        return True110    @property111    def properties(self) -> Dict[str, Any]:112        """Get column properties."""113        return {114            "dtype": self.dtype,115            "checks": self._checks,116            "nullable": self._nullable,117            "unique": self._unique,118            "coerce": self._coerce,119            "required": self.required,120            "name": self._name,121            "regex": self._regex,122            "title": self.title,123            "description": self.description,124        }125    def set_name(self, name: str):126        """Used to set or modify the name of a column object.127        :param str name: the name of the column object128        """129        self._name = name130        return self131    def coerce_dtype(self, obj: Union[pd.DataFrame, pd.Series, pd.Index]):132        """Coerce dtype of a column, handling duplicate column names."""133        # pylint: disable=super-with-arguments134        if check_utils.is_field(obj) or check_utils.is_index(obj):135            return super(Column, self).coerce_dtype(obj)136        return obj.apply(137            lambda x: super(Column, self).coerce_dtype(x), axis="columns"138        )139    def validate(140        self,141        check_obj: pd.DataFrame,142        head: Optional[int] = None,143        tail: Optional[int] = None,144        sample: Optional[int] = None,145        random_state: Optional[int] = None,146        lazy: bool = False,147        inplace: bool = False,148    ) -> pd.DataFrame:149        """Validate a Column in a DataFrame object.150        :param check_obj: pandas DataFrame to validate.151        :param head: validate the first n rows. Rows overlapping with `tail` or152            `sample` are de-duplicated.153        :param tail: validate the last n rows. Rows overlapping with `head` or154            `sample` are de-duplicated.155        :param sample: validate a random sample of n rows. Rows overlapping156            with `head` or `tail` are de-duplicated.157        :param random_state: random seed for the ``sample`` argument.158        :param lazy: if True, lazily evaluates dataframe against all validation159            checks and raises a ``SchemaErrors``. Otherwise, raise160            ``SchemaError`` as soon as one occurs.161        :param inplace: if True, applies coercion to the object of validation,162            otherwise creates a copy of the data.163        :returns: validated DataFrame.164        """165        if not inplace:166            check_obj = check_obj.copy()167        if self._name is None:168            raise errors.SchemaError(169                self,170                check_obj,171                "column name is set to None. Pass the ``name` argument when "172                "initializing a Column object, or use the ``set_name`` "173                "method.",174            )175        def validate_column(check_obj, column_name):176            super(Column, copy(self).set_name(column_name)).validate(177                check_obj,178                head,179                tail,180                sample,181                random_state,182                lazy,183                inplace=inplace,184            )185        column_keys_to_check = (186            self.get_regex_columns(check_obj.columns)187            if self._regex188            else [self._name]189        )190        for column_name in column_keys_to_check:191            if self.coerce:192                check_obj[column_name] = self.coerce_dtype(193                    check_obj[column_name]194                )195            if check_utils.is_table(check_obj[column_name]):196                for i in range(check_obj[column_name].shape[1]):197                    validate_column(198                        check_obj[column_name].iloc[:, [i]], column_name199                    )200            else:201                validate_column(check_obj, column_name)202        return check_obj203    def get_regex_columns(204        self, columns: Union[pd.Index, pd.MultiIndex]205    ) -> Union[pd.Index, pd.MultiIndex]:206        """Get matching column names based on regex column name pattern.207        :param columns: columns to regex pattern match208        :returns: matchin columns209        """210        if isinstance(self.name, tuple):211            # handle MultiIndex case212            if len(self.name) != columns.nlevels:213                raise IndexError(214                    f"Column regex name='{self.name}' is a tuple, expected a "215                    f"MultiIndex columns with {len(self.name)} number of "216                    f"levels, found {columns.nlevels} level(s)"217                )218            matches = np.ones(len(columns)).astype(bool)219            for i, name in enumerate(self.name):220                matched = pd.Index(221                    columns.get_level_values(i).astype(str).str.match(name)222                ).fillna(False)223                matches = matches & np.array(matched.tolist())224            column_keys_to_check = columns[matches]225        else:226            if check_utils.is_multiindex(columns):227                raise IndexError(228                    f"Column regex name {self.name} is a string, expected a "229                    "dataframe where the index is a pd.Index object, not a "230                    "pd.MultiIndex object"231                )232            column_keys_to_check = columns[233                # str.match will return nan values when the index value is234                # not a string.235                pd.Index(columns.astype(str).str.match(self.name))236                .fillna(False)237                .tolist()238            ]239        if column_keys_to_check.shape[0] == 0:240            raise errors.SchemaError(241                self,242                columns,243                f"Column regex name='{self.name}' did not match any columns "244                "in the dataframe. Update the regex pattern so that it "245                f"matches at least one column:\n{columns.tolist()}",246            )247        # drop duplicates to account for potential duplicated columns in the248        # dataframe.249        return column_keys_to_check.drop_duplicates()250    @st.strategy_import_error251    def strategy(self, *, size=None):252        """Create a ``hypothesis`` strategy for generating a Column.253        :param size: number of elements to generate254        :returns: a dataframe strategy for a single column.255        """256        return super().strategy(size=size).map(lambda x: x.to_frame())257    @st.strategy_import_error258    def strategy_component(self):259        """Generate column data object for use by DataFrame strategy."""260        return st.column_strategy(261            self.dtype,262            checks=self.checks,263            unique=self.unique,264            name=self.name,265        )266    def example(self, size=None) -> pd.DataFrame:267        """Generate an example of a particular size.268        :param size: number of elements in the generated Index.269        :returns: pandas DataFrame object.270        """271        # pylint: disable=import-outside-toplevel,cyclic-import,import-error272        import hypothesis273        with warnings.catch_warnings():274            warnings.simplefilter(275                "ignore",276                category=hypothesis.errors.NonInteractiveExampleWarning,277            )278            return (279                super()280                .strategy(size=size)281                .example()282                .rename(self.name)283                .to_frame()284            )285    def __eq__(self, other):286        if not isinstance(other, self.__class__):287            return NotImplemented288        def _compare_dict(obj):289            return {290                k: v if k != "_checks" else set(v)291                for k, v in obj.__dict__.items()292            }293        return _compare_dict(self) == _compare_dict(other)294class Index(SeriesSchemaBase):295    """Validate types and properties of a DataFrame Index."""296    @property297    def names(self):298        """Get index names in the Index schema component."""299        return [self.name]300    @property301    def _allow_groupby(self) -> bool:302        """Whether the schema or schema component allows groupby operations."""303        return False304    def validate(305        self,306        check_obj: Union[pd.DataFrame, pd.Series],307        head: Optional[int] = None,308        tail: Optional[int] = None,309        sample: Optional[int] = None,310        random_state: Optional[int] = None,311        lazy: bool = False,312        inplace: bool = False,313    ) -> Union[pd.DataFrame, pd.Series]:314        """Validate DataFrameSchema or SeriesSchema Index.315        :check_obj: pandas DataFrame of Series containing index to validate.316        :param head: validate the first n rows. Rows overlapping with `tail` or317            `sample` are de-duplicated.318        :param tail: validate the last n rows. Rows overlapping with `head` or319            `sample` are de-duplicated.320        :param sample: validate a random sample of n rows. Rows overlapping321            with `head` or `tail` are de-duplicated.322        :param random_state: random seed for the ``sample`` argument.323        :param lazy: if True, lazily evaluates dataframe against all validation324            checks and raises a ``SchemaErrors``. Otherwise, raise325            ``SchemaError`` as soon as one occurs.326        :param inplace: if True, applies coercion to the object of validation,327            otherwise creates a copy of the data.328        :returns: validated DataFrame or Series.329        """330        if check_utils.is_multiindex(check_obj.index):331            raise errors.SchemaError(332                self, check_obj, "Attempting to validate mismatch index"333            )334        series_cls = pd.Series335        # NOTE: this is a hack to get koalas working, this needs a more336        # principled implementation337        if type(check_obj).__module__ == "databricks.koalas.frame":338            # pylint: disable=import-outside-toplevel339            import databricks.koalas as ks340            series_cls = ks.Series341        if self.coerce:342            check_obj.index = self.coerce_dtype(check_obj.index)343            # handles case where pandas native string type is not supported344            # by index.345            obj_to_validate = self.dtype.coerce(346                series_cls(347                    check_obj.index.to_numpy(), name=check_obj.index.name348                )349            )350        else:351            obj_to_validate = series_cls(352                check_obj.index.to_numpy(), name=check_obj.index.name353            )354        assert check_utils.is_field(355            super().validate(356                obj_to_validate,357                head,358                tail,359                sample,360                random_state,361                lazy,362                inplace,363            ),364        )365        return check_obj366    @st.strategy_import_error367    def strategy(self, *, size: int = None):368        """Create a ``hypothesis`` strategy for generating an Index.369        :param size: number of elements to generate.370        :returns: index strategy.371        """372        return st.index_strategy(373            self.dtype,  # type: ignore374            checks=self.checks,375            nullable=self.nullable,376            unique=self.unique,377            name=self.name,378            size=size,379        )380    @st.strategy_import_error381    def strategy_component(self):382        """Generate column data object for use by MultiIndex strategy."""383        return st.column_strategy(384            self.dtype,385            checks=self.checks,386            unique=self.unique,387            name=self.name,388        )389    def example(self, size: int = None) -> pd.Index:390        """Generate an example of a particular size.391        :param size: number of elements in the generated Index.392        :returns: pandas Index object.393        """394        # pylint: disable=import-outside-toplevel,cyclic-import,import-error395        import hypothesis396        with warnings.catch_warnings():397            warnings.simplefilter(398                "ignore",399                category=hypothesis.errors.NonInteractiveExampleWarning,400            )401            return self.strategy(size=size).example()402    def __eq__(self, other):403        return self.__dict__ == other.__dict__404class MultiIndex(DataFrameSchema):405    """Validate types and properties of a DataFrame MultiIndex.406    This class inherits from :class:`~pandera.schemas.DataFrameSchema` to407    leverage its validation logic.408    """409    def __init__(410        self,411        indexes: List[Index],412        coerce: bool = False,413        strict: bool = False,414        name: str = None,415        ordered: bool = True,416        unique: Optional[Union[str, List[str]]] = None,417    ) -> None:418        """Create MultiIndex validator.419        :param indexes: list of Index validators for each level of the420            MultiIndex index.421        :param coerce: Whether or not to coerce the MultiIndex to the422            specified dtypes before validation423        :param strict: whether or not to accept columns in the MultiIndex that424            aren't defined in the ``indexes`` argument.425        :param name: name of schema component426        :param ordered: whether or not to validate the indexes order.427        :param unique: a list of index names that should be jointly unique.428        :example:429        >>> import pandas as pd430        >>> import pandera as pa431        >>>432        >>>433        >>> schema = pa.DataFrameSchema(434        ...     columns={"column": pa.Column(int)},435        ...     index=pa.MultiIndex([436        ...         pa.Index(str,437        ...               pa.Check(lambda s: s.isin(["foo", "bar"])),438        ...               name="index0"),439        ...         pa.Index(int, name="index1"),440        ...     ])441        ... )442        >>>443        >>> df = pd.DataFrame(444        ...     data={"column": [1, 2, 3]},445        ...     index=pd.MultiIndex.from_arrays(446        ...         [["foo", "bar", "foo"], [0, 1, 2]],447        ...         names=["index0", "index1"],448        ...     )449        ... )450        >>>451        >>> schema.validate(df)452                       column453        index0 index1454        foo    0            1455        bar    1            2456        foo    2            3457        See :ref:`here<multiindex>` for more usage details.458        """459        if any(not isinstance(i, Index) for i in indexes):460            raise errors.SchemaInitError(461                f"expected a list of Index objects, found {indexes} "462                f"of type {[type(x) for x in indexes]}"463            )464        self.indexes = indexes465        columns = {}466        for i, index in enumerate(indexes):467            if not ordered and index.name is None:468                # if the MultiIndex is not ordered, there's no way of469                # determining how to get the index level without an explicit470                # index name471                raise errors.SchemaInitError(472                    "You must specify index names if MultiIndex schema "473                    "component is not ordered."474                )475            columns[i if index.name is None else index.name] = Column(476                dtype=index._dtype,477                checks=index.checks,478                nullable=index._nullable,479                unique=index._unique,480            )481        super().__init__(482            columns=columns,483            coerce=coerce,484            strict=strict,485            name=name,486            ordered=ordered,487            unique=unique,488        )489    @property490    def names(self):491        """Get index names in the MultiIndex schema component."""492        return [index.name for index in self.indexes]493    @property494    def coerce(self):495        """Whether or not to coerce data types."""496        return self._coerce or any(index.coerce for index in self.indexes)497    @coerce.setter498    def coerce(self, value: bool) -> None:499        """Set coerce attribute."""500        self._coerce = value501    def coerce_dtype(self, obj: pd.MultiIndex) -> pd.MultiIndex:502        """Coerce type of a pd.Series by type specified in dtype.503        :param obj: multi-index to coerce.504        :returns: ``MultiIndex`` with coerced data type505        """506        error_handler = SchemaErrorHandler(lazy=True)507        # construct MultiIndex with coerced data types508        coerced_multi_index = {}509        for i, index in enumerate(self.indexes):510            if all(x is None for x in self.names):511                index_levels = [i]512            else:513                index_levels = [514                    i for i, name in enumerate(obj.names) if name == index.name515                ]516            for index_level in index_levels:517                index_array = obj.get_level_values(index_level)518                if index.coerce or self._coerce:519                    try:520                        index_array = index.coerce_dtype(index_array)521                    except errors.SchemaError as err:522                        error_handler.collect_error(523                            "dtype_coercion_error", err524                        )525                coerced_multi_index[index_level] = index_array526        if error_handler.collected_errors:527            raise errors.SchemaErrors(error_handler.collected_errors, obj)528        multiindex_cls = pd.MultiIndex529        # NOTE: this is a hack to support koalas530        if type(obj).__module__.startswith("databricks.koalas"):531            # pylint: disable=import-outside-toplevel532            import databricks.koalas as ks533            multiindex_cls = ks.MultiIndex534        return multiindex_cls.from_arrays(535            [536                v.to_numpy()537                for k, v in sorted(538                    coerced_multi_index.items(), key=lambda x: x[0]539                )540            ],541            names=obj.names,542        )543    def validate(544        self,545        check_obj: Union[pd.DataFrame, pd.Series],546        head: Optional[int] = None,547        tail: Optional[int] = None,548        sample: Optional[int] = None,549        random_state: Optional[int] = None,550        lazy: bool = False,551        inplace: bool = False,552    ) -> Union[pd.DataFrame, pd.Series]:553        """Validate DataFrame or Series MultiIndex.554        :param check_obj: pandas DataFrame of Series to validate.555        :param head: validate the first n rows. Rows overlapping with `tail` or556            `sample` are de-duplicated.557        :param tail: validate the last n rows. Rows overlapping with `head` or558            `sample` are de-duplicated.559        :param sample: validate a random sample of n rows. Rows overlapping560            with `head` or `tail` are de-duplicated.561        :param random_state: random seed for the ``sample`` argument.562        :param lazy: if True, lazily evaluates dataframe against all validation563            checks and raises a ``SchemaErrors``. Otherwise, raise564            ``SchemaError`` as soon as one occurs.565        :param inplace: if True, applies coercion to the object of validation,566            otherwise creates a copy of the data.567        :returns: validated DataFrame or Series.568        """569        # pylint: disable=too-many-locals570        if self.coerce:571            try:572                check_obj.index = self.coerce_dtype(check_obj.index)573            except errors.SchemaErrors as err:574                if lazy:575                    raise576                raise err.schema_errors[0]["error"] from err577        # Prevent data type coercion when the validate method is called because578        # it leads to some weird behavior when calling coerce_dtype within the579        # DataFrameSchema.validate call. Need to fix this by having MultiIndex580        # not inherit from DataFrameSchema.581        self_copy = deepcopy(self)582        self_copy.coerce = False583        for index in self_copy.indexes:584            index.coerce = False585        # rename integer-based column names in case of duplicate index names,586        # with at least one named index.587        if (588            not all(x is None for x in check_obj.index.names)589            and len(set(check_obj.index.names)) != check_obj.index.nlevels590        ):591            index_names = []592            for i, name in enumerate(check_obj.index.names):593                name = i if name is None else name594                if name not in index_names:595                    index_names.append(name)596            columns = {}597            for name, (_, column) in zip(598                index_names, self_copy.columns.items()599            ):600                columns[name] = column.set_name(name)601            self_copy.columns = columns602        def to_dataframe(multiindex):603            """604            Emulate the behavior of pandas.MultiIndex.to_frame, but preserve605            duplicate index names if they exist.606            """607            # NOTE: this is a hack to support koalas608            if type(multiindex).__module__.startswith("databricks.koalas"):609                df = multiindex.to_frame()610            else:611                df = pd.DataFrame(612                    {613                        i: multiindex.get_level_values(i)614                        for i in range(multiindex.nlevels)615                    }616                )617                df.columns = [618                    i if name is None else name619                    for i, name in enumerate(multiindex.names)620                ]621                df.index = multiindex622            return df623        try:624            validation_result = super(MultiIndex, self_copy).validate(625                to_dataframe(check_obj.index),626                head,627                tail,628                sample,629                random_state,630                lazy,631                inplace,632            )633        except errors.SchemaErrors as err:634            # This is a hack to re-raise the SchemaErrors exception and change635            # the schema context to MultiIndex. This should be fixed by with636            # a more principled schema class hierarchy.637            schema_error_dicts = []638            for schema_error_dict in err.schema_errors:639                error = schema_error_dict["error"]640                error = errors.SchemaError(641                    self,642                    check_obj,643                    error.args[0],644                    error.failure_cases.assign(column=error.schema.name),645                    error.check,646                    error.check_index,647                )648                schema_error_dict["error"] = error649                schema_error_dicts.append(schema_error_dict)650            raise errors.SchemaErrors(schema_error_dicts, check_obj)651        assert check_utils.is_table(validation_result)652        return check_obj653    @st.strategy_import_error654    # NOTE: remove these ignore statements as part of655    # https://github.com/pandera-dev/pandera/issues/403656    # pylint: disable=arguments-differ657    def strategy(self, *, size=None):  # type: ignore658        return st.multiindex_strategy(indexes=self.indexes, size=size)659    # NOTE: remove these ignore statements as part of660    # https://github.com/pandera-dev/pandera/issues/403661    # pylint: disable=arguments-differ662    def example(self, size=None) -> pd.MultiIndex:  # type: ignore663        # pylint: disable=import-outside-toplevel,cyclic-import,import-error664        import hypothesis665        with warnings.catch_warnings():666            warnings.simplefilter(667                "ignore",668                category=hypothesis.errors.NonInteractiveExampleWarning,669            )670            return self.strategy(size=size).example()671    def __repr__(self):672        return (673            f"<Schema {self.__class__.__name__}("674            f"indexes={self.indexes}, "675            f"coerce={self.coerce}, "676            f"strict={self.strict}, "677            f"name={self.name}, "678            f"ordered={self.ordered}"679            ")>"680        )681    def __str__(self):682        indent = " " * 4683        indexes_str = "[\n"684        for index in self.indexes:685            indexes_str += f"{indent * 2}{index}\n"686        indexes_str += f"{indent}]"687        return (688            f"<Schema {self.__class__.__name__}(\n"689            f"{indent}indexes={indexes_str}\n"690            f"{indent}coerce={self.coerce},\n"691            f"{indent}strict={self.strict},\n"692            f"{indent}name={self.name},\n"693            f"{indent}ordered={self.ordered}\n"694            ")>"695        )696    def __eq__(self, other):...

model.py

Source:model.py

1"""Class-based api"""2import inspect3import os4import re5import sys6import typing7from typing import (8    Any,9    Callable,10    Dict,11    Iterable,12    List,13    Optional,14    Set,15    Tuple,16    Type,17    TypeVar,18    Union,19    cast,20)21import pandas as pd22from . import schema_components23from . import strategies as st24from .checks import Check25from .errors import SchemaInitError26from .json_schema import to_json_schema27from .model_components import (28    CHECK_KEY,29    DATAFRAME_CHECK_KEY,30    CheckInfo,31    Field,32    FieldCheckInfo,33    FieldInfo,34)35from .schemas import DataFrameSchema36from .typing import INDEX_TYPES, SERIES_TYPES, AnnotationInfo37from .typing.common import DataFrameBase38from .typing.config import BaseConfig39if sys.version_info[:2] < (3, 9):40    from typing_extensions import get_type_hints41else:42    from typing import get_type_hints43try:44    from pydantic.fields import ModelField  # pylint:disable=unused-import45    HAS_PYDANTIC = True46except ImportError:47    HAS_PYDANTIC = False48SchemaIndex = Union[schema_components.Index, schema_components.MultiIndex]49_CONFIG_KEY = "Config"50MODEL_CACHE: Dict[Type["SchemaModel"], DataFrameSchema] = {}51F = TypeVar("F", bound=Callable)52TSchemaModel = TypeVar("TSchemaModel", bound="SchemaModel")53def docstring_substitution(*args: Any, **kwargs: Any) -> Callable[[F], F]:54    """Typed wrapper around pd.util.Substitution."""55    def decorator(func: F) -> F:56        return cast(F, pd.util.Substitution(*args, **kwargs)(func))57    return decorator58def _is_field(name: str) -> bool:59    """Ignore private and reserved keywords."""60    return not name.startswith("_") and name != _CONFIG_KEY61_config_options = [attr for attr in vars(BaseConfig) if _is_field(attr)]62def _extract_config_options_and_extras(63    config: Any,64) -> Tuple[Dict[str, Any], Dict[str, Any]]:65    config_options, extras = {}, {}66    for name, value in vars(config).items():67        if name in _config_options:68            config_options[name] = value69        elif _is_field(name):70            extras[name] = value71        # drop private/reserved keywords72    return config_options, extras73def _convert_extras_to_checks(extras: Dict[str, Any]) -> List[Check]:74    """75    New in GH#383.76    Any key not in BaseConfig keys is interpreted as defining a dataframe check. This function77    defines this conversion as follows:78        - Look up the key name in Check79        - If value is80            - tuple: interpret as args81            - dict: interpret as kwargs82            - anything else: interpret as the only argument to pass to Check83    """84    checks = []85    for name, value in extras.items():86        if isinstance(value, tuple):87            args, kwargs = value, {}88        elif isinstance(value, dict):89            args, kwargs = (), value90        else:91            args, kwargs = (value,), {}92        # dispatch directly to getattr to raise the correct exception93        checks.append(Check.__getattr__(name)(*args, **kwargs))94    return checks95class _MetaSchema(type):96    """Add string representations, mainly for pydantic."""97    def __repr__(cls):98        return str(cls)99    def __str__(cls):100        return cls.__name__101class SchemaModel(metaclass=_MetaSchema):102    """Definition of a :class:`~pandera.DataFrameSchema`.103    *new in 0.5.0*104    See the :ref:`User Guide <schema_models>` for more.105    """106    Config: Type[BaseConfig] = BaseConfig107    __extras__: Optional[Dict[str, Any]] = None108    __schema__: Optional[DataFrameSchema] = None109    __config__: Optional[Type[BaseConfig]] = None110    #: Key according to `FieldInfo.name`111    __fields__: Dict[str, Tuple[AnnotationInfo, FieldInfo]] = {}112    __checks__: Dict[str, List[Check]] = {}113    __dataframe_checks__: List[Check] = []114    # This is syntantic sugar that delegates to the validate method115    @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)116    def __new__(cls, *args, **kwargs) -> DataFrameBase[TSchemaModel]:  # type: ignore [misc]117        """%(validate_doc)s"""118        return cast(DataFrameBase[TSchemaModel], cls.validate(*args, **kwargs))119    def __init_subclass__(cls, **kwargs):120        """Ensure :class:`~pandera.model_components.FieldInfo` instances."""121        super().__init_subclass__(**kwargs)122        # pylint:disable=no-member123        subclass_annotations = cls.__dict__.get("__annotations__", {})124        for field_name in subclass_annotations.keys():125            if _is_field(field_name) and field_name not in cls.__dict__:126                # Field omitted127                field = Field()128                field.__set_name__(cls, field_name)129                setattr(cls, field_name, field)130        cls.__config__, cls.__extras__ = cls._collect_config_and_extras()131    @classmethod132    def to_schema(cls) -> DataFrameSchema:133        """Create :class:`~pandera.DataFrameSchema` from the :class:`.SchemaModel`."""134        if cls in MODEL_CACHE:135            return MODEL_CACHE[cls]136        mi_kwargs = {137            name[len("multiindex_") :]: value138            for name, value in vars(cls.__config__).items()139            if name.startswith("multiindex_")140        }141        cls.__fields__ = cls._collect_fields()142        check_infos = typing.cast(143            List[FieldCheckInfo], cls._collect_check_infos(CHECK_KEY)144        )145        cls.__checks__ = cls._extract_checks(146            check_infos, field_names=list(cls.__fields__.keys())147        )148        df_check_infos = cls._collect_check_infos(DATAFRAME_CHECK_KEY)149        df_custom_checks = cls._extract_df_checks(df_check_infos)150        df_registered_checks = _convert_extras_to_checks(151            {} if cls.__extras__ is None else cls.__extras__152        )153        cls.__dataframe_checks__ = df_custom_checks + df_registered_checks154        columns, index = cls._build_columns_index(155            cls.__fields__, cls.__checks__, **mi_kwargs156        )157        kwargs = {}158        if cls.__config__ is not None:159            kwargs = {160                "coerce": cls.__config__.coerce,161                "strict": cls.__config__.strict,162                "name": cls.__config__.name,163                "ordered": cls.__config__.ordered,164                "unique": cls.__config__.unique,165                "title": cls.__config__.title,166                "description": cls.__config__.description or cls.__doc__,167            }168        cls.__schema__ = DataFrameSchema(169            columns,170            index=index,171            checks=cls.__dataframe_checks__,  # type: ignore172            **kwargs,173        )174        if cls not in MODEL_CACHE:175            MODEL_CACHE[cls] = cls.__schema__  # type: ignore176        return cls.__schema__  # type: ignore177    @classmethod178    def to_yaml(cls, stream: Optional[os.PathLike] = None):179        """180        Convert `Schema` to yaml using `io.to_yaml`.181        """182        return cls.to_schema().to_yaml(stream)183    @classmethod184    @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)185    def validate(186        cls: Type[TSchemaModel],187        check_obj: pd.DataFrame,188        head: Optional[int] = None,189        tail: Optional[int] = None,190        sample: Optional[int] = None,191        random_state: Optional[int] = None,192        lazy: bool = False,193        inplace: bool = False,194    ) -> DataFrameBase[TSchemaModel]:195        """%(validate_doc)s"""196        return cast(197            DataFrameBase[TSchemaModel],198            cls.to_schema().validate(199                check_obj, head, tail, sample, random_state, lazy, inplace200            ),201        )202    @classmethod203    @docstring_substitution(strategy_doc=DataFrameSchema.strategy.__doc__)204    @st.strategy_import_error205    def strategy(cls: Type[TSchemaModel], *, size: Optional[int] = None):206        """%(strategy_doc)s"""207        return cls.to_schema().strategy(size=size)208    @classmethod209    @docstring_substitution(example_doc=DataFrameSchema.strategy.__doc__)210    @st.strategy_import_error211    def example(212        cls: Type[TSchemaModel], *, size: Optional[int] = None213    ) -> DataFrameBase[TSchemaModel]:214        """%(example_doc)s"""215        return cast(216            DataFrameBase[TSchemaModel], cls.to_schema().example(size=size)217        )218    @classmethod219    def _build_columns_index(  # pylint:disable=too-many-locals220        cls,221        fields: Dict[str, Tuple[AnnotationInfo, FieldInfo]],222        checks: Dict[str, List[Check]],223        **multiindex_kwargs: Any,224    ) -> Tuple[225        Dict[str, schema_components.Column],226        Optional[Union[schema_components.Index, schema_components.MultiIndex]],227    ]:228        index_count = sum(229            annotation.origin in INDEX_TYPES230            for annotation, _ in fields.values()231        )232        columns: Dict[str, schema_components.Column] = {}233        indices: List[schema_components.Index] = []234        for field_name, (annotation, field) in fields.items():235            field_checks = checks.get(field_name, [])236            field_name = field.name237            check_name = getattr(field, "check_name", None)238            if annotation.metadata:239                if field.dtype_kwargs:240                    raise TypeError(241                        "Cannot specify redundant 'dtype_kwargs' "242                        + f"for {annotation.raw_annotation}."243                        + "\n Usage Tip: Drop 'typing.Annotated'."244                    )245                dtype_kwargs = _get_dtype_kwargs(annotation)246                dtype = annotation.arg(**dtype_kwargs)  # type: ignore247            elif annotation.default_dtype:248                dtype = annotation.default_dtype249            else:250                dtype = annotation.arg251            dtype = None if dtype is Any else dtype252            if (253                annotation.origin in SERIES_TYPES254                or annotation.raw_annotation in SERIES_TYPES255            ):256                col_constructor = (257                    field.to_column if field else schema_components.Column258                )259                if check_name is False:260                    raise SchemaInitError(261                        f"'check_name' is not supported for {field_name}."262                    )263                columns[field_name] = col_constructor(  # type: ignore264                    dtype,265                    required=not annotation.optional,266                    checks=field_checks,267                    name=field_name,268                )269            elif (270                annotation.origin in INDEX_TYPES271                or annotation.raw_annotation in INDEX_TYPES272            ):273                if annotation.optional:274                    raise SchemaInitError(275                        f"Index '{field_name}' cannot be Optional."276                    )277                if check_name is False or (278                    # default single index279                    check_name is None280                    and index_count == 1281                ):282                    field_name = None  # type:ignore283                index_constructor = (284                    field.to_index if field else schema_components.Index285                )286                index = index_constructor(  # type: ignore287                    dtype, checks=field_checks, name=field_name288                )289                indices.append(index)290            else:291                raise SchemaInitError(292                    f"Invalid annotation '{field_name}: "293                    f"{annotation.raw_annotation}'"294                )295        return columns, _build_schema_index(indices, **multiindex_kwargs)296    @classmethod297    def _get_model_attrs(cls) -> Dict[str, Any]:298        """Return all attributes.299        Similar to inspect.get_members but bypass descriptors __get__.300        """301        bases = inspect.getmro(cls)[:-1]  # bases -> SchemaModel -> object302        attrs = {}303        for base in reversed(bases):304            attrs.update(base.__dict__)305        return attrs306    @classmethod307    def _collect_fields(cls) -> Dict[str, Tuple[AnnotationInfo, FieldInfo]]:308        """Centralize publicly named fields and their corresponding annotations."""309        annotations = get_type_hints(  # pylint:disable=unexpected-keyword-arg310            cls, include_extras=True311        )312        attrs = cls._get_model_attrs()313        missing = []314        for name, attr in attrs.items():315            if inspect.isroutine(attr):316                continue317            if not _is_field(name):318                annotations.pop(name, None)319            elif name not in annotations:320                missing.append(name)321        if missing:322            raise SchemaInitError(f"Found missing annotations: {missing}")323        fields = {}324        for field_name, annotation in annotations.items():325            field = attrs[field_name]  # __init_subclass__ guarantees existence326            if not isinstance(field, FieldInfo):327                raise SchemaInitError(328                    f"'{field_name}' can only be assigned a 'Field', "329                    + f"not a '{type(field)}.'"330                )331            fields[field.name] = (AnnotationInfo(annotation), field)332        return fields333    @classmethod334    def _collect_config_and_extras(335        cls,336    ) -> Tuple[Type[BaseConfig], Dict[str, Any]]:337        """Collect config options from bases, splitting off unknown options."""338        bases = inspect.getmro(cls)[:-1]339        bases = typing.cast(Tuple[Type[SchemaModel]], bases)340        root_model, *models = reversed(bases)341        options, extras = _extract_config_options_and_extras(root_model.Config)342        for model in models:343            config = getattr(model, _CONFIG_KEY, {})344            base_options, base_extras = _extract_config_options_and_extras(345                config346            )347            options.update(base_options)348            extras.update(base_extras)349        return type("Config", (BaseConfig,), options), extras350    @classmethod351    def _collect_check_infos(cls, key: str) -> List[CheckInfo]:352        """Collect inherited check metadata from bases.353        Inherited classmethods are not in cls.__dict__, that's why we need to354        walk the inheritance tree.355        """356        bases = inspect.getmro(cls)[:-2]  # bases -> SchemaModel -> object357        bases = typing.cast(Tuple[Type[SchemaModel]], bases)358        method_names = set()359        check_infos = []360        for base in bases:361            for attr_name, attr_value in vars(base).items():362                check_info = getattr(attr_value, key, None)363                if not isinstance(check_info, CheckInfo):364                    continue365                if attr_name in method_names:  # check overridden by subclass366                    continue367                method_names.add(attr_name)368                check_infos.append(check_info)369        return check_infos370    @classmethod371    def _extract_checks(372        cls, check_infos: List[FieldCheckInfo], field_names: List[str]373    ) -> Dict[str, List[Check]]:374        """Collect field annotations from bases in mro reverse order."""375        checks: Dict[str, List[Check]] = {}376        for check_info in check_infos:377            check_info_fields = {378                field.name if isinstance(field, FieldInfo) else field379                for field in check_info.fields380            }381            if check_info.regex:382                matched = _regex_filter(field_names, check_info_fields)383            else:384                matched = check_info_fields385            check_ = check_info.to_check(cls)386            for field in matched:387                if field not in field_names:388                    raise SchemaInitError(389                        f"Check {check_.name} is assigned to a non-existing field '{field}'."390                    )391                if field not in checks:392                    checks[field] = []393                checks[field].append(check_)394        return checks395    @classmethod396    def _extract_df_checks(cls, check_infos: List[CheckInfo]) -> List[Check]:397        """Collect field annotations from bases in mro reverse order."""398        return [check_info.to_check(cls) for check_info in check_infos]399    @classmethod400    def __get_validators__(cls):401        yield cls._pydantic_validate402    @classmethod403    def _pydantic_validate(cls, schema_model: Any) -> "SchemaModel":404        """Verify that the input is a compatible schema model."""405        if not inspect.isclass(schema_model):  # type: ignore406            raise TypeError(f"{schema_model} is not a pandera.SchemaModel")407        if not issubclass(schema_model, cls):  # type: ignore408            raise TypeError(f"{schema_model} does not inherit {cls}.")409        try:410            schema_model.to_schema()411        except SchemaInitError as exc:412            raise ValueError(413                f"Cannot use {cls} as a pydantic type as its "414                "SchemaModel cannot be converted to a DataFrameSchema.\n"415                f"Please revisit the model to address the following errors:"416                f"\n{exc}"417            ) from exc418        return cast("SchemaModel", schema_model)419    @classmethod420    def __modify_schema__(cls, field_schema):421        """Update pydantic field schema."""422        field_schema.update(to_json_schema(cls.to_schema()))423def _build_schema_index(424    indices: List[schema_components.Index], **multiindex_kwargs: Any425) -> Optional[SchemaIndex]:426    index: Optional[SchemaIndex] = None427    if indices:428        if len(indices) == 1:429            index = indices[0]430        else:431            index = schema_components.MultiIndex(indices, **multiindex_kwargs)432    return index433def _regex_filter(seq: Iterable, regexps: Iterable[str]) -> Set[str]:434    """Filter items matching at least one of the regexes."""435    matched: Set[str] = set()436    for regex in regexps:437        pattern = re.compile(regex)438        matched.update(filter(pattern.match, seq))439    return matched440def _get_dtype_kwargs(annotation: AnnotationInfo) -> Dict[str, Any]:441    sig = inspect.signature(annotation.arg)  # type: ignore442    dtype_arg_names = list(sig.parameters.keys())443    if len(annotation.metadata) != len(dtype_arg_names):  # type: ignore444        raise TypeError(445            f"Annotation '{annotation.arg.__name__}' requires "  # type: ignore446            + f"all positional arguments {dtype_arg_names}."447        )...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.