Test your AI Agents with the all-new Agent to Agent Testing Platform.Learn More

How to use get_regex_columns method in pandera

Best Python code snippet using pandera_python

schemas.py

Source:schemas.py

...281            if column.regex:282                regex_dtype.update(283                    {284                        c: column.dtype285                        for c in column.get_regex_columns(dataframe.columns)286                    }287                )288        return {289            **{n: c.dtype for n, c in self.columns.items() if not c.regex},290            **regex_dtype,291        }292    @property293    def dtype(294        self,295    ) -> DataType:296        """Get the dtype property."""297        return self._dtype  # type: ignore298    @dtype.setter299    def dtype(self, value: PandasDtypeInputTypes) -> None:300        """Set the pandas dtype property."""301        self._dtype = pandas_engine.Engine.dtype(value) if value else None302    def _coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:303        if self.dtype is None:304            raise ValueError(305                "dtype argument is None. Must specify this argument "306                "to coerce dtype"307            )308        try:309            return self.dtype.try_coerce(obj)310        except errors.ParserError as exc:311            raise errors.SchemaError(312                self,313                obj,314                (315                    f"Error while coercing '{self.name}' to type "316                    f"{self.dtype}: {exc}"317                ),318                failure_cases=exc.failure_cases,319                check=f"coerce_dtype('{self.dtype}')",320            ) from exc321    def coerce_dtype(self, obj: pd.DataFrame) -> pd.DataFrame:322        """Coerce dataframe to the type specified in dtype.323        :param obj: dataframe to coerce.324        :returns: dataframe with coerced dtypes325        """326        error_handler = SchemaErrorHandler(lazy=True)327        def _try_coercion(coerce_fn, obj):328            try:329                return coerce_fn(obj)330            except errors.SchemaError as exc:331                error_handler.collect_error("dtype_coercion_error", exc)332                return obj333        for colname, col_schema in self.columns.items():334            if col_schema.regex:335                try:336                    matched_columns = col_schema.get_regex_columns(obj.columns)337                except errors.SchemaError:338                    matched_columns = pd.Index([])339                for matched_colname in matched_columns:340                    if col_schema.coerce or self.coerce:341                        obj[matched_colname] = _try_coercion(342                            col_schema.coerce_dtype, obj[matched_colname]343                        )344            elif (345                (col_schema.coerce or self.coerce)346                and self.dtype is None347                and colname in obj348            ):349                obj[colname] = _try_coercion(350                    col_schema.coerce_dtype, obj[colname]351                )352        if self.dtype is not None:353            obj = _try_coercion(self._coerce_dtype, obj)354        if self.index is not None and (self.index.coerce or self.coerce):355            index_schema = copy.deepcopy(self.index)356            if self.coerce:357                # coercing at the dataframe-level should apply index coercion358                # for both single- and multi-indexes.359                index_schema._coerce = True360            coerced_index = _try_coercion(index_schema.coerce_dtype, obj.index)361            if coerced_index is not None:362                obj.index = coerced_index363        if error_handler.collected_errors:364            raise errors.SchemaErrors(error_handler.collected_errors, obj)365        return obj366    def validate(367        self,368        check_obj: pd.DataFrame,369        head: Optional[int] = None,370        tail: Optional[int] = None,371        sample: Optional[int] = None,372        random_state: Optional[int] = None,373        lazy: bool = False,374        inplace: bool = False,375    ) -> pd.DataFrame:376        """Check if all columns in a dataframe have a column in the Schema.377        :param pd.DataFrame check_obj: the dataframe to be validated.378        :param head: validate the first n rows. Rows overlapping with `tail` or379            `sample` are de-duplicated.380        :param tail: validate the last n rows. Rows overlapping with `head` or381            `sample` are de-duplicated.382        :param sample: validate a random sample of n rows. Rows overlapping383            with `head` or `tail` are de-duplicated.384        :param random_state: random seed for the ``sample`` argument.385        :param lazy: if True, lazily evaluates dataframe against all validation386            checks and raises a ``SchemaErrors``. Otherwise, raise387            ``SchemaError`` as soon as one occurs.388        :param inplace: if True, applies coercion to the object of validation,389            otherwise creates a copy of the data.390        :returns: validated ``DataFrame``391        :raises SchemaError: when ``DataFrame`` violates built-in or custom392            checks.393        :example:394        Calling ``schema.validate`` returns the dataframe.395        >>> import pandas as pd396        >>> import pandera as pa397        >>>398        >>> df = pd.DataFrame({399        ...     "probability": [0.1, 0.4, 0.52, 0.23, 0.8, 0.76],400        ...     "category": ["dog", "dog", "cat", "duck", "dog", "dog"]401        ... })402        >>>403        >>> schema_withchecks = pa.DataFrameSchema({404        ...     "probability": pa.Column(405        ...         float, pa.Check(lambda s: (s >= 0) & (s <= 1))),406        ...407        ...     # check that the "category" column contains a few discrete408        ...     # values, and the majority of the entries are dogs.409        ...     "category": pa.Column(410        ...         str, [411        ...             pa.Check(lambda s: s.isin(["dog", "cat", "duck"])),412        ...             pa.Check(lambda s: (s == "dog").mean() > 0.5),413        ...         ]),414        ... })415        >>>416        >>> schema_withchecks.validate(df)[["probability", "category"]]417           probability category418        0         0.10      dog419        1         0.40      dog420        2         0.52      cat421        3         0.23     duck422        4         0.80      dog423        5         0.76      dog424        """425        if not check_utils.is_table(check_obj):426            raise TypeError(f"expected pd.DataFrame, got {type(check_obj)}")427        if hasattr(check_obj, "dask"):428            # special case for dask dataframes429            if inplace:430                check_obj = check_obj.pandera.add_schema(self)431            else:432                check_obj = check_obj.copy()433            check_obj = check_obj.map_partitions(434                self._validate,435                head=head,436                tail=tail,437                sample=sample,438                random_state=random_state,439                lazy=lazy,440                inplace=inplace,441                meta=check_obj,442            )443            return check_obj.pandera.add_schema(self)444        return self._validate(445            check_obj=check_obj,446            head=head,447            tail=tail,448            sample=sample,449            random_state=random_state,450            lazy=lazy,451            inplace=inplace,452        )453    def _validate(454        self,455        check_obj: pd.DataFrame,456        head: Optional[int] = None,457        tail: Optional[int] = None,458        sample: Optional[int] = None,459        random_state: Optional[int] = None,460        lazy: bool = False,461        inplace: bool = False,462    ) -> pd.DataFrame:463        # pylint: disable=too-many-locals,too-many-branches,too-many-statements464        if self._is_inferred:465            warnings.warn(466                f"This {type(self)} is an inferred schema that hasn't been "467                "modified. It's recommended that you refine the schema "468                "by calling `add_columns`, `remove_columns`, or "469                "`update_columns` before using it to validate data.",470                UserWarning,471            )472        error_handler = SchemaErrorHandler(lazy)473        if not inplace:474            check_obj = check_obj.copy()475        if hasattr(check_obj, "pandera"):476            check_obj = check_obj.pandera.add_schema(self)477        # dataframe strictness check makes sure all columns in the dataframe478        # are specified in the dataframe schema479        if self.strict or self.ordered:480            column_names: List[Any] = []481            for col_name, col_schema in self.columns.items():482                if col_schema.regex:483                    try:484                        column_names.extend(485                            col_schema.get_regex_columns(check_obj.columns)486                        )487                    except errors.SchemaError:488                        pass489                elif col_name in check_obj.columns:490                    column_names.append(col_name)491            # ordered "set" of columns492            sorted_column_names = iter(dict.fromkeys(column_names))493            expanded_column_names = frozenset(column_names)494            # drop adjacent duplicated column names495            if check_obj.columns.has_duplicates:496                columns = [k for k, _ in itertools.groupby(check_obj.columns)]497            else:498                columns = check_obj.columns499            for column in columns:...

schema_components.py

Source:schema_components.py

...182                lazy,183                inplace=inplace,184            )185        column_keys_to_check = (186            self.get_regex_columns(check_obj.columns)187            if self._regex188            else [self._name]189        )190        for column_name in column_keys_to_check:191            if self.coerce:192                check_obj[column_name] = self.coerce_dtype(193                    check_obj[column_name]194                )195            if check_utils.is_table(check_obj[column_name]):196                for i in range(check_obj[column_name].shape[1]):197                    validate_column(198                        check_obj[column_name].iloc[:, [i]], column_name199                    )200            else:201                validate_column(check_obj, column_name)202        return check_obj203    def get_regex_columns(204        self, columns: Union[pd.Index, pd.MultiIndex]205    ) -> Union[pd.Index, pd.MultiIndex]:206        """Get matching column names based on regex column name pattern.207        :param columns: columns to regex pattern match208        :returns: matchin columns209        """210        if isinstance(self.name, tuple):211            # handle MultiIndex case212            if len(self.name) != columns.nlevels:213                raise IndexError(214                    f"Column regex name='{self.name}' is a tuple, expected a "215                    f"MultiIndex columns with {len(self.name)} number of "216                    f"levels, found {columns.nlevels} level(s)"217                )...

test_schema_components.py

Source:test_schema_components.py

...366        regex=True,367    )368    if error is not None:369        with pytest.raises(error):370            column_schema.get_regex_columns(columns)371    else:372        matched_columns = column_schema.get_regex_columns(columns)373        assert expected_matches == matched_columns.tolist()374INT_REGEX = r"-?\d+$"375FLOAT_REGEX = r"-?\d+\.\d+$"376DATETIME_REGEX = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"377@pytest.mark.parametrize(378    "column_name_regex, expected_matches",379    [380        # match all381        [".+", [1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")]],382        # match integers383        [INT_REGEX, [1, -1]],384        # match floats385        [FLOAT_REGEX, [2.2, 3.1415, -3.6]],386        # match datetimes387        [DATETIME_REGEX, [pd.Timestamp("2018/01/01")]],388    ],389)390def test_column_regex_matching_non_str_types(391    column_name_regex: str, expected_matches: List392) -> None:393    """Non-string column names should be cast into str for regex matching."""394    columns = pd.Index([1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")])395    column_schema = Column(name=column_name_regex, regex=True)396    matched_columns = column_schema.get_regex_columns(columns)397    assert expected_matches == matched_columns.tolist()398@pytest.mark.parametrize(399    "column_name_regex, expected_matches",400    [401        # match all402        [403            (".+", ".+"),404            [405                ("foo", 1),406                ("foo", pd.Timestamp("2018/01/01")),407                (1, 2.2),408                (3.14, -1),409            ],410        ],411        # match (str, int)412        [("foo", INT_REGEX), [("foo", 1)]],413        # match (str, pd.Timestamp)414        [("foo", DATETIME_REGEX), [("foo", pd.Timestamp("2018/01/01"))]],415        # match (int, float)416        [(INT_REGEX, FLOAT_REGEX), [(1, 2.2)]],417        # match (float, int)418        [(FLOAT_REGEX, INT_REGEX), [(3.14, -1)]],419    ],420)421def test_column_regex_matching_non_str_types_multiindex(422    column_name_regex: Tuple[str, str], expected_matches: List[Tuple[Any, Any]]423) -> None:424    """425    Non-string column names should be cast into str for regex matching in426    MultiIndex column case.427    """428    columns = pd.MultiIndex.from_tuples(429        (430            ("foo", 1),431            ("foo", pd.Timestamp("2018/01/01")),432            (1, 2.2),433            (3.14, -1),434        )435    )436    column_schema = Column(name=column_name_regex, regex=True)437    matched_columns = column_schema.get_regex_columns(columns)438    assert expected_matches == matched_columns.tolist()439def test_column_regex_strict() -> None:440    """Test that Column regex patterns correctly parsed in DataFrameSchema."""441    data = pd.DataFrame(442        {443            "foo_1": [1, 2, 3],444            "foo_2": [1, 2, 3],445            "foo_3": [1, 2, 3],446        }447    )448    schema = DataFrameSchema(449        columns={"foo_*": Column(Int, regex=True)}, strict=True450    )451    assert isinstance(schema.validate(data), pd.DataFrame)...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.