Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use _allow_groupby method in pandera

Best Python code snippet using pandera_python

schemas.py

Source:schemas.py

...1571                failure_cases=exc.failure_cases,1572                check=f"coerce_dtype('{self.dtype}')",1573            ) from exc1574    @property1575    def _allow_groupby(self):1576        """Whether the schema or schema component allows groupby operations."""1577        raise NotImplementedError(  # pragma: no cover1578            "The _allow_groupby property must be implemented by subclasses "1579            "of SeriesSchemaBase"1580        )1581    def validate(1582        self,1583        check_obj: Union[pd.DataFrame, pd.Series],1584        head: Optional[int] = None,1585        tail: Optional[int] = None,1586        sample: Optional[int] = None,1587        random_state: Optional[int] = None,1588        lazy: bool = False,1589        inplace: bool = False,1590    ) -> Union[pd.DataFrame, pd.Series]:1591        # pylint: disable=too-many-locals,too-many-branches,too-many-statements1592        """Validate a series or specific column in dataframe.1593        :check_obj: pandas DataFrame or Series to validate.1594        :param head: validate the first n rows. Rows overlapping with `tail` or1595            `sample` are de-duplicated.1596        :param tail: validate the last n rows. Rows overlapping with `head` or1597            `sample` are de-duplicated.1598        :param sample: validate a random sample of n rows. Rows overlapping1599            with `head` or `tail` are de-duplicated.1600        :param random_state: random seed for the ``sample`` argument.1601        :param lazy: if True, lazily evaluates dataframe against all validation1602            checks and raises a ``SchemaErrors``. Otherwise, raise1603            ``SchemaError`` as soon as one occurs.1604        :param inplace: if True, applies coercion to the object of validation,1605            otherwise creates a copy of the data.1606        :returns: validated DataFrame or Series.1607        """1608        if self._is_inferred:1609            warnings.warn(1610                f"This {type(self)} is an inferred schema that hasn't been "1611                "modified. It's recommended that you refine the schema "1612                "by calling `set_checks` before using it to validate data.",1613                UserWarning,1614            )1615        error_handler = SchemaErrorHandler(lazy)1616        if not inplace:1617            check_obj = check_obj.copy()1618        series = (1619            check_obj1620            if check_utils.is_field(check_obj)1621            else check_obj[self.name]1622        )1623        series = _pandas_obj_to_validate(1624            series, head, tail, sample, random_state1625        )1626        check_obj = _pandas_obj_to_validate(1627            check_obj, head, tail, sample, random_state1628        )1629        if self.name is not None and series.name != self._name:1630            msg = (1631                f"Expected {type(self)} to have name '{self._name}', found "1632                f"'{series.name}'"1633            )1634            error_handler.collect_error(1635                "wrong_field_name",1636                errors.SchemaError(1637                    self,1638                    check_obj,1639                    msg,1640                    failure_cases=scalar_failure_case(series.name),1641                    check=f"field_name('{self._name}')",1642                ),1643            )1644        if not self._nullable:1645            nulls = series.isna()1646            if nulls.sum() > 0:1647                failed = series[nulls]1648                msg = (1649                    f"non-nullable series '{series.name}' contains null "1650                    f"values:\n{failed}"1651                )1652                error_handler.collect_error(1653                    "series_contains_nulls",1654                    errors.SchemaError(1655                        self,1656                        check_obj,1657                        msg,1658                        failure_cases=reshape_failure_cases(1659                            series[nulls], ignore_na=False1660                        ),1661                        check="not_nullable",1662                    ),1663                )1664        # Check if the series contains duplicate values1665        if self._unique:1666            if type(series).__module__.startswith("databricks.koalas"):1667                duplicates = (1668                    series.to_frame().duplicated().reindex(series.index)1669                )1670                # pylint: disable=import-outside-toplevel1671                import databricks.koalas as ks1672                with ks.option_context("compute.ops_on_diff_frames", True):1673                    failed = series[duplicates]1674            else:1675                duplicates = series.duplicated()1676                failed = series[duplicates]1677            if duplicates.any():1678                msg = (1679                    f"series '{series.name}' contains duplicate values:\n"1680                    f"{failed}"1681                )1682                error_handler.collect_error(1683                    "series_contains_duplicates",1684                    errors.SchemaError(1685                        self,1686                        check_obj,1687                        msg,1688                        failure_cases=reshape_failure_cases(failed),1689                        check="field_uniqueness",1690                    ),1691                )1692        if self._dtype is not None and (1693            not self._dtype.check(pandas_engine.Engine.dtype(series.dtype))1694        ):1695            msg = (1696                f"expected series '{series.name}' to have type {self._dtype}, "1697                + f"got {series.dtype}"1698            )1699            error_handler.collect_error(1700                "wrong_dtype",1701                errors.SchemaError(1702                    self,1703                    check_obj,1704                    msg,1705                    failure_cases=scalar_failure_case(str(series.dtype)),1706                    check=f"dtype('{self.dtype}')",1707                ),1708            )1709        check_results = []1710        if check_utils.is_field(check_obj):1711            check_obj, check_args = series, [None]1712        else:1713            check_args = [self.name]  # type: ignore1714        for check_index, check in enumerate(self.checks):1715            try:1716                check_results.append(1717                    _handle_check_results(1718                        self, check_index, check, check_obj, *check_args1719                    )1720                )1721            except errors.SchemaError as err:1722                error_handler.collect_error("dataframe_check", err)1723            except Exception as err:  # pylint: disable=broad-except1724                # catch other exceptions that may occur when executing the1725                # Check1726                err_msg = f'"{err.args[0]}"' if len(err.args) > 0 else ""1727                err_str = f"{err.__class__.__name__}({ err_msg})"1728                msg = (1729                    f"Error while executing check function: {err_str}\n"1730                    + traceback.format_exc()1731                )1732                error_handler.collect_error(1733                    "check_error",1734                    errors.SchemaError(1735                        self,1736                        check_obj,1737                        msg,1738                        failure_cases=scalar_failure_case(err_str),1739                        check=check,1740                        check_index=check_index,1741                    ),1742                    original_exc=err,1743                )1744        if lazy and error_handler.collected_errors:1745            raise errors.SchemaErrors(1746                error_handler.collected_errors, check_obj1747            )1748        assert all(check_results)1749        return check_obj1750    def __call__(1751        self,1752        check_obj: Union[pd.DataFrame, pd.Series],1753        head: Optional[int] = None,1754        tail: Optional[int] = None,1755        sample: Optional[int] = None,1756        random_state: Optional[int] = None,1757        lazy: bool = False,1758        inplace: bool = False,1759    ) -> Union[pd.DataFrame, pd.Series]:1760        """Alias for ``validate`` method."""1761        return self.validate(1762            check_obj, head, tail, sample, random_state, lazy, inplace1763        )1764    def __eq__(self, other):1765        return self.__dict__ == other.__dict__1766    @st.strategy_import_error1767    def strategy(self, *, size=None):1768        """Create a ``hypothesis`` strategy for generating a Series.1769        :param size: number of elements to generate1770        :returns: a strategy that generates pandas Series objects.1771        """1772        return st.series_strategy(1773            self.dtype,1774            checks=self.checks,1775            nullable=self.nullable,1776            unique=self.unique,1777            name=self.name,1778            size=size,1779        )1780    def example(self, size=None) -> pd.Series:1781        """Generate an example of a particular size.1782        :param size: number of elements in the generated Series.1783        :returns: pandas Series object.1784        """1785        # pylint: disable=import-outside-toplevel,cyclic-import,import-error1786        import hypothesis1787        with warnings.catch_warnings():1788            warnings.simplefilter(1789                "ignore",1790                category=hypothesis.errors.NonInteractiveExampleWarning,1791            )1792            return self.strategy(size=size).example()1793    def __repr__(self):1794        return (1795            f"<Schema {self.__class__.__name__}"1796            f"(name={self._name}, type={self.dtype!r})>"1797        )1798    @classmethod1799    def __get_validators__(cls):1800        yield cls._pydantic_validate1801    @classmethod1802    def _pydantic_validate(  # type: ignore1803        cls: TSeriesSchemaBase, schema: Any1804    ) -> TSeriesSchemaBase:1805        """Verify that the input is a compatible DataFrameSchema."""1806        if not isinstance(schema, cls):  # type: ignore1807            raise TypeError(f"{schema} is not a {cls}.")1808        return cast(TSeriesSchemaBase, schema)1809class SeriesSchema(SeriesSchemaBase):1810    """Series validator."""1811    @deprecate_pandas_dtype1812    def __init__(1813        self,1814        dtype: PandasDtypeInputTypes = None,1815        checks: CheckList = None,1816        index=None,1817        nullable: bool = False,1818        unique: bool = False,1819        allow_duplicates: Optional[bool] = None,1820        coerce: bool = False,1821        name: str = None,1822        pandas_dtype: PandasDtypeInputTypes = None,1823        title: Optional[str] = None,1824        description: Optional[str] = None,1825    ) -> None:1826        """Initialize series schema base object.1827        :param dtype: datatype of the column. If a string is specified,1828            then assumes one of the valid pandas string values:1829            http://pandas.pydata.org/pandas-docs/stable/basics.html#dtypes1830        :param checks: If element_wise is True, then callable signature should1831            be:1832            ``Callable[Any, bool]`` where the ``Any`` input is a scalar element1833            in the column. Otherwise, the input is assumed to be a1834            pandas.Series object.1835        :param index: specify the datatypes and properties of the index.1836        :param nullable: Whether or not column can contain null values.1837        :param unique: Whether or not column can contain duplicate1838            values.1839        :param allow_duplicates: Whether or not column can contain duplicate1840            values.1841        .. warning::1842            This option will be deprecated in 0.8.0. Use the ``unique``1843            argument instead.1844        :param coerce: If True, when schema.validate is called the column will1845            be coerced into the specified dtype. This has no effect on columns1846            where ``pandas_dtype=None``.1847        :param name: series name.1848        :param pandas_dtype: alias of ``dtype`` for backwards compatibility.1849        :param title: A human-readable label for the series.1850        :param description: An arbitrary textual description of the series.1851            .. warning:: This option will be deprecated in 0.8.01852        """1853        super().__init__(1854            dtype,1855            checks,1856            nullable,1857            unique,1858            allow_duplicates,1859            coerce,1860            name,1861            pandas_dtype,1862            title,1863            description,1864        )1865        self.index = index1866    @property1867    def _allow_groupby(self) -> bool:1868        """Whether the schema or schema component allows groupby operations."""1869        return False1870    def validate(1871        self,1872        check_obj: pd.Series,1873        head: Optional[int] = None,1874        tail: Optional[int] = None,1875        sample: Optional[int] = None,1876        random_state: Optional[int] = None,1877        lazy: bool = False,1878        inplace: bool = False,1879    ) -> pd.Series:1880        """Validate a Series object.1881        :param check_obj: One-dimensional ndarray with axis labels...

schema_components.py

Source:schema_components.py

...103    def regex(self) -> bool:104        """True if ``name`` attribute should be treated as a regex pattern."""105        return self._regex106    @property107    def _allow_groupby(self) -> bool:108        """Whether the schema or schema component allows groupby operations."""109        return True110    @property111    def properties(self) -> Dict[str, Any]:112        """Get column properties."""113        return {114            "dtype": self.dtype,115            "checks": self._checks,116            "nullable": self._nullable,117            "unique": self._unique,118            "coerce": self._coerce,119            "required": self.required,120            "name": self._name,121            "regex": self._regex,122            "title": self.title,123            "description": self.description,124        }125    def set_name(self, name: str):126        """Used to set or modify the name of a column object.127        :param str name: the name of the column object128        """129        self._name = name130        return self131    def coerce_dtype(self, obj: Union[pd.DataFrame, pd.Series, pd.Index]):132        """Coerce dtype of a column, handling duplicate column names."""133        # pylint: disable=super-with-arguments134        if check_utils.is_field(obj) or check_utils.is_index(obj):135            return super(Column, self).coerce_dtype(obj)136        return obj.apply(137            lambda x: super(Column, self).coerce_dtype(x), axis="columns"138        )139    def validate(140        self,141        check_obj: pd.DataFrame,142        head: Optional[int] = None,143        tail: Optional[int] = None,144        sample: Optional[int] = None,145        random_state: Optional[int] = None,146        lazy: bool = False,147        inplace: bool = False,148    ) -> pd.DataFrame:149        """Validate a Column in a DataFrame object.150        :param check_obj: pandas DataFrame to validate.151        :param head: validate the first n rows. Rows overlapping with `tail` or152            `sample` are de-duplicated.153        :param tail: validate the last n rows. Rows overlapping with `head` or154            `sample` are de-duplicated.155        :param sample: validate a random sample of n rows. Rows overlapping156            with `head` or `tail` are de-duplicated.157        :param random_state: random seed for the ``sample`` argument.158        :param lazy: if True, lazily evaluates dataframe against all validation159            checks and raises a ``SchemaErrors``. Otherwise, raise160            ``SchemaError`` as soon as one occurs.161        :param inplace: if True, applies coercion to the object of validation,162            otherwise creates a copy of the data.163        :returns: validated DataFrame.164        """165        if not inplace:166            check_obj = check_obj.copy()167        if self._name is None:168            raise errors.SchemaError(169                self,170                check_obj,171                "column name is set to None. Pass the ``name` argument when "172                "initializing a Column object, or use the ``set_name`` "173                "method.",174            )175        def validate_column(check_obj, column_name):176            super(Column, copy(self).set_name(column_name)).validate(177                check_obj,178                head,179                tail,180                sample,181                random_state,182                lazy,183                inplace=inplace,184            )185        column_keys_to_check = (186            self.get_regex_columns(check_obj.columns)187            if self._regex188            else [self._name]189        )190        for column_name in column_keys_to_check:191            if self.coerce:192                check_obj[column_name] = self.coerce_dtype(193                    check_obj[column_name]194                )195            if check_utils.is_table(check_obj[column_name]):196                for i in range(check_obj[column_name].shape[1]):197                    validate_column(198                        check_obj[column_name].iloc[:, [i]], column_name199                    )200            else:201                validate_column(check_obj, column_name)202        return check_obj203    def get_regex_columns(204        self, columns: Union[pd.Index, pd.MultiIndex]205    ) -> Union[pd.Index, pd.MultiIndex]:206        """Get matching column names based on regex column name pattern.207        :param columns: columns to regex pattern match208        :returns: matchin columns209        """210        if isinstance(self.name, tuple):211            # handle MultiIndex case212            if len(self.name) != columns.nlevels:213                raise IndexError(214                    f"Column regex name='{self.name}' is a tuple, expected a "215                    f"MultiIndex columns with {len(self.name)} number of "216                    f"levels, found {columns.nlevels} level(s)"217                )218            matches = np.ones(len(columns)).astype(bool)219            for i, name in enumerate(self.name):220                matched = pd.Index(221                    columns.get_level_values(i).astype(str).str.match(name)222                ).fillna(False)223                matches = matches & np.array(matched.tolist())224            column_keys_to_check = columns[matches]225        else:226            if check_utils.is_multiindex(columns):227                raise IndexError(228                    f"Column regex name {self.name} is a string, expected a "229                    "dataframe where the index is a pd.Index object, not a "230                    "pd.MultiIndex object"231                )232            column_keys_to_check = columns[233                # str.match will return nan values when the index value is234                # not a string.235                pd.Index(columns.astype(str).str.match(self.name))236                .fillna(False)237                .tolist()238            ]239        if column_keys_to_check.shape[0] == 0:240            raise errors.SchemaError(241                self,242                columns,243                f"Column regex name='{self.name}' did not match any columns "244                "in the dataframe. Update the regex pattern so that it "245                f"matches at least one column:\n{columns.tolist()}",246            )247        # drop duplicates to account for potential duplicated columns in the248        # dataframe.249        return column_keys_to_check.drop_duplicates()250    @st.strategy_import_error251    def strategy(self, *, size=None):252        """Create a ``hypothesis`` strategy for generating a Column.253        :param size: number of elements to generate254        :returns: a dataframe strategy for a single column.255        """256        return super().strategy(size=size).map(lambda x: x.to_frame())257    @st.strategy_import_error258    def strategy_component(self):259        """Generate column data object for use by DataFrame strategy."""260        return st.column_strategy(261            self.dtype,262            checks=self.checks,263            unique=self.unique,264            name=self.name,265        )266    def example(self, size=None) -> pd.DataFrame:267        """Generate an example of a particular size.268        :param size: number of elements in the generated Index.269        :returns: pandas DataFrame object.270        """271        # pylint: disable=import-outside-toplevel,cyclic-import,import-error272        import hypothesis273        with warnings.catch_warnings():274            warnings.simplefilter(275                "ignore",276                category=hypothesis.errors.NonInteractiveExampleWarning,277            )278            return (279                super()280                .strategy(size=size)281                .example()282                .rename(self.name)283                .to_frame()284            )285    def __eq__(self, other):286        if not isinstance(other, self.__class__):287            return NotImplemented288        def _compare_dict(obj):289            return {290                k: v if k != "_checks" else set(v)291                for k, v in obj.__dict__.items()292            }293        return _compare_dict(self) == _compare_dict(other)294class Index(SeriesSchemaBase):295    """Validate types and properties of a DataFrame Index."""296    @property297    def names(self):298        """Get index names in the Index schema component."""299        return [self.name]300    @property301    def _allow_groupby(self) -> bool:302        """Whether the schema or schema component allows groupby operations."""303        return False304    def validate(305        self,306        check_obj: Union[pd.DataFrame, pd.Series],307        head: Optional[int] = None,308        tail: Optional[int] = None,309        sample: Optional[int] = None,310        random_state: Optional[int] = None,311        lazy: bool = False,312        inplace: bool = False,313    ) -> Union[pd.DataFrame, pd.Series]:314        """Validate DataFrameSchema or SeriesSchema Index.315        :check_obj: pandas DataFrame of Series containing index to validate....

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.