Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use get_dataframe_schema_statistics method in pandera

Best Python code snippet using pandera_python

io.py

Source:io.py

...94    }95def _serialize_schema(dataframe_schema):96    """Serialize dataframe schema into into json/yaml-compatible format."""97    from pandera import __version__  # pylint: disable=import-outside-toplevel98    statistics = get_dataframe_schema_statistics(dataframe_schema)99    columns, index, checks = None, None, None100    if statistics["columns"] is not None:101        columns = {102            col_name: _serialize_component_stats(column_stats)103            for col_name, column_stats in statistics["columns"].items()104        }105    if statistics["index"] is not None:106        index = [107            _serialize_component_stats(index_stats)108            for index_stats in statistics["index"]109        ]110    if statistics["checks"] is not None:111        checks = _serialize_dataframe_stats(statistics["checks"])112    return {113        "schema_type": "dataframe",114        "version": __version__,115        "columns": columns,116        "checks": checks,117        "index": index,118        "coerce": dataframe_schema.coerce,119        "strict": dataframe_schema.strict,120        "unique": dataframe_schema.unique,121    }122def _deserialize_check_stats(check, serialized_check_stats, dtype=None):123    def handle_stat_dtype(stat):124        try:125            if pandas_engine.Engine.dtype(dtypes.DateTime).check(dtype):126                return pd.to_datetime(stat, format=DATETIME_FORMAT)127            elif pandas_engine.Engine.dtype(dtypes.Timedelta).check(dtype):128                # serialize to int in nanoseconds129                return pd.to_timedelta(stat, unit="ns")130        except (TypeError, ValueError):131            return stat132        return stat133    if isinstance(serialized_check_stats, dict):134        # handle case where serialized check stats are in the form of a135        # dictionary mapping Check arg names to values.136        check_stats = {}137        for arg, stat in serialized_check_stats.items():138            check_stats[arg] = handle_stat_dtype(stat)139        return check(**check_stats)140    # otherwise assume unary check function signature141    return check(handle_stat_dtype(serialized_check_stats))142def _deserialize_component_stats(serialized_component_stats):143    dtype = serialized_component_stats.get("dtype")144    if dtype:145        dtype = pandas_engine.Engine.dtype(dtype)146    checks = serialized_component_stats.get("checks")147    if checks is not None:148        checks = [149            _deserialize_check_stats(150                getattr(Check, check_name), check_stats, dtype151            )152            for check_name, check_stats in checks.items()153        ]154    return {155        "dtype": dtype,156        "checks": checks,157        **{158            key: serialized_component_stats.get(key)159            for key in [160                "name",161                "nullable",162                "unique",163                # deserialize allow_duplicates property for backwards164                # compatibility. Remove this for 0.8.0 release165                "allow_duplicates",166                "coerce",167                "required",168                "regex",169            ]170            if key in serialized_component_stats171        },172    }173def _deserialize_schema(serialized_schema):174    # pylint: disable=import-outside-toplevel175    from pandera import Index, MultiIndex176    # GH#475177    serialized_schema = serialized_schema if serialized_schema else {}178    if not isinstance(serialized_schema, Mapping):179        raise pandera.errors.SchemaDefinitionError(180            "Schema representation must be a mapping."181        )182    columns = serialized_schema.get("columns")183    index = serialized_schema.get("index")184    checks = serialized_schema.get("checks")185    if columns is not None:186        columns = {187            col_name: Column(**_deserialize_component_stats(column_stats))188            for col_name, column_stats in columns.items()189        }190    if index is not None:191        index = [192            _deserialize_component_stats(index_component)193            for index_component in index194        ]195    if checks is not None:196        # handles unregistered checks by raising AttributeErrors from getattr197        checks = [198            _deserialize_check_stats(getattr(Check, check_name), check_stats)199            for check_name, check_stats in checks.items()200        ]201    if index is None:202        pass203    elif len(index) == 1:204        index = Index(**index[0])205    else:206        index = MultiIndex(207            indexes=[Index(**index_properties) for index_properties in index]208        )209    return DataFrameSchema(210        columns=columns,211        checks=checks,212        index=index,213        coerce=serialized_schema.get("coerce", False),214        strict=serialized_schema.get("strict", False),215        unique=serialized_schema.get("unique", None),216    )217def from_yaml(yaml_schema):218    """Create :class:`~pandera.schemas.DataFrameSchema` from yaml file.219    :param yaml_schema: str or Path to yaml schema, or serialized yaml string.220    :returns: dataframe schema.221    """222    try:223        with Path(yaml_schema).open("r", encoding="utf-8") as f:224            serialized_schema = yaml.safe_load(f)225    except (TypeError, OSError):226        serialized_schema = yaml.safe_load(yaml_schema)227    return _deserialize_schema(serialized_schema)228def to_yaml(dataframe_schema, stream=None):229    """Write :class:`~pandera.schemas.DataFrameSchema` to yaml file.230    :param dataframe_schema: schema to write to file or dump to string.231    :param stream: file stream to write to. If None, dumps to string.232    :returns: yaml string if stream is None, otherwise returns None.233    """234    statistics = _serialize_schema(dataframe_schema)235    def _write_yaml(obj, stream):236        return yaml.safe_dump(obj, stream=stream, sort_keys=False)237    try:238        with Path(stream).open("w", encoding="utf-8") as f:239            _write_yaml(statistics, f)240    except (TypeError, OSError):241        return _write_yaml(statistics, stream)242SCRIPT_TEMPLATE = """243from pandera import (244    DataFrameSchema, Column, Check, Index, MultiIndex245)246schema = DataFrameSchema(247    columns={{{columns}}},248    index={index},249    coerce={coerce},250    strict={strict},251    name={name},252)253"""254COLUMN_TEMPLATE = """255Column(256    dtype={dtype},257    checks={checks},258    nullable={nullable},259    unique={unique},260    coerce={coerce},261    required={required},262    regex={regex},263)264"""265INDEX_TEMPLATE = (266    "Index(dtype={dtype},checks={checks},"267    "nullable={nullable},coerce={coerce},name={name})"268)269MULTIINDEX_TEMPLATE = """270MultiIndex(indexes=[{indexes}])271"""272def _format_checks(checks_dict):273    if checks_dict is None:274        return "None"275    checks = []276    for check_name, check_kwargs in checks_dict.items():277        if check_kwargs is None:278            warnings.warn(279                f"Check {check_name} cannot be serialized. "280                "This check will be ignored"281            )282        else:283            args = ", ".join(284                f"{k}={v.__repr__()}" for k, v in check_kwargs.items()285            )286            checks.append(f"Check.{check_name}({args})")287    return f"[{', '.join(checks)}]"288def _format_index(index_statistics):289    index = []290    for properties in index_statistics:291        dtype = properties.get("dtype")292        index_code = INDEX_TEMPLATE.format(293            dtype=f"{_get_qualified_name(dtype.__class__)}",294            checks=(295                "None"296                if properties["checks"] is None297                else _format_checks(properties["checks"])298            ),299            nullable=properties["nullable"],300            coerce=properties["coerce"],301            name=(302                "None"303                if properties["name"] is None304                else f"\"{properties['name']}\""305            ),306        )307        index.append(index_code.strip())308    if len(index) == 1:309        return index[0]310    return MULTIINDEX_TEMPLATE.format(indexes=",".join(index)).strip()311def _format_script(script):312    formatter = partial(black.format_str, mode=black.FileMode(line_length=80))313    return formatter(script)314def to_script(dataframe_schema, path_or_buf=None):315    """Write :class:`~pandera.schemas.DataFrameSchema` to a python script.316    :param dataframe_schema: schema to write to file or dump to string.317    :param path_or_buf: filepath or buf stream to write to. If None, outputs318        string representation of the script.319    :returns: yaml string if stream is None, otherwise returns None.320    """321    statistics = get_dataframe_schema_statistics(dataframe_schema)322    columns = {}323    for colname, properties in statistics["columns"].items():324        dtype = properties.get("dtype")325        column_code = COLUMN_TEMPLATE.format(326            dtype=(327                None if dtype is None else _get_qualified_name(dtype.__class__)328            ),329            checks=_format_checks(properties["checks"]),330            nullable=properties["nullable"],331            unique=properties["unique"],332            coerce=properties["coerce"],333            required=properties["required"],334            regex=properties["regex"],335        )...

schema_statistics.py

Source:schema_statistics.py

...66        except TypeError:67            # if stats cannot be unpacked as key-word args, assume unary check.68            checks.append(check(stats))69    return checks if checks else None70def get_dataframe_schema_statistics(dataframe_schema):71    """Get statistical properties from dataframe schema."""72    statistics = {73        "columns": {74            col_name: {75                "dtype": column.dtype,76                "nullable": column.nullable,77                "coerce": column.coerce,78                "required": column.required,79                "regex": column.regex,80                "checks": parse_checks(column.checks),81                "unique": column.unique,82            }83            for col_name, column in dataframe_schema.columns.items()84        },...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.