How to use get_dataframe_schema_statistics method in pandera

Best Python code snippet using pandera_python

io.py

Source:io.py Github

copy

Full Screen

...94 }95def _serialize_schema(dataframe_schema):96 """Serialize dataframe schema into into json/yaml-compatible format."""97 from pandera import __version__ # pylint: disable=import-outside-toplevel98 statistics = get_dataframe_schema_statistics(dataframe_schema)99 columns, index, checks = None, None, None100 if statistics["columns"] is not None:101 columns = {102 col_name: _serialize_component_stats(column_stats)103 for col_name, column_stats in statistics["columns"].items()104 }105 if statistics["index"] is not None:106 index = [107 _serialize_component_stats(index_stats)108 for index_stats in statistics["index"]109 ]110 if statistics["checks"] is not None:111 checks = _serialize_dataframe_stats(statistics["checks"])112 return {113 "schema_type": "dataframe",114 "version": __version__,115 "columns": columns,116 "checks": checks,117 "index": index,118 "coerce": dataframe_schema.coerce,119 "strict": dataframe_schema.strict,120 "unique": dataframe_schema.unique,121 }122def _deserialize_check_stats(check, serialized_check_stats, dtype=None):123 def handle_stat_dtype(stat):124 try:125 if pandas_engine.Engine.dtype(dtypes.DateTime).check(dtype):126 return pd.to_datetime(stat, format=DATETIME_FORMAT)127 elif pandas_engine.Engine.dtype(dtypes.Timedelta).check(dtype):128 # serialize to int in nanoseconds129 return pd.to_timedelta(stat, unit="ns")130 except (TypeError, ValueError):131 return stat132 return stat133 if isinstance(serialized_check_stats, dict):134 # handle case where serialized check stats are in the form of a135 # dictionary mapping Check arg names to values.136 check_stats = {}137 for arg, stat in serialized_check_stats.items():138 check_stats[arg] = handle_stat_dtype(stat)139 return check(**check_stats)140 # otherwise assume unary check function signature141 return check(handle_stat_dtype(serialized_check_stats))142def _deserialize_component_stats(serialized_component_stats):143 dtype = serialized_component_stats.get("dtype")144 if dtype:145 dtype = pandas_engine.Engine.dtype(dtype)146 checks = serialized_component_stats.get("checks")147 if checks is not None:148 checks = [149 _deserialize_check_stats(150 getattr(Check, check_name), check_stats, dtype151 )152 for check_name, check_stats in checks.items()153 ]154 return {155 "dtype": dtype,156 "checks": checks,157 **{158 key: serialized_component_stats.get(key)159 for key in [160 "name",161 "nullable",162 "unique",163 # deserialize allow_duplicates property for backwards164 # compatibility. Remove this for 0.8.0 release165 "allow_duplicates",166 "coerce",167 "required",168 "regex",169 ]170 if key in serialized_component_stats171 },172 }173def _deserialize_schema(serialized_schema):174 # pylint: disable=import-outside-toplevel175 from pandera import Index, MultiIndex176 # GH#475177 serialized_schema = serialized_schema if serialized_schema else {}178 if not isinstance(serialized_schema, Mapping):179 raise pandera.errors.SchemaDefinitionError(180 "Schema representation must be a mapping."181 )182 columns = serialized_schema.get("columns")183 index = serialized_schema.get("index")184 checks = serialized_schema.get("checks")185 if columns is not None:186 columns = {187 col_name: Column(**_deserialize_component_stats(column_stats))188 for col_name, column_stats in columns.items()189 }190 if index is not None:191 index = [192 _deserialize_component_stats(index_component)193 for index_component in index194 ]195 if checks is not None:196 # handles unregistered checks by raising AttributeErrors from getattr197 checks = [198 _deserialize_check_stats(getattr(Check, check_name), check_stats)199 for check_name, check_stats in checks.items()200 ]201 if index is None:202 pass203 elif len(index) == 1:204 index = Index(**index[0])205 else:206 index = MultiIndex(207 indexes=[Index(**index_properties) for index_properties in index]208 )209 return DataFrameSchema(210 columns=columns,211 checks=checks,212 index=index,213 coerce=serialized_schema.get("coerce", False),214 strict=serialized_schema.get("strict", False),215 unique=serialized_schema.get("unique", None),216 )217def from_yaml(yaml_schema):218 """Create :class:`~pandera.schemas.DataFrameSchema` from yaml file.219 :param yaml_schema: str or Path to yaml schema, or serialized yaml string.220 :returns: dataframe schema.221 """222 try:223 with Path(yaml_schema).open("r", encoding="utf-8") as f:224 serialized_schema = yaml.safe_load(f)225 except (TypeError, OSError):226 serialized_schema = yaml.safe_load(yaml_schema)227 return _deserialize_schema(serialized_schema)228def to_yaml(dataframe_schema, stream=None):229 """Write :class:`~pandera.schemas.DataFrameSchema` to yaml file.230 :param dataframe_schema: schema to write to file or dump to string.231 :param stream: file stream to write to. If None, dumps to string.232 :returns: yaml string if stream is None, otherwise returns None.233 """234 statistics = _serialize_schema(dataframe_schema)235 def _write_yaml(obj, stream):236 return yaml.safe_dump(obj, stream=stream, sort_keys=False)237 try:238 with Path(stream).open("w", encoding="utf-8") as f:239 _write_yaml(statistics, f)240 except (TypeError, OSError):241 return _write_yaml(statistics, stream)242SCRIPT_TEMPLATE = """243from pandera import (244 DataFrameSchema, Column, Check, Index, MultiIndex245)246schema = DataFrameSchema(247 columns={{{columns}}},248 index={index},249 coerce={coerce},250 strict={strict},251 name={name},252)253"""254COLUMN_TEMPLATE = """255Column(256 dtype={dtype},257 checks={checks},258 nullable={nullable},259 unique={unique},260 coerce={coerce},261 required={required},262 regex={regex},263)264"""265INDEX_TEMPLATE = (266 "Index(dtype={dtype},checks={checks},"267 "nullable={nullable},coerce={coerce},name={name})"268)269MULTIINDEX_TEMPLATE = """270MultiIndex(indexes=[{indexes}])271"""272def _format_checks(checks_dict):273 if checks_dict is None:274 return "None"275 checks = []276 for check_name, check_kwargs in checks_dict.items():277 if check_kwargs is None:278 warnings.warn(279 f"Check {check_name} cannot be serialized. "280 "This check will be ignored"281 )282 else:283 args = ", ".join(284 f"{k}={v.__repr__()}" for k, v in check_kwargs.items()285 )286 checks.append(f"Check.{check_name}({args})")287 return f"[{', '.join(checks)}]"288def _format_index(index_statistics):289 index = []290 for properties in index_statistics:291 dtype = properties.get("dtype")292 index_code = INDEX_TEMPLATE.format(293 dtype=f"{_get_qualified_name(dtype.__class__)}",294 checks=(295 "None"296 if properties["checks"] is None297 else _format_checks(properties["checks"])298 ),299 nullable=properties["nullable"],300 coerce=properties["coerce"],301 name=(302 "None"303 if properties["name"] is None304 else f"\"{properties['name']}\""305 ),306 )307 index.append(index_code.strip())308 if len(index) == 1:309 return index[0]310 return MULTIINDEX_TEMPLATE.format(indexes=",".join(index)).strip()311def _format_script(script):312 formatter = partial(black.format_str, mode=black.FileMode(line_length=80))313 return formatter(script)314def to_script(dataframe_schema, path_or_buf=None):315 """Write :class:`~pandera.schemas.DataFrameSchema` to a python script.316 :param dataframe_schema: schema to write to file or dump to string.317 :param path_or_buf: filepath or buf stream to write to. If None, outputs318 string representation of the script.319 :returns: yaml string if stream is None, otherwise returns None.320 """321 statistics = get_dataframe_schema_statistics(dataframe_schema)322 columns = {}323 for colname, properties in statistics["columns"].items():324 dtype = properties.get("dtype")325 column_code = COLUMN_TEMPLATE.format(326 dtype=(327 None if dtype is None else _get_qualified_name(dtype.__class__)328 ),329 checks=_format_checks(properties["checks"]),330 nullable=properties["nullable"],331 unique=properties["unique"],332 coerce=properties["coerce"],333 required=properties["required"],334 regex=properties["regex"],335 )...

Full Screen

Full Screen

schema_statistics.py

Source:schema_statistics.py Github

copy

Full Screen

...66 except TypeError:67 # if stats cannot be unpacked as key-word args, assume unary check.68 checks.append(check(stats))69 return checks if checks else None70def get_dataframe_schema_statistics(dataframe_schema):71 """Get statistical properties from dataframe schema."""72 statistics = {73 "columns": {74 col_name: {75 "dtype": column.dtype,76 "nullable": column.nullable,77 "coerce": column.coerce,78 "required": column.required,79 "regex": column.regex,80 "checks": parse_checks(column.checks),81 "unique": column.unique,82 }83 for col_name, column in dataframe_schema.columns.items()84 },...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run pandera automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful