How to use parse_check_statistics method in pandera

Best Python code snippet using pandera_python

test_schema_statistics.py

Source:test_schema_statistics.py Github

copy

Full Screen

...129 ],130 [{}, None],131 ],132)133def test_parse_check_statistics(check_stats, expectation) -> None:134 """Test that Checks are correctly parsed from check statistics."""135 if expectation is None:136 expectation = []137 checks = schema_statistics.parse_check_statistics(check_stats)138 if checks is None:139 checks = []140 assert set(checks) == set(expectation)141def _test_statistics(statistics, expectations):142 if not isinstance(statistics, list):143 statistics = [statistics]144 if not isinstance(expectations, list):145 expectations = [expectations]146 for stats, expectation in zip(statistics, expectations):147 stat_dtype = stats.pop("dtype")148 expectation_dtype = expectation.pop("dtype")149 assert stats == expectation150 assert expectation_dtype.check(stat_dtype)151@pytest.mark.parametrize(152 "series, expectation",153 [154 *[155 [156 pd.Series(157 [1, 2, 3], dtype=str(pandas_engine.Engine.dtype(data_type))158 ),159 {160 "dtype": pandas_engine.Engine.dtype(data_type),161 "nullable": False,162 "checks": {163 "greater_than_or_equal_to": 1,164 "less_than_or_equal_to": 3,165 },166 "name": None,167 },168 ]169 for data_type in NUMERIC_TYPES170 ],171 [172 pd.Series(["a", "b", "c", "a"], dtype="category"),173 {174 "dtype": pandas_engine.Engine.dtype(pa.Category),175 "nullable": False,176 "checks": {"isin": ["a", "b", "c"]},177 "name": None,178 },179 ],180 [181 pd.Series(["a", "b", "c", "a"], dtype="string", name="str_series"),182 {183 "dtype": pandas_engine.Engine.dtype("string"),184 "nullable": False,185 "checks": None,186 "name": "str_series",187 },188 ],189 [190 pd.Series(pd.to_datetime(["20180101", "20180102", "20180103"])),191 {192 "dtype": pandas_engine.Engine.dtype(pa.DateTime),193 "nullable": False,194 "checks": {195 "greater_than_or_equal_to": pd.Timestamp("20180101"),196 "less_than_or_equal_to": pd.Timestamp("20180103"),197 },198 "name": None,199 },200 ],201 ],202)203def test_infer_series_schema_statistics(series, expectation) -> None:204 """Test series statistics are correctly inferred."""205 statistics = schema_statistics.infer_series_statistics(series)206 _test_statistics(statistics, expectation)207@pytest.mark.parametrize(208 "null_index, series, expectation",209 [210 *[211 [212 0,213 pd.Series([1, 2, 3], dtype=str(data_type)),214 {215 # introducing nans to integer arrays upcasts to float216 "dtype": DEFAULT_FLOAT,217 "nullable": True,218 "checks": {219 "greater_than_or_equal_to": 2,220 "less_than_or_equal_to": 3,221 },222 "name": None,223 },224 ]225 for data_type in INTEGER_TYPES226 ],227 [228 # introducing nans to bool arrays upcasts to float except229 # for pandas >= 1.3.0230 0,231 pd.Series([True, False, True, False]),232 {233 "dtype": (234 pandas_engine.Engine.dtype(pa.BOOL)235 if pa.PANDAS_1_3_0_PLUS236 else DEFAULT_FLOAT237 ),238 "nullable": True,239 "checks": (240 None241 if pa.PANDAS_1_3_0_PLUS242 else {243 "greater_than_or_equal_to": 0,244 "less_than_or_equal_to": 1,245 }246 ),247 "name": None,248 },249 ],250 [251 0,252 pd.Series(["a", "b", "c", "a"], dtype="category"),253 {254 "dtype": pandas_engine.Engine.dtype(pa.Category),255 "nullable": True,256 "checks": {"isin": ["a", "b", "c"]},257 "name": None,258 },259 ],260 [261 0,262 pd.Series(["a", "b", "c", "a"], name="str_series"),263 {264 "dtype": pandas_engine.Engine.dtype(str),265 "nullable": True,266 "checks": None,267 "name": "str_series",268 },269 ],270 [271 2,272 pd.Series(pd.to_datetime(["20180101", "20180102", "20180103"])),273 {274 "dtype": pandas_engine.Engine.dtype(pa.DateTime),275 "nullable": True,276 "checks": {277 "greater_than_or_equal_to": pd.Timestamp("20180101"),278 "less_than_or_equal_to": pd.Timestamp("20180102"),279 },280 "name": None,281 },282 ],283 ],284)285def test_infer_nullable_series_schema_statistics(286 null_index, series, expectation287):288 """Test nullable series statistics are correctly inferred."""289 series.iloc[null_index] = None290 statistics = schema_statistics.infer_series_statistics(series)291 _test_statistics(statistics, expectation)292@pytest.mark.parametrize(293 "index, expectation",294 [295 [296 pd.RangeIndex(20),297 [298 {299 "name": None,300 "dtype": DEFAULT_INT,301 "nullable": False,302 "checks": {303 "greater_than_or_equal_to": 0,304 "less_than_or_equal_to": 19,305 },306 }307 ],308 ],309 [310 pd.Index([1, 2, 3], name="int_index"),311 [312 {313 "name": "int_index",314 "dtype": DEFAULT_INT,315 "nullable": False,316 "checks": {317 "greater_than_or_equal_to": 1,318 "less_than_or_equal_to": 3,319 },320 }321 ],322 ],323 [324 pd.Index(["foo", "bar", "baz"], name="str_index"),325 [326 {327 "name": "str_index",328 "dtype": pandas_engine.Engine.dtype("object"),329 "nullable": False,330 "checks": None,331 },332 ],333 ],334 [335 pd.MultiIndex.from_arrays(336 [[10, 11, 12], pd.Series(["a", "b", "c"], dtype="category")],337 names=["int_index", "str_index"],338 ),339 [340 {341 "name": "int_index",342 "dtype": DEFAULT_INT,343 "nullable": False,344 "checks": {345 "greater_than_or_equal_to": 10,346 "less_than_or_equal_to": 12,347 },348 },349 {350 "name": "str_index",351 "dtype": pandas_engine.Engine.dtype(pa.Category),352 "nullable": False,353 "checks": {"isin": ["a", "b", "c"]},354 },355 ],356 ],357 # UserWarning cases358 [1, UserWarning],359 ["foo", UserWarning],360 [{"foo": "bar"}, UserWarning],361 [["foo", "bar"], UserWarning],362 [pd.Series(["foo", "bar"]), UserWarning],363 [pd.DataFrame({"column": ["foo", "bar"]}), UserWarning],364 ],365)366def test_infer_index_statistics(index, expectation):367 """Test that index statistics are correctly inferred."""368 if expectation is UserWarning:369 with pytest.warns(UserWarning, match="^index type .+ not recognized"):370 schema_statistics.infer_index_statistics(index)371 else:372 _test_statistics(373 schema_statistics.infer_index_statistics(index), expectation374 )375def test_get_dataframe_schema_statistics():376 """Test that dataframe schema statistics logic is correct."""377 schema = pa.DataFrameSchema(378 columns={379 "int": pa.Column(380 int,381 checks=[382 pa.Check.greater_than_or_equal_to(0),383 pa.Check.less_than_or_equal_to(100),384 ],385 nullable=True,386 ),387 "float": pa.Column(388 float,389 checks=[390 pa.Check.greater_than_or_equal_to(50),391 pa.Check.less_than_or_equal_to(100),392 ],393 ),394 "str": pa.Column(395 str,396 checks=[pa.Check.isin(["foo", "bar", "baz"])],397 ),398 },399 index=pa.Index(400 int,401 checks=pa.Check.greater_than_or_equal_to(0),402 nullable=False,403 name="int_index",404 ),405 )406 expectation = {407 "checks": None,408 "columns": {409 "int": {410 "dtype": DEFAULT_INT,411 "checks": {412 "greater_than_or_equal_to": {"min_value": 0},413 "less_than_or_equal_to": {"max_value": 100},414 },415 "nullable": True,416 "unique": False,417 "coerce": False,418 "required": True,419 "regex": False,420 },421 "float": {422 "dtype": DEFAULT_FLOAT,423 "checks": {424 "greater_than_or_equal_to": {"min_value": 50},425 "less_than_or_equal_to": {"max_value": 100},426 },427 "nullable": False,428 "unique": False,429 "coerce": False,430 "required": True,431 "regex": False,432 },433 "str": {434 "dtype": pandas_engine.Engine.dtype(str),435 "checks": {"isin": {"allowed_values": ["foo", "bar", "baz"]}},436 "nullable": False,437 "unique": False,438 "coerce": False,439 "required": True,440 "regex": False,441 },442 },443 "index": [444 {445 "dtype": DEFAULT_INT,446 "checks": {"greater_than_or_equal_to": {"min_value": 0}},447 "nullable": False,448 "coerce": False,449 "name": "int_index",450 }451 ],452 "coerce": False,453 }454 statistics = schema_statistics.get_dataframe_schema_statistics(schema)455 assert statistics == expectation456def test_get_series_schema_statistics():457 """Test that series schema statistics logic is correct."""458 schema = pa.SeriesSchema(459 int,460 nullable=False,461 checks=[462 pa.Check.greater_than_or_equal_to(0),463 pa.Check.less_than_or_equal_to(100),464 ],465 )466 statistics = schema_statistics.get_series_schema_statistics(schema)467 assert statistics == {468 "dtype": pandas_engine.Engine.dtype(int),469 "nullable": False,470 "checks": {471 "greater_than_or_equal_to": {"min_value": 0},472 "less_than_or_equal_to": {"max_value": 100},473 },474 "name": None,475 "coerce": False,476 }477@pytest.mark.parametrize(478 "index_schema_component, expectation",479 [480 [481 pa.Index(482 int,483 checks=[484 pa.Check.greater_than_or_equal_to(10),485 pa.Check.less_than_or_equal_to(20),486 ],487 nullable=False,488 name="int_index",489 ),490 [491 {492 "dtype": pandas_engine.Engine.dtype(int),493 "nullable": False,494 "checks": {495 "greater_than_or_equal_to": {"min_value": 10},496 "less_than_or_equal_to": {"max_value": 20},497 },498 "name": "int_index",499 "coerce": False,500 }501 ],502 ]503 ],504)505def test_get_index_schema_statistics(index_schema_component, expectation):506 """Test that index schema statistics logic is correct."""507 statistics = schema_statistics.get_index_schema_statistics(508 index_schema_component509 )510 _test_statistics(statistics, expectation)511@pytest.mark.parametrize(512 "checks, expectation",513 [514 *[515 [[check], {check.name: check.statistics}]516 for check in [517 pa.Check.greater_than(1),518 pa.Check.less_than(1),519 pa.Check.in_range(1, 3),520 pa.Check.equal_to(1),521 pa.Check.not_equal_to(1),522 pa.Check.notin([1, 2, 3]),523 pa.Check.str_matches("foobar"),524 pa.Check.str_contains("foobar"),525 pa.Check.str_startswith("foobar"),526 pa.Check.str_endswith("foobar"),527 pa.Check.str_length(5, 10),528 ]529 ],530 # multiple checks at once531 [532 [533 pa.Check.greater_than_or_equal_to(10),534 pa.Check.less_than_or_equal_to(50),535 pa.Check.isin([10, 20, 30, 40, 50]),536 ],537 {538 "greater_than_or_equal_to": {"min_value": 10},539 "less_than_or_equal_to": {"max_value": 50},540 "isin": {"allowed_values": [10, 20, 30, 40, 50]},541 },542 ],543 # incompatible checks544 *[545 [546 [547 pa.Check.greater_than_or_equal_to(min_value),548 pa.Check.less_than_or_equal_to(max_value),549 ],550 ValueError,551 ]552 for min_value, max_value in [553 (5, 1),554 (10, 1),555 (100, 10),556 (1000, 100),557 ]558 ],559 ],560)561def test_parse_checks_and_statistics_roundtrip(checks, expectation):562 """563 Test that parse checks correctly obtain statistics from checks and564 vice-versa.565 """566 if expectation is ValueError:567 with pytest.raises(ValueError):568 schema_statistics.parse_checks(checks)569 return570 assert schema_statistics.parse_checks(checks) == expectation571 check_statistics = {check.name: check.statistics for check in checks}572 check_list = schema_statistics.parse_check_statistics(check_statistics)573 assert set(check_list) == set(checks)574# pylint: disable=unused-argument575def test_parse_checks_and_statistics_no_param(extra_registered_checks):576 """577 Ensure that an edge case where a check does not have parameters is578 appropriately handled.579 """580 checks = [pa.Check.no_param_check()]581 expectation = {"no_param_check": {}}582 assert schema_statistics.parse_checks(checks) == expectation583 check_statistics = {check.name: check.statistics for check in checks}584 check_list = schema_statistics.parse_check_statistics(check_statistics)585 assert set(check_list) == set(checks)...

Full Screen

Full Screen

schema_inference.py

Source:schema_inference.py Github

copy

Full Screen

...28def _create_index(index_statistics):29 index = [30 Index(31 properties["dtype"],32 checks=parse_check_statistics(properties["checks"]),33 nullable=properties["nullable"],34 name=properties["name"],35 )36 for properties in index_statistics37 ]38 if len(index) == 1:39 index = index[0] # type: ignore40 else:41 index = MultiIndex(index) # type: ignore42 return index43def infer_dataframe_schema(df: pd.DataFrame) -> DataFrameSchema:44 """Infer a DataFrameSchema from a pandas DataFrame.45 :param df: DataFrame object to infer.46 :returns: DataFrameSchema47 """48 df_statistics = infer_dataframe_statistics(df)49 schema = DataFrameSchema(50 columns={51 colname: Column(52 properties["dtype"],53 checks=parse_check_statistics(properties["checks"]),54 nullable=properties["nullable"],55 )56 for colname, properties in df_statistics["columns"].items()57 },58 index=_create_index(df_statistics["index"]),59 coerce=True,60 )61 schema._is_inferred = True62 return schema63def infer_series_schema(series) -> SeriesSchema:64 """Infer a SeriesSchema from a pandas DataFrame.65 :param series: Series object to infer.66 :returns: SeriesSchema67 """68 series_statistics = infer_series_statistics(series)69 schema = SeriesSchema(70 dtype=series_statistics["dtype"],71 checks=parse_check_statistics(series_statistics["checks"]),72 nullable=series_statistics["nullable"],73 name=series_statistics["name"],74 coerce=True,75 )76 schema._is_inferred = True...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run pandera automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful