How to use infer_schema method in pandera

Best Python code snippet using pandera_python

pipeline_utils.py

Source:pipeline_utils.py Github

copy

Full Screen

1import os2from enum import Enum3PipelineStep = Enum("PipelineStep", "DATA_VALIDATION PREPROCESSING MODELING DEPLOY")4def create_pipeline_file(step, project, train_steps=1000, eval_steps=500, columns_for_slicing=None):5 6 base_file = """7import os8import logging9import datetime10from tfx.orchestration.airflow.airflow_runner import AirflowDAGRunner11from tfx.orchestration.pipeline import PipelineDecorator12from tfx.utils.dsl_utils import csv_input13from tfx.proto import trainer_pb2, evaluator_pb2, pusher_pb214from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen15from tfx.components.statistics_gen.component import StatisticsGen16from tfx.components.schema_gen.component import SchemaGen17from tfx.components.example_validator.component import ExampleValidator18from tfx.components.transform.component import Transform19from tfx.components.trainer.component import Trainer20from tfx.components.evaluator.component import Evaluator21from tfx.components.model_validator.component import ModelValidator22from tfx.components.pusher.component import Pusher23data_dir = os.path.join(os.environ['DATA_DIR'], '{project}')24log_dir = os.path.join(os.environ['TFX_DIR'], 'logs')25serving_model_dir = os.path.join(os.environ['SERVING_DIR'], 'serving_model', '{project}')26project_preprocessing_file = os.path.join(os.environ['DAGS_DIR'], '{project}_preprocessing.py')27project_training_file = os.path.join(os.environ['DAGS_DIR'], '{project}_modeling.py')28logger_overrides = dict([29 ('log_root', log_dir),30 ('log_level', logging.INFO)31])32airflow_config = dict([33 ('schedule_interval', None),34 ('start_date', datetime.datetime(2019, 1, 1))35])36@PipelineDecorator(37 pipeline_name='{project}',38 enable_cache=True,39 metadata_db_root=os.environ['METADATA_DB_DIR'],40 additional_pipeline_args=dict([('logger_args', logger_overrides)]),41 pipeline_root=os.environ['PIPELINE_DIR']42)43def create_pipeline():44 45 pipeline = []46 47 {components}48 49 return pipeline50pipeline = AirflowDAGRunner(airflow_config).run(create_pipeline())51"""52 53 pipeline_file = None54 if step == PipelineStep.DATA_VALIDATION:55 56 pipeline_file = base_file.format(57 project=project,58 components=""" 59 examples = csv_input(data_dir)60 # Brings data into the pipeline61 example_gen = CsvExampleGen(input_base=examples)62 pipeline.append(example_gen)63 64 # Computes statistics over data for visualization and example validation.65 statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)66 pipeline.append(statistics_gen)67 # Generates schema based on statistics files.68 infer_schema = SchemaGen(stats=statistics_gen.outputs.output)69 pipeline.append(infer_schema)70 # Performs anomaly detection based on statistics and data schema.71 validate_stats = ExampleValidator(72 stats=statistics_gen.outputs.output,73 schema=infer_schema.outputs.output74 )75 pipeline.append(validate_stats)76 """77 )78 elif step == PipelineStep.PREPROCESSING:79 80 pipeline_file = base_file.format(81 project=project,82 components=""" 83 examples = csv_input(data_dir)84 # Brings data into the pipeline85 example_gen = CsvExampleGen(input_base=examples)86 pipeline.append(example_gen)87 88 # Computes statistics over data for visualization and example validation.89 statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)90 pipeline.append(statistics_gen)91 # Generates schema based on statistics files.92 infer_schema = SchemaGen(stats=statistics_gen.outputs.output)93 pipeline.append(infer_schema)94 # Performs anomaly detection based on statistics and data schema.95 validate_stats = ExampleValidator(96 stats=statistics_gen.outputs.output,97 schema=infer_schema.outputs.output98 )99 pipeline.append(validate_stats)100 # Performs transformations and feature engineering in training and serving.101 transform = Transform(102 input_data=example_gen.outputs.examples,103 schema=infer_schema.outputs.output,104 module_file=project_preprocessing_file105 )106 pipeline.append(transform)107 """108 )109 elif step == PipelineStep.MODELING:110 111 pipeline_file = base_file.format(112 project=project,113 components=""" 114 examples = csv_input(data_dir)115 # Brings data into the pipeline116 example_gen = CsvExampleGen(input_base=examples)117 pipeline.append(example_gen)118 119 # Computes statistics over data for visualization and example validation.120 statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)121 pipeline.append(statistics_gen)122 # Generates schema based on statistics files.123 infer_schema = SchemaGen(stats=statistics_gen.outputs.output)124 pipeline.append(infer_schema)125 # Performs anomaly detection based on statistics and data schema.126 validate_stats = ExampleValidator(127 stats=statistics_gen.outputs.output,128 schema=infer_schema.outputs.output129 )130 pipeline.append(validate_stats)131 # Performs transformations and feature engineering in training and serving.132 transform = Transform(133 input_data=example_gen.outputs.examples,134 schema=infer_schema.outputs.output,135 module_file=project_preprocessing_file136 )137 pipeline.append(transform)138 # Uses user-provided Python function that implements a model.139 trainer = Trainer(140 module_file=project_training_file,141 transformed_examples=transform.outputs.transformed_examples,142 schema=infer_schema.outputs.output,143 transform_output=transform.outputs.transform_output,144 train_args=trainer_pb2.TrainArgs(num_steps={train_steps}),145 eval_args=trainer_pb2.EvalArgs(num_steps={eval_steps})146 )147 pipeline.append(trainer)148 # Uses TFMA to compute a evaluation statistics over features of a model.149 model_analyzer = Evaluator(150 examples=example_gen.outputs.examples,151 model_exports=trainer.outputs.output,152 feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[{specs}])153 )154 pipeline.append(model_analyzer)155 """.format(156 train_steps=train_steps,157 eval_steps=eval_steps,158 specs=",".join(["evaluator_pb2.SingleSlicingSpec(column_for_slicing=['%s'])" % col for col in columns_for_slicing])159 ))160 elif step == PipelineStep.DEPLOY:161 162 pipeline_file = base_file.format(163 project=project,164 components=""" 165 examples = csv_input(data_dir)166 # Brings data into the pipeline167 example_gen = CsvExampleGen(input_base=examples)168 pipeline.append(example_gen)169 170 # Computes statistics over data for visualization and example validation.171 statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)172 pipeline.append(statistics_gen)173 # Generates schema based on statistics files.174 infer_schema = SchemaGen(stats=statistics_gen.outputs.output)175 pipeline.append(infer_schema)176 # Performs anomaly detection based on statistics and data schema.177 validate_stats = ExampleValidator(178 stats=statistics_gen.outputs.output,179 schema=infer_schema.outputs.output180 )181 pipeline.append(validate_stats)182 # Performs transformations and feature engineering in training and serving.183 transform = Transform(184 input_data=example_gen.outputs.examples,185 schema=infer_schema.outputs.output,186 module_file=project_preprocessing_file187 )188 pipeline.append(transform)189 # Uses user-provided Python function that implements a model.190 trainer = Trainer(191 module_file=project_training_file,192 transformed_examples=transform.outputs.transformed_examples,193 schema=infer_schema.outputs.output,194 transform_output=transform.outputs.transform_output,195 train_args=trainer_pb2.TrainArgs(num_steps={train_steps}),196 eval_args=trainer_pb2.EvalArgs(num_steps={eval_steps})197 )198 pipeline.append(trainer)199 # Uses TFMA to compute a evaluation statistics over features of a model.200 model_analyzer = Evaluator(201 examples=example_gen.outputs.examples,202 model_exports=trainer.outputs.output,203 feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[{specs}])204 )205 pipeline.append(model_analyzer)206 # Performs quality validation of a candidate model (compared to a baseline).207 model_validator = ModelValidator(208 examples=example_gen.outputs.examples, model=trainer.outputs.output209 )210 pipeline.append(model_validator)211 # Checks whether the model passed the validation steps and pushes the model to a file destination if check passed.212 pusher = Pusher(213 model_export=trainer.outputs.output,214 model_blessing=model_validator.outputs.blessing,215 push_destination=pusher_pb2.PushDestination(216 filesystem=pusher_pb2.PushDestination.Filesystem(base_directory=serving_model_dir)217 )218 )219 pipeline.append(pusher)220 """.format(221 train_steps=train_steps,222 eval_steps=eval_steps,223 specs=",".join(["evaluator_pb2.SingleSlicingSpec(column_for_slicing=['%s'])" % col for col in columns_for_slicing])224 ))225 226 227 return pipeline_file228 229def write_pipeline_to_dags(pipeline_file, name):230 with open(os.path.join(os.environ['DAGS_DIR'], name + ".py"), "w") as f:231 f.write(pipeline_file)...

Full Screen

Full Screen

test_infer_schema.py

Source:test_infer_schema.py Github

copy

Full Screen

...6 col1 = pd.Series(["1", "4", "9", "20", "8"], dtype="string", name="col1")7 col2 = pd.Series(["1", "4", "9", "20", "8"], name="col2")8 col3 = pd.Series(["1", "4", "9", "20", "8"], dtype="category", name="col3")9 col4 = pd.Series([True, False, False, True], name="col4")10 assert "col1" in infer_schema(col1.to_frame()).categorical_columns11 assert "col2" in infer_schema(col2.to_frame()).categorical_columns12 assert "col3" in infer_schema(col3.to_frame()).categorical_columns13 assert "col4" in infer_schema(col4.to_frame()).categorical_columns14def test_valid_numerical_column():15 col1 = pd.Series([1, 2, 3, 4], name="col1")16 assert "col1" in infer_schema(col1.to_frame()).numerical_columns17def test_valid_text_column():18 col1 = pd.Series([f"{x}" for x in range(21)], dtype="string", name="col1")19 col2 = pd.Series([f"{x}" for x in range(21)], name="col2")20 assert "col1" in infer_schema(col1.to_frame()).text_columns21 assert "col2" in infer_schema(col2.to_frame()).text_columns22def test_valid_time_column():23 col1 = pd.to_datetime(24 pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"], name="col1")25 )26 col2 = pd.Series(27 [datetime(2022, 9, 13, 9), pd.Timestamp(1513393355.5, unit="s")], name="col2"28 )29 assert "col1" in infer_schema(col1.to_frame()).time_columns30 assert "col2" in infer_schema(col2.to_frame()).time_columns31def test_non_supported_mixed_column():32 col1 = pd.Series([1, 2, 3, 4, "a"], name="col1")33 col2 = pd.Series([1, 2, 3, 4, datetime(2013, 1, 1)], name="col1")34 with pytest.raises(35 Exception, match=r"Mixed data type mixed-integer is not supported"36 ):37 infer_schema(col1.to_frame())38 infer_schema(col2.to_frame())39def test_non_supported_other_column():40 col1 = pd.Series(pd.Timedelta(timedelta(days=1, seconds=1)))41 col2 = pd.Series(pd.Interval(left=0, right=5))42 col3 = pd.Series(pd.Period("2017-01-01"))43 with pytest.raises(44 Exception, match=r"Data type timedelta64\[ns\] is not supported"45 ):46 infer_schema(col1.to_frame())47 with pytest.raises(Exception, match=r"Data type interval.* is not supported"):48 infer_schema(col2.to_frame())49 with pytest.raises(Exception, match=r"Data type period.* is not supported"):...

Full Screen

Full Screen

python-dict-list-to-dataframe.py

Source:python-dict-list-to-dataframe.py Github

copy

Full Screen

...15 {"Category": 'Category C', 'ItemID': 3, 'Amount': 100.01},16 {"Category": 'Category A', 'ItemID': 4, 'Amount': 110.01},17 {"Category": 'Category B', 'ItemID': 5, 'Amount': 70.85}18 ]19def infer_schema():20 # Create data frame21 df = spark.createDataFrame(data)22 print(df.schema)23 df.show()24def explicit_schema():25 # Create a schema for the dataframe26 schema = StructType([27 StructField('Category', StringType(), False),28 StructField('ItemID', IntegerType(), False),29 StructField('Amount', FloatType(), True)30 ])31 # Create data frame32 df = spark.createDataFrame(data, schema)33 print(df.schema)34 df.show()35if __name__ == "__main__":36 infer_schema()...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run pandera automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful