How to use return_variable method in pyresttest

Best Python code snippet using pyresttest_python

etl.py

Source:etl.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2# MLToolkit (mltoolkit)3"""4MLToolkit - a verstile helping library for machine learning5===========================================================6'MLToolkit' is a Python package providing a set of user-friendly functions to 7help building machine learning models in data science research or production 8focused projects. It is compatible with and interoperate with popular data 9analysis, manipulation and machine learning libraries Pandas, Sci-kit Learn, 10Tensorflow, Statmodels, Catboost, XGboost, etc.11Main Features12-------------13- Data Extraction (SQL, Flatfiles, etc.)14- Exploratory data analysis (statistical summary, univariate analysis, etc.)15- Feature Extraction and Engineering16- Model performance analysis, Explain Predictions and comparison between models17- Cross Validation and Hyper parameter tuning18- JSON input script for executing model building and scoring tasks.19- Model Building UI20- Auto ML (automated machine learning)21- Model Deploymet and Serving via RESTful API22Author23------24- Sumudu Tennakoon25Links26-----27Website: http://sumudu.tennakoon.net/projects/MLToolkit28Github: https://mltoolkit.github.io/MLToolKit29License30-------31Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)32"""33# IF QUERY HAS DROP TABLE, TRUNCATE TABLE, DELETE, UPDATE, CREATE, set a control flag on (safety)34# Modify output timing to execute query + row count35from datetime import datetime36import gc37import traceback38import gc39import os40from timeit import default_timer as timer41import numpy as np42import pandas as pd43import matplotlib.pyplot as plt44import re45import urllib46import sqlalchemy 47import csv48try:49 import pyodbc50except:51 print('pyodbc not found! Data base query fufnctions disabled.')52import warnings53warnings.filterwarnings("ignore")54from mltk.string import *55from mltk.explore import *56def number_unit_example():57 edges_std = ['0', '1p', '1n', '1u', '1m', '1c', '1', '100', '500', 58 '1K', '2K', '5K', '10K', '20K', '50K', '100K', '500K', 59 '1M', '2M', '5M', '10M', '100M', '200M', '500M', 60 '1G', '2G', '5G', '10G', '100G', '200G', '500G',61 '1T', '2T', '5T', '10T', '100T', '200T', '500T',62 '1P', '2P', '5P', '10P', '100P', '200P', '500P',63 '1E']64 print(edges_std)65 66def get_number_units(): 67 units = {'p':0.000000000001,68 'n':0.000000001,69 'u':0.000001,70 'm':0.001,71 'c':0.01,72 'd':0.1,73 '':1,74 'D':10,75 'H':100,76 'K':1000,77 'M':1000000,78 'G':1000000000,79 'T':1000000000000,80 'P':1000000000000000,81 'E':1000000000000000000,82 'INF':np.inf 83 }84 units = pd.DataFrame(data=units.items(), columns=['unit', 'multiplier'])85 print(units)86 return units87###############################################################################88##[ I/O FUNCTIONS]############################################################# 89###############################################################################90def read_data(connector, params=None):91 connector = {92 "method":"sql", #"pickle", "csv", "excel"93 "source":{"dbms":"mssql", "server":"SQLSERVER1", "database":"SampleDB", "schema":None},94 "auth":{'type':'user', 'user':'user1', 'pwd':'password123'},95 "params":{}96 }97 98 connector2 = {99 "method":"sql", #"pickle", "csv", "excel"100 "source":{"dbms":"snowflake", "server":"SQLSERVER1", "database":"SampleDB", "schema":None}, # account (server)101 "auth":{'type':'user', 'user':'user1', 'password':'password123', "role": None}, 102 "params":{}103 }104 105 return None 106 107def read_data_csv(file, separator=',', quoting= 'MINIMAL', compression='infer', encoding='utf-8'):108 """109 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html110 111 Parameters112 ---------- 113 file : str114 separator : str115 index : bool116 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'117 quoting : {'ALL', MINIMAL', 'NONNUMERIC', 'NONE'}, default 'MINIMAL'118 encoding : {'utf-8', 'utf-16'}, default 'utf-8'119 120 Returns121 -------122 DataFrame : pandas.DataFrame123 """124 if quoting=='ALL':125 quoting = csv.QUOTE_ALL126 elif quoting=='MINIMAL':127 quoting = csv.QUOTE_MINIMAL 128 elif quoting=='NONNUMERIC':129 quoting = csv.QUOTE_NONNUMERIC 130 elif quoting=='NONE':131 quoting = csv.QUOTE_NONE 132 133 try:134 start_time = timer() 135 DataFrame = pd.read_csv(filepath_or_buffer=file, sep=separator, quoting=quoting, 136 compression=compression, encoding=encoding) 137 execute_time = timer() - start_time138 except:139 execute_time = 0140 DataFrame = pd.DataFrame()141 print(traceback.format_exc())142 143 144 145 print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))146 147 return DataFrame148def write_data_csv(DataFrame, file, separator=',', index=False, quoting='ALL', encoding='utf-8', compression='infer', chunksize=None):149 """150 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html151 152 Parameters153 ---------- 154 DataFrame : pandas.DataFrame155 file : str156 separator : str157 index : bool158 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'159 quoting : {'ALL', MINIMAL', 'NONNUMERIC', 'NONE'}, default 'MINIMAL'160 encoding : {'utf-8', 'utf-16'}, default 'utf-8'161 chunksize : int, default None162 163 Returns164 -------165 None166 """167 168 if quoting=='ALL':169 quoting = csv.QUOTE_ALL170 elif quoting=='MINIMAL':171 quoting = csv.QUOTE_MINIMAL 172 elif quoting=='NONNUMERIC':173 quoting = csv.QUOTE_NONNUMERIC 174 elif quoting=='NONE':175 quoting = csv.QUOTE_NONE 176 try:177 start_time = timer() 178 DataFrame.to_csv(path_or_buf=file, sep=separator, encoding=encoding, index=index, 179 quoting=quoting, compression=compression, chunksize=chunksize)180 execute_time = timer() - start_time181 except:182 execute_time = 0183 print(traceback.format_exc())184 185 print('{:,d} records were written. execute time = {} s'.format(len(DataFrame.index), execute_time))186 187 return None188def read_data_pickle(file, compression='infer'):189 """190 https://docs.python.org/3/library/pickle.html191 "Warning The pickle module is not secure against erroneous or maliciously constructed data. 192 Never unpickle data received from an untrusted or unauthenticated source."193 194 Parameters195 ---------- 196 file : str197 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'198 199 Returns200 -------201 DataFrame : pandas.DataFrame202 """203 try:204 start_time = timer() 205 DataFrame = pd.read_pickle(path=file, compression=compression)206 execute_time = timer() - start_time207 except:208 execute_time = 0209 print(traceback.format_exc())210 DataFrame = pd.DataFrame()211 212 213 print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))214 215 return DataFrame216def write_data_pickle(DataFrame, file, compression='infer', protocol=3):217 """218 https://docs.python.org/3/library/pickle.html219 "Warning The pickle module is not secure against erroneous or maliciously constructed data. 220 Never unpickle data received from an untrusted or unauthenticated source."221 222 Parameters223 ---------- 224 DataFrame : pandas.DataFrame225 file : str226 compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'227 protocol : int {1, 2, 3, 4}228 0 is human-readable/backwards compatible with earlier versions of Python229 read more at https://docs.python.org/3/library/pickle.html230 Returns231 -------232 None 233 """234 try:235 start_time = timer() 236 DataFrame.to_pickle(path=file, compression=compression, protocol=protocol)237 execute_time = timer() - start_time238 except:239 execute_time = 0240 print(traceback.format_exc())241 242 print('{:,d} records were written. execute time = {} s'.format(len(DataFrame.index), execute_time))243def create_sql_connect_string(server=None, database=None, auth=None, dbms='mssql', autocommit = 'True'): 244 if dbms=='mssql':245 # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server246 driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 247 if auth['type']=='machine':248 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'249 connect_string = urllib.parse.quote_plus(connect_string)250 elif auth['type']=='user':251 uid = auth['uid'] 252 pwd = auth['pwd'] 253 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'254 connect_string = urllib.parse.quote_plus(connect_string)255 elif dbms=='mysql':256 connect_string = None257 elif dbms=='snowflake':258 connect_string = None259 else:260 raise Exception("Parameter dbms not provided. Accepted values are {'mssql', 'mysql', 'snowflake'}")261 return connect_string262def read_data_sql(query=None, server=None, database=None, auth=None, dbms='mssql', params=None):263 """264 Parameters265 ----------266 query : str267 SQL SELECT query268 server : str269 Database Server270 database : str271 Database272 auth : dict273 e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication274 auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication275 276 Returns277 -------278 DataFrame : pandas.DataFrame279 """ 280 execute_time = 0281 282 if query!=None and server!=None and auth!=None: 283 coerce_float=True284 index_col=None285 parse_dates=None286 287 try:288 if auth['type']=='machine':289 connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;'290 elif auth['type']=='user':291 uid = auth['uid'] 292 pwd = auth['pwd'] 293 connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'}'294 else:295 raise Exception('No db server authentication method provided!')296 connection = pyodbc.connect(connect_string) 297 298 start_time = timer() 299 DataFrame = pd.read_sql_query(sql=query, con=connection, coerce_float=coerce_float, index_col=index_col, parse_dates=parse_dates)300 execute_time = timer() - start_time301 302 connection.close() 303 except:304 print('Database Query Fialed!:\n{}\n'.format(traceback.format_exc()))305 DataFrame=pd.DataFrame()306 else:307 print('No Query provided !')308 DataFrame=pd.DataFrame()309 310 print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))311 312 return DataFrame313def write_data_sql(DataFrame, server=None, database=None, schema=None, table=None, index=False, dtypes=None, if_exists='fail', auth=None, dbms='mssql', params=None):314 """315 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html316 317 Parameters318 ----------319 DataFrame : pandas.DataFrame320 DataFrame321 server : str322 Database Server323 database : str324 Database325 schema : str326 Database Schema327 table : str328 Table name329 if_exists : {'fail', 'replace', 'append'}, default 'fail'330 Action if the table already exists.331 auth : dict332 e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication333 auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication334 335 Returns336 -------337 None338 """ 339 340 # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server341 driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 342 autocommit = 'True'343 fast_executemany = True344 execute_time = 0345 346 if server!=None and database!=None and schema!=None and table!=None and auth!=None : 347 try:348 if auth['type']=='machine':349 #connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;' #ODBC (slow)350 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'351 connect_string = urllib.parse.quote_plus(connect_string)352 elif auth['type']=='user':353 uid = auth['uid'] 354 pwd = auth['pwd'] 355 #connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'}' #ODBC (slow)356 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'357 connect_string = urllib.parse.quote_plus(connect_string)358 else:359 raise Exception('No db server authentication method provided !') 360 361 #connection = pyodbc.connect(connect_string) #ODBC (slow)362 engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect="+connect_string, fast_executemany=fast_executemany)363 connection = engine364 365 start_time = timer() 366 if dtypes==None:367 DataFrame.to_sql(name=table, con=connection, schema=schema, index= index, if_exists=if_exists)368 else:369 DataFrame.to_sql(name=table, con=connection, schema=schema, index= index, dtype=dtypes, if_exists=if_exists)370 execute_time = timer() - start_time371 372 #connection.close() 373 engine.dispose()374 rowcount = len(DataFrame.index)375 except:376 print('Database Query Failed! Check If ODBC driver installed. \nIf not, Download ODBC Driver from https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-serve.:\n{}\n'.format(traceback.format_exc()))377 rowcount = 0378 else:379 print('Check the destiniation table path (server, database, schema, table, auth) !')380 rowcount = 0381 382 print('{:,d} records were written. execute time = {} s'.format(rowcount, execute_time))383 384 return rowcount385def execute_sql_query(query=None, server=None, database=None, auth=None, params=None, dbms='mssql', on_error='ignore'):386 """387 Parameters388 ----------389 query : str390 SQL SELECT query391 server : str392 Database Server393 database : str394 Database395 auth : dict396 e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication397 auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication398 params : dict399 extra parameters (not implemented)400 401 Returns402 -------403 DataFrame : pandas.DataFrame404 """ 405 # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server406 driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 407 autocommit = 'True'408 fast_executemany = True409 410 if server!=None and database!=None and query!=None and auth!=None :411 try:412 if auth['type']=='machine':413 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'414 connect_string = urllib.parse.quote_plus(connect_string)415 416 elif auth['type']=='user':417 uid = auth['uid'] 418 pwd = auth['pwd'] 419 connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'420 connect_string = urllib.parse.quote_plus(connect_string)421 else:422 raise Exception('No db server authentication method provided !')423 424 engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect="+connect_string, fast_executemany=fast_executemany)425 426 # connection427 connection = engine.connect()428 429 #transaction430 trans = connection.begin()431 432 # execute433 start_time = timer() 434 result = connection.execute(query)435 execute_time = timer() - start_time436 437 try:438 rowcount = result.rowcount439 print('{} rows affected. execute time = {} s'.format(rowcount,execute_time))440 except:441 rowcount = -1442 print('ERROR in fetching affected rows count. execute time = {} s'.format(execute_time))443 444 # commit445 trans.commit()446 447 # close connections, results set and dispose engine (moved to finally)448 #connection.close()449 #result.close()450 #engine.dispose()451 except:452 print(r'ERROR: Check If ODBC driver installed. \nIf not, Download ODBC Driver from https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server:\n{}\n'.format(traceback.format_exc()))453 rowcount = 0454 finally:455 # close connections, results set and dispose engine456 try:457 connection.close()458 except:459 print('Failed to close connection !')460 try:461 result.close()462 except:463 print('Failed to close results !')464 try:465 engine.dispose()466 except:467 print('Failed to dispose engine !')468 469 return rowcount 470def sql_server_database_list(server, auth=None, user_database_only=True, dbms='mssql'):471 """472 Reference: https://docs.microsoft.com/en-us/sql/relational-databases/system-compatibility-views/sys-sysdatabases-transact-sql?view=sql-server-2017473 """474 475 query = """476 SELECT 477 @@SERVERNAME AS [ServerName],478 NAME AS [DBName],479 STATUS AS [Status],480 CRDATE AS [CreateDate]481 FROM master.dbo.sysdatabases (NOLOCK)482 WHERE Name NOT IN ( 'master','tempdb','model' ,'msdb')483 """ 484 DBList = read_data_sql(query=query, server=server, database='master', auth=auth, params=None)485 486 return DBList487 488def sql_server_database_usage_report(server, database, auth=None, schema=None, table=None, user_tables_only=True, dbms='mssql', unit='KB'):489 """490 Reference: https://docs.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-tables-transact-sql?view=sql-server-2017491 """492 493 if user_tables_only:494 user_tables_only_condition = "AND table.is_ms_shipped = 0 " # is_ms_shipped = 1 (indicates this object was shipped or created by Microsoft), 0 (indicates this object was created by a user)495 else:496 user_tables_only_condition = ""497 498 if schema != None:499 schema_condition = "AND schema.NAME = '{}'".format(schema)500 else:501 schema_condition = ""502 if table != None:503 table_condition = "AND table.NAME = '{}'".format(table)504 else:505 table_condition = ""506 #Unit conversion507 if unit == 'KB':508 multiplier = 1.0509 if unit == 'MB':510 multiplier = 1.0/1024.0511 if unit == 'GB':512 multiplier = 1.0/(1024.0*1024.0) 513 if unit == 'TB':514 multiplier = 1.0/(1024.0*1024.0*1024.0) 515 516 if dbms == 'mssql':517 query = """518 SELECT519 @@SERVERNAME AS [Server],520 DB_Name() AS [DB],521 [schema].NAME AS [Schema],522 [table].NAME AS [Table],523 [table].CREATE_DATE AS [CreateDate],524 [table].MODIFY_DATE AS [ModifyDate], 525 [part].ROWS AS [Rows],526 SUM(alloc.total_pages) * 8 AS [TotalSpaceKBx],527 SUM(alloc.used_pages) * 8 AS [UsedSpaceKBx],528 FROM529 sys.tables [table] (NOLOCK)530 INNER JOIN 531 sys.indexes (NOLOCK) [ix] ON ([table].OBJECT_ID = [ix].OBJECT_ID)532 INNER JOIN533 sys.partitions (NOLOCK) [part] ON ([ix].OBJECT_ID = [part].OBJECT_ID AND ix.index_id = [part].index_id)534 INNER JOIN535 sys.allocation_units (NOLOCK) [alloc] ON ([part].PARTITION_ID = [alloc].container_id)536 LEFT OUTER JOIN537 sys.schemas [schema] (NOLOCK) ON ([table].SCHEMA_ID = [schema].SCHEMA_ID)538 WHERE539 [table].NAME IS NOT NULL540 {user_tables_only_condition}541 {table_condition}542 {schema_condition}543 GROUP BY544 [table].NAME, 545 [table].CREATE_DATE, 546 [table].MODIFY_DATE, 547 [schema].NAME, part.ROWS548 """.format(schema_condition=schema_condition, table_condition=table_condition, user_tables_only_condition=user_tables_only_condition)549 550 DBUsageReport = read_data_sql(query=query, server=server, database=database, auth=auth, params=None)551 552 DBUsageReport['TotalSpaceKBx'] = DBUsageReport['TotalSpaceKBx'].fillna(0)553 DBUsageReport['UsedSpaceKBx'] = DBUsageReport['UsedSpaceKBx'].fillna(0)554 DBUsageReport['AvaiableSpaceKBx'] = DBUsageReport['TotalSpaceKBx'] - DBUsageReport['UsedSpaceKBx']555 556 DBUsageReport['TotalSpace{}'.format(unit)] = DBUsageReport['TotalSpaceKBx'] * multiplier557 DBUsageReport['UsedSpace{}'.format(unit)] = DBUsageReport['UsedSpaceKBx'] * multiplier558 DBUsageReport['AvaiableSpace{}'.format(unit)] = DBUsageReport['AvaiableSpaceKBx'] * multiplier559 560 DBUsageReport = DBUsageReport.drop(columns=['TotalSpaceKBx', 'UsedSpaceKBx', 'AvaiableSpaceKBx'])561 else:562 DBUsageReport = pd.DataFrame()563 print('This function currently supported for MSSQL server only')564 565 return DBUsageReport566###############################################################################567##[ VALIDATE FIELDS]########################################################## 568###############################################################################569 570def add_identity_column(DataFrame, id_label='ID', start=1, increment=1):571 if id_label in DataFrame.columns:572 print('Column {} exists in the DataFrame'.format(id_label))573 return DataFrame574 else:575 DataFrame.reset_index(drop=True, inplace=True)576 DataFrame.insert(0, id_label, start+DataFrame.index)577 return DataFrame578 579def remove_special_characters(str_val, replace=''):580 return re.sub('\W+',replace, str_val)581def remove_special_characters_list(str_list, replace=''):582 return [remove_special_characters(str_val, replace=replace) for str_val in str_list]583 584def clean_column_names(DataFrame, replace=''): # Remove special charcters from column names585 """586 Parameters587 ----------588 DataFrame : pandas.DataFrame589 DataFrame590 replace : str, dafault ''591 Character to replace special charaters with. 592 593 Returns594 -------595 DataFrame : pandas.DataFrame596 """597 try:598 columns = DataFrame.columns599 columns = remove_special_characters_list(columns, replace=replace)600 if check_list_values_unique(columns):601 DataFrame.columns = columns602 else:603 print('Duplicates values excists the column names after removing special characters!. Column names were rolled-back to initial values.') 604 except:605 print('Error in removing special characters from column names:\n{}\n'.format(traceback.format_exc()))606 return DataFrame607def check_list_values_unique(values_list):608 if len(values_list) == len(set(values_list)):609 return True610 else:611 return False612 613def handle_duplicate_columns(DataFrame, action='rename'): #'drop'614 """615 Parameters616 ----------617 DataFrame : pandas.DataFrame618 DataFrame619 action : {'rename', 'drop'}, dafault 'rename'620 Action to be taken on duplicate columns 621 622 Returns623 -------624 DataFrame : pandas.DataFrame625 """626 is_duplicate = DataFrame.columns.duplicated()627 columns = list(DataFrame.columns)628 if action=='rename':629 for i in range(len(columns)):630 if is_duplicate[i]:631 columns[i]=columns[i]+'_' 632 DataFrame.columns = columns633 elif action=='drop':634 DataFrame = DataFrame.loc[:,~is_duplicate]635 else:636 print('No valid action (rename or drop) provided!')637 return DataFrame638def add_missing_feature_columns(DataFrame, expected_features, fill_value=0):639 # Blanck columns for non-existance variables640 feature_variables_to_add = list(set(expected_features) - set(DataFrame.columns)) # Find columns not found in the dataset641 for f in feature_variables_to_add:642 DataFrame[f]=fill_value643 print('Column [{}] does not exist in the dataset. Created new column and set to {}...'.format(f,fill_value))644 return DataFrame645def exclude_records(DataFrame, exclude_condition=None, action = 'flag', exclude_label='_EXCLUDE_'):646 N0 = len(DataFrame.index)647 if exclude_condition==None:648 print('No exclude condition...')649 return DataFrame650 651 try:652 if action=='drop': #Drop Excludes 653 DataFrame = DataFrame.query('not ({})'.format(exclude_condition))654 elif action=='flag': #Create new flagged column655 DataFrame[exclude_label] = DataFrame.eval(exclude_condition).astype('int8')656 print('Records {} -> {}=1'.format(exclude_condition, exclude_label))657 except:658 print('Error in excluding records {}:\n{}\n'.format(exclude_condition, traceback.format_exc()))659 N1 = len(DataFrame.index) 660 print('{} records were excluded'.format(N1-N0))661 return DataFrame662###############################################################################663##[ CREATING FEATURES - TARGET ]############################################### 664############################################################################### 665 666def set_binary_target(DataFrame, to_variable='_TARGET_', condition_str=None, default=0, null=0, return_variable=False, return_script=False):667 if condition_str==None: 668 return DataFrame669 670 DataFrame, to_variable = create_binary_variable(DataFrame, to_variable, condition_str, default=default, null=null, return_variable=True)671 parameters = {672 'condition_str':condition_str,673 'default':default,674 'null':null675 } 676 script_dict = generate_create_variable_task_script(type='target', out_type='bin', include=False, operation='condition', source=None, destination=to_variable, parameters=parameters)677 678 if return_script and return_variable:679 return DataFrame, to_variable, script_dict680 elif return_script:681 return DataFrame, script_dict682 elif return_variable:683 return DataFrame, to_variable684 else:685 return DataFrame 686 687###############################################################################688##[ CREATING FEATURES - TRANSFORMATIONS]####################################### 689############################################################################### 690def create_normalized_variable(DataFrame, variable, method='maxscale', parameters=None, to_variable=None, return_variable=False, return_script=False):691 if to_variable==None:692 to_variable = variable693 694 if method=='minscale': #scale=max695 try:696 min_ = parameters["min"]697 except:698 min_ = DataFrame[variable].min()699 parameters["min"] = min_700 DataFrame[to_variable] = DataFrame[variable]/min_701 if method=='maxscale': #scale=max702 try:703 max_ = parameters["max"]704 except:705 max_ = DataFrame[variable].max()706 parameters["max"] = max_707 DataFrame[to_variable] = DataFrame[variable]/max_708 if method=='range': # range = abs(max-min)709 try:710 min_ = parameters["min"]711 max_ = parameters["max"]712 except:713 min_ = DataFrame[variable].min()714 max_ = DataFrame[variable].max() 715 parameters["min"] = min_716 parameters["max"] = max_717 min_max = abs(min_-max_)718 DataFrame[to_variable] = DataFrame[variable]/min_max719 if method=='minmaxfs': # range = (value-min)/(max-min)720 try:721 min_ = parameters["min"]722 max_ = parameters["max"]723 except:724 min_ = DataFrame[variable].min()725 max_ = DataFrame[variable].max() 726 parameters["min"] = min_727 parameters["max"] = max_728 min_max = abs(max_-min_)729 DataFrame[to_variable] = (DataFrame[variable]-min_)/min_max730 if method=='minmaxfs_m': # range = (value-min)/(max-min)731 try:732 min_ = parameters["min"]733 max_ = parameters["max"]734 mean_ = parameters["mean"]735 except: 736 min_=DataFrame[variable].min()737 max_=DataFrame[variable].max()738 mean_ = DataFrame[variable].mean()739 parameters["min"] = min_740 parameters["max"] = max_741 parameters["mean"] = mean_742 min_max = abs(max_-min_)743 DataFrame[to_variable] = (DataFrame[variable]-mean_)/min_max744 if method=='mean':745 try:746 mean_ = parameters["mean"]747 except: 748 mean_ = DataFrame[variable].mean()749 parameters["mean"] = mean_750 DataFrame[to_variable] = DataFrame[variable]/mean_751 if method=='median':752 try:753 median_ = parameters["median"]754 except: 755 median_ = DataFrame[variable].median()756 parameters["median"] = median_757 DataFrame[to_variable] = DataFrame[variable]/median_758 if method=='zscore': 759 try:760 std_ = parameters["std"]761 mean_ = parameters["mean"]762 except: 763 std_ = DataFrame[variable].std()764 mean_ = DataFrame[variable].mean()765 parameters["mean"] = mean_766 parameters["std"] = std_767 DataFrame[to_variable] = (DataFrame[variable] - mean_)/std_ 768 769 script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 770 include=False, operation='normalize', 771 source=variable, destination=to_variable, 772 parameters=parameters)773 774 if return_script and return_variable:775 return DataFrame, to_variable, script_dict776 elif return_script:777 return DataFrame, script_dict778 elif return_variable:779 return DataFrame, to_variable780 else:781 return DataFrame 782def create_datepart_variable(DataFrame, variable, to_variable=None, part='date', return_variable=False, return_script=False):783 if to_variable==None:784 to_variable = '{}{}'.format(variable,part)785 786 try:787 DataFrame[variable] = pd.to_datetime(DataFrame[variable])788 if part=='date':789 DataFrame[to_variable] = DataFrame[variable].dt.date790 elif part=='year':791 DataFrame[to_variable] = DataFrame[variable].dt.year792 elif part=='quarter':793 DataFrame[to_variable] = DataFrame[variable].dt.quarter794 elif part=='month':795 DataFrame[to_variable] = DataFrame[variable].dt.month796 elif part=='week':797 DataFrame[to_variable] = DataFrame[variable].dt.week798 elif part=='day':799 DataFrame[to_variable] = DataFrame[variable].dt.day 800 elif part=='dayofweek':801 DataFrame[to_variable] = DataFrame[variable].dt.dayofweek802 elif part=='dayofyear':803 DataFrame[to_variable] = DataFrame[variable].dt.dayofyear804 elif part=='time':805 DataFrame[to_variable] = DataFrame[variable].dt.time806 elif part=='hour':807 DataFrame[to_variable] = DataFrame[variable].dt.hour808 elif part=='minute':809 DataFrame[to_variable] = DataFrame[variable].dt.minute810 elif part=='second':811 DataFrame[to_variable] = DataFrame[variable].dt.second812 elif part=='microsecond':813 DataFrame[to_variable] = DataFrame[variable].dt.microsecond814 elif part=='nanosecond':815 DataFrame[to_variable] = DataFrame[variable].dt.nanosecond816 else:817 DataFrame[to_variable] = variable818 except:819 DataFrame[to_variable] = variable820 parameters = {'part':part} 821 script_dict = generate_create_variable_task_script(type='transform', out_type='dat', 822 include=False, operation='datepart', 823 source=variable, destination=to_variable, 824 parameters=parameters)825 826 if return_script and return_variable:827 return DataFrame, to_variable, script_dict828 elif return_script:829 return DataFrame, script_dict830 elif return_variable:831 return DataFrame, to_variable832 else:833 return DataFrame 834def create_dateadd_variable(DataFrame, variable, to_variable=None, unit='years', value=0, return_variable=False, return_script=False):835 if to_variable==None:836 to_variable = '{}{}{}'.format(variable, value, unit)837 838 try:839 DataFrame[variable] = pd.to_datetime(DataFrame[variable])840 if part=='years':841 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(year=value)842 elif part=='months':843 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(months=value)844 elif part=='weeks':845 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(weeks=value)846 elif part=='days':847 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(days=value)848 elif part=='hours':849 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(hours=value)850 elif part=='minutes':851 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(minutes=value)852 elif part=='seconds':853 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(seconds=value)854 elif part=='microseconds':855 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(microseconds=value)856 elif part=='nanoseconds':857 DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(nanoseconds=value) 858 except:859 DataFrame[to_variable] = variable860 parameters = {861 'unit':unit, 862 'value':value863 } 864 script_dict = generate_create_variable_task_script(type='transform', out_type='dat', 865 include=False, operation='dateadd', 866 source=variable, destination=to_variable, 867 parameters=parameters)868 869 if return_script and return_variable:870 return DataFrame, to_variable, script_dict871 elif return_script:872 return DataFrame, script_dict873 elif return_variable:874 return DataFrame, to_variable875 else:876 return DataFrame 877def create_log_variable(DataFrame, variable, base='e', to_variable=None, return_variable=False, return_script=False):878 if to_variable==None:879 to_variable = 'LOG{}'.format(variable)880 881 if base=='e':882 DataFrame[to_variable] = np.log(DataFrame[variable])883 elif base=='10':884 DataFrame[to_variable] = np.log10(DataFrame[variable])885 elif base=='2':886 DataFrame[to_variable] = np.log2(DataFrame[variable])887 parameters = { 'base':base }888 script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 889 include=False, operation='log', 890 source=variable, destination=to_variable, 891 parameters=parameters) 892 893 if return_script and return_variable:894 return DataFrame, to_variable, script_dict895 elif return_script:896 return DataFrame, script_dict897 elif return_variable:898 return DataFrame, to_variable899 else:900 return DataFrame 901 902def create_exponent_variable(DataFrame, variable, base='e', to_variable=None, return_variable=False, return_script=False):903 if to_variable==None:904 to_variable = 'EXP{}'.format(variable)905 906 if base=='e':907 DataFrame[to_variable] = np.e**DataFrame[variable]908 elif base=='10':909 DataFrame[to_variable] = 10**DataFrame[variable]910 elif base=='2':911 DataFrame[to_variable] = 2**DataFrame[variable]912 parameters = { 'base':base }913 script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 914 include=False, operation='exponent', 915 source=variable, destination=to_variable, 916 parameters=parameters) 917 918 if return_script and return_variable:919 return DataFrame, to_variable, script_dict920 elif return_script:921 return DataFrame, script_dict922 elif return_variable:923 return DataFrame, to_variable924 else:925 return DataFrame 926def create_segmented_variable(DataFrame, variable, a=None, b=None, to_variable=None, return_variable=False, return_script=False):927 if to_variable==None:928 to_variable = 'SEG{}'.format(variable) 929 930 if a == None:931 a = -np.inf932 933 if b == None:934 b = np.inf935 936 DataFrame[to_variable] = DataFrame[variable]937 DataFrame.loc[DataFrame[to_variable]<a, to_variable] = a938 DataFrame.loc[DataFrame[to_variable]>b, to_variable] = b939 parameters = { 'a':a, 'b':b }940 script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 941 include=False, operation='segment', 942 source=variable, destination=to_variable, 943 parameters=parameters) 944 if return_script and return_variable:945 return DataFrame, to_variable, script_dict946 elif return_script:947 return DataFrame, script_dict948 elif return_variable:949 return DataFrame, to_variable950 else:951 return DataFrame 952###############################################################################953##[ CREATING FEATURES - STR TRANSFORM ]######################################## 954############################################################################### 955 956def create_str_count_variable(DataFrame, variable, pattern='*', case_sensitive=True, to_variable=None, return_variable=False, return_script=False):957 if to_variable==None:958 to_variable = '{}CNT{}'.format(variable, remove_special_characters(pattern, replace=''))959 try:960 if pattern=='*':961 DataFrame[to_variable] = DataFrame[variable].str.len()962 else:963 DataFrame[to_variable] = DataFrame[variable].str.count(pattern) 964 except:965 print('ERROR in create_str_count_variable:\n{}'.format(traceback.format_exc()))966 DataFrame[to_variable] = DataFrame[variable]967 parameters = { 'pattern':pattern, 'case_sensitive':case_sensitive }968 script_dict = generate_create_variable_task_script(type='transform_str', out_type='cnt', 969 include=False, operation='strcount', 970 source=variable, destination=to_variable, 971 parameters=parameters) 972 973 if return_script and return_variable:974 return DataFrame, to_variable, script_dict975 elif return_script:976 return DataFrame, script_dict977 elif return_variable:978 return DataFrame, to_variable979 else:980 return DataFrame 981 982def create_str_normalized_variable(DataFrame, variable, to_case='lower', chars='keep', numbers='remove', spchar='remove', space='remove', to_variable=None, return_variable=False, return_script=False):983 if to_variable==None:984 to_variable = '{}'.format(variable)985 986 try: 987 DataFrame[to_variable] = DataFrame[variable]988 989 if to_case=='lower':990 DataFrame[to_variable] = DataFrame[variable].str.lower()991 if to_case=='upper':992 DataFrame[to_variable] = DataFrame[variable].str.upper()993 if numbers=='remove':994 DataFrame[to_variable] = DataFrame[variable].str.replace('\d','') 995 if spchar=='remove':996 DataFrame[to_variable] = DataFrame[variable].str.replace('\W','') 997 if space=='remove':998 DataFrame[to_variable] = DataFrame[variable].str.replace('\s','') 999 if chars=='remove':1000 DataFrame[to_variable] = DataFrame[variable].str.replace('\w','') 1001 except:1002 print('ERROR in create_str_normalized_variable:\n{}'.format(traceback.format_exc()))1003 DataFrame[to_variable] = DataFrame[variable]1004 parameters = { 1005 'to_case':to_case, 1006 'chars':chars,1007 'numbers':numbers,1008 'spchar':spchar, 1009 'space':space1010 }1011 script_dict = generate_create_variable_task_script(type='transform_str', out_type='str', 1012 include=False, operation='normalize', 1013 source=variable, destination=to_variable, 1014 parameters=parameters) 1015 1016 if return_script and return_variable:1017 return DataFrame, to_variable, script_dict1018 elif return_script:1019 return DataFrame, script_dict1020 elif return_variable:1021 return DataFrame, to_variable1022 else:1023 return DataFrame 1024def create_str_extract_variable(DataFrame, variable, pattern='\w+', case_sensitive=True, to_variable=None, return_variable=False, return_script=False): 1025 if to_variable==None:1026 to_variable = 'variableEXT'.format(variable)1027 try:1028 if case_sensitive: 1029 DataFrame[to_variable] = DataFrame[variable].str.extract('({})'.format(pattern))1030 else:1031 DataFrame[to_variable] = DataFrame[variable].str.extract('({})'.format(pattern), flags=re.IGNORECASE)1032 except:1033 print('ERROR in create_str_extract_variable:\n{}'.format(traceback.format_exc()))1034 DataFrame[to_variable] = DataFrame[variable]1035 parameters = { 1036 'pattern':pattern, 1037 'case_sensitive':case_sensitive1038 }1039 script_dict = generate_create_variable_task_script(type='transform_str', out_type='str', 1040 include=False, operation='extract', 1041 source=variable, destination=to_variable, 1042 parameters=parameters) 1043 1044 if return_script and return_variable:1045 return DataFrame, to_variable, script_dict1046 elif return_script:1047 return DataFrame, script_dict1048 elif return_variable:1049 return DataFrame, to_variable1050 else:1051 return DataFrame 1052###############################################################################1053##[ CREATING FEATURES - MULTI VARIABLE ]####################################### 1054############################################################################### 1055def create_operation_mult_variable(DataFrame, expression_str='0', to_variable=None, return_variable=False, return_script=False):1056 if to_variable==None:1057 to_variable = '{}'.format(expression_str)1058 1059 try:1060 DataFrame[to_variable] = DataFrame.eval(expression_str)1061 except:1062 print('ERROR in create_operation_mult_variable:\n{}'.format(traceback.format_exc()))1063 parameters = { 'expression_str':expression_str}1064 script_dict = generate_create_variable_task_script(type='operation_mult', out_type='cnt', 1065 include=False, operation='expression', 1066 source=None, destination=to_variable, 1067 parameters=parameters) 1068 if return_script and return_variable:1069 return DataFrame, to_variable, script_dict1070 elif return_script:1071 return DataFrame, script_dict1072 elif return_variable:1073 return DataFrame, to_variable1074 else:1075 return DataFrame 1076###############################################################################1077##[ CREATING FEATURES - SEQUENCE ORDER ]####################################### 1078###############################################################################1079def create_sequence_order_variable(DataFrame, variable1a, variable2a, variable1b, variable2b, output='binary', to_variable=None, return_variable=False, return_script=False):1080 if to_variable==None:1081 to_variable = '{}{}SEQ{}{}'.format(variable1a, variable2a, variable1b, variable2b)1082 1083 try:1084 DataFrame[to_variable] = DataFrame[variable] ########### NEED UPDATE !!!!1085 except:1086 print('ERROR in create_sequence_order_variable:\n{}'.format(traceback.format_exc()))1087 DataFrame[to_variable] = DataFrame[variable]1088 parameters = { 'output':output }1089 script_dict = generate_create_variable_task_script(type='sequence', out_type='cnt', 1090 include=False, operation='seqorder', 1091 source=[variable1a, variable2a, variable1b, variable2b], 1092 destination=to_variable, 1093 parameters=parameters) 1094 1095 if return_script and return_variable:1096 return DataFrame, to_variable, script_dict1097 elif return_script:1098 return DataFrame, script_dict1099 elif return_variable:1100 return DataFrame, to_variable1101 else:1102 return DataFrame 1103 1104###############################################################################1105##[ CREATING FEATURES - DIFFERENCES ]########################################## 1106############################################################################### 1107def create_numeric_difference_variable(DataFrame, variable1, variable2, multiplier=1, onerror=None, to_variable=None, return_variable=False, return_script=False):1108 if to_variable==None:1109 to_variable = '{}DIFF{}'.format(variable1, variable2)1110 1111 try:1112 DataFrame[variable1] = pd.to_numeric(DataFrame[variable1], errors='coerce')1113 DataFrame[variable2] = pd.to_numeric(DataFrame[variable2], errors='coerce') 1114 DataFrame[to_variable] = multiplier*(DataFrame[variable1] - DataFrame[variable2])1115 except:1116 DataFrame[to_variable] = None1117 print('Data Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc())) 1118 parameters = { 1119 'multiplier':multiplier,1120 'onerror': onerror1121 }1122 script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1123 include=False, operation='numdiff', 1124 source=[variable1, variable2], 1125 destination=to_variable, 1126 parameters=parameters) 1127 1128 if return_script and return_variable:1129 return DataFrame, to_variable, script_dict1130 elif return_script:1131 return DataFrame, script_dict1132 elif return_variable:1133 return DataFrame, to_variable1134 else:1135 return DataFrame 1136def create_numeric_ratio_variable(DataFrame, variable1, variable2, multiplier=1, onerror=None, to_variable=None, return_variable=False, return_script=False):1137 if to_variable==None:1138 to_variable = '{}DIV{}'.format(variable1, variable2)1139 1140 try:1141 DataFrame[variable1] = pd.to_numeric(DataFrame[variable1], errors='coerce')1142 DataFrame[variable2] = pd.to_numeric(DataFrame[variable2], errors='coerce') 1143 DataFrame[to_variable] = multiplier*(DataFrame[variable1]/DataFrame[variable2])1144 except:1145 DataFrame[to_variable] = None1146 print('Data Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc())) 1147 parameters = { 1148 'multiplier':multiplier,1149 'onerror': onerror1150 }1151 script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1152 include=False, operation='ratio', 1153 source=[variable1, variable2], 1154 destination=to_variable, 1155 parameters=parameters) 1156 1157 if return_script and return_variable:1158 return DataFrame, to_variable, script_dict1159 elif return_script:1160 return DataFrame, script_dict1161 elif return_variable:1162 return DataFrame, to_variable1163 else:1164 return DataFrame 1165 1166def create_date_difference_variable(DataFrame, variable1, variable2, to_variable=None, unit='day', onerror=None, return_variable=False, return_script=False):1167 if to_variable==None:1168 to_variable = '{}DIFF{}'.format(variable1,variable2)1169 1170 try:1171 DataFrame[variable1] = pd.to_datetime(DataFrame[variable1])1172 DataFrame[variable2] = pd.to_datetime(DataFrame[variable2]) 1173 DataFrame[to_variable] = DataFrame[variable2] - DataFrame[variable1]1174 DataFrame[to_variable]=DataFrame[to_variable]/np.timedelta64(1,unit)1175 except:1176 DataFrame[to_variable] = None1177 print('Date Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc())) 1178 parameters = { 1179 'unit':unit,1180 'onerror': onerror1181 }1182 script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1183 include=False, operation='datediff', 1184 source=[variable1, variable2], 1185 destination=to_variable, 1186 parameters=parameters) 1187 1188 if return_script and return_variable:1189 return DataFrame, to_variable, script_dict1190 elif return_script:1191 return DataFrame, script_dict1192 elif return_variable:1193 return DataFrame, to_variable1194 else:1195 return DataFrame 1196def create_row_min_variable(DataFrame, variable1, variable2, to_variable=None, return_variable=False, return_script=False):1197 1198 if to_variable==None:1199 to_variable = '{}MIN{}'.format(variable1,variable2)1200 1201 try:1202 DataFrame[to_variable] = DataFrame[[variable1,variable2]].min(axis=1)1203 except:1204 DataFrame[to_variable] = None1205 print('Row min({}, {}) Error: {}'.format(variable1, variable2, traceback.format_exc())) 1206 parameters = { }1207 script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1208 include=False, operation='rowmin', 1209 source=[variable1, variable2], 1210 destination=to_variable, 1211 parameters=parameters) 1212 1213 if return_script and return_variable:1214 return DataFrame, to_variable, script_dict1215 elif return_script:1216 return DataFrame, script_dict1217 elif return_variable:1218 return DataFrame, to_variable1219 else:1220 return DataFrame 1221 1222 1223def create_row_max_variable(DataFrame, variable1, variable2, to_variable=None, return_variable=False, return_script=False):1224 1225 if to_variable==None:1226 to_variable = '{}MAX{}'.format(variable1,variable2)1227 1228 try:1229 DataFrame[to_variable] = DataFrame[[variable1,variable2]].max(axis=1)1230 except:1231 DataFrame[to_variable] = None1232 print('Row max({}, {}) Error : {}'.format(variable1, variable2, traceback.format_exc())) 1233 parameters = { }1234 script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1235 include=False, operation='rowmax', 1236 source=[variable1, variable2], 1237 destination=to_variable, 1238 parameters=parameters) 1239 1240 if return_script and return_variable:1241 return DataFrame, to_variable, script_dict1242 elif return_script:1243 return DataFrame, script_dict1244 elif return_variable:1245 return DataFrame, to_variable1246 else:1247 return DataFrame 1248 1249###############################################################################1250##[ CREATING FEATURES - STR COMPARISON ]####################################### 1251############################################################################### 1252def create_str_comparison_variable(DataFrame, variable1, variable2, to_variable=None, operation='levenshtein', parameters={}, return_variable=False, return_script=False): 1253 if to_variable==None:1254 to_variable = '{}SIM{}'.format(variable1,variable2)1255 1256 try:1257 case_sensitive = parameters['case_sensitive']1258 except:1259 case_sensitive = True1260 1261 if operation=='levenshtein':1262 try:1263 normalize = parameters['normalize']1264 except:1265 normalize = False1266 DataFrame[to_variable] = np.vectorize(damerau_levenshtein_distance)(DataFrame[variable1], DataFrame[variable2], case_sensitive, normalize)1267 elif operation=='jaccard':1268 try:1269 method=parameters['method']1270 except:1271 method='substring'1272 try:1273 min_length=parameters['min_length']1274 except:1275 min_length=11276 try:1277 max_length=parameters['max_length']1278 except: 1279 max_length=np.inf1280 1281 DataFrame[to_variable] = np.vectorize(jaccard_index)(DataFrame[variable1], DataFrame[variable2], method, case_sensitive, min_length, max_length)1282 script_dict = generate_create_variable_task_script(type='comparison_str', out_type='cnt', 1283 include=False, operation=operation, 1284 source=[variable1, variable2], 1285 destination=to_variable, 1286 parameters=parameters)1287 1288 if return_script and return_variable:1289 return DataFrame, to_variable, script_dict1290 elif return_script:1291 return DataFrame, script_dict1292 elif return_variable:1293 return DataFrame, to_variable1294 else:1295 return DataFrame 1296###############################################################################1297##[ CREATING FEATURES - BINARY VARIABLES]###################################### 1298############################################################################### 1299 1300def create_binary_variable(DataFrame, to_variable, condition_str, default=0, null=0, return_variable=False, return_script=False):1301 1302 if to_variable==None:1303 to_variable = '{}'.format(condition_str)1304 try: 1305 DataFrame[to_variable] = DataFrame.eval(condition_str).astype('int8').fillna(null)1306 DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1307 except:1308 print('Error in creating the binary variable {}:\n{}\n'.format(condition_str, traceback.format_exc()))1309 print('Check variable rule set !')1310 parameters = { 1311 'condition_str':condition_str,1312 'default': default,1313 'null': null1314 }1315 script_dict = generate_create_variable_task_script(type='condition', out_type='bin', 1316 include=False, operation='condition', 1317 source=None, 1318 destination=to_variable, 1319 parameters=parameters) 1320 1321 if return_script and return_variable:1322 return DataFrame, to_variable, script_dict1323 elif return_script:1324 return DataFrame, script_dict1325 elif return_variable:1326 return DataFrame, to_variable1327 else:1328 return DataFrame 1329 1330###############################################################################1331##[ CREATING FEATURES - CATEGORY LABELS]####################################### 1332############################################################################### 1333 1334def num_label_to_value(num_label):1335 units = {'p':0.000000000001,1336 'n':0.000000001,1337 'u':0.000001,1338 'm':0.001,1339 'c':0.01,1340 'd':0.1,1341 '':1,1342 'D':10,1343 'H':100,1344 'K':1000,1345 'M':1000000,1346 'G':1000000000,1347 'T':1000000000000,1348 'P':1000000000000000,1349 'E':1000000000000000000,1350 'INF':np.inf 1351 }1352 try:1353 sign, inf, num, unit = re.findall('^([-]?)((\d+)([pnumcdDHKMGTPE]?)|INF)$', num_label.rstrip().lstrip())[0]1354 if inf=='INF':1355 value = int('{}1'.format(sign))*np.inf1356 else:1357 value = int('{}1'.format(sign))*float(num)*units[unit]1358 except:1359 print('vnum_label_value failed !\n{}'.format(traceback.format_exc()))1360 value = None1361 return value1362def edge_labels_to_values(edge_labels, left_inclusive=False, right_inclusive=False):1363 """1364 Parameters1365 ----------1366 edge_labels : str []1367 Edge labels with number unit as postfix1368 'p':0.000000000001,1369 'n':0.000000001,1370 'u':0.000001,1371 'm':0.001,1372 'c':0.01,1373 'd':0.1,1374 '':1,1375 'D':10,1376 'H':100,1377 'K':1000,1378 'M':1000000,1379 'G':1000000000,1380 'T':1000000000000,1381 'P':1000000000000000,1382 'INF':np.inf 1383 left_inclusive : bool, default False1384 Include left edge1385 right_inclusive : bool, default False1386 Include right edge1387 1388 Returns1389 -------1390 edge_values : numeric []1391 bin_labels : str []1392 """ 1393 edge_values = []1394 bin_labels = []1395 n_bins = len(edge_labels)-11396 i=01397 for i in range(n_bins): 1398 l_bracket = '(' if (i==0 and edge_labels[i]=='-INF') or (not left_inclusive) else '['1399 r_bracket = ')' if (i==n_bins-1 and edge_labels[i+1]=='INF') or (not right_inclusive) else ']'1400 edge_values.append(num_label_to_value(edge_labels[i]))1401 bin_labels.append('{}_{}{},{}{}'.format(i+1, l_bracket, edge_labels[i], edge_labels[i+1], r_bracket))1402 edge_values.append(num_label_to_value(edge_labels[n_bins]))1403 return edge_values,bin_labels1404###############################################################################1405##[ CREATING FEATURES - CATEGORY]############################################## 1406############################################################################### 1407 1408def create_categorical_variable(DataFrame, variable, to_variable, labels_str, right_inclusive=True, default='OTHER', null='NA', return_variable=False, return_script=False):1409 1410 if to_variable==None:1411 to_variable = '{}GRP'.format(variable)1412 1413 try:1414 default_ = '0_{}'.format(default)1415 null_ = '0_{}'.format(null)1416 except:1417 default_ = '0_Other'1418 null_ = '0_NA'1419 edge_values, bin_labels = edge_labels_to_values(labels_str, left_inclusive=not right_inclusive, right_inclusive=right_inclusive)1420 1421 try: 1422 DataFrame[to_variable] = pd.cut(DataFrame[variable], bins=edge_values, labels=bin_labels, right=right_inclusive, include_lowest=True).astype('object')1423 except:1424 DataFrame[to_variable] = null_1425 DataFrame.loc[DataFrame[variable].isna(), to_variable] = null_1426 DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default_1427 parameters = { 1428 'labels_str':labels_str,1429 'right_inclusive': right_inclusive,1430 'default': default,1431 'null': null1432 }1433 script_dict = generate_create_variable_task_script(type='category', out_type='cat', 1434 include=False, operation='bucket', 1435 source=variable, 1436 destination=to_variable, 1437 parameters=parameters) 1438 1439 if return_script and return_variable:1440 return DataFrame, to_variable, script_dict1441 elif return_script:1442 return DataFrame, script_dict1443 elif return_variable:1444 return DataFrame, to_variable1445 else:1446 return DataFrame 1447def merge_categories(DataFrame, variable, to_variable, values, group_value, return_variable=False, return_script=False):1448 if to_variable==None:1449 to_variable = variable1450 1451 try: 1452 DataFrame[to_variable] = DataFrame[variable].replace(to_replace=values, value=group_value)1453 except:1454 print('ERROR in creating the categorical variable merge {}:\n{}\n'.format(variable, traceback.format_exc()))1455 print('Check variable rule set !')1456 1457 parameters = { 1458 'group_value':group_value,1459 'values': values1460 }1461 script_dict = generate_create_variable_task_script(type='category_merge', out_type='cat', 1462 include=False, operation='catmerge', 1463 source=variable, 1464 destination=to_variable, 1465 parameters=parameters) 1466 if return_script and return_variable:1467 return DataFrame, to_variable, script_dict1468 elif return_script:1469 return DataFrame, script_dict1470 elif return_variable:1471 return DataFrame, to_variable1472 else:1473 return DataFrame 1474 1475###############################################################################1476##[ CREATING FEATURES - ENTITY (DICTIONARY) ]################################## 1477###############################################################################1478def create_entity_variable(DataFrame, variable, to_variable, dictionary, match_type=None, default='OTHER', null='NA', return_variable=False, return_script=False):1479 if to_variable==None:1480 to_variable = '{}GRP'.format(variable)1481 1482 if to_variable != variable:1483 DataFrame[to_variable] = None1484 1485 for entity in reversed(dictionary): 1486 try:1487 case=entity['case']1488 except:1489 case=True1490 1491 if (match_type=='values') or ('values' in entity.keys()):1492 if case==True:1493 DataFrame.loc[DataFrame[variable].isin(entity['values']), to_variable] = entity['entity']1494 else:1495 values = [x.lower() for x in entity['values']] 1496 DataFrame.loc[DataFrame[variable].str.lower().isin(values), to_variable] = entity['entity']1497 elif (match_type=='pattern') or ('pattern' in entity.keys()):1498 DataFrame.loc[DataFrame[variable].fillna('').str.contains(pat=entity['pattern'], case=case), to_variable] = entity['entity']1499 else:1500 print('Entity {} not created !'.format(entity))1501 1502 DataFrame.loc[DataFrame[variable].isna(), to_variable] = null1503 DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1504 parameters = { 1505 'match_type':match_type,1506 'dictionary': dictionary,1507 'default': default,1508 'null': null1509 }1510 script_dict = generate_create_variable_task_script(type='entity', out_type='cat', 1511 include=False, operation='dictionary', 1512 source=variable, 1513 destination=to_variable, 1514 parameters=parameters) 1515 1516 if return_script and return_variable:1517 return DataFrame, to_variable, script_dict1518 elif return_script:1519 return DataFrame, script_dict1520 elif return_variable:1521 return DataFrame, to_variable1522 else:1523 return DataFrame 1524def create_value_pair_variable(DataFrame, variable1, variable2, to_variable, dictionary, match_type=None, default='OTHER', null='NA', return_variable=False, return_script=False):1525 if to_variable==None:1526 to_variable = '{}GRP{}'.format(variable1, variable2)1527 1528 if to_variable != variable1 and to_variable != variable2 :1529 DataFrame[to_variable] = None1530 1531 for entity in reversed(dictionary): 1532 try:1533 case=entity['case']1534 except:1535 case=True1536 1537 try:1538 opperator = entity['opperator']1539 except:1540 opperator='AND'1541 1542 if (match_type=='values') or ('values' in entity.keys()):1543 if case==True:1544 if opperator=='AND':1545 DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) & (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1546 elif opperator=='OR':1547 DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) | (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1548 elif opperator=='NOT':1549 DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) & (DataFrame[variable2]!=entity['values'][1]), to_variable] = entity['entity']1550 elif opperator=='^NOT':1551 DataFrame.loc[(DataFrame[variable1]!=entity['values'][0]) & (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1552 else:1553 values = [x.lower() for x in entity['values']] 1554 if opperator=='AND':1555 DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) & (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1556 elif opperator=='OR':1557 DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) | (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1558 elif opperator=='NOT':1559 DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) & (DataFrame[variable2].str.lower()!=values[1]), to_variable] = entity['entity']1560 elif opperator=='^NOT':1561 DataFrame.loc[(DataFrame[variable1].str.lower()!=values[0]) & (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1562 1563 elif (match_type=='pattern') or ('pattern' in entity.keys()):1564 if opperator=='AND':1565 DataFrame.loc[(DataFrame[variable1].fillna('').str.contains(pat=entity['values'][0], case=case)) & (DataFrame[variable2].fillna('').str.contains(pat=entity['values'][1], case=case)), to_variable] = entity['entity']1566 elif opperator=='OR':1567 DataFrame.loc[(DataFrame[variable1].fillna('').str.contains(pat=entity['values'][0], case=case)) | (DataFrame[variable2].fillna('').str.contains(pat=entity['values'][1], case=case)), to_variable] = entity['entity'] 1568 else:1569 print('Entity {} not created !'.format(entity))1570 1571 DataFrame.loc[(DataFrame[variable1].isna()) & (DataFrame[variable2].isna()), to_variable] = null1572 DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1573 parameters = { 1574 'match_type':match_type,1575 'dictionary': dictionary,1576 'default': default,1577 'null': null1578 }1579 script_dict = generate_create_variable_task_script(type='entity', out_type='cat', 1580 include=False, operation='valuepairs', 1581 source=[variable1, variable2],1582 destination=to_variable, 1583 parameters=parameters) 1584 1585 if return_script and return_variable:1586 return DataFrame, to_variable, script_dict1587 elif return_script:1588 return DataFrame, script_dict1589 elif return_variable:1590 return DataFrame, to_variable1591 else:1592 return DataFrame 1593 1594###############################################################################1595##[ CREATING FEATURES - PAIR EQUALITY ]######################################## 1596###############################################################################1597def create_pair_equality_variable(DataFrame, variable1, variable2, to_variable, magnitude=False, case=True, return_variable=False, return_script=False):1598 if to_variable==None:1599 to_variable = '{}CMP{}'.format(variable1,variable2)1600 1601 DataFrame.loc[(DataFrame[variable1]==DataFrame[variable2]), to_variable] = 'EQ'1602 DataFrame.loc[(DataFrame[variable1]!=DataFrame[variable2]), to_variable] = 'DF'1603 DataFrame.loc[(DataFrame[variable1].isna()) | (DataFrame[variable2].isna()), to_variable] = 'ON'1604 DataFrame.loc[(DataFrame[variable1].isna()) & (DataFrame[variable2].isna()), to_variable] = 'BN'1605 parameters = { 1606 'magnitude':magnitude,1607 'case': case1608 }1609 script_dict = generate_create_variable_task_script(type='pair_equality', out_type='cat', 1610 include=False, operation='pairequality', 1611 source=[variable1, variable2], 1612 destination=to_variable, 1613 parameters=parameters) 1614 1615 if return_script and return_variable:1616 return DataFrame, to_variable, script_dict1617 elif return_script:1618 return DataFrame, script_dict1619 elif return_variable:1620 return DataFrame, to_variable1621 else:1622 return DataFrame 1623###############################################################################1624 1625###############################################################################1626##[ CREATING FEATURES TASK - TARGET ]########################################## 1627###############################################################################1628def create_target_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1629 to_variable = rule_set['variables']['destination']1630 operation = rule_set['operation'] 1631 parameters = rule_set['parameters']1632 1633 target_condition_str = parameters['condition_str']1634 default = parameters['default']1635 null = parameters['null']1636 1637 DataFrame, to_variable, script_dict = set_binary_target(DataFrame, condition_str=target_condition_str, 1638 to_variable=to_variable, default=default, null=null, return_variable=True, return_script=True)1639 if return_script and return_variable:1640 return DataFrame, to_variable, script_dict1641 elif return_script:1642 return DataFrame, script_dict1643 elif return_variable:1644 return DataFrame, to_variable1645 else:1646 return DataFrame 1647 1648###############################################################################1649##[ CREATING FEATURES TASK - TRANSFORM ]####################################### 1650###############################################################################1651def create_transformed_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1652 variable = rule_set['variables']['source']1653 to_variable = rule_set['variables']['destination']1654 operation = rule_set['operation'] 1655 parameters = rule_set['parameters']1656 1657 if operation=='normalize':1658 method = rule_set['parameters']['method'] 1659 DataFrame, to_variable, script_dict = create_normalized_variable(DataFrame, variable, method=method, parameters=parameters, to_variable=to_variable, return_variable=True, return_script=True)1660 elif operation=='datepart':1661 part = rule_set['parameters']['part'] 1662 DataFrame, to_variable, script_dict = create_datepart_variable(DataFrame, variable, part=part, to_variable=to_variable, return_variable=True, return_script=True)1663 elif operation=='dateadd':1664 unit = rule_set['parameters']['unit'] 1665 value = rule_set['parameters']['value'] 1666 DataFrame, to_variable, script_dict = create_dateadd_variable(DataFrame, variable, unit=unit, value=value, to_variable=to_variable, return_variable=True, return_script=True)1667 elif operation=='log':1668 base = rule_set['parameters']['base'] 1669 DataFrame, to_variable, script_dict = create_log_variable(DataFrame, variable, base=base, to_variable=to_variable, return_variable=True, return_script=True)1670 elif operation=='exponent':1671 base = rule_set['parameters']['base'] 1672 DataFrame, to_variable, script_dict = create_exponent_variable(DataFrame, variable, base=base, to_variable=to_variable, return_variable=True, return_script=True)1673 elif operation=='exponent':1674 a = rule_set['parameters']['a'] 1675 b = rule_set['parameters']['b'] 1676 DataFrame, to_variable, script_dict = create_segmented_variable(DataFrame, variable, a=a, b=b, to_variable=to_variable, return_variable=True, return_script=True)1677 else:1678 pass # other transformations to be implemented1679 1680 if return_script and return_variable:1681 return DataFrame, to_variable, script_dict1682 elif return_script:1683 return DataFrame, script_dict1684 elif return_variable:1685 return DataFrame, to_variable1686 else:1687 return DataFrame 1688def create_str_transformed_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1689 variable = rule_set['variables']['source']1690 to_variable = rule_set['variables']['destination']1691 operation = rule_set['operation'] 1692 parameters = rule_set['parameters']1693 if operation=='strcount':1694 pattern = parameters['pattern']1695 case_sensitive = parameters['case_sensitive']1696 DataFrame, to_variable, script_dict = create_str_count_variable(DataFrame, variable, pattern=pattern, case_sensitive=case_sensitive, to_variable=to_variable, return_variable=True, return_script=True) 1697 elif operation=='normalize':1698 to_case = parameters['to_case']1699 chars = parameters['chars']1700 numbers = parameters['numbers'] 1701 spchar = parameters['spchar']1702 space = parameters['space'] 1703 DataFrame, to_variable, script_dict = create_str_normalized_variable(DataFrame, variable, 1704 to_case=to_case, 1705 chars=chars, 1706 numbers=numbers, 1707 spchar=spchar, 1708 space=space, 1709 to_variable=None, return_variable=False, return_script=True)1710 elif operation=='extract':1711 pattern = parameters['pattern']1712 case_sensitive = parameters['case_sensitive']1713 DataFrame, to_variable, script_dict = create_str_extract_variable(DataFrame, variable, pattern=pattern, 1714 case_sensitive=case_sensitive, 1715 to_variable=to_variable, return_variable=True, return_script=True) 1716 if return_script and return_variable:1717 return DataFrame, to_variable, script_dict1718 elif return_script:1719 return DataFrame, script_dict1720 elif return_variable:1721 return DataFrame, to_variable1722 else:1723 return DataFrame 1724###############################################################################1725##[ CREATING FEATURES TASK - MLTI VARIAVLE ]################################### 1726###############################################################################1727 1728def create_operation_mult_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1729 variable = rule_set['variables']['source']1730 to_variable = rule_set['variables']['destination']1731 operation = rule_set['operation'] 1732 parameters = rule_set['parameters'] 1733 expression_str = parameters['expression_str']1734 1735 DataFrame, to_variable, script_dict = create_operation_mult_variable(DataFrame, expression_str=expression_str, 1736 to_variable=to_variable, return_variable=True, return_script=True)1737 1738 if return_script and return_variable:1739 return DataFrame, to_variable, script_dict1740 elif return_script:1741 return DataFrame, script_dict1742 elif return_variable:1743 return DataFrame, to_variable1744 else:1745 return DataFrame 1746###############################################################################1747##[ CREATING FEATURES TASK - SEQUENCE ORDER ]################################## 1748###############################################################################1749def create_sequence_order_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1750 variable1a = rule_set['variables']['source1a']1751 variable2a = rule_set['variables']['source2a']1752 variable1b = rule_set['variables']['source1b']1753 variable2b = rule_set['variables']['source2b']1754 to_variable = rule_set['variables']['destination']1755 1756 DataFrame, to_variable, script_dict = create_sequence_order_variable(DataFrame, variable1a, variable2a, variable1b, variable2b, output='binary', 1757 to_variable=to_variable, return_variable=True, return_script=True)1758 1759 if return_script and return_variable:1760 return DataFrame, to_variable, script_dict1761 elif return_script:1762 return DataFrame, script_dict1763 elif return_variable:1764 return DataFrame, to_variable1765 else:1766 return DataFrame 1767 1768###############################################################################1769##[ CREATING FEATURES TASK - COMPARISON ]###################################### 1770###############################################################################1771def create_comparison_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1772 variable1 = rule_set['variables']['source1']1773 variable2 = rule_set['variables']['source2']1774 to_variable = rule_set['variables']['destination']1775 operation = rule_set['operation']1776 parameters = rule_set['parameters']1777 1778 try:1779 multiplier = parameters['multiplier']1780 except:1781 multiplier=1 1782 1783 try:1784 unit = parameters['unit']1785 except:1786 unit = 'D'1787 onerror = None # parameters['onerror']1788 1789 if operation=='numdiff': 1790 DataFrame, to_variable, script_dict = create_numeric_difference_variable(DataFrame, variable1, variable2, multiplier=multiplier, onerror=onerror, to_variable=to_variable, return_variable=True, return_script=True)1791 elif operation=='datediff':1792 DataFrame, to_variable, script_dict = create_date_difference_variable(DataFrame, variable1, variable2, unit=unit, onerror=onerror, to_variable=to_variable, return_variable=True, return_script=True)1793 elif operation=='rowmin':1794 DataFrame, to_variable, script_dict = create_row_min_variable(DataFrame, variable1, variable2, to_variable=to_variable, return_variable=True, return_script=True)1795 elif operation=='rowmax':1796 DataFrame, to_variable, script_dict = create_row_max_variable(DataFrame, variable1, variable2, to_variable=to_variable, return_variable=True, return_script=True) 1797 1798 if return_script and return_variable:1799 return DataFrame, to_variable, script_dict1800 elif return_script:1801 return DataFrame, script_dict1802 elif return_variable:1803 return DataFrame, to_variable1804 else:1805 return DataFrame 1806 1807def create_str_comparison_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1808 variable1 = rule_set['variables']['source1']1809 variable2 = rule_set['variables']['source2']1810 to_variable = rule_set['variables']['destination']1811 operation = rule_set['operation'] 1812 parameters = rule_set['parameters']1813 1814 DataFrame, to_variable, script_dict = create_str_comparison_variable(DataFrame, variable1=variable1, variable2=variable2, to_variable=to_variable, operation=operation, parameters=parameters, 1815 return_variable=True, return_script=True)1816 1817 if return_script and return_variable:1818 return DataFrame, to_variable, script_dict1819 elif return_script:1820 return DataFrame, script_dict1821 elif return_variable:1822 return DataFrame, to_variable1823 else:1824 return DataFrame 1825###############################################################################1826##[ CREATING FEATURES TASK - BINARY VARIABLE ]################################# 1827###############################################################################1828def create_binary_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1829 #variable = rule_set['variables']['source']1830 to_variable = rule_set['variables']['destination']1831 parameters = rule_set['parameters']1832 condition_str = parameters['condition_str']1833 default = parameters['default']1834 null = parameters['null']1835 1836 DataFrame, to_variable, script_dict = create_binary_variable(DataFrame, to_variable, condition_str, default, null, 1837 return_variable=True, return_script=True)1838 1839 if return_script and return_variable:1840 return DataFrame, to_variable, script_dict1841 elif return_script:1842 return DataFrame, script_dict1843 elif return_variable:1844 return DataFrame, to_variable1845 else:1846 return DataFrame 1847###############################################################################1848##[ CREATING FEATURES TASK - CATEGORY VARIABLE ]############################### 1849############################################################################### 1850def create_categorical_variable_task(DataFrame, rule_set, return_variable=False, return_script=False): 1851 variable = rule_set['variables']['source']1852 to_variable = rule_set['variables']['destination']1853 operation = rule_set['operation'] 1854 parameters = rule_set['parameters']1855 labels_str = parameters['labels_str']1856 right_inclusive = parameters['right_inclusive'] 1857 default = parameters['default']1858 null = parameters['null']1859 1860 DataFrame, to_variable, script_dict = create_categorical_variable(DataFrame, variable, to_variable, labels_str, right_inclusive, default, null, 1861 return_variable=True, return_script=True)1862 1863 if return_script and return_variable:1864 return DataFrame, to_variable, script_dict1865 elif return_script:1866 return DataFrame, script_dict1867 elif return_variable:1868 return DataFrame, to_variable1869 else:1870 return DataFrame 1871###############################################################################1872##[ CREATING FEATURES TASK - ENTITY VARIABLE ]################################# 1873############################################################################### 1874def create_entity_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1875 1876 to_variable = rule_set['variables']['destination']1877 parameters = rule_set['parameters']1878 match_type = parameters['match_type']1879 dictionary = parameters['dictionary'] 1880 default = parameters['default']1881 null = parameters['null']1882 operation = rule_set['operation']1883 if operation == 'dictionary':1884 variable = rule_set['variables']['source'] 1885 DataFrame, to_variable, script_dict = create_entity_variable(DataFrame, variable=variable, to_variable=to_variable, 1886 dictionary=dictionary, match_type=match_type, default=default, null=null, 1887 return_variable=True, return_script=True)1888 elif operation == 'valuepairs':1889 variable1 = rule_set['variables']['source1'] 1890 variable2 = rule_set['variables']['source2'] 1891 DataFrame, to_variable, script_dict = create_value_pair_variable(DataFrame, variable1, variable2, to_variable, 1892 dictionary, match_type=None, default='OTHER', null='NA', 1893 return_variable=True, return_script=True)1894 if return_script and return_variable:1895 return DataFrame, to_variable, script_dict1896 elif return_script:1897 return DataFrame, script_dict1898 elif return_variable:1899 return DataFrame, to_variable1900 else:1901 return DataFrame 1902###############################################################################1903##[ CREATING FEATURES TASK - PAIR EQUALITY ]################################### 1904############################################################################### 1905def create_pair_equality_variable_task(DataFrame, rule_set, return_variable=False, return_script=False): 1906 variable1 = rule_set['variables']['source1']1907 variable2 = rule_set['variables']['source2']1908 to_variable = rule_set['variables']['destination']1909 parameters = rule_set['parameters']1910 try:1911 magnitude = parameters['magnitude']1912 except:1913 magnitude = 11914 case = parameters['case']1915 1916 DataFrame, to_variable, script_dict = create_pair_equality_variable(DataFrame, variable1=variable1, variable2=variable2, to_variable=to_variable, magnitude=magnitude, case=case, 1917 return_variable=True, return_script=True)1918 1919 if return_script and return_variable:1920 return DataFrame, to_variable, script_dict1921 elif return_script:1922 return DataFrame, script_dict1923 elif return_variable:1924 return DataFrame, to_variable1925 else:1926 return DataFrame 1927###############################################################################1928##[ CREATING FEATURES TASK - MERGE CATEGORY ]################################## 1929###############################################################################1930def merge_categories_task(DataFrame, rule_set, return_variable=False, return_script=False):1931 variable = rule_set['variables']['source']1932 to_variable = rule_set['variables']['destination']1933 values = rule_set['parameters']['values']1934 group_value = rule_set['parameters']['group_value']1935 1936 DataFrame, to_variable, script_dict = merge_categories(DataFrame, variable=variable, to_variable=to_variable, values=values, group_value=group_value, 1937 return_variable=True, return_script=True) 1938 if return_script and return_variable:1939 return DataFrame, to_variable, script_dict1940 elif return_script:1941 return DataFrame, script_dict1942 elif return_variable:1943 return DataFrame, to_variable1944 else:1945 return DataFrame 1946###############################################################################1947 1948###############################################################################1949##[ ENCODER ]################################################################## 1950############################################################################### 1951def to_one_hot_encode(DataFrame, category_variables=[], binary_variables=[], target_variable='target', target_type='binary'):1952 # TO DO: If target type is 'multi' apply one hot encoding to target1953 feature_variables = []1954 try:1955 VariablesDummies = pd.get_dummies(DataFrame[category_variables]).astype('int8')1956 dummy_variables = list(VariablesDummies.columns.values)1957 DataFrame[dummy_variables] = VariablesDummies1958 except:1959 print('Category columns {} does not specified nor exists'.format(category_variables))1960 1961 try:1962 DataFrame[binary_variables] = DataFrame[binary_variables].astype('int8')1963 except:1964 print('Binary columns {} does not specified nor exists'.format(binary_variables))1965 1966 try: 1967 feature_variables = binary_variables+dummy_variables1968 except:1969 print('Error in creating feature variables.')1970 1971 return DataFrame, feature_variables, target_variable1972###############################################################################1973##[ ML MODEL DRIVER ]########################################################## 1974############################################################################### 1975def load_data_task(load_data_dict, return_name=False):1976 """1977 Parameters1978 ----------1979 load_data_dict: dict1980 e.g.: {1981 "type": "csv",1982 "location": "local",1983 "workclass": "Private",1984 "source": {"path":"C:/Projects/Data/incomedata.csv", "separator":",", "encoding":null},1985 "auth": None,1986 "query": None,1987 "limit": None1988 }1989 1990 Returns1991 -------1992 DataFrame: pandas.DataFrame1993 data_name: str1994 """ 1995 import json1996 if type(load_data_dict)==dict:1997 pass1998 else:1999 try:2000 load_data_dict = json.loads(load_data_dict) 2001 except:2002 print('ERROR in loading data:{}\n {}'.format(load_data_dict, traceback.format_exc())) 2003 2004 data_name = load_data_dict['data_name']2005 2006 if load_data_dict['type']=='csv':2007 DataFrame = read_data_csv(2008 file=load_data_dict['source']['path'], 2009 separator=load_data_dict['source']['separator'], 2010 encoding=load_data_dict['source']['encoding']2011 )2012 elif load_data_dict['type']=='pickle':2013 DataFrame = read_data_pickle(2014 file=load_data_dict['source']['path'], 2015 compression =load_data_dict['source']['compression']2016 ) 2017 elif load_data_dict['type']=='sql':2018 DataFrame = read_data_sql(2019 query=load_data_dict['query'], 2020 server=load_data_dict['source']['server'], 2021 database=load_data_dict['source']['database'],2022 auth=load_data_dict['auth']2023 ) 2024 else:2025 print("No valid data source provided!")2026 DataFrame = pd.DataFrame() 2027 # Add ID column2028 DataFrame = add_identity_column(DataFrame, id_label='ID', start=1, increment=1)2029 # Clean column names2030 DataFrame = clean_column_names(DataFrame, replace='')2031 2032 if return_name: 2033 return DataFrame, data_name2034 else:2035 return DataFrame2036###############################################################################2037def create_variable_task(DataFrame, create_variable_task_dict=None, return_extra=False, return_script=False):2038 """2039 Interface function for single variable operation2040 Parameters2041 ----------2042 DataFrame: pandas.DataFrame2043 create_variable_task_dict : dict or JSON2044 return_extra : bool, default False2045 Returns variable_class and include if True2046 2047 Returns2048 -------2049 DataFrame: pandas.DataFrame2050 data_name: str2051 variable_class : str, optional2052 include: bool, optional2053 """ 2054 import json2055 if type(create_variable_task_dict)==dict:2056 pass2057 else:2058 try:2059 create_variable_task_dict = json.loads(create_variable_task_dict) 2060 except:2061 print('ERROR in creating variable:{}\n {}'.format(create_variable_task_dict, traceback.format_exc())) 2062 2063 rule_set = {2064 'operation':create_variable_task_dict['operation'],2065 'variables':create_variable_task_dict['variables'],2066 'parameters':create_variable_task_dict['parameters']2067 }2068 out_type = create_variable_task_dict['out_type']2069 include = create_variable_task_dict['include'] 2070 try:2071 if create_variable_task_dict['type']=='target':2072 DataFrame, output_variable, script_dict = create_target_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2073 if create_variable_task_dict['type']=='transform':2074 DataFrame, output_variable, script_dict = create_transformed_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2075 elif create_variable_task_dict['type']=='str_transform':2076 DataFrame, output_variable, script_dict = create_str_transformed_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2077 elif create_variable_task_dict['type']=='operation_mult':2078 DataFrame, output_variable, script_dict = create_operation_mult_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2079 elif create_variable_task_dict['type']=='seq_order':2080 DataFrame, output_variable, script_dict = create_sequence_order_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)2081 elif create_variable_task_dict['type']=='comparison':2082 DataFrame, output_variable, script_dict = create_comparison_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2083 elif create_variable_task_dict['type']=='str_comparison':2084 DataFrame, output_variable, script_dict = create_str_comparison_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2085 elif create_variable_task_dict['type']=='condition':2086 DataFrame, output_variable, script_dict = create_binary_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2087 elif create_variable_task_dict['type']=='category':2088 DataFrame, output_variable, script_dict = create_categorical_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2089 elif create_variable_task_dict['type']=='entity':2090 DataFrame, output_variable, script_dict = create_entity_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2091 elif create_variable_task_dict['type']=='pair_equality':2092 DataFrame, output_variable, script_dict = create_pair_equality_variable_task(DataFrame, rule_set, return_variable=True, return_script=True) 2093 elif create_variable_task_dict['type']=='category_merge':2094 DataFrame, output_variable, script_dict = merge_categories_task(DataFrame, rule_set, return_variable=True, return_script=True) 2095 else:2096 output_variable= None 2097 out_type = None2098 include = False2099 script_dict= {2100 "type": "",2101 "out_type":"",2102 "include": False,2103 "operation": "",2104 "variables": {2105 "source": "",2106 "destination": None2107 },2108 "parameters": { 2109 }2110 }2111 except:2112 output_variable= None 2113 out_type = None2114 include = False2115 script_dict= {2116 "type": "",2117 "out_type":"",2118 "include": False,2119 "operation": "",2120 "variables": {2121 "source": "",2122 "destination": None2123 },2124 "parameters": { 2125 }2126 } 2127 2128 if return_script and return_extra:2129 return DataFrame, output_variable, out_type, include, script_dict2130 if return_script:2131 return DataFrame, script_dict2132 if return_extra: 2133 return DataFrame, output_variable, out_type, include 2134 else:2135 return DataFrame, output_variable2136def setup_variables_task(DataFrame, variables_setup_dict, return_script=False):2137 """2138 Parameters2139 ----------2140 DataFrame: pandas.DataFrame2141 variables_setup_dict: json or dict2142 2143 2144 Returns2145 -------2146 DataFrame: pandas.DataFrame2147 category_variables: list(str)2148 binary_variables: list(str)2149 target_variable: list(str)2150 """2151 2152 import re2153 import json2154 if type(variables_setup_dict)==dict:2155 pass2156 else:2157 try:2158 variables_setup_dict = json.loads(variables_setup_dict) 2159 except:2160 print('ERROR in creating variables:{}\n {}'.format(variables_setup_dict, traceback.format_exc())) 2161 2162 # Setting = {'model', 'score'} 2163 setting = variables_setup_dict['setting']2164 2165 # verify if variables exists2166 category_variables = variables_setup_dict['variables']['category_variables']2167 binary_variables = variables_setup_dict['variables']['binary_variables'] 2168 target_variable = variables_setup_dict['variables']['target_variable'] 2169 2170 #Create variables sets2171 category_variables = set(category_variables) & set(DataFrame.columns)2172 binary_variables = set(binary_variables) & set(DataFrame.columns)2173 2174 # Create placeholder for variable creation scripts2175 script_dict = []2176 2177 # Check if target variable exists (fill the column with None in scoring)2178 if not target_variable in DataFrame.columns:2179 DataFrame[target_variable]=None 2180 2181 # Run variable creation task list2182 for preprocess_task in variables_setup_dict['preprocess_tasks']:2183 task_type = preprocess_task['type'] #re.sub('[\W\d]', '', task_type) 2184 if task_type in ['target', 'transform', 'condition', 'category', 'entity', 'category_merge', 'pair_equality', 'str_transform', 2185 'str_comparison', 'operation_mult', 'comparison', 'seq_order']:2186 #print(task_type)2187 2188 DataFrame, variable_, variable_class_, include_, script_dict_ = create_variable_task(DataFrame, create_variable_task_dict=preprocess_task, return_extra=True, return_script=True) 2189 2190 if include_:2191 script_dict_['include'] = True2192 script_dict.append(script_dict_)2193 if variable_class_=='bin':2194 binary_variables.add(variable_)2195 elif variable_class_=='cat':2196 category_variables.add(variable_)2197 #Finalize variables lists2198 category_variables=list(category_variables)2199 binary_variables=list(binary_variables)2200 target_variable = target_variable2201 2202 if return_script:2203 return DataFrame, category_variables, binary_variables, target_variable, script_dict2204 else:2205 return DataFrame, category_variables, binary_variables, target_variable2206###############################################################################2207# Generate Script2208###############################################################################2209def generate_variables_script(source, destination): 2210 if type(source)==list:2211 if len(source)==2:2212 variables = {2213 'source1': source[0],2214 'source2': source[1],2215 'destination': destination2216 } 2217 elif len(source)==4:2218 variables = {2219 'source1a': source[0],2220 'source2a': source[1],2221 'source1b': source[2],2222 'source2b': source[3],2223 'destination': destination2224 }2225 else:2226 variables = {2227 'source': source,2228 'destination': destination2229 }2230 return variables2231 2232def generate_create_variable_task_script(type='', out_type='', include=False, operation='', source=None, destination=None, parameters={}):2233 variable_task_script = {2234 'type': type,2235 'out_type':out_type,2236 'include': include,2237 'operation': operation,2238 'variables': generate_variables_script(source, destination),2239 'parameters': parameters2240 }2241 return variable_task_script2242###############################################################################2243# EZ User Functions2244###############################################################################2245def create_category_ez(DataFrame, variable, labels_str, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2246 rule_set = { 2247 'operation':'bucket',2248 'variables': {2249 'source':variable, 2250 'destination':to_variable2251 },2252 'parameters': {2253 'labels_str': labels_str,2254 'right_inclusive':True,2255 'default':default,2256 'null':null2257 }2258 }2259 DataFrame, category_variable = mltk.create_categorical_variable_task(DataFrame, rule_set, return_variable=True)2260 print(variable_response(DataFrame=DataFrame, variable=category_variable, target_variable=target_variable, show_plot=show_plot))2261 return DataFrame, category_variable2262def create_binary_ez(DataFrame, condition_str, default=0, null=0, to_variable=None, target_variable=None, show_plot=True):2263 rule_set = {2264 'operation':'condition', 2265 'variables': {2266 'source': None, 2267 'destination':to_variable2268 },2269 'parameters': {2270 'condition_str':condition_str,2271 'default':default,2272 'null':null,2273 }2274 } 2275 2276 DataFrame, binary_variable = create_binary_variable_task(DataFrame, rule_set, return_variables=True) 2277 print(variable_response(DataFrame=DataFrame, variable=binary_variable, target_variable=target_variable, show_plot=show_plot))2278 return DataFrame, binary_variable 2279def create_entity_ez(DataFrame, variable, dictionary, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2280 rule_set = {2281 'operation':'dictionary', 2282 'variables': {2283 'source': variable, 2284 'destination':to_variable2285 },2286 'parameters': {2287 'match_type': None,2288 'dictionary':dictionary,2289 'default':default,2290 'null':null,2291 }2292 } 2293 2294 DataFrame, entity_variable = create_entity_variable_task(DataFrame, rule_set, return_variables=True) 2295 print(variable_response(DataFrame=DataFrame, variable=entity_variable, target_variable=target_variable, show_plot=show_plot))2296 return DataFrame, entity_variable 2297def create_entity_ez(DataFrame, variable, dictionary, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2298 rule_set = {2299 'operation':'dictionary', 2300 'variables': {2301 'source': variable, 2302 'destination':to_variable2303 },2304 'parameters': {2305 'match_type': None,2306 'dictionary':dictionary,2307 'default':default,2308 'null':null,2309 }2310 } 2311 2312 DataFrame, entity_variable = create_entity_variable_task(DataFrame, rule_set, return_variables=True) 2313 print(variable_response(DataFrame=DataFrame, variable=entity_variable, target_variable=target_variable, show_plot=show_plot))...

Full Screen

Full Screen

put_in_function.py

Source:put_in_function.py Github

copy

Full Screen

1#!/usr/bin/env python32# -*- coding: utf-8 -*-3"""4Created on Wed Feb 8 19:55:06 20175@author: lauragustafson, knjohnso, carlosh6"""7def put_in_functions_a():8 FUNCTION_NAME = 'part_a'9 #change FUNCTION_NAME to be the name of the function that you want their code10 #to be wrapped in11 FUNCTION_HEADER = 'def %s(annual_salary, portion_saved, total_cost):' % (FUNCTION_NAME)12 STUDENT_FILE_NAME = 'ps1a.py'13 #change STUDENT_FILE_NAME to be the name of the file that their code will be in14 RETURN_VARIABLE = 'months'15 #change RETURN_VARIABLE to be the name of the variable you want the fucntion16 #to return17 RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)18 NEW_FILE_NAME = 'ps1a_in_function.py'19 #change NEW_FILE_NAME to be the name of the output file20 START_LINE = "## Initialize other variables you need (if any) for your program below ##" 21 #The start line to grab for the student's function, everything else below this should be fine to copy and not include any input statements.22 new_lines = []23 lines = [line.rstrip('\n') for line in open('ps1a.py')]24 # look for the start line and find its index in lines 25 START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)26 lines = lines[START_INDEX+1:]27 28 new_lines.append(FUNCTION_HEADER)29 for line in lines:30 new_lines.append('\t'+line)31 new_lines.append(RETURN_STATEMENT)32 with open(NEW_FILE_NAME, 'w') as new_file:33 new_file.write('\n'.join(new_lines))34def put_in_functions_b():35 FUNCTION_NAME = 'part_b'36 #change FUNCTION_NAME to be the name of the function that you want their code37 #to be wrapped in38 FUNCTION_HEADER = 'def %s(annual_salary, portion_saved, total_cost, semi_annual_raise):' % (FUNCTION_NAME)39 STUDENT_FILE_NAME = 'ps1b.py'40 #change STUDENT_FILE_NAME to be the name of the file that their code will be in41 RETURN_VARIABLE = 'months'42 #change RETURN_VARIABLE to be the name of the variable you want the fucntion43 #to return44 RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)45 NEW_FILE_NAME = 'ps1b_in_function.py'46 #change NEW_FILE_NAME to be the name of the output file47 START_LINE = "## Initialize other variables you need (if any) for your program below ##" 48 #The start line to grab for the student's function, everything else below this should be fine to copy andd not include any input statements.49 new_lines = []50 lines = [line.rstrip('\n') for line in open('ps1b.py')]51 # look for the start line and find its index in lines52 START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)53 lines = lines[START_INDEX+1:]54 55 new_lines.append(FUNCTION_HEADER)56 for line in lines:57 new_lines.append('\t'+line)58 new_lines.append(RETURN_STATEMENT)59 with open(NEW_FILE_NAME, 'w') as new_file:60 new_file.write('\n'.join(new_lines))61def put_in_functions_c():62 FUNCTION_NAME = 'part_c'63 #change FUNCTION_NAME to be the name of the function that you want their code64 #to be wrapped in65 FUNCTION_HEADER = 'def %s(initial_deposit):' % (FUNCTION_NAME)66 STUDENT_FILE_NAME = 'ps1c.py'67 #change STUDENT_FILE_NAME to be the name of the file that their code will be in68 RETURN_VARIABLE = 'r, steps'69 #change RETURN_VARIABLE to be the name of the variable you want the fucntion70 #to return71 RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)72 NEW_FILE_NAME = 'ps1c_in_function.py'73 #change NEW_FILE_NAME to be the name of the output file74 START_LINE = "## Initialize other variables you need (if any) for your program below ##" 75 #The start line to grab for the student's function, everything else below this should be fine to copy andd not include any input statements.76 new_lines = []77 lines = [line.rstrip('\n') for line in open('ps1c.py')]78 # look for the start line and find its index in lines79 START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)80 lines = lines[START_INDEX+1:]81 new_lines.append(FUNCTION_HEADER)82 for line in lines:83 new_lines.append('\t'+line)84 new_lines.append(RETURN_STATEMENT)85 with open(NEW_FILE_NAME, 'w') as new_file:86 new_file.write('\n'.join(new_lines))87put_in_functions_a()88put_in_functions_b()...

Full Screen

Full Screen

python-code-runner.py

Source:python-code-runner.py Github

copy

Full Screen

1from bottle import run, request, route2@route('/runCode', method='POST')3def runCode():4 name_of_return_variable = 'return_variable'5 function_definition = request.params['functionDefinition']6 function_call = request.params['functionCall']7 outputMap = {}8 error_output = ''9 error_occured = False10 testRunCode = function_definition + '\n' + name_of_return_variable + ' = ' + function_call11 12 try:13 exec(testRunCode, globals(), outputMap)14 except Exception as exception:15 error_output = exception16 error_occured = True17 if error_occured:18 return_variable = error_output19 else:20 return_variable = outputMap[name_of_return_variable]21 return str(return_variable)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run pyresttest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful