Best Python code snippet using pyresttest_python
etl.py
Source:etl.py  
1# -*- coding: utf-8 -*-2# MLToolkit (mltoolkit)3"""4MLToolkit - a verstile helping library for machine learning5===========================================================6'MLToolkit' is a Python package providing a set of user-friendly functions to 7help building machine learning models in data science research or production 8focused projects. It is compatible with and interoperate with popular data 9analysis, manipulation and machine learning libraries Pandas, Sci-kit Learn, 10Tensorflow, Statmodels, Catboost, XGboost, etc.11Main Features12-------------13- Data Extraction (SQL, Flatfiles, etc.)14- Exploratory data analysis (statistical summary, univariate analysis, etc.)15- Feature Extraction and Engineering16- Model performance analysis, Explain Predictions and comparison between models17- Cross Validation and Hyper parameter tuning18- JSON input script for executing model building and scoring tasks.19- Model Building UI20- Auto ML (automated machine learning)21- Model Deploymet and Serving via RESTful  API22Author23------24- Sumudu Tennakoon25Links26-----27Website: http://sumudu.tennakoon.net/projects/MLToolkit28Github: https://mltoolkit.github.io/MLToolKit29License30-------31Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)32"""33# IF QUERY HAS DROP TABLE, TRUNCATE TABLE, DELETE, UPDATE, CREATE, set a control flag on (safety)34# Modify output timing to execute query + row count35from datetime import datetime36import gc37import traceback38import gc39import os40from timeit import default_timer as timer41import numpy as np42import pandas as pd43import matplotlib.pyplot as plt44import re45import urllib46import sqlalchemy 47import csv48try:49    import pyodbc50except:51    print('pyodbc not found! Data base query fufnctions disabled.')52import warnings53warnings.filterwarnings("ignore")54from mltk.string import *55from mltk.explore import *56def number_unit_example():57    edges_std = ['0', '1p', '1n', '1u', '1m', '1c', '1', '100', '500', 58                 '1K', '2K', '5K', '10K', '20K', '50K', '100K', '500K', 59                 '1M', '2M', '5M', '10M', '100M', '200M', '500M', 60                 '1G', '2G', '5G', '10G', '100G', '200G', '500G',61                 '1T', '2T', '5T', '10T', '100T', '200T', '500T',62                 '1P', '2P', '5P', '10P', '100P', '200P', '500P',63                 '1E']64    print(edges_std)65    66def get_number_units():    67    units = {'p':0.000000000001,68        'n':0.000000001,69        'u':0.000001,70        'm':0.001,71        'c':0.01,72        'd':0.1,73        '':1,74        'D':10,75        'H':100,76        'K':1000,77        'M':1000000,78        'G':1000000000,79        'T':1000000000000,80        'P':1000000000000000,81        'E':1000000000000000000,82        'INF':np.inf        83        }84    units = pd.DataFrame(data=units.items(), columns=['unit', 'multiplier'])85    print(units)86    return units87###############################################################################88##[ I/O FUNCTIONS]#############################################################      89###############################################################################90def read_data(connector, params=None):91    connector = {92            "method":"sql", #"pickle", "csv", "excel"93            "source":{"dbms":"mssql", "server":"SQLSERVER1", "database":"SampleDB", "schema":None},94            "auth":{'type':'user', 'user':'user1', 'pwd':'password123'},95            "params":{}96            }97    98    connector2 = {99            "method":"sql", #"pickle", "csv", "excel"100            "source":{"dbms":"snowflake", "server":"SQLSERVER1", "database":"SampleDB", "schema":None}, # account (server)101            "auth":{'type':'user', 'user':'user1', 'password':'password123', "role": None}, 102            "params":{}103            }104    105    return None    106    107def read_data_csv(file, separator=',', quoting= 'MINIMAL', compression='infer', encoding='utf-8'):108    """109    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html110    111    Parameters112    ----------    113    file : str114    separator : str115    index : bool116    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'117    quoting : {'ALL', MINIMAL', 'NONNUMERIC', 'NONE'}, default 'MINIMAL'118    encoding : {'utf-8', 'utf-16'}, default 'utf-8'119     120    Returns121    -------122    DataFrame : pandas.DataFrame123    """124    if quoting=='ALL':125        quoting = csv.QUOTE_ALL126    elif quoting=='MINIMAL':127        quoting = csv.QUOTE_MINIMAL        128    elif quoting=='NONNUMERIC':129        quoting = csv.QUOTE_NONNUMERIC        130    elif quoting=='NONE':131        quoting = csv.QUOTE_NONE   132    133    try:134        start_time = timer() 135        DataFrame = pd.read_csv(filepath_or_buffer=file, sep=separator, quoting=quoting, 136                                compression=compression, encoding=encoding)  137        execute_time = timer() - start_time138    except:139        execute_time = 0140        DataFrame = pd.DataFrame()141        print(traceback.format_exc())142        143    144    145    print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))146    147    return DataFrame148def write_data_csv(DataFrame, file, separator=',', index=False, quoting='ALL', encoding='utf-8', compression='infer', chunksize=None):149    """150    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html151    152    Parameters153    ----------    154    DataFrame : pandas.DataFrame155    file : str156    separator : str157    index : bool158    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'159    quoting : {'ALL', MINIMAL', 'NONNUMERIC', 'NONE'}, default 'MINIMAL'160    encoding : {'utf-8', 'utf-16'}, default 'utf-8'161    chunksize : int, default None162    163    Returns164    -------165    None166    """167    168    if quoting=='ALL':169        quoting = csv.QUOTE_ALL170    elif quoting=='MINIMAL':171        quoting = csv.QUOTE_MINIMAL        172    elif quoting=='NONNUMERIC':173        quoting = csv.QUOTE_NONNUMERIC        174    elif quoting=='NONE':175        quoting = csv.QUOTE_NONE        176    try:177        start_time = timer()     178        DataFrame.to_csv(path_or_buf=file, sep=separator, encoding=encoding, index=index, 179                         quoting=quoting, compression=compression, chunksize=chunksize)180        execute_time = timer() - start_time181    except:182        execute_time = 0183        print(traceback.format_exc())184        185    print('{:,d} records were written. execute time = {} s'.format(len(DataFrame.index), execute_time))186    187    return None188def read_data_pickle(file, compression='infer'):189    """190    https://docs.python.org/3/library/pickle.html191    "Warning The pickle module is not secure against erroneous or maliciously constructed data. 192    Never unpickle data received from an untrusted or unauthenticated source."193    194    Parameters195    ----------    196    file : str197    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'198    199    Returns200    -------201    DataFrame : pandas.DataFrame202    """203    try:204        start_time = timer() 205        DataFrame = pd.read_pickle(path=file, compression=compression)206        execute_time = timer() - start_time207    except:208        execute_time = 0209        print(traceback.format_exc())210        DataFrame = pd.DataFrame()211        212        213    print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))214    215    return DataFrame216def write_data_pickle(DataFrame, file, compression='infer', protocol=3):217    """218    https://docs.python.org/3/library/pickle.html219    "Warning The pickle module is not secure against erroneous or maliciously constructed data. 220    Never unpickle data received from an untrusted or unauthenticated source."221    222    Parameters223    ----------    224    DataFrame : pandas.DataFrame225    file : str226    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'227    protocol : int {1, 2, 3, 4}228        0 is human-readable/backwards compatible with earlier versions of Python229        read more at https://docs.python.org/3/library/pickle.html230    Returns231    -------232    None    233    """234    try:235        start_time = timer() 236        DataFrame.to_pickle(path=file, compression=compression, protocol=protocol)237        execute_time = timer() - start_time238    except:239        execute_time = 0240        print(traceback.format_exc())241        242    print('{:,d} records were written. execute time = {} s'.format(len(DataFrame.index), execute_time))243def create_sql_connect_string(server=None, database=None, auth=None, dbms='mssql', autocommit = 'True'):    244    if dbms=='mssql':245        # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server246        driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 247        if auth['type']=='machine':248            connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'249            connect_string = urllib.parse.quote_plus(connect_string)250        elif auth['type']=='user':251            uid =  auth['uid'] 252            pwd =  auth['pwd'] 253            connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'254            connect_string = urllib.parse.quote_plus(connect_string)255    elif dbms=='mysql':256        connect_string = None257    elif dbms=='snowflake':258        connect_string = None259    else:260        raise Exception("Parameter dbms not provided. Accepted values are {'mssql', 'mysql', 'snowflake'}")261    return connect_string262def read_data_sql(query=None, server=None, database=None, auth=None, dbms='mssql', params=None):263    """264    Parameters265    ----------266    query : str267        SQL SELECT query268    server : str269        Database Server270    database : str271        Database272    auth :  dict273        e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication274             auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication275    276    Returns277    -------278    DataFrame : pandas.DataFrame279    """   280    execute_time = 0281    282    if query!=None and server!=None and auth!=None:        283        coerce_float=True284        index_col=None285        parse_dates=None286        287        try:288            if auth['type']=='machine':289                connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;'290            elif auth['type']=='user':291                uid =  auth['uid'] 292                pwd =  auth['pwd'] 293                connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'}'294            else:295                raise Exception('No db server authentication method provided!')296            connection = pyodbc.connect(connect_string)   297            298            start_time = timer() 299            DataFrame = pd.read_sql_query(sql=query, con=connection, coerce_float=coerce_float, index_col=index_col, parse_dates=parse_dates)300            execute_time = timer() - start_time301            302            connection.close()        303        except:304            print('Database Query Fialed!:\n{}\n'.format(traceback.format_exc()))305            DataFrame=pd.DataFrame()306    else:307        print('No Query provided !')308        DataFrame=pd.DataFrame()309        310    print('{:,d} records were loaded. execute time = {} s'.format(len(DataFrame.index), execute_time))311   312    return DataFrame313def write_data_sql(DataFrame, server=None, database=None, schema=None, table=None, index=False, dtypes=None, if_exists='fail', auth=None, dbms='mssql', params=None):314    """315    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html316    317    Parameters318    ----------319    DataFrame : pandas.DataFrame320        DataFrame321    server : str322        Database Server323    database : str324        Database325    schema : str326        Database Schema327    table : str328        Table name329    if_exists : {'fail', 'replace', 'append'}, default 'fail'330        Action if the table already exists.331    auth :  dict332        e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication333             auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication334    335    Returns336    -------337    None338    """ 339    340    # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server341    driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 342    autocommit = 'True'343    fast_executemany = True344    execute_time = 0345    346    if server!=None and  database!=None and schema!=None and table!=None and auth!=None : 347        try:348            if auth['type']=='machine':349                #connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;' #ODBC (slow)350                connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'351                connect_string = urllib.parse.quote_plus(connect_string)352            elif auth['type']=='user':353                uid =  auth['uid'] 354                pwd =  auth['pwd'] 355                #connect_string = r'Driver={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'}' #ODBC (slow)356                connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'357                connect_string = urllib.parse.quote_plus(connect_string)358            else:359                raise Exception('No db server authentication method provided !') 360                361            #connection = pyodbc.connect(connect_string) #ODBC (slow)362            engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect="+connect_string, fast_executemany=fast_executemany)363            connection = engine364            365            start_time = timer() 366            if dtypes==None:367                DataFrame.to_sql(name=table, con=connection, schema=schema, index= index, if_exists=if_exists)368            else:369                DataFrame.to_sql(name=table, con=connection, schema=schema, index= index, dtype=dtypes, if_exists=if_exists)370            execute_time = timer() - start_time371            372            #connection.close() 373            engine.dispose()374            rowcount = len(DataFrame.index)375        except:376            print('Database Query Failed! Check If ODBC driver installed. \nIf not, Download ODBC Driver from https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-serve.:\n{}\n'.format(traceback.format_exc()))377            rowcount = 0378    else:379        print('Check the destiniation table path (server, database, schema, table, auth) !')380        rowcount = 0381    382    print('{:,d} records were written. execute time = {} s'.format(rowcount, execute_time))383    384    return rowcount385def execute_sql_query(query=None, server=None, database=None, auth=None, params=None, dbms='mssql', on_error='ignore'):386    """387    Parameters388    ----------389    query : str390        SQL SELECT query391    server : str392        Database Server393    database : str394        Database395    auth :  dict396        e.g. auth = {'type':'user', 'uid':'user', 'pwd':'password'} for username password authentication397             auth = {'type':'machine', 'uid':None, 'pwd':None} for machine authentication398    params : dict399        extra parameters (not implemented)400        401    Returns402    -------403    DataFrame : pandas.DataFrame404    """        405    # Download ODBC Driver https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server406    driver = 'ODBC Driver 13 for SQL Server' # 'SQL Server' # 407    autocommit = 'True'408    fast_executemany = True409    410    if server!=None and  database!=None and query!=None and auth!=None :411        try:412            if auth['type']=='machine':413                connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';TRUSTED_CONNECTION=yes;autocommit='+autocommit+';'414                connect_string = urllib.parse.quote_plus(connect_string)415                416            elif auth['type']=='user':417                uid =  auth['uid'] 418                pwd =  auth['pwd'] 419                connect_string = r'Driver={'+driver+'};SERVER='+server+';DATABASE='+database+';UID='+uid+'r;PWD='+pwd+'; autocommit='+autocommit+';'420                connect_string = urllib.parse.quote_plus(connect_string)421            else:422                raise Exception('No db server authentication method provided !')423            424            engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect="+connect_string, fast_executemany=fast_executemany)425            426            # connection427            connection = engine.connect()428            429            #transaction430            trans = connection.begin()431        432            # execute433            start_time = timer() 434            result = connection.execute(query)435            execute_time = timer() - start_time436            437            try:438                rowcount = result.rowcount439                print('{} rows affected. execute time = {} s'.format(rowcount,execute_time))440            except:441                rowcount = -1442                print('ERROR in fetching affected rows count. execute time = {} s'.format(execute_time))443                444            # commit445            trans.commit()446        447            # close connections, results set and dispose engine (moved to finally)448            #connection.close()449            #result.close()450            #engine.dispose()451        except:452            print(r'ERROR: Check If ODBC driver installed. \nIf not, Download ODBC Driver from https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server:\n{}\n'.format(traceback.format_exc()))453            rowcount = 0454        finally:455            # close connections, results set and dispose engine456            try:457                connection.close()458            except:459                print('Failed to close connection !')460            try:461                result.close()462            except:463                print('Failed to close results !')464            try:465                engine.dispose()466            except:467                print('Failed to dispose engine !')468            469        return rowcount      470def sql_server_database_list(server, auth=None, user_database_only=True, dbms='mssql'):471    """472    Reference: https://docs.microsoft.com/en-us/sql/relational-databases/system-compatibility-views/sys-sysdatabases-transact-sql?view=sql-server-2017473    """474    475    query = """476    SELECT 477        @@SERVERNAME AS [ServerName],478        NAME AS [DBName],479        STATUS AS [Status],480        CRDATE AS [CreateDate]481    FROM master.dbo.sysdatabases (NOLOCK)482    WHERE Name NOT IN ( 'master','tempdb','model' ,'msdb')483    """    484    DBList = read_data_sql(query=query, server=server, database='master', auth=auth, params=None)485    486    return DBList487    488def sql_server_database_usage_report(server, database, auth=None, schema=None, table=None, user_tables_only=True, dbms='mssql', unit='KB'):489    """490    Reference: https://docs.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-tables-transact-sql?view=sql-server-2017491    """492    493    if user_tables_only:494        user_tables_only_condition = "AND table.is_ms_shipped = 0 " # is_ms_shipped = 1 (indicates this object was shipped or created by Microsoft), 0 (indicates this object was created by a user)495    else:496        user_tables_only_condition = ""497        498    if schema != None:499        schema_condition = "AND schema.NAME = '{}'".format(schema)500    else:501        schema_condition = ""502    if table != None:503        table_condition = "AND table.NAME = '{}'".format(table)504    else:505        table_condition = ""506    #Unit conversion507    if unit == 'KB':508        multiplier = 1.0509    if unit  == 'MB':510        multiplier = 1.0/1024.0511    if unit  == 'GB':512        multiplier = 1.0/(1024.0*1024.0)  513    if unit  == 'TB':514        multiplier = 1.0/(1024.0*1024.0*1024.0) 515    516    if dbms == 'mssql':517        query = """518        SELECT519            @@SERVERNAME AS [Server],520            DB_Name() AS [DB],521            [schema].NAME AS [Schema],522            [table].NAME AS [Table],523            [table].CREATE_DATE AS [CreateDate],524            [table].MODIFY_DATE AS [ModifyDate],		525            [part].ROWS AS [Rows],526            SUM(alloc.total_pages) * 8 AS [TotalSpaceKBx],527            SUM(alloc.used_pages) * 8 AS [UsedSpaceKBx],528        FROM529            sys.tables [table] (NOLOCK)530        INNER JOIN     531            sys.indexes (NOLOCK) [ix] ON ([table].OBJECT_ID = [ix].OBJECT_ID)532        INNER JOIN533            sys.partitions (NOLOCK) [part] ON ([ix].OBJECT_ID = [part].OBJECT_ID AND ix.index_id = [part].index_id)534        INNER JOIN535            sys.allocation_units (NOLOCK) [alloc] ON ([part].PARTITION_ID = [alloc].container_id)536        LEFT OUTER JOIN537            sys.schemas [schema] (NOLOCK) ON ([table].SCHEMA_ID = [schema].SCHEMA_ID)538        WHERE539            [table].NAME IS NOT NULL540            {user_tables_only_condition}541            {table_condition}542            {schema_condition}543        GROUP BY544            [table].NAME, 545            [table].CREATE_DATE, 546            [table].MODIFY_DATE, 547            [schema].NAME, part.ROWS548        """.format(schema_condition=schema_condition, table_condition=table_condition, user_tables_only_condition=user_tables_only_condition)549        550        DBUsageReport = read_data_sql(query=query, server=server, database=database, auth=auth, params=None)551        552        DBUsageReport['TotalSpaceKBx'] = DBUsageReport['TotalSpaceKBx'].fillna(0)553        DBUsageReport['UsedSpaceKBx'] = DBUsageReport['UsedSpaceKBx'].fillna(0)554        DBUsageReport['AvaiableSpaceKBx'] = DBUsageReport['TotalSpaceKBx'] - DBUsageReport['UsedSpaceKBx']555        556        DBUsageReport['TotalSpace{}'.format(unit)] = DBUsageReport['TotalSpaceKBx'] * multiplier557        DBUsageReport['UsedSpace{}'.format(unit)] = DBUsageReport['UsedSpaceKBx'] * multiplier558        DBUsageReport['AvaiableSpace{}'.format(unit)] = DBUsageReport['AvaiableSpaceKBx'] * multiplier559        560        DBUsageReport = DBUsageReport.drop(columns=['TotalSpaceKBx', 'UsedSpaceKBx', 'AvaiableSpaceKBx'])561    else:562        DBUsageReport = pd.DataFrame()563        print('This function currently supported for MSSQL server only')564    565    return DBUsageReport566###############################################################################567##[ VALIDATE FIELDS]##########################################################      568###############################################################################569        570def add_identity_column(DataFrame, id_label='ID', start=1, increment=1):571    if id_label in DataFrame.columns:572        print('Column {} exists in the DataFrame'.format(id_label))573        return DataFrame574    else:575        DataFrame.reset_index(drop=True, inplace=True)576        DataFrame.insert(0, id_label, start+DataFrame.index)577        return DataFrame578    579def remove_special_characters(str_val, replace=''):580    return re.sub('\W+',replace, str_val)581def remove_special_characters_list(str_list, replace=''):582    return [remove_special_characters(str_val, replace=replace) for str_val in str_list]583    584def clean_column_names(DataFrame, replace=''): # Remove special charcters from column names585    """586    Parameters587    ----------588    DataFrame : pandas.DataFrame589        DataFrame590    replace : str, dafault ''591        Character to replace special charaters with.    592    593    Returns594    -------595    DataFrame : pandas.DataFrame596    """597    try:598        columns = DataFrame.columns599        columns = remove_special_characters_list(columns, replace=replace)600        if check_list_values_unique(columns):601            DataFrame.columns = columns602        else:603            print('Duplicates values excists the column names after removing special characters!. Column names were rolled-back to initial values.')        604    except:605        print('Error in removing special characters from column names:\n{}\n'.format(traceback.format_exc()))606    return DataFrame607def check_list_values_unique(values_list):608    if len(values_list) == len(set(values_list)):609        return True610    else:611        return False612    613def handle_duplicate_columns(DataFrame, action='rename'): #'drop'614    """615    Parameters616    ----------617    DataFrame : pandas.DataFrame618        DataFrame619    action : {'rename', 'drop'}, dafault 'rename'620        Action to be taken on duplicate columns    621    622    Returns623    -------624    DataFrame : pandas.DataFrame625    """626    is_duplicate = DataFrame.columns.duplicated()627    columns = list(DataFrame.columns)628    if action=='rename':629        for i in range(len(columns)):630            if is_duplicate[i]:631               columns[i]=columns[i]+'_' 632        DataFrame.columns = columns633    elif action=='drop':634        DataFrame = DataFrame.loc[:,~is_duplicate]635    else:636        print('No valid action (rename or drop) provided!')637    return DataFrame638def add_missing_feature_columns(DataFrame, expected_features, fill_value=0):639    # Blanck columns for non-existance variables640    feature_variables_to_add = list(set(expected_features) - set(DataFrame.columns)) # Find columns not found in the dataset641    for f in feature_variables_to_add:642        DataFrame[f]=fill_value643        print('Column [{}] does not exist in the dataset. Created new column and set to {}...'.format(f,fill_value))644    return DataFrame645def exclude_records(DataFrame, exclude_condition=None, action = 'flag', exclude_label='_EXCLUDE_'):646    N0 = len(DataFrame.index)647    if exclude_condition==None:648        print('No exclude condition...')649        return DataFrame650    651    try:652        if action=='drop': #Drop Excludes        653            DataFrame = DataFrame.query('not ({})'.format(exclude_condition))654        elif action=='flag': #Create new flagged column655            DataFrame[exclude_label] = DataFrame.eval(exclude_condition).astype('int8')656            print('Records {} -> {}=1'.format(exclude_condition, exclude_label))657    except:658        print('Error in excluding records {}:\n{}\n'.format(exclude_condition, traceback.format_exc()))659    N1 = len(DataFrame.index)    660    print('{} records were excluded'.format(N1-N0))661    return DataFrame662###############################################################################663##[ CREATING FEATURES - TARGET ]###############################################      664############################################################################### 665    666def set_binary_target(DataFrame, to_variable='_TARGET_', condition_str=None, default=0, null=0, return_variable=False, return_script=False):667    if condition_str==None: 668        return DataFrame669    670    DataFrame, to_variable = create_binary_variable(DataFrame, to_variable, condition_str, default=default, null=null, return_variable=True)671    parameters = {672            'condition_str':condition_str,673            'default':default,674            'null':null675            }    676    script_dict = generate_create_variable_task_script(type='target', out_type='bin', include=False, operation='condition', source=None, destination=to_variable, parameters=parameters)677    678    if return_script and return_variable:679        return DataFrame, to_variable, script_dict680    elif return_script:681        return DataFrame, script_dict682    elif return_variable:683        return DataFrame, to_variable684    else:685        return DataFrame 686    687###############################################################################688##[ CREATING FEATURES - TRANSFORMATIONS]#######################################      689############################################################################### 690def create_normalized_variable(DataFrame, variable, method='maxscale', parameters=None, to_variable=None, return_variable=False, return_script=False):691    if to_variable==None:692        to_variable = variable693        694    if method=='minscale': #scale=max695        try:696            min_ = parameters["min"]697        except:698            min_ = DataFrame[variable].min()699            parameters["min"] = min_700        DataFrame[to_variable] = DataFrame[variable]/min_701    if method=='maxscale': #scale=max702        try:703            max_ = parameters["max"]704        except:705            max_ = DataFrame[variable].max()706            parameters["max"] = max_707        DataFrame[to_variable] = DataFrame[variable]/max_708    if method=='range': # range = abs(max-min)709        try:710            min_ = parameters["min"]711            max_ = parameters["max"]712        except:713            min_ = DataFrame[variable].min()714            max_ = DataFrame[variable].max()    715            parameters["min"] = min_716            parameters["max"] = max_717        min_max = abs(min_-max_)718        DataFrame[to_variable] = DataFrame[variable]/min_max719    if method=='minmaxfs': # range = (value-min)/(max-min)720        try:721            min_ = parameters["min"]722            max_ = parameters["max"]723        except:724            min_ = DataFrame[variable].min()725            max_ = DataFrame[variable].max() 726            parameters["min"] = min_727            parameters["max"] = max_728        min_max = abs(max_-min_)729        DataFrame[to_variable] = (DataFrame[variable]-min_)/min_max730    if method=='minmaxfs_m': # range = (value-min)/(max-min)731        try:732            min_ = parameters["min"]733            max_ = parameters["max"]734            mean_ = parameters["mean"]735        except:  736            min_=DataFrame[variable].min()737            max_=DataFrame[variable].max()738            mean_ = DataFrame[variable].mean()739            parameters["min"] = min_740            parameters["max"] = max_741            parameters["mean"] = mean_742        min_max = abs(max_-min_)743        DataFrame[to_variable] = (DataFrame[variable]-mean_)/min_max744    if method=='mean':745        try:746            mean_ = parameters["mean"]747        except:  748            mean_ = DataFrame[variable].mean()749            parameters["mean"] = mean_750        DataFrame[to_variable] = DataFrame[variable]/mean_751    if method=='median':752        try:753            median_ = parameters["median"]754        except:  755            median_ = DataFrame[variable].median()756            parameters["median"] = median_757        DataFrame[to_variable] = DataFrame[variable]/median_758    if method=='zscore':  759        try:760            std_ = parameters["std"]761            mean_ = parameters["mean"]762        except: 763            std_ = DataFrame[variable].std()764            mean_ = DataFrame[variable].mean()765            parameters["mean"] = mean_766            parameters["std"] = std_767        DataFrame[to_variable] = (DataFrame[variable] - mean_)/std_  768 769    script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 770                                                       include=False, operation='normalize', 771                                                       source=variable, destination=to_variable, 772                                                       parameters=parameters)773     774    if return_script and return_variable:775        return DataFrame, to_variable, script_dict776    elif return_script:777        return DataFrame, script_dict778    elif return_variable:779        return DataFrame, to_variable780    else:781        return DataFrame 782def create_datepart_variable(DataFrame, variable, to_variable=None, part='date', return_variable=False, return_script=False):783    if to_variable==None:784        to_variable = '{}{}'.format(variable,part)785        786    try:787        DataFrame[variable] = pd.to_datetime(DataFrame[variable])788        if part=='date':789            DataFrame[to_variable] = DataFrame[variable].dt.date790        elif part=='year':791            DataFrame[to_variable] = DataFrame[variable].dt.year792        elif part=='quarter':793            DataFrame[to_variable] = DataFrame[variable].dt.quarter794        elif part=='month':795            DataFrame[to_variable] = DataFrame[variable].dt.month796        elif part=='week':797            DataFrame[to_variable] = DataFrame[variable].dt.week798        elif part=='day':799            DataFrame[to_variable] = DataFrame[variable].dt.day  800        elif part=='dayofweek':801            DataFrame[to_variable] = DataFrame[variable].dt.dayofweek802        elif part=='dayofyear':803            DataFrame[to_variable] = DataFrame[variable].dt.dayofyear804        elif part=='time':805            DataFrame[to_variable] = DataFrame[variable].dt.time806        elif part=='hour':807            DataFrame[to_variable] = DataFrame[variable].dt.hour808        elif part=='minute':809            DataFrame[to_variable] = DataFrame[variable].dt.minute810        elif part=='second':811            DataFrame[to_variable] = DataFrame[variable].dt.second812        elif part=='microsecond':813            DataFrame[to_variable] = DataFrame[variable].dt.microsecond814        elif part=='nanosecond':815            DataFrame[to_variable] = DataFrame[variable].dt.nanosecond816        else:817            DataFrame[to_variable] = variable818    except:819        DataFrame[to_variable] = variable820    parameters = {'part':part}    821    script_dict = generate_create_variable_task_script(type='transform', out_type='dat', 822                                                       include=False, operation='datepart', 823                                                       source=variable, destination=to_variable, 824                                                       parameters=parameters)825   826    if return_script and return_variable:827        return DataFrame, to_variable, script_dict828    elif return_script:829        return DataFrame, script_dict830    elif return_variable:831        return DataFrame, to_variable832    else:833        return DataFrame 834def create_dateadd_variable(DataFrame, variable, to_variable=None, unit='years', value=0, return_variable=False, return_script=False):835    if to_variable==None:836        to_variable = '{}{}{}'.format(variable, value, unit)837        838    try:839        DataFrame[variable] = pd.to_datetime(DataFrame[variable])840        if part=='years':841            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(year=value)842        elif part=='months':843            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(months=value)844        elif part=='weeks':845            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(weeks=value)846        elif part=='days':847            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(days=value)848        elif part=='hours':849            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(hours=value)850        elif part=='minutes':851            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(minutes=value)852        elif part=='seconds':853            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(seconds=value)854        elif part=='microseconds':855            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(microseconds=value)856        elif part=='nanoseconds':857            DataFrame[to_variable] = DataFrame[variable] + pd.DateOffset(nanoseconds=value)        858    except:859        DataFrame[to_variable] = variable860    parameters = {861        'unit':unit, 862        'value':value863        }    864    script_dict = generate_create_variable_task_script(type='transform', out_type='dat', 865                                                       include=False, operation='dateadd', 866                                                       source=variable, destination=to_variable, 867                                                       parameters=parameters)868        869    if return_script and return_variable:870        return DataFrame, to_variable, script_dict871    elif return_script:872        return DataFrame, script_dict873    elif return_variable:874        return DataFrame, to_variable875    else:876        return DataFrame   877def create_log_variable(DataFrame, variable, base='e', to_variable=None, return_variable=False, return_script=False):878    if to_variable==None:879        to_variable = 'LOG{}'.format(variable)880        881    if base=='e':882        DataFrame[to_variable] = np.log(DataFrame[variable])883    elif base=='10':884        DataFrame[to_variable] = np.log10(DataFrame[variable])885    elif base=='2':886        DataFrame[to_variable] = np.log2(DataFrame[variable])887    parameters = { 'base':base }888    script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 889                                                       include=False, operation='log', 890                                                       source=variable, destination=to_variable, 891                                                       parameters=parameters) 892        893    if return_script and return_variable:894        return DataFrame, to_variable, script_dict895    elif return_script:896        return DataFrame, script_dict897    elif return_variable:898        return DataFrame, to_variable899    else:900        return DataFrame     901    902def create_exponent_variable(DataFrame, variable, base='e', to_variable=None, return_variable=False, return_script=False):903    if to_variable==None:904        to_variable = 'EXP{}'.format(variable)905        906    if base=='e':907        DataFrame[to_variable] = np.e**DataFrame[variable]908    elif base=='10':909        DataFrame[to_variable] = 10**DataFrame[variable]910    elif base=='2':911        DataFrame[to_variable] = 2**DataFrame[variable]912    parameters = { 'base':base }913    script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 914                                                       include=False, operation='exponent', 915                                                       source=variable, destination=to_variable, 916                                                       parameters=parameters) 917        918    if return_script and return_variable:919        return DataFrame, to_variable, script_dict920    elif return_script:921        return DataFrame, script_dict922    elif return_variable:923        return DataFrame, to_variable924    else:925        return DataFrame 926def create_segmented_variable(DataFrame, variable, a=None, b=None, to_variable=None, return_variable=False, return_script=False):927    if to_variable==None:928        to_variable = 'SEG{}'.format(variable)  929    930    if a == None:931        a = -np.inf932    933    if b == None:934        b = np.inf935        936    DataFrame[to_variable] = DataFrame[variable]937    DataFrame.loc[DataFrame[to_variable]<a, to_variable] = a938    DataFrame.loc[DataFrame[to_variable]>b, to_variable] = b939    parameters = { 'a':a, 'b':b }940    script_dict = generate_create_variable_task_script(type='transform', out_type='cnt', 941                                                       include=False, operation='segment', 942                                                       source=variable, destination=to_variable, 943                                                       parameters=parameters)     944    if return_script and return_variable:945        return DataFrame, to_variable, script_dict946    elif return_script:947        return DataFrame, script_dict948    elif return_variable:949        return DataFrame, to_variable950    else:951        return DataFrame         952###############################################################################953##[ CREATING FEATURES - STR TRANSFORM ]########################################      954############################################################################### 955        956def create_str_count_variable(DataFrame, variable, pattern='*', case_sensitive=True, to_variable=None, return_variable=False, return_script=False):957    if to_variable==None:958        to_variable = '{}CNT{}'.format(variable, remove_special_characters(pattern, replace=''))959    try:960        if pattern=='*':961            DataFrame[to_variable] = DataFrame[variable].str.len()962        else:963            DataFrame[to_variable] = DataFrame[variable].str.count(pattern) 964    except:965        print('ERROR in create_str_count_variable:\n{}'.format(traceback.format_exc()))966        DataFrame[to_variable] = DataFrame[variable]967    parameters = { 'pattern':pattern, 'case_sensitive':case_sensitive }968    script_dict = generate_create_variable_task_script(type='transform_str', out_type='cnt', 969                                                       include=False, operation='strcount', 970                                                       source=variable, destination=to_variable, 971                                                       parameters=parameters) 972            973    if return_script and return_variable:974        return DataFrame, to_variable, script_dict975    elif return_script:976        return DataFrame, script_dict977    elif return_variable:978        return DataFrame, to_variable979    else:980        return DataFrame 981 982def create_str_normalized_variable(DataFrame, variable, to_case='lower', chars='keep', numbers='remove', spchar='remove', space='remove', to_variable=None, return_variable=False, return_script=False):983    if to_variable==None:984        to_variable = '{}'.format(variable)985    986    try:    987        DataFrame[to_variable] = DataFrame[variable]988        989        if to_case=='lower':990            DataFrame[to_variable] = DataFrame[variable].str.lower()991        if to_case=='upper':992            DataFrame[to_variable] = DataFrame[variable].str.upper()993        if numbers=='remove':994            DataFrame[to_variable] = DataFrame[variable].str.replace('\d','')    995        if spchar=='remove':996            DataFrame[to_variable] = DataFrame[variable].str.replace('\W','')   997        if space=='remove':998            DataFrame[to_variable] = DataFrame[variable].str.replace('\s','')       999        if chars=='remove':1000            DataFrame[to_variable] = DataFrame[variable].str.replace('\w','') 1001    except:1002        print('ERROR in create_str_normalized_variable:\n{}'.format(traceback.format_exc()))1003        DataFrame[to_variable] = DataFrame[variable]1004    parameters = { 1005        'to_case':to_case, 1006        'chars':chars,1007        'numbers':numbers,1008        'spchar':spchar, 1009        'space':space1010    }1011    script_dict = generate_create_variable_task_script(type='transform_str', out_type='str', 1012                                                       include=False, operation='normalize', 1013                                                       source=variable, destination=to_variable, 1014                                                       parameters=parameters) 1015        1016    if return_script and return_variable:1017        return DataFrame, to_variable, script_dict1018    elif return_script:1019        return DataFrame, script_dict1020    elif return_variable:1021        return DataFrame, to_variable1022    else:1023        return DataFrame  1024def create_str_extract_variable(DataFrame, variable, pattern='\w+', case_sensitive=True, to_variable=None, return_variable=False, return_script=False): 1025    if to_variable==None:1026        to_variable = 'variableEXT'.format(variable)1027    try:1028        if case_sensitive:    1029            DataFrame[to_variable] = DataFrame[variable].str.extract('({})'.format(pattern))1030        else:1031            DataFrame[to_variable] = DataFrame[variable].str.extract('({})'.format(pattern), flags=re.IGNORECASE)1032    except:1033        print('ERROR in create_str_extract_variable:\n{}'.format(traceback.format_exc()))1034        DataFrame[to_variable] = DataFrame[variable]1035    parameters = { 1036        'pattern':pattern, 1037        'case_sensitive':case_sensitive1038    }1039    script_dict = generate_create_variable_task_script(type='transform_str', out_type='str', 1040                                                       include=False, operation='extract', 1041                                                       source=variable, destination=to_variable, 1042                                                       parameters=parameters) 1043        1044    if return_script and return_variable:1045        return DataFrame, to_variable, script_dict1046    elif return_script:1047        return DataFrame, script_dict1048    elif return_variable:1049        return DataFrame, to_variable1050    else:1051        return DataFrame 1052###############################################################################1053##[ CREATING FEATURES - MULTI VARIABLE ]#######################################      1054###############################################################################    1055def create_operation_mult_variable(DataFrame, expression_str='0', to_variable=None, return_variable=False, return_script=False):1056    if to_variable==None:1057        to_variable = '{}'.format(expression_str)1058    1059    try:1060        DataFrame[to_variable] = DataFrame.eval(expression_str)1061    except:1062        print('ERROR in create_operation_mult_variable:\n{}'.format(traceback.format_exc()))1063    parameters = { 'expression_str':expression_str}1064    script_dict = generate_create_variable_task_script(type='operation_mult', out_type='cnt', 1065                                                       include=False, operation='expression', 1066                                                       source=None, destination=to_variable, 1067                                                       parameters=parameters) 1068    if return_script and return_variable:1069        return DataFrame, to_variable, script_dict1070    elif return_script:1071        return DataFrame, script_dict1072    elif return_variable:1073        return DataFrame, to_variable1074    else:1075        return DataFrame  1076###############################################################################1077##[ CREATING FEATURES - SEQUENCE ORDER ]#######################################      1078###############################################################################1079def create_sequence_order_variable(DataFrame, variable1a, variable2a, variable1b, variable2b, output='binary', to_variable=None, return_variable=False, return_script=False):1080    if to_variable==None:1081        to_variable = '{}{}SEQ{}{}'.format(variable1a, variable2a, variable1b, variable2b)1082        1083    try:1084        DataFrame[to_variable] = DataFrame[variable] ########### NEED UPDATE !!!!1085    except:1086        print('ERROR in create_sequence_order_variable:\n{}'.format(traceback.format_exc()))1087        DataFrame[to_variable] = DataFrame[variable]1088    parameters = { 'output':output }1089    script_dict = generate_create_variable_task_script(type='sequence', out_type='cnt', 1090                                                       include=False, operation='seqorder', 1091                                                       source=[variable1a, variable2a, variable1b, variable2b], 1092                                                       destination=to_variable, 1093                                                       parameters=parameters) 1094        1095    if return_script and return_variable:1096        return DataFrame, to_variable, script_dict1097    elif return_script:1098        return DataFrame, script_dict1099    elif return_variable:1100        return DataFrame, to_variable1101    else:1102        return DataFrame  1103    1104###############################################################################1105##[ CREATING FEATURES - DIFFERENCES ]##########################################      1106############################################################################### 1107def create_numeric_difference_variable(DataFrame, variable1, variable2, multiplier=1, onerror=None, to_variable=None, return_variable=False, return_script=False):1108    if to_variable==None:1109        to_variable = '{}DIFF{}'.format(variable1, variable2)1110    1111    try:1112        DataFrame[variable1] = pd.to_numeric(DataFrame[variable1], errors='coerce')1113        DataFrame[variable2] = pd.to_numeric(DataFrame[variable2], errors='coerce')        1114        DataFrame[to_variable] = multiplier*(DataFrame[variable1] - DataFrame[variable2])1115    except:1116        DataFrame[to_variable] = None1117        print('Data Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc()))  1118    parameters = { 1119                    'multiplier':multiplier,1120                    'onerror': onerror1121    }1122    script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1123                                                       include=False, operation='numdiff', 1124                                                       source=[variable1, variable2], 1125                                                       destination=to_variable, 1126                                                       parameters=parameters) 1127    1128    if return_script and return_variable:1129        return DataFrame, to_variable, script_dict1130    elif return_script:1131        return DataFrame, script_dict1132    elif return_variable:1133        return DataFrame, to_variable1134    else:1135        return DataFrame 1136def create_numeric_ratio_variable(DataFrame, variable1, variable2, multiplier=1, onerror=None, to_variable=None, return_variable=False, return_script=False):1137    if to_variable==None:1138        to_variable = '{}DIV{}'.format(variable1, variable2)1139    1140    try:1141        DataFrame[variable1] = pd.to_numeric(DataFrame[variable1], errors='coerce')1142        DataFrame[variable2] = pd.to_numeric(DataFrame[variable2], errors='coerce')        1143        DataFrame[to_variable] = multiplier*(DataFrame[variable1]/DataFrame[variable2])1144    except:1145        DataFrame[to_variable] = None1146        print('Data Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc()))  1147    parameters = { 1148                    'multiplier':multiplier,1149                    'onerror': onerror1150    }1151    script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1152                                                       include=False, operation='ratio', 1153                                                       source=[variable1, variable2], 1154                                                       destination=to_variable, 1155                                                       parameters=parameters) 1156    1157    if return_script and return_variable:1158        return DataFrame, to_variable, script_dict1159    elif return_script:1160        return DataFrame, script_dict1161    elif return_variable:1162        return DataFrame, to_variable1163    else:1164        return DataFrame 1165    1166def create_date_difference_variable(DataFrame, variable1, variable2, to_variable=None, unit='day', onerror=None, return_variable=False, return_script=False):1167    if to_variable==None:1168        to_variable = '{}DIFF{}'.format(variable1,variable2)1169    1170    try:1171        DataFrame[variable1] = pd.to_datetime(DataFrame[variable1])1172        DataFrame[variable2] = pd.to_datetime(DataFrame[variable2])        1173        DataFrame[to_variable] = DataFrame[variable2] - DataFrame[variable1]1174        DataFrame[to_variable]=DataFrame[to_variable]/np.timedelta64(1,unit)1175    except:1176        DataFrame[to_variable] = None1177        print('Date Type Error in {}, {} : {} '.format(variable1, variable2, traceback.format_exc()))  1178    parameters = { 1179                    'unit':unit,1180                    'onerror': onerror1181    }1182    script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1183                                                       include=False, operation='datediff', 1184                                                       source=[variable1, variable2], 1185                                                       destination=to_variable, 1186                                                       parameters=parameters) 1187    1188    if return_script and return_variable:1189        return DataFrame, to_variable, script_dict1190    elif return_script:1191        return DataFrame, script_dict1192    elif return_variable:1193        return DataFrame, to_variable1194    else:1195        return DataFrame    1196def create_row_min_variable(DataFrame, variable1, variable2, to_variable=None, return_variable=False, return_script=False):1197    1198    if to_variable==None:1199        to_variable = '{}MIN{}'.format(variable1,variable2)1200    1201    try:1202        DataFrame[to_variable] = DataFrame[[variable1,variable2]].min(axis=1)1203    except:1204        DataFrame[to_variable] = None1205        print('Row min({}, {}) Error: {}'.format(variable1, variable2, traceback.format_exc()))  1206    parameters = {  }1207    script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1208                                                       include=False, operation='rowmin', 1209                                                       source=[variable1, variable2], 1210                                                       destination=to_variable, 1211                                                       parameters=parameters) 1212    1213    if return_script and return_variable:1214        return DataFrame, to_variable, script_dict1215    elif return_script:1216        return DataFrame, script_dict1217    elif return_variable:1218        return DataFrame, to_variable1219    else:1220        return DataFrame   1221    1222    1223def create_row_max_variable(DataFrame, variable1, variable2, to_variable=None, return_variable=False, return_script=False):1224    1225    if to_variable==None:1226        to_variable = '{}MAX{}'.format(variable1,variable2)1227    1228    try:1229        DataFrame[to_variable] = DataFrame[[variable1,variable2]].max(axis=1)1230    except:1231        DataFrame[to_variable] = None1232        print('Row max({}, {}) Error : {}'.format(variable1, variable2, traceback.format_exc()))  1233    parameters = {  }1234    script_dict = generate_create_variable_task_script(type='comparison', out_type='cnt', 1235                                                       include=False, operation='rowmax', 1236                                                       source=[variable1, variable2], 1237                                                       destination=to_variable, 1238                                                       parameters=parameters) 1239    1240    if return_script and return_variable:1241        return DataFrame, to_variable, script_dict1242    elif return_script:1243        return DataFrame, script_dict1244    elif return_variable:1245        return DataFrame, to_variable1246    else:1247        return DataFrame  1248    1249###############################################################################1250##[ CREATING FEATURES - STR COMPARISON ]#######################################      1251############################################################################### 1252def create_str_comparison_variable(DataFrame, variable1, variable2, to_variable=None, operation='levenshtein', parameters={}, return_variable=False, return_script=False):    1253    if to_variable==None:1254        to_variable = '{}SIM{}'.format(variable1,variable2)1255        1256    try:1257        case_sensitive = parameters['case_sensitive']1258    except:1259        case_sensitive = True1260        1261    if operation=='levenshtein':1262        try:1263            normalize = parameters['normalize']1264        except:1265            normalize = False1266        DataFrame[to_variable] = np.vectorize(damerau_levenshtein_distance)(DataFrame[variable1], DataFrame[variable2], case_sensitive, normalize)1267    elif operation=='jaccard':1268        try:1269            method=parameters['method']1270        except:1271            method='substring'1272        try:1273            min_length=parameters['min_length']1274        except:1275            min_length=11276        try:1277            max_length=parameters['max_length']1278        except:    1279            max_length=np.inf1280            1281        DataFrame[to_variable] = np.vectorize(jaccard_index)(DataFrame[variable1], DataFrame[variable2], method, case_sensitive, min_length, max_length)1282    script_dict = generate_create_variable_task_script(type='comparison_str', out_type='cnt', 1283                                                       include=False, operation=operation, 1284                                                       source=[variable1, variable2], 1285                                                       destination=to_variable, 1286                                                       parameters=parameters)1287    1288    if return_script and return_variable:1289        return DataFrame, to_variable, script_dict1290    elif return_script:1291        return DataFrame, script_dict1292    elif return_variable:1293        return DataFrame, to_variable1294    else:1295        return DataFrame      1296###############################################################################1297##[ CREATING FEATURES - BINARY VARIABLES]######################################      1298############################################################################### 1299        1300def create_binary_variable(DataFrame, to_variable, condition_str, default=0, null=0, return_variable=False, return_script=False):1301    1302    if to_variable==None:1303        to_variable = '{}'.format(condition_str)1304    try:    1305        DataFrame[to_variable] = DataFrame.eval(condition_str).astype('int8').fillna(null)1306        DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1307    except:1308        print('Error in creating the binary variable {}:\n{}\n'.format(condition_str, traceback.format_exc()))1309        print('Check variable rule set !')1310    parameters = { 1311                    'condition_str':condition_str,1312                    'default': default,1313                    'null': null1314    }1315    script_dict = generate_create_variable_task_script(type='condition', out_type='bin', 1316                                                       include=False, operation='condition', 1317                                                       source=None, 1318                                                       destination=to_variable, 1319                                                       parameters=parameters) 1320            1321    if return_script and return_variable:1322        return DataFrame, to_variable, script_dict1323    elif return_script:1324        return DataFrame, script_dict1325    elif return_variable:1326        return DataFrame, to_variable1327    else:1328        return DataFrame 1329    1330###############################################################################1331##[ CREATING FEATURES - CATEGORY LABELS]#######################################      1332############################################################################### 1333  1334def num_label_to_value(num_label):1335    units = {'p':0.000000000001,1336        'n':0.000000001,1337        'u':0.000001,1338        'm':0.001,1339        'c':0.01,1340        'd':0.1,1341        '':1,1342        'D':10,1343        'H':100,1344        'K':1000,1345        'M':1000000,1346        'G':1000000000,1347        'T':1000000000000,1348        'P':1000000000000000,1349        'E':1000000000000000000,1350        'INF':np.inf        1351        }1352    try:1353        sign, inf, num, unit = re.findall('^([-]?)((\d+)([pnumcdDHKMGTPE]?)|INF)$', num_label.rstrip().lstrip())[0]1354        if inf=='INF':1355            value = int('{}1'.format(sign))*np.inf1356        else:1357            value = int('{}1'.format(sign))*float(num)*units[unit]1358    except:1359        print('vnum_label_value failed !\n{}'.format(traceback.format_exc()))1360        value = None1361    return value1362def edge_labels_to_values(edge_labels, left_inclusive=False, right_inclusive=False):1363    """1364    Parameters1365    ----------1366    edge_labels : str []1367        Edge labels with number unit as postfix1368            'p':0.000000000001,1369            'n':0.000000001,1370            'u':0.000001,1371            'm':0.001,1372            'c':0.01,1373            'd':0.1,1374            '':1,1375            'D':10,1376            'H':100,1377            'K':1000,1378            'M':1000000,1379            'G':1000000000,1380            'T':1000000000000,1381            'P':1000000000000000,1382            'INF':np.inf        1383    left_inclusive : bool, default False1384        Include left edge1385    right_inclusive : bool, default False1386        Include right edge1387    1388    Returns1389    -------1390    edge_values : numeric []1391    bin_labels : str []1392    """ 1393    edge_values = []1394    bin_labels = []1395    n_bins = len(edge_labels)-11396    i=01397    for i in range(n_bins):        1398        l_bracket = '(' if (i==0 and edge_labels[i]=='-INF') or (not left_inclusive) else '['1399        r_bracket = ')' if (i==n_bins-1 and edge_labels[i+1]=='INF') or (not right_inclusive) else ']'1400        edge_values.append(num_label_to_value(edge_labels[i]))1401        bin_labels.append('{}_{}{},{}{}'.format(i+1, l_bracket, edge_labels[i], edge_labels[i+1], r_bracket))1402    edge_values.append(num_label_to_value(edge_labels[n_bins]))1403    return edge_values,bin_labels1404###############################################################################1405##[ CREATING FEATURES - CATEGORY]##############################################      1406###############################################################################     1407    1408def create_categorical_variable(DataFrame, variable, to_variable, labels_str, right_inclusive=True, default='OTHER', null='NA', return_variable=False, return_script=False):1409    1410    if to_variable==None:1411        to_variable = '{}GRP'.format(variable)1412        1413    try:1414        default_ = '0_{}'.format(default)1415        null_ = '0_{}'.format(null)1416    except:1417        default_ = '0_Other'1418        null_ = '0_NA'1419    edge_values, bin_labels = edge_labels_to_values(labels_str, left_inclusive=not right_inclusive, right_inclusive=right_inclusive)1420    1421    try:    1422        DataFrame[to_variable] = pd.cut(DataFrame[variable], bins=edge_values, labels=bin_labels, right=right_inclusive, include_lowest=True).astype('object')1423    except:1424        DataFrame[to_variable] = null_1425    DataFrame.loc[DataFrame[variable].isna(), to_variable] = null_1426    DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default_1427    parameters = { 1428                    'labels_str':labels_str,1429                    'right_inclusive': right_inclusive,1430                    'default': default,1431                    'null': null1432    }1433    script_dict = generate_create_variable_task_script(type='category', out_type='cat', 1434                                                       include=False, operation='bucket', 1435                                                       source=variable, 1436                                                       destination=to_variable, 1437                                                       parameters=parameters) 1438        1439    if return_script and return_variable:1440        return DataFrame, to_variable, script_dict1441    elif return_script:1442        return DataFrame, script_dict1443    elif return_variable:1444        return DataFrame, to_variable1445    else:1446        return DataFrame 1447def merge_categories(DataFrame, variable, to_variable, values, group_value, return_variable=False, return_script=False):1448    if to_variable==None:1449        to_variable = variable1450    1451    try:    1452        DataFrame[to_variable] = DataFrame[variable].replace(to_replace=values, value=group_value)1453    except:1454        print('ERROR in creating the categorical variable merge {}:\n{}\n'.format(variable, traceback.format_exc()))1455        print('Check variable rule set !')1456        1457    parameters = { 1458                    'group_value':group_value,1459                    'values': values1460    }1461    script_dict = generate_create_variable_task_script(type='category_merge', out_type='cat', 1462                                                       include=False, operation='catmerge', 1463                                                       source=variable, 1464                                                       destination=to_variable, 1465                                                       parameters=parameters)     1466    if return_script and return_variable:1467        return DataFrame, to_variable, script_dict1468    elif return_script:1469        return DataFrame, script_dict1470    elif return_variable:1471        return DataFrame, to_variable1472    else:1473        return DataFrame  1474    1475###############################################################################1476##[ CREATING FEATURES - ENTITY (DICTIONARY) ]##################################      1477###############################################################################1478def create_entity_variable(DataFrame, variable, to_variable, dictionary, match_type=None, default='OTHER', null='NA', return_variable=False, return_script=False):1479    if to_variable==None:1480        to_variable = '{}GRP'.format(variable)1481              1482    if to_variable != variable:1483        DataFrame[to_variable] = None1484    1485    for entity in reversed(dictionary): 1486        try:1487            case=entity['case']1488        except:1489            case=True1490                1491        if (match_type=='values') or ('values' in  entity.keys()):1492            if case==True:1493                DataFrame.loc[DataFrame[variable].isin(entity['values']), to_variable] = entity['entity']1494            else:1495                values = [x.lower() for x in entity['values']] 1496                DataFrame.loc[DataFrame[variable].str.lower().isin(values), to_variable] = entity['entity']1497        elif (match_type=='pattern') or ('pattern' in entity.keys()):1498            DataFrame.loc[DataFrame[variable].fillna('').str.contains(pat=entity['pattern'], case=case), to_variable] = entity['entity']1499        else:1500            print('Entity {} not created !'.format(entity))1501            1502    DataFrame.loc[DataFrame[variable].isna(), to_variable] = null1503    DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1504    parameters = { 1505                    'match_type':match_type,1506                    'dictionary': dictionary,1507                    'default': default,1508                    'null': null1509    }1510    script_dict = generate_create_variable_task_script(type='entity', out_type='cat', 1511                                                       include=False, operation='dictionary', 1512                                                       source=variable, 1513                                                       destination=to_variable, 1514                                                       parameters=parameters)  1515    1516    if return_script and return_variable:1517        return DataFrame, to_variable, script_dict1518    elif return_script:1519        return DataFrame, script_dict1520    elif return_variable:1521        return DataFrame, to_variable1522    else:1523        return DataFrame 1524def create_value_pair_variable(DataFrame, variable1, variable2, to_variable, dictionary, match_type=None, default='OTHER', null='NA', return_variable=False, return_script=False):1525    if to_variable==None:1526        to_variable = '{}GRP{}'.format(variable1, variable2)1527              1528    if to_variable != variable1 and to_variable != variable2 :1529        DataFrame[to_variable] = None1530    1531    for entity in reversed(dictionary): 1532        try:1533            case=entity['case']1534        except:1535            case=True1536        1537        try:1538            opperator = entity['opperator']1539        except:1540            opperator='AND'1541            1542        if (match_type=='values') or ('values' in  entity.keys()):1543            if case==True:1544                if opperator=='AND':1545                    DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) & (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1546                elif opperator=='OR':1547                    DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) | (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1548                elif opperator=='NOT':1549                    DataFrame.loc[(DataFrame[variable1]==entity['values'][0]) & (DataFrame[variable2]!=entity['values'][1]), to_variable] = entity['entity']1550                elif opperator=='^NOT':1551                    DataFrame.loc[(DataFrame[variable1]!=entity['values'][0]) & (DataFrame[variable2]==entity['values'][1]), to_variable] = entity['entity']1552            else:1553                values = [x.lower() for x in entity['values']] 1554                if opperator=='AND':1555                    DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) & (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1556                elif opperator=='OR':1557                    DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) | (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1558                elif opperator=='NOT':1559                    DataFrame.loc[(DataFrame[variable1].str.lower()==values[0]) & (DataFrame[variable2].str.lower()!=values[1]), to_variable] = entity['entity']1560                elif opperator=='^NOT':1561                    DataFrame.loc[(DataFrame[variable1].str.lower()!=values[0]) & (DataFrame[variable2].str.lower()==values[1]), to_variable] = entity['entity']1562                                  1563        elif (match_type=='pattern') or ('pattern' in entity.keys()):1564            if opperator=='AND':1565                DataFrame.loc[(DataFrame[variable1].fillna('').str.contains(pat=entity['values'][0], case=case)) & (DataFrame[variable2].fillna('').str.contains(pat=entity['values'][1], case=case)), to_variable] = entity['entity']1566            elif opperator=='OR':1567                DataFrame.loc[(DataFrame[variable1].fillna('').str.contains(pat=entity['values'][0], case=case)) | (DataFrame[variable2].fillna('').str.contains(pat=entity['values'][1], case=case)), to_variable] = entity['entity']                                  1568        else:1569            print('Entity {} not created !'.format(entity))1570            1571    DataFrame.loc[(DataFrame[variable1].isna()) & (DataFrame[variable2].isna()), to_variable] = null1572    DataFrame.loc[DataFrame[to_variable].isna(), to_variable] = default1573    parameters = { 1574                    'match_type':match_type,1575                    'dictionary': dictionary,1576                    'default': default,1577                    'null': null1578    }1579    script_dict = generate_create_variable_task_script(type='entity', out_type='cat', 1580                                                       include=False, operation='valuepairs', 1581                                                       source=[variable1, variable2],1582                                                       destination=to_variable, 1583                                                       parameters=parameters)  1584    1585    if return_script and return_variable:1586        return DataFrame, to_variable, script_dict1587    elif return_script:1588        return DataFrame, script_dict1589    elif return_variable:1590        return DataFrame, to_variable1591    else:1592        return DataFrame 1593    1594###############################################################################1595##[ CREATING FEATURES - PAIR EQUALITY ]########################################      1596###############################################################################1597def create_pair_equality_variable(DataFrame, variable1, variable2, to_variable, magnitude=False, case=True, return_variable=False, return_script=False):1598    if to_variable==None:1599        to_variable = '{}CMP{}'.format(variable1,variable2)1600        1601    DataFrame.loc[(DataFrame[variable1]==DataFrame[variable2]), to_variable] = 'EQ'1602    DataFrame.loc[(DataFrame[variable1]!=DataFrame[variable2]), to_variable] = 'DF'1603    DataFrame.loc[(DataFrame[variable1].isna()) | (DataFrame[variable2].isna()), to_variable] = 'ON'1604    DataFrame.loc[(DataFrame[variable1].isna()) & (DataFrame[variable2].isna()), to_variable] = 'BN'1605    parameters = { 1606                    'magnitude':magnitude,1607                    'case': case1608    }1609    script_dict = generate_create_variable_task_script(type='pair_equality', out_type='cat', 1610                                                       include=False, operation='pairequality', 1611                                                       source=[variable1, variable2], 1612                                                       destination=to_variable, 1613                                                       parameters=parameters)  1614    1615    if return_script and return_variable:1616        return DataFrame, to_variable, script_dict1617    elif return_script:1618        return DataFrame, script_dict1619    elif return_variable:1620        return DataFrame, to_variable1621    else:1622        return DataFrame 1623###############################################################################1624        1625###############################################################################1626##[ CREATING FEATURES TASK - TARGET ]##########################################      1627###############################################################################1628def create_target_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1629    to_variable = rule_set['variables']['destination']1630    operation = rule_set['operation']    1631    parameters = rule_set['parameters']1632    1633    target_condition_str = parameters['condition_str']1634    default = parameters['default']1635    null = parameters['null']1636    1637    DataFrame, to_variable, script_dict = set_binary_target(DataFrame, condition_str=target_condition_str, 1638                                               to_variable=to_variable, default=default, null=null, return_variable=True, return_script=True)1639    if return_script and return_variable:1640        return DataFrame, to_variable, script_dict1641    elif return_script:1642        return DataFrame, script_dict1643    elif return_variable:1644        return DataFrame, to_variable1645    else:1646        return DataFrame 1647        1648###############################################################################1649##[ CREATING FEATURES TASK - TRANSFORM ]#######################################      1650###############################################################################1651def create_transformed_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1652    variable = rule_set['variables']['source']1653    to_variable = rule_set['variables']['destination']1654    operation = rule_set['operation']    1655    parameters = rule_set['parameters']1656    1657    if operation=='normalize':1658        method = rule_set['parameters']['method']        1659        DataFrame, to_variable, script_dict = create_normalized_variable(DataFrame, variable, method=method, parameters=parameters, to_variable=to_variable, return_variable=True, return_script=True)1660    elif operation=='datepart':1661        part = rule_set['parameters']['part']  1662        DataFrame, to_variable, script_dict = create_datepart_variable(DataFrame, variable, part=part, to_variable=to_variable, return_variable=True, return_script=True)1663    elif operation=='dateadd':1664        unit = rule_set['parameters']['unit']  1665        value = rule_set['parameters']['value']  1666        DataFrame, to_variable, script_dict = create_dateadd_variable(DataFrame, variable, unit=unit, value=value, to_variable=to_variable, return_variable=True, return_script=True)1667    elif operation=='log':1668        base = rule_set['parameters']['base']  1669        DataFrame, to_variable, script_dict = create_log_variable(DataFrame, variable, base=base, to_variable=to_variable, return_variable=True, return_script=True)1670    elif operation=='exponent':1671        base = rule_set['parameters']['base']  1672        DataFrame, to_variable, script_dict = create_exponent_variable(DataFrame, variable, base=base, to_variable=to_variable, return_variable=True, return_script=True)1673    elif operation=='exponent':1674        a = rule_set['parameters']['a']  1675        b = rule_set['parameters']['b'] 1676        DataFrame, to_variable, script_dict = create_segmented_variable(DataFrame, variable, a=a, b=b, to_variable=to_variable, return_variable=True, return_script=True)1677    else:1678        pass # other transformations to be implemented1679        1680    if return_script and return_variable:1681        return DataFrame, to_variable, script_dict1682    elif return_script:1683        return DataFrame, script_dict1684    elif return_variable:1685        return DataFrame, to_variable1686    else:1687        return DataFrame  1688def create_str_transformed_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1689    variable = rule_set['variables']['source']1690    to_variable = rule_set['variables']['destination']1691    operation = rule_set['operation']    1692    parameters = rule_set['parameters']1693    if operation=='strcount':1694        pattern = parameters['pattern']1695        case_sensitive = parameters['case_sensitive']1696        DataFrame, to_variable, script_dict = create_str_count_variable(DataFrame, variable, pattern=pattern, case_sensitive=case_sensitive, to_variable=to_variable, return_variable=True, return_script=True)        1697    elif operation=='normalize':1698        to_case = parameters['to_case']1699        chars = parameters['chars']1700        numbers = parameters['numbers'] 1701        spchar = parameters['spchar']1702        space = parameters['space']        1703        DataFrame, to_variable, script_dict = create_str_normalized_variable(DataFrame, variable, 1704                                                                to_case=to_case, 1705                                                                chars=chars, 1706                                                                numbers=numbers, 1707                                                                spchar=spchar, 1708                                                                space=space, 1709                                                                to_variable=None, return_variable=False, return_script=True)1710    elif operation=='extract':1711        pattern = parameters['pattern']1712        case_sensitive = parameters['case_sensitive']1713        DataFrame, to_variable, script_dict = create_str_extract_variable(DataFrame, variable, pattern=pattern, 1714                                                             case_sensitive=case_sensitive, 1715                                                             to_variable=to_variable, return_variable=True, return_script=True)   1716    if return_script and return_variable:1717        return DataFrame, to_variable, script_dict1718    elif return_script:1719        return DataFrame, script_dict1720    elif return_variable:1721        return DataFrame, to_variable1722    else:1723        return DataFrame      1724###############################################################################1725##[ CREATING FEATURES TASK - MLTI VARIAVLE ]###################################      1726###############################################################################1727        1728def create_operation_mult_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1729    variable = rule_set['variables']['source']1730    to_variable = rule_set['variables']['destination']1731    operation = rule_set['operation']    1732    parameters = rule_set['parameters']    1733    expression_str = parameters['expression_str']1734    1735    DataFrame, to_variable, script_dict = create_operation_mult_variable(DataFrame, expression_str=expression_str, 1736                                                            to_variable=to_variable, return_variable=True, return_script=True)1737    1738    if return_script and return_variable:1739        return DataFrame, to_variable, script_dict1740    elif return_script:1741        return DataFrame, script_dict1742    elif return_variable:1743        return DataFrame, to_variable1744    else:1745        return DataFrame 1746###############################################################################1747##[ CREATING FEATURES TASK - SEQUENCE ORDER ]##################################      1748###############################################################################1749def create_sequence_order_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1750    variable1a = rule_set['variables']['source1a']1751    variable2a = rule_set['variables']['source2a']1752    variable1b = rule_set['variables']['source1b']1753    variable2b = rule_set['variables']['source2b']1754    to_variable = rule_set['variables']['destination']1755    1756    DataFrame, to_variable, script_dict = create_sequence_order_variable(DataFrame, variable1a, variable2a, variable1b, variable2b, output='binary', 1757                                                            to_variable=to_variable, return_variable=True, return_script=True)1758        1759    if return_script and return_variable:1760        return DataFrame, to_variable, script_dict1761    elif return_script:1762        return DataFrame, script_dict1763    elif return_variable:1764        return DataFrame, to_variable1765    else:1766        return DataFrame       1767    1768###############################################################################1769##[ CREATING FEATURES TASK - COMPARISON ]######################################      1770###############################################################################1771def create_comparison_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1772    variable1 = rule_set['variables']['source1']1773    variable2 = rule_set['variables']['source2']1774    to_variable = rule_set['variables']['destination']1775    operation = rule_set['operation']1776    parameters = rule_set['parameters']1777    1778    try:1779        multiplier = parameters['multiplier']1780    except:1781        multiplier=1        1782        1783    try:1784        unit = parameters['unit']1785    except:1786        unit = 'D'1787    onerror = None # parameters['onerror']1788    1789    if operation=='numdiff':        1790        DataFrame, to_variable, script_dict = create_numeric_difference_variable(DataFrame, variable1, variable2, multiplier=multiplier, onerror=onerror, to_variable=to_variable, return_variable=True, return_script=True)1791    elif operation=='datediff':1792        DataFrame, to_variable, script_dict = create_date_difference_variable(DataFrame, variable1, variable2, unit=unit, onerror=onerror, to_variable=to_variable, return_variable=True, return_script=True)1793    elif operation=='rowmin':1794        DataFrame, to_variable, script_dict = create_row_min_variable(DataFrame, variable1, variable2, to_variable=to_variable, return_variable=True, return_script=True)1795    elif operation=='rowmax':1796        DataFrame, to_variable, script_dict = create_row_max_variable(DataFrame, variable1, variable2, to_variable=to_variable, return_variable=True, return_script=True)    1797    1798    if return_script and return_variable:1799        return DataFrame, to_variable, script_dict1800    elif return_script:1801        return DataFrame, script_dict1802    elif return_variable:1803        return DataFrame, to_variable1804    else:1805        return DataFrame       1806    1807def create_str_comparison_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1808    variable1 = rule_set['variables']['source1']1809    variable2 = rule_set['variables']['source2']1810    to_variable = rule_set['variables']['destination']1811    operation = rule_set['operation']    1812    parameters = rule_set['parameters']1813                1814    DataFrame, to_variable, script_dict = create_str_comparison_variable(DataFrame, variable1=variable1, variable2=variable2, to_variable=to_variable, operation=operation, parameters=parameters, 1815                                                                         return_variable=True, return_script=True)1816    1817    if return_script and return_variable:1818        return DataFrame, to_variable, script_dict1819    elif return_script:1820        return DataFrame, script_dict1821    elif return_variable:1822        return DataFrame, to_variable1823    else:1824        return DataFrame 1825###############################################################################1826##[ CREATING FEATURES TASK - BINARY VARIABLE ]#################################      1827###############################################################################1828def create_binary_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1829    #variable = rule_set['variables']['source']1830    to_variable = rule_set['variables']['destination']1831    parameters = rule_set['parameters']1832    condition_str = parameters['condition_str']1833    default = parameters['default']1834    null = parameters['null']1835    1836    DataFrame, to_variable, script_dict = create_binary_variable(DataFrame, to_variable, condition_str, default, null, 1837                                                                 return_variable=True, return_script=True)1838    1839    if return_script and return_variable:1840        return DataFrame, to_variable, script_dict1841    elif return_script:1842        return DataFrame, script_dict1843    elif return_variable:1844        return DataFrame, to_variable1845    else:1846        return DataFrame 1847###############################################################################1848##[ CREATING FEATURES TASK - CATEGORY VARIABLE ]###############################      1849###############################################################################  1850def create_categorical_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):  1851    variable = rule_set['variables']['source']1852    to_variable = rule_set['variables']['destination']1853    operation = rule_set['operation']    1854    parameters = rule_set['parameters']1855    labels_str = parameters['labels_str']1856    right_inclusive = parameters['right_inclusive'] 1857    default = parameters['default']1858    null = parameters['null']1859    1860    DataFrame, to_variable, script_dict = create_categorical_variable(DataFrame, variable, to_variable, labels_str, right_inclusive, default, null, 1861                                                                      return_variable=True, return_script=True)1862    1863    if return_script and return_variable:1864        return DataFrame, to_variable, script_dict1865    elif return_script:1866        return DataFrame, script_dict1867    elif return_variable:1868        return DataFrame, to_variable1869    else:1870        return DataFrame 1871###############################################################################1872##[ CREATING FEATURES TASK - ENTITY VARIABLE ]#################################      1873###############################################################################         1874def create_entity_variable_task(DataFrame, rule_set, return_variable=False, return_script=False):1875    1876    to_variable = rule_set['variables']['destination']1877    parameters = rule_set['parameters']1878    match_type = parameters['match_type']1879    dictionary = parameters['dictionary'] 1880    default = parameters['default']1881    null = parameters['null']1882    operation = rule_set['operation']1883    if operation == 'dictionary':1884        variable = rule_set['variables']['source']                                                               1885        DataFrame, to_variable, script_dict = create_entity_variable(DataFrame, variable=variable, to_variable=to_variable, 1886                                                                     dictionary=dictionary, match_type=match_type, default=default, null=null, 1887                                                                     return_variable=True, return_script=True)1888    elif operation == 'valuepairs':1889        variable1 = rule_set['variables']['source1'] 1890        variable2 = rule_set['variables']['source2'] 1891        DataFrame, to_variable, script_dict = create_value_pair_variable(DataFrame, variable1, variable2, to_variable, 1892                                                                     dictionary, match_type=None, default='OTHER', null='NA', 1893                                                                     return_variable=True, return_script=True)1894    if return_script and return_variable:1895        return DataFrame, to_variable, script_dict1896    elif return_script:1897        return DataFrame, script_dict1898    elif return_variable:1899        return DataFrame, to_variable1900    else:1901        return DataFrame       1902###############################################################################1903##[ CREATING FEATURES TASK - PAIR EQUALITY ]###################################      1904############################################################################### 1905def create_pair_equality_variable_task(DataFrame, rule_set, return_variable=False, return_script=False): 1906    variable1 = rule_set['variables']['source1']1907    variable2 = rule_set['variables']['source2']1908    to_variable = rule_set['variables']['destination']1909    parameters = rule_set['parameters']1910    try:1911        magnitude = parameters['magnitude']1912    except:1913        magnitude = 11914    case = parameters['case']1915    1916    DataFrame, to_variable, script_dict = create_pair_equality_variable(DataFrame, variable1=variable1, variable2=variable2, to_variable=to_variable, magnitude=magnitude, case=case, 1917                                                                        return_variable=True, return_script=True)1918    1919    if return_script and return_variable:1920        return DataFrame, to_variable, script_dict1921    elif return_script:1922        return DataFrame, script_dict1923    elif return_variable:1924        return DataFrame, to_variable1925    else:1926        return DataFrame   1927###############################################################################1928##[ CREATING FEATURES TASK - MERGE CATEGORY ]##################################      1929###############################################################################1930def merge_categories_task(DataFrame, rule_set, return_variable=False, return_script=False):1931    variable = rule_set['variables']['source']1932    to_variable = rule_set['variables']['destination']1933    values = rule_set['parameters']['values']1934    group_value = rule_set['parameters']['group_value']1935    1936    DataFrame, to_variable, script_dict =  merge_categories(DataFrame, variable=variable, to_variable=to_variable, values=values, group_value=group_value, 1937                                                            return_variable=True, return_script=True)   1938    if return_script and return_variable:1939        return DataFrame, to_variable, script_dict1940    elif return_script:1941        return DataFrame, script_dict1942    elif return_variable:1943        return DataFrame, to_variable1944    else:1945        return DataFrame 1946###############################################################################1947        1948###############################################################################1949##[ ENCODER ]##################################################################      1950############################################################################### 1951def to_one_hot_encode(DataFrame, category_variables=[], binary_variables=[], target_variable='target', target_type='binary'):1952    # TO DO: If target type is 'multi' apply one hot encoding to target1953    feature_variables = []1954    try:1955        VariablesDummies = pd.get_dummies(DataFrame[category_variables]).astype('int8')1956        dummy_variables = list(VariablesDummies.columns.values)1957        DataFrame[dummy_variables] = VariablesDummies1958    except:1959        print('Category columns {} does not specified nor exists'.format(category_variables))1960        1961    try:1962        DataFrame[binary_variables] = DataFrame[binary_variables].astype('int8')1963    except:1964        print('Binary columns {} does not specified nor exists'.format(binary_variables))1965    1966    try:          1967        feature_variables = binary_variables+dummy_variables1968    except:1969        print('Error in creating feature variables.')1970        1971    return DataFrame, feature_variables, target_variable1972###############################################################################1973##[ ML MODEL DRIVER ]##########################################################      1974###############################################################################     1975def load_data_task(load_data_dict, return_name=False):1976    """1977    Parameters1978    ----------1979    load_data_dict: dict1980    e.g.:   {1981        	  "type": "csv",1982        	  "location": "local",1983        	  "workclass": "Private",1984        	  "source": {"path":"C:/Projects/Data/incomedata.csv", "separator":",", "encoding":null},1985        	  "auth": None,1986        	  "query": None,1987        	  "limit": None1988            }1989    1990    Returns1991    -------1992    DataFrame: pandas.DataFrame1993    data_name: str1994    """    1995    import json1996    if type(load_data_dict)==dict:1997        pass1998    else:1999        try:2000            load_data_dict = json.loads(load_data_dict) 2001        except:2002            print('ERROR in loading data:{}\n {}'.format(load_data_dict, traceback.format_exc()))  2003    2004    data_name = load_data_dict['data_name']2005        2006    if load_data_dict['type']=='csv':2007        DataFrame = read_data_csv(2008                file=load_data_dict['source']['path'], 2009                separator=load_data_dict['source']['separator'], 2010                encoding=load_data_dict['source']['encoding']2011                )2012    elif load_data_dict['type']=='pickle':2013        DataFrame = read_data_pickle(2014                file=load_data_dict['source']['path'], 2015                compression =load_data_dict['source']['compression']2016                )    2017    elif load_data_dict['type']=='sql':2018        DataFrame = read_data_sql(2019                query=load_data_dict['query'], 2020                server=load_data_dict['source']['server'], 2021                database=load_data_dict['source']['database'],2022                auth=load_data_dict['auth']2023                )    2024    else:2025        print("No valid data source provided!")2026        DataFrame = pd.DataFrame()	2027    # Add ID column2028    DataFrame = add_identity_column(DataFrame, id_label='ID', start=1, increment=1)2029    # Clean column names2030    DataFrame = clean_column_names(DataFrame, replace='')2031        2032    if return_name:  2033        return DataFrame, data_name2034    else:2035        return DataFrame2036###############################################################################2037def create_variable_task(DataFrame, create_variable_task_dict=None, return_extra=False, return_script=False):2038    """2039    Interface function for single variable operation2040    Parameters2041    ----------2042    DataFrame: pandas.DataFrame2043    create_variable_task_dict : dict or JSON2044    return_extra : bool, default False2045        Returns variable_class and include if True2046    2047    Returns2048    -------2049    DataFrame: pandas.DataFrame2050    data_name: str2051    variable_class : str, optional2052    include: bool, optional2053    """    2054    import json2055    if type(create_variable_task_dict)==dict:2056        pass2057    else:2058        try:2059            create_variable_task_dict = json.loads(create_variable_task_dict) 2060        except:2061            print('ERROR in creating variable:{}\n {}'.format(create_variable_task_dict, traceback.format_exc()))  2062            2063    rule_set = {2064        'operation':create_variable_task_dict['operation'],2065        'variables':create_variable_task_dict['variables'],2066        'parameters':create_variable_task_dict['parameters']2067    }2068    out_type = create_variable_task_dict['out_type']2069    include = create_variable_task_dict['include']    2070    try:2071        if create_variable_task_dict['type']=='target':2072            DataFrame, output_variable, script_dict  = create_target_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)      2073        if create_variable_task_dict['type']=='transform':2074            DataFrame, output_variable, script_dict  = create_transformed_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)  2075        elif create_variable_task_dict['type']=='str_transform':2076            DataFrame, output_variable, script_dict  = create_str_transformed_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)          2077        elif create_variable_task_dict['type']=='operation_mult':2078            DataFrame, output_variable, script_dict  = create_operation_mult_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)           2079        elif create_variable_task_dict['type']=='seq_order':2080            DataFrame, output_variable, script_dict  = create_sequence_order_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)2081        elif create_variable_task_dict['type']=='comparison':2082            DataFrame, output_variable, script_dict  = create_comparison_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)     2083        elif create_variable_task_dict['type']=='str_comparison':2084            DataFrame, output_variable, script_dict  = create_str_comparison_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)    2085        elif create_variable_task_dict['type']=='condition':2086            DataFrame, output_variable, script_dict  = create_binary_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)     2087        elif create_variable_task_dict['type']=='category':2088            DataFrame, output_variable, script_dict  = create_categorical_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)    2089        elif create_variable_task_dict['type']=='entity':2090            DataFrame, output_variable, script_dict  = create_entity_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)    2091        elif create_variable_task_dict['type']=='pair_equality':2092            DataFrame, output_variable, script_dict  = create_pair_equality_variable_task(DataFrame, rule_set, return_variable=True, return_script=True)   2093        elif create_variable_task_dict['type']=='category_merge':2094            DataFrame, output_variable, script_dict  = merge_categories_task(DataFrame, rule_set, return_variable=True, return_script=True)   2095        else:2096            output_variable= None    2097            out_type = None2098            include = False2099            script_dict= {2100                    "type": "",2101                    "out_type":"",2102                    "include": False,2103                    "operation": "",2104                    "variables": {2105                        "source": "",2106                        "destination": None2107                    },2108                    "parameters": {                2109                    }2110            }2111    except:2112        output_variable= None    2113        out_type = None2114        include = False2115        script_dict= {2116                "type": "",2117                "out_type":"",2118                "include": False,2119                "operation": "",2120                "variables": {2121                    "source": "",2122                    "destination": None2123                },2124                "parameters": {                2125                }2126        }        2127    2128    if return_script and return_extra:2129        return DataFrame, output_variable, out_type, include, script_dict2130    if return_script:2131        return DataFrame, script_dict2132    if return_extra:    2133        return DataFrame, output_variable, out_type, include   2134    else:2135        return DataFrame, output_variable2136def setup_variables_task(DataFrame, variables_setup_dict, return_script=False):2137    """2138    Parameters2139    ----------2140    DataFrame: pandas.DataFrame2141    variables_setup_dict: json or dict2142   2143    2144    Returns2145    -------2146    DataFrame: pandas.DataFrame2147    category_variables: list(str)2148    binary_variables: list(str)2149    target_variable: list(str)2150    """2151    2152    import re2153    import json2154    if type(variables_setup_dict)==dict:2155        pass2156    else:2157        try:2158            variables_setup_dict = json.loads(variables_setup_dict) 2159        except:2160            print('ERROR in creating variables:{}\n {}'.format(variables_setup_dict, traceback.format_exc()))  2161            2162    # Setting = {'model', 'score'}     2163    setting = variables_setup_dict['setting']2164    2165    # verify if variables exists2166    category_variables = variables_setup_dict['variables']['category_variables']2167    binary_variables = variables_setup_dict['variables']['binary_variables']  2168    target_variable = variables_setup_dict['variables']['target_variable'] 2169    2170    #Create variables sets2171    category_variables =  set(category_variables) & set(DataFrame.columns)2172    binary_variables  = set(binary_variables) & set(DataFrame.columns)2173    2174    # Create placeholder for variable creation scripts2175    script_dict = []2176    2177    # Check if target variable exists (fill the column with None in scoring)2178    if not target_variable in DataFrame.columns:2179        DataFrame[target_variable]=None    2180    2181    # Run variable creation task list2182    for preprocess_task in variables_setup_dict['preprocess_tasks']:2183        task_type = preprocess_task['type'] #re.sub('[\W\d]', '', task_type)         2184        if task_type in ['target', 'transform', 'condition', 'category', 'entity', 'category_merge', 'pair_equality', 'str_transform', 2185                 'str_comparison', 'operation_mult', 'comparison', 'seq_order']:2186            #print(task_type)2187            2188            DataFrame, variable_, variable_class_, include_, script_dict_ = create_variable_task(DataFrame, create_variable_task_dict=preprocess_task, return_extra=True, return_script=True)                    2189   2190            if include_:2191                script_dict_['include'] = True2192                script_dict.append(script_dict_)2193                if variable_class_=='bin':2194                    binary_variables.add(variable_)2195                elif variable_class_=='cat':2196                    category_variables.add(variable_)2197    #Finalize variables lists2198    category_variables=list(category_variables)2199    binary_variables=list(binary_variables)2200    target_variable = target_variable2201    2202    if return_script:2203        return DataFrame, category_variables, binary_variables, target_variable, script_dict2204    else:2205        return DataFrame, category_variables, binary_variables, target_variable2206###############################################################################2207# Generate Script2208###############################################################################2209def generate_variables_script(source, destination):    2210    if type(source)==list:2211        if len(source)==2:2212            variables = {2213                'source1': source[0],2214                'source2': source[1],2215                'destination': destination2216            }            2217        elif len(source)==4:2218            variables = {2219                'source1a': source[0],2220                'source2a': source[1],2221                'source1b': source[2],2222                'source2b': source[3],2223                'destination': destination2224            }2225    else:2226        variables = {2227            'source': source,2228            'destination': destination2229        }2230    return variables2231    2232def generate_create_variable_task_script(type='', out_type='', include=False, operation='', source=None, destination=None, parameters={}):2233    variable_task_script = {2234        'type': type,2235        'out_type':out_type,2236        'include': include,2237        'operation': operation,2238        'variables': generate_variables_script(source, destination),2239        'parameters': parameters2240    }2241    return variable_task_script2242###############################################################################2243# EZ User Functions2244###############################################################################2245def create_category_ez(DataFrame, variable, labels_str, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2246    rule_set = {   2247        'operation':'bucket',2248        'variables': {2249            'source':variable, 2250            'destination':to_variable2251        },2252        'parameters': {2253            'labels_str': labels_str,2254            'right_inclusive':True,2255            'default':default,2256            'null':null2257        }2258    }2259    DataFrame, category_variable = mltk.create_categorical_variable_task(DataFrame, rule_set, return_variable=True)2260    print(variable_response(DataFrame=DataFrame, variable=category_variable, target_variable=target_variable, show_plot=show_plot))2261    return DataFrame, category_variable2262def create_binary_ez(DataFrame, condition_str, default=0, null=0, to_variable=None, target_variable=None, show_plot=True):2263    rule_set = {2264        'operation':'condition',  2265        'variables': {2266            'source': None, 2267            'destination':to_variable2268        },2269        'parameters': {2270            'condition_str':condition_str,2271            'default':default,2272            'null':null,2273        }2274    } 2275    2276    DataFrame, binary_variable = create_binary_variable_task(DataFrame, rule_set, return_variables=True)    2277    print(variable_response(DataFrame=DataFrame, variable=binary_variable, target_variable=target_variable, show_plot=show_plot))2278    return DataFrame, binary_variable  2279def create_entity_ez(DataFrame, variable, dictionary, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2280    rule_set = {2281        'operation':'dictionary',  2282        'variables': {2283            'source': variable, 2284            'destination':to_variable2285        },2286        'parameters': {2287            'match_type': None,2288            'dictionary':dictionary,2289            'default':default,2290            'null':null,2291        }2292    } 2293    2294    DataFrame, entity_variable = create_entity_variable_task(DataFrame, rule_set, return_variables=True)    2295    print(variable_response(DataFrame=DataFrame, variable=entity_variable, target_variable=target_variable, show_plot=show_plot))2296    return DataFrame, entity_variable  2297def create_entity_ez(DataFrame, variable, dictionary, default='OTHER', null='NA', to_variable=None, target_variable=None, show_plot=True):2298    rule_set = {2299        'operation':'dictionary',  2300        'variables': {2301            'source': variable, 2302            'destination':to_variable2303        },2304        'parameters': {2305            'match_type': None,2306            'dictionary':dictionary,2307            'default':default,2308            'null':null,2309        }2310    } 2311    2312    DataFrame, entity_variable = create_entity_variable_task(DataFrame, rule_set, return_variables=True)    2313    print(variable_response(DataFrame=DataFrame, variable=entity_variable, target_variable=target_variable, show_plot=show_plot))...put_in_function.py
Source:put_in_function.py  
1#!/usr/bin/env python32# -*- coding: utf-8 -*-3"""4Created on Wed Feb  8 19:55:06 20175@author: lauragustafson, knjohnso, carlosh6"""7def put_in_functions_a():8    FUNCTION_NAME = 'part_a'9    #change FUNCTION_NAME to be the name of the function that you want their code10    #to be wrapped in11    FUNCTION_HEADER = 'def %s(annual_salary, portion_saved, total_cost):' % (FUNCTION_NAME)12    STUDENT_FILE_NAME = 'ps1a.py'13    #change STUDENT_FILE_NAME to be the name of the file that their code will be in14    RETURN_VARIABLE = 'months'15    #change RETURN_VARIABLE to be the name of the variable you want the fucntion16    #to return17    RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)18    NEW_FILE_NAME = 'ps1a_in_function.py'19    #change NEW_FILE_NAME to be the name of the output file20    START_LINE = "## Initialize other variables you need (if any) for your program below ##" 21    #The start line to grab for the student's function, everything else below this should be fine to copy and not include any input statements.22    new_lines = []23    lines = [line.rstrip('\n') for line in open('ps1a.py')]24    # look for the start line and find its index in lines 25    START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)26    lines = lines[START_INDEX+1:]27 28    new_lines.append(FUNCTION_HEADER)29    for line in lines:30        new_lines.append('\t'+line)31    new_lines.append(RETURN_STATEMENT)32    with open(NEW_FILE_NAME, 'w') as new_file:33        new_file.write('\n'.join(new_lines))34def put_in_functions_b():35    FUNCTION_NAME = 'part_b'36    #change FUNCTION_NAME to be the name of the function that you want their code37    #to be wrapped in38    FUNCTION_HEADER = 'def %s(annual_salary, portion_saved, total_cost, semi_annual_raise):' % (FUNCTION_NAME)39    STUDENT_FILE_NAME = 'ps1b.py'40    #change STUDENT_FILE_NAME to be the name of the file that their code will be in41    RETURN_VARIABLE = 'months'42    #change RETURN_VARIABLE to be the name of the variable you want the fucntion43    #to return44    RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)45    NEW_FILE_NAME = 'ps1b_in_function.py'46    #change NEW_FILE_NAME to be the name of the output file47    START_LINE = "## Initialize other variables you need (if any) for your program below ##" 48    #The start line to grab for the student's function, everything else below this should be fine to copy andd not include any input statements.49    new_lines = []50    lines = [line.rstrip('\n') for line in open('ps1b.py')]51    # look for the start line and find its index in lines52    START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)53    lines = lines[START_INDEX+1:]54    55    new_lines.append(FUNCTION_HEADER)56    for line in lines:57        new_lines.append('\t'+line)58    new_lines.append(RETURN_STATEMENT)59    with open(NEW_FILE_NAME, 'w') as new_file:60        new_file.write('\n'.join(new_lines))61def put_in_functions_c():62    FUNCTION_NAME = 'part_c'63    #change FUNCTION_NAME to be the name of the function that you want their code64    #to be wrapped in65    FUNCTION_HEADER = 'def %s(initial_deposit):' % (FUNCTION_NAME)66    STUDENT_FILE_NAME = 'ps1c.py'67    #change STUDENT_FILE_NAME to be the name of the file that their code will be in68    RETURN_VARIABLE = 'r, steps'69    #change RETURN_VARIABLE to be the name of the variable you want the fucntion70    #to return71    RETURN_STATEMENT = '\treturn %s' % (RETURN_VARIABLE)72    NEW_FILE_NAME = 'ps1c_in_function.py'73    #change NEW_FILE_NAME to be the name of the output file74    START_LINE = "## Initialize other variables you need (if any) for your program below ##" 75    #The start line to grab for the student's function, everything else below this should be fine to copy andd not include any input statements.76    new_lines = []77    lines = [line.rstrip('\n') for line in open('ps1c.py')]78    # look for the start line and find its index in lines79    START_INDEX = [line.startswith(START_LINE) for line in lines].index(True)80    lines = lines[START_INDEX+1:]81    new_lines.append(FUNCTION_HEADER)82    for line in lines:83        new_lines.append('\t'+line)84    new_lines.append(RETURN_STATEMENT)85    with open(NEW_FILE_NAME, 'w') as new_file:86        new_file.write('\n'.join(new_lines))87put_in_functions_a()88put_in_functions_b()...python-code-runner.py
Source:python-code-runner.py  
1from bottle import run, request, route2@route('/runCode', method='POST')3def runCode():4	name_of_return_variable = 'return_variable'5	function_definition = request.params['functionDefinition']6	function_call = request.params['functionCall']7	outputMap = {}8	error_output = ''9	error_occured = False10	testRunCode = function_definition + '\n' + name_of_return_variable + ' = ' + function_call11	12	try:13		exec(testRunCode, globals(), outputMap)14	except Exception as exception:15		error_output = exception16		error_occured = True17	if error_occured:18		return_variable = error_output19	else:20		return_variable = outputMap[name_of_return_variable]21	return str(return_variable)...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
