Best Python code snippet using robotframework
PD Model - Data Preparation (py).py
Source:PD Model - Data Preparation (py).py  
1#!/usr/bin/env python2# coding: utf-83# ### GOAL4# To build statistical model for estimating EL(Expected Loss)5# 6# EL = PD * EAD * LGD7# In[2]:8import numpy as np9import pandas as pd10import matplotlib.pyplot as plt11import seaborn as sns12# ### Import Data13# In[3]:14loan_data_backup = pd.read_csv("loan_data_2007_2014.csv")15# In[6]:16loan_data = loan_data_backup.copy()17# ### Data Exploration18# In[7]:19loan_data.head(10)20# In[8]:21pd.options.display.max_columns = 1022# #### Displaying all columns in the data23# In[10]:24loan_data.columns.values25# #### Column Data Type26# In[11]:27loan_data.info()28# ### Preprocessing for Continous Variables29# In[9]:30# Converting emp_legnth columns and term to numeric value31# In[12]:32loan_data.emp_length.unique()33# In[13]:34loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\+ years','')35loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years','')36loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year','')37loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1',str(0))38loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a',str(0))39# In[14]:40# To numeric41loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])42# In[15]:43type(loan_data['emp_length_int'][0])44# In[16]:45loan_data.term.unique()46# In[17]:47loan_data['term_int'] = loan_data['term'].str.replace(' 36 months',str(36))48loan_data['term_int'] = loan_data['term_int'].str.replace(' 60 months',str(60))49# In[18]:50loan_data['term_int'] = pd.to_numeric(loan_data['term_int'])51# In[19]:52type(loan_data['term_int'][0])53# In[20]:54loan_data.head(10)55# In[21]:56loan_data['earliest_cr_line'].head(12)57# In[28]:58# Convert to date time column from object(text string)59loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'],format = '%b-%y')60# In[29]:61type(loan_data['earliest_cr_line_date'][0])62# In[30]:63# checking how many days before earliest loan was given(toadays date is taken as reference)64df = pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']65# In[31]:66# Converting days into months (using timedelta)67loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric(df/np.timedelta64(1,'M')))68# In[32]:69loan_data['mths_since_earliest_cr_line'].describe()70# In[33]:71loan_data.loc[:,['earliest_cr_line','earliest_cr_line_date','mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line']<0] 72# In[34]:73# The different between future time period and past time period shouldn't come negative74# This is because while converting to datetime, the dataframe has taken many columns of date 2060 instead of 196075# In[35]:76# Now to convert from 20 to 19 in each row in earliest_cr_line_date is not an easy task(Origin of built in time scale starts from 1960)77# Instead we are directly imputing78# In[36]:79loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] <0] = loan_data['mths_since_earliest_cr_line'].max()80# In[37]:81min(loan_data['mths_since_earliest_cr_line'])82# In[38]:83# Homework do the same as above for 'term' and 'issue_date' variable84# In[39]:85loan_data['term_int'].describe()86# In[40]:87loan_data['issue_d']88# In[41]:89loan_data['issue_d_dateTime'] = pd.to_datetime(loan_data['issue_d'],format = '%b-%y')90type(loan_data['issue_d_dateTime'][0])91df1 = pd.to_datetime('2017-12-01') - loan_data['issue_d_dateTime']92loan_data['mths_since_issue'] = round(pd.to_numeric(df1/np.timedelta64(1,'M')))93# In[42]:94loan_data['mths_since_issue'].describe()95# In[43]:96loan_data.loc[:,['issue_d','issue_d_dateTime','mths_since_issue']]97# ### Discrete / categorical preprocessing98# In[44]:99loan_data.head(5)100# ### Create dummy variable for discrete variables101# ### create a new data frame for dummy variables than concat in loan_data 102# In[46]:103pd.get_dummies(loan_data['grade'],prefix='grade',prefix_sep=":")104# In[47]:105loan_data.columns106# In[48]:107loan_data_dummies = [pd.get_dummies(loan_data['grade'],prefix='grade',prefix_sep=':'),108                    pd.get_dummies(loan_data['sub_grade'],prefix='sub_grade',prefix_sep=':'),109                    pd.get_dummies(loan_data['home_ownership'],prefix='home_ownership',prefix_sep=':'),110                    pd.get_dummies(loan_data['verification_status'],prefix='verification_status',prefix_sep=':'),111                    pd.get_dummies(loan_data['loan_status'],prefix='loan_status',prefix_sep=':'),112                    pd.get_dummies(loan_data['purpose'],prefix='purpose',prefix_sep=':'),113                    pd.get_dummies(loan_data['addr_state'],prefix='addr_state',prefix_sep=':'),114                    pd.get_dummies(loan_data['initial_list_status'],prefix='initial_list_status',prefix_sep=':')]115# In[49]:116type(loan_data_dummies)117# In[50]:118loan_data_dummies = pd.concat(loan_data_dummies,axis=1)119# In[51]:120type(loan_data_dummies)121# In[52]:122loan_data_dummies.head(10)123# In[53]:124loan_data.head(10)125# In[54]:126loan_data = pd.concat([loan_data,loan_data_dummies],axis=1)127# In[55]:128loan_data.columns.values129# ### Dealing with missing values130# In[56]:131loan_data.isna().sum()132# In[57]:133# pd.options.display.max_rows = 100134loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'],inplace=True)135# In[58]:136loan_data['total_rev_hi_lim'].isna().sum()137# In[59]:138loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(),inplace=True)139# In[60]:140loan_data['annual_inc'].isna().sum()141# In[61]:142loan_data['mths_since_earliest_cr_line'].fillna(0,inplace=True)143# In[62]:144loan_data['mths_since_earliest_cr_line'].isna().sum()145# In[63]:146loan_data['acc_now_delinq'].fillna(0,inplace=True)147loan_data['total_acc'].fillna(0,inplace=True)148loan_data['pub_rec'].fillna(0,inplace=True)149loan_data['open_acc'].fillna(0,inplace=True)150loan_data['inq_last_6mths'].fillna(0,inplace=True)151loan_data['delinq_2yrs'].fillna(0,inplace=True)152loan_data['emp_length_int'].fillna(0,inplace=True)153# In[64]:154#pd.options.display.max_rows=None155loan_data.isna().sum()156# In[65]:157loan_data.head(10)158# # PD Model159# ### Data Preparation160# ##### Dependent Variables161# In[66]:162loan_data['loan_status'].unique()163# In[67]:164loan_data['loan_status'].value_counts()165# In[68]:166loan_data['good/bad'] = np.where(loan_data['loan_status'].isin(['Charged Off','Default','Late (31-120 days)',167                                                                'Late (16-30 days)',168                                                                'Does not meet the credit policy. Status:Charged Off']),0,1)169# In[69]:170loan_data['good/bad'].head()171# ### Independent Variables172# At the end scorecord should contain whether a guy should get a loan or not i.e.  1 or 0173# Discrete independent variables such as home ownership , age etc can be converted directly into dummy variables174# However categorizing continous variables is not easy, first fine classing is done which is initial binning of data into 175# between 20 and 50 fine granular bins 176# Coarse classing is where a binning process is applied to the fine granular bins to merge those with similar risk and 177# create fewer bins, usually up to ten. The purpose is to achieve simplicity by creating fewer bins, each with distinctively 178# different risk factors, while minimizing information loss. However, to create a robust model that is resilient to overfitting179# , each bin should contain a sufficient number of observations from the total account (5% is the minimum recommended by most180# practitioners)181# 182# From initial fine classing, coarse classing is done based on the weight of evidence 183# ###### Splitting the data184# In[70]:185from sklearn.model_selection import train_test_split186# In[71]:187train_test_split(loan_data.drop('good/bad',axis=1),loan_data['good/bad'])188# In[72]:189loan_data_inputs_train,loan_data_inputs_test,loan_data_outputs_train,loan_data_outputs_test =  train_test_split(loan_data.drop('good/bad',axis=1),loan_data['good/bad'],test_size=0.2,random_state=42)190# In[73]:191loan_data_inputs_train.shape192# In[74]:193loan_data_inputs_test.shape194# In[75]:195loan_data_outputs_train.shape196# In[76]:197loan_data_outputs_test.shape198# In[77]:199df_input_prep = loan_data_inputs_train200df_output_prep = loan_data_outputs_train201# In[78]:202df_input_prep.head(10)203# In[79]:204##### Dicrete Data Preprocessing205##### Dicrete variable is already categorical so here we have no need to calculate dummy variables using fine and coarse classing206##### Only calculate WOE and Information value to estimate if the variable can be included for predicting dependent variable207# In[80]:208df_input_prep['grade'].unique()209# In[81]:210df1 = pd.concat([df_input_prep['grade'],df_output_prep],axis=1)211# In[82]:212df1.head(10)213# In[83]:214df1.tail(10)215# ## Weight of evidence of discrete variable Grade216# 217# In[84]:218df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].count()219# In[85]:220df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].mean()221# In[86]:222df1 = pd.concat([df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].count(),223                df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].mean()],axis=1)224# In[87]:225df1.head(10)226# In[88]:227df1 = df1.iloc[:,[0,1,3]]228# In[89]:229df1.head(5)230# In[90]:231df1.columns = [df1.columns.values[0],'n_obs','prop_good']232# In[91]:233df1.head(5)234# In[92]:235df1['no_good'] = df1['prop_good'] * df1['n_obs']236df1['no_bad'] = (1- df1['prop_good']) * df1['n_obs']237# In[93]:238df1.head(5)239# In[94]:240df1['Final_good'] = df1['no_good']/df1['no_good'].sum()241df1['Final_bad'] = df1['no_bad']/df1['no_bad'].sum()242# In[95]:243df1.head(5)244# In[96]:245df1['WOE'] = np.log(df1['Final_good']/df1['Final_bad'])246# In[97]:247df1.head(5)248# In[98]:249df1 = df1.sort_values(['WOE'])250# In[99]:251df1252# In[100]:253df1.reset_index(drop=True)254# In[101]:255df1['IV'] = (df1['Final_good']-df1['Final_bad']) * df1['WOE']256# In[102]:257df1['IV'] = df1['IV'].sum()258# In[103]:259df1260# In[104]:261### Grade Information value is 0.29 which comes under the bracket of 0.1-0.3262### It means medium predictive power to obtain output variable263# #### Function to calculate WOE264# In[105]:265def woe_discrete(df,independent_variable,dependent_variable):266    df = pd.concat([df[independent_variable],dependent_variable],axis=1)267    df = pd.concat([df.groupby(df.columns.values[0],as_index=False)[df.columns.values[1]].count(),268    df.groupby(df.columns.values[0],as_index=False)[df.columns.values[1]].mean()],axis=1)269    df = df.iloc[:,[0,1,3]]270    df.columns = [df.columns.values[0],'n_obs','prop_good']271    df['no_good'] = df['prop_good'] * df['n_obs']272    df['no_bad'] = (1- df['prop_good']) * df['n_obs']273    df['Final_good'] = df['no_good']/df['no_good'].sum()274    df['Final_bad'] = df['no_bad']/df['no_bad'].sum()275    df['WOE'] = np.log(df['Final_good']/df['Final_bad'])276    df = df.sort_values(['WOE'])277    df = df.reset_index(drop=True)278    df['IV'] = (df['Final_good']-df['Final_bad']) * df['WOE']279    df['IV'] = df['IV'].sum()280    return df281# In[106]:282df_temp=woe_discrete(df_input_prep,'grade',df_output_prep)283# In[107]:284df_temp285# #### Visualizing WOE for dicerete variables to interpret it286# In[108]:287sns.set()288# In[109]:289def plot_by_woe(df_woe,rotation_of_x_labels=0):290    x = np.array(df_woe.iloc[:,0].apply(str)) ## matplotlib works better with array than dataframes291    y = df_woe['WOE']292    plt.figure(figsize=(18,6))293    plt.plot(x,y,marker='o',linestyle='--',color='k')294    plt.xlabel(df_woe.columns[0])295    plt.ylabel('Weight of evidence')296    plt.title(str('Weight of evidence by' + df_woe.columns[0]))297    plt.xticks(rotation = rotation_of_x_labels)298# In[110]:299plot_by_woe(df_temp)300# In[111]:301### Keeping dummy variable G (grade) as reference302### All other in regression model303# ##### Home Ownership Variable304# In[112]:305df_input_prep.head()306# In[113]:307df_home_owner=woe_discrete(df_input_prep,'home_ownership',df_output_prep)308# In[114]:309df_home_owner.head()310# In[115]:311df_home_owner.tail()312# In[116]:313plot_by_woe(df_home_owner)314# In[117]:315df_home_owner316# In 2nd column(n_obs) it is clearly visible that OTHER, NONE and ANY has few values in the dataset, therefore it is less317# WOE to predict loan default, but it is not good to delete those variables as those are most riskiest values, better if we combine them to get good amount of information318# 319# For RENT also, WOE is very low so we can combine it with OTHER,NONE and ANY320# In[119]:321df_input_prep['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_input_prep['home_ownership:OTHER'] ,df_input_prep['home_ownership:RENT'],322                                                          df_input_prep['home_ownership:NONE'],df_input_prep['home_ownership:ANY']])323# From a set of categorical variables that represent one original independent variable, we make a reference category the 324# category with lowest WOE value325# #### address state discrete variable326# In[120]:327df_input_prep['addr_state'].unique()328# In[121]:329df_addr_state=woe_discrete(df_input_prep,'addr_state',df_output_prep)330# In[122]:331df_addr_state.head()332# In[123]:333plot_by_woe(df_addr_state)334# In[124]:335if ['addr_state:ND'] in df_input_prep.columns.values:336    pass337else:338    df_input_prep['addr_state:ND'] = 0339# In[125]:340plot_by_woe(df_addr_state.iloc[2:-2,:])341# Earlier first two and last two states were making us believe that all states from NV to DC wee kind of similar but ideally 342# it is not343# Combine NE, IA, NV, FL, Al, HI based on WOE and number of observation, all of these are having worst borrowers , WOE is lowest344# Being conservative, add ND(North Dakota earlier not in the list) also in this category345# Last four WV,NH,WY,DC and ME,ID are having good borrowers -combine them346# In[127]:347plot_by_woe(df_addr_state.iloc[6:-6,:])348# VA,NM,NY,TN,MO,LA,OK,NC,MD,CA have similar WOE349# However NY and CA have many borrowers so they will be a seperate dummy variable350# Final categories from VA to CA will be;351# 1. VA,NM352# 2. NY353# 3. TN,MO,LA,OK,NC,MA354# 4. CA355# In[128]:356### THEN UT,NJ,AZ,KY357# #### 358# In[129]:359# We create the following categories:360# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'361# 'NM' 'VA'362# 'NY'363# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'364# 'CA'365# 'UT' 'KY' 'AZ' 'NJ'366# 'AR' 'MI' 'PA' 'OH' 'MN'367# 'RI' 'MA' 'DE' 'SD' 'IN'368# 'GA' 'WA' 'OR'369# 'WI' 'MT'370# 'TX'371# 'IL' 'CT'372# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'373# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'374# 'IA_NV_HI_ID_AL_FL' will be the reference category.375df_inputs_prepr = df_input_prep.copy()376df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],377                                              df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],378                                              df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],379                                                          df_inputs_prepr['addr_state:AL']])380df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])381df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],382                                              df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],383                                              df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])384df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],385                                              df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])386df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],387                                              df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],388                                              df_inputs_prepr['addr_state:MN']])389df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],390                                              df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],391                                              df_inputs_prepr['addr_state:IN']])392df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],393                                              df_inputs_prepr['addr_state:OR']])394df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])395df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])396df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],397                                              df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],398                                              df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])399df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],400                                              df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],401                                              df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])402# In[130]:403df_inputs_prepr.head()404# #### verification status discrete variable405# In[131]:406df_inputs_prepr['verification_status'].unique()407# In[132]:408df_verification_status=woe_discrete(df_input_prep,'verification_status',df_output_prep)409# In[133]:410df_verification_status.head()411# In[134]:412plot_by_woe(df_verification_status)413# #### purpose discrete variable414# In[135]:415df_inputs_prepr['purpose'].unique()416# In[136]:417df_purpose=woe_discrete(df_input_prep,'purpose',df_output_prep)418# In[137]:419df_purpose.head()420# In[138]:421plot_by_woe(df_purpose)422# In[189]:423# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.424# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.425# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.426# We leave 'debt_consolidtion' in a separate category.427# We leave 'credit_card' in a separate category.428# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.429df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],430                                                                 df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],431                                                                 df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])432df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],433                                             df_inputs_prepr['purpose:vacation']])434df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],435                                                        df_inputs_prepr['purpose:home_improvement']])436# In[190]:437# 'initial_list_status'438df_initial_list_status = woe_discrete(df_inputs_prepr, 'initial_list_status', df_output_prep)439df_initial_list_status440# In[191]:441plot_by_woe(df_initial_list_status)442# We plot the weight of evidence values.443# ### Preprocessing Continuous Variables: Automating Calculations and Visualizing Results444# When we calculate and plot the weights of evidence of continuous variables categories, what do we sort them by their own445# values in ascending order446# In[ ]:447# In[193]:448# WoE function for ordered discrete and continuous variables449def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):450    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)451    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),452                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)453    df = df.iloc[:, [0, 1, 3]]454    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']455    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()456    df['n_good'] = df['prop_good'] * df['n_obs']457    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']458    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()459    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()460    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])461    #df = df.sort_values(['WoE'])462    #df = df.reset_index(drop = True)463    df['diff_prop_good'] = df['prop_good'].diff().abs()464    df['diff_WoE'] = df['WoE'].diff().abs()465    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']466    df['IV'] = df['IV'].sum()467    return df468# Here we define a function similar to the one above, ...469# ... with one slight difference: we order the results by the values of a different column.470# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.471# In[206]:472def plot_by_woe(df_woe,rotation_of_x_labels=0):473    x = np.array(df_woe.iloc[:,0].apply(str)) ## matplotlib works better with array than dataframes474    y = df_woe['WoE']475    plt.figure(figsize=(18,6))476    plt.plot(x,y,marker='o',linestyle='--',color='k')477    plt.xlabel(df_woe.columns[0])478    plt.ylabel('Weight of evidence')479    plt.title(str('Weight of evidence by' + df_woe.columns[0]))480    plt.xticks(rotation = rotation_of_x_labels)481# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 1482# In[207]:483# term484df_inputs_prepr['term_int'].unique()485# There are only two unique values, 36 and 60.486# In[208]:487df_term_int = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_output_prep)488# We calculate weight of evidence.489df_term_int490# In[209]:491plot_by_woe(df_term_int)492# We plot the weight of evidence values.493# #### emp_length_int494# In[211]:495# Leave as is.496# '60' will be the reference category.497df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)498df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)499# In[212]:500# emp_length_int501df_inputs_prepr['emp_length_int'].unique()502# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.503# In[213]:504df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_output_prep)505# We calculate weight of evidence.506df_temp507# In[214]:508plot_by_woe(df_temp)509# In[215]:510# We create the following categories: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'511# '0' will be the reference category512df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)513df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)514df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)515df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)516df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)517df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)518# #### months since issue519# In[217]:520df_inputs_prepr.head(5)521# In[218]:522df_inputs_prepr.mths_since_issue.unique()523# Fine classing of continous or discrete high ordered variable524# In[220]:525df_inputs_prepr['mths_since_issue'] = pd.cut(df_inputs_prepr['mths_since_issue'], 50)526# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.527# In[221]:528df_inputs_prepr.mths_since_issue.unique()529# In[222]:530df_inputs_prepr['mths_since_issue']531# In[223]:532# mths_since_issue_d533df_mnths_since_issue = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue', df_output_prep)534# We calculate weight of evidence.535df_mnths_since_issue.head(10)536# In[224]:537plot_by_woe(df_mnths_since_issue)538# In[225]:539plot_by_woe(df_mnths_since_issue,rotation_of_x_labels=90)540# In[226]:541plot_by_woe(df_mnths_since_issue.iloc[3: , : ], 90)542# We plot the weight of evidence values.543# In[227]:544# We create the following categories:545# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.546df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(38)), 1, 0)547df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(38, 40)), 1, 0)548df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(40, 42)), 1, 0)549df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(42, 49)), 1, 0)550df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(49, 53)), 1, 0)551df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(53, 65)), 1, 0)552df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(65, 85)), 1, 0)553df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(85, 127)), 1, 0)554# ### Fine classing555# In[229]:556# int_rate557df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)558# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.559# In[230]:560df_inputs_prepr['int_rate_factor'].unique()561# In[231]:562df_inputs_prepr['int_rate_factor']563# In[232]:564df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_output_prep)565# We calculate weight of evidence.566df_temp.head(10)567# In[233]:568plot_by_woe(df_temp,rotation_of_x_labels=90)569# Greater the interest rate, lower the WOE and higher the probability of default (riskier)570# In[235]:571# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'572# In[236]:573df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)574df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)575df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)576df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)577df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)578# In[ ]:579# In[237]:580df_inputs_prepr.head(3)581# In[238]:582df_inputs_prepr['funded_amnt'].unique()583# In[239]:584# funded_amnt585df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)586# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.587df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_output_prep)588# We calculate weight of evidence.589df_temp.head(5)590# In[240]:591plot_by_woe(df_temp,rotation_of_x_labels=90)592# In[241]:593### No need to inlude funded amount in the pD model as WOE is independent of the WOE594# ### Data Preparation: Continuous Variables, Part 1 and 2595# In[242]:596# mths_since_earliest_cr_line597df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)598# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.599df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_output_prep)600# We calculate weight of evidence.601df_temp.head(5)602# In[243]:603plot_by_woe(df_temp, 90)604# We plot the weight of evidence values.605# In[244]:606plot_by_woe(df_temp.iloc[6: , : ], 90)607# We plot the weight of evidence values.608# In[245]:609# We create the following categories:610# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352611df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)612df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)613df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)614df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)615df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)616df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)617# In[246]:618# delinq_2yrs619df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_output_prep)620# We calculate weight of evidence.621df_temp.head(5)622# In[247]:623plot_by_woe(df_temp)624# We plot the weight of evidence values.625# In[248]:626# Categories: 0, 1-3, >=4627df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)628df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)629df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)630# In[250]:631# inq_last_6mths632df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_output_prep)633# We calculate weight of evidence.634df_temp.head()635# In[251]:636plot_by_woe(df_temp)637# We plot the weight of evidence values.638# In[252]:639# Categories: 0, 1 - 2, 3 - 6, > 6640df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)641df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)642df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)643df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)644# In[253]:645# open_acc646df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_output_prep)647# We calculate weight of evidence.648df_temp.head()649# In[254]:650plot_by_woe(df_temp, 90)651# We plot the weight of evidence values.652# In[255]:653plot_by_woe(df_temp.iloc[ : 40, :], 90)654# We plot the weight of evidence values.655# In[256]:656# Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'657df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)658df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)659df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)660df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)661df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)662df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)663df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)664df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)665# In[258]:666# pub_rec667df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_output_prep)668# We calculate weight of evidence.669df_temp.head()670# In[259]:671plot_by_woe(df_temp, 90)672# We plot the weight of evidence values.673# In[260]:674# Categories '0-2', '3-4', '>=5'675df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)676df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)677df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)678# In[261]:679# total_acc680df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)681# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.682df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_output_prep)683# We calculate weight of evidence.684df_temp.head()685# In[262]:686plot_by_woe(df_temp, 90)687# We plot the weight of evidence values.688# In[263]:689# Categories: '<=27', '28-51', '>51'690df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)691df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)692df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)693# In[264]:694# acc_now_delinq695df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_output_prep)696# We calculate weight of evidence.697df_temp.head()698# In[265]:699plot_by_woe(df_temp)700# We plot the weight of evidence values.701# In[266]:702# Categories: '0', '>=1'703df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)704df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)705# In[267]:706# total_rev_hi_lim707df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)708# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.709df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_output_prep)710# We calculate weight of evidence.711df_temp.head()712# In[268]:713plot_by_woe(df_temp.iloc[: 50, : ], 90)714# We plot the weight of evidence values.715# In[269]:716# Categories717# '<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'718df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)719df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)720df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)721df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)722df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)723df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)724df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)725df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)726# In[271]:727# installment728df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)729# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.730df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_output_prep)731# We calculate weight of evidence.732df_temp.head()733# In[272]:734plot_by_woe(df_temp, 90)735# We plot the weight of evidence values.736# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3737# In[273]:738# annual_inc739df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)740# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.741df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)742# We calculate weight of evidence.743df_temp.head()744# In[274]:745df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)746# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.747df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)748# We calculate weight of evidence.749df_temp.head()750# In[276]:751# Initial examination shows that there are too few individuals with large income and too many with small income.752# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine753# the categories of everyone with 140k or less.754df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]755#loan_data_temp = loan_data_temp.reset_index(drop = True)756#df_inputs_prepr_temp757# In[278]:758df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)759# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.760df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_output_prep[df_inputs_prepr_temp.index])761# We calculate weight of evidence.762df_temp.head()763# In[279]:764plot_by_woe(df_temp, 90)765# We plot the weight of evidence values.766# In[280]:767# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.768df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)769df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)770df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)771df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)772df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)773df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)774df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)775df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)776df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)777df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)778df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)779df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)780# In[281]:781# mths_since_last_delinq782# We have to create one category for missing values and do fine and coarse classing for the rest.783df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]784df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)785df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_output_prep[df_inputs_prepr_temp.index])786# We calculate weight of evidence.787df_temp.head()788# In[282]:789plot_by_woe(df_temp, 90)790# We plot the weight of evidence values.791# In[283]:792# Categories: Missing, 0-3, 4-30, 31-56, >=57793df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)794df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)795df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)796df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)797df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)798# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3799# 800# In[284]:801# annual_inc802df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)803# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.804df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)805# We calculate weight of evidence.806df_temp.head()807# 50 classes are not enough to fine class annual income as more than 94% lies in first class808# In[286]:809df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)810# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.811df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)812# We calculate weight of evidence.813df_temp.head()814# In[287]:815# Initial examination shows that there are too few individuals with large income and too many with small income.816# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine817# the categories of everyone with 140k or less.818df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]819#loan_data_temp = loan_data_temp.reset_index(drop = True)820#df_inputs_prepr_temp821# In[288]:822df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)823# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.824df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_output_prep[df_inputs_prepr_temp.index])825# We calculate weight of evidence.826df_temp.head()827# In[289]:828plot_by_woe(df_temp, 90)829# We plot the weight of evidence values.830# ######831# In[290]:832# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.833df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)834df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)835df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)836df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)837df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)838df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)839df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)840df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)841df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)842df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)843df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)844df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)845# In[291]:846# mths_since_last_delinq847# We have to create one category for missing values and do fine and coarse classing for the rest.848df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]849df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)850df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_output_prep[df_inputs_prepr_temp.index])851# We calculate weight of evidence.852df_temp.head()853# In[292]:854plot_by_woe(df_temp, 90)855# We plot the weight of evidence values.856# In[293]:857# Categories: Missing, 0-3, 4-30, 31-56, >=57858df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)859df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)860df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)861df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)862df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)863# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3864# In[294]:865# dti866df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)867# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.868df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_output_prep)869# We calculate weight of evidence.870df_temp.head()871# In[295]:872plot_by_woe(df_temp, 90)873# We plot the weight of evidence values.874# In[296]:875# Similarly to income, initial examination shows that most values are lower than 200.876# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine877# the categories of everyone with 150k or less.878df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]879# In[297]:880df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)881# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.882df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_output_prep[df_inputs_prepr_temp.index])883# We calculate weight of evidence.884df_temp.head()885# In[298]:886plot_by_woe(df_temp, 90)887# We plot the weight of evidence values.888# In[299]:889# Categories:890df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)891df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)892df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)893df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)894df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)895df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)896df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)897df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)898df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)899df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)900# In[300]:901# mths_since_last_record902# We have to create one category for missing values and do fine and coarse classing for the rest.903df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]904#sum(loan_data_temp['mths_since_last_record'].isnull())905df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)906# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.907df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_output_prep[df_inputs_prepr_temp.index])908# We calculate weight of evidence.909df_temp.head()910# In[301]:911plot_by_woe(df_temp, 90)912# We plot the weight of evidence values.913# In[302]:914# Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'915df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)916df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)917df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)918df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)919df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)920df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)921df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)922# In[303]:923loan_data_inputs_train = df_inputs_prepr.copy()924# In[304]:925loan_data_inputs_train.describe()926# ## Preprocessing the Test Dataset927# In[305]:928df_input_prep = loan_data_inputs_test929df_output_prep = loan_data_outputs_test930# In[306]:931df_input_prep.head()932# Dicrete Data Preprocessing933# Dicrete variable is already categorical so here we have no need to calculate dummy variables using fine and coarse classing934# Only calculate WOE and Information value to estimate if the variable can be included for predicting dependent variable935# In[308]:936df_input_prep['grade'].unique()937# In[309]:938df1 = pd.concat([df_input_prep['grade'],df_output_prep],axis=1)939# In[310]:940df1.head()941# In[311]:942df1.tail()943# ## Weight of evidence of discrete variable Grade944# 945# In[312]:946df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].count()947# In[313]:948df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].mean()949# In[314]:950df1 = pd.concat([df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].count(),951                df1.groupby(df1.columns.values[0],as_index=False)[df1.columns.values[1]].mean()],axis=1)952# In[315]:953df1.head()954# In[316]:955df1 = df1.iloc[:,[0,1,3]]956# In[317]:957df1.columns = [df1.columns.values[0],'n_obs','prop_good']958# In[319]:959df1['no_good'] = df1['prop_good'] * df1['n_obs']960df1['no_bad'] = (1- df1['prop_good']) * df1['n_obs']961# In[320]:962df1['Final_good'] = df1['no_good']/df1['no_good'].sum()963df1['Final_bad'] = df1['no_bad']/df1['no_bad'].sum()964# In[321]:965df1['WOE'] = np.log(df1['Final_good']/df1['Final_bad'])966# In[322]:967df1968# In[323]:969df1 = df1.sort_values(['WOE'])970# In[324]:971df1.head()972# In[ ]:973df1.reset_index(drop=True)974# In[326]:975df1['IV'] = (df1['Final_good']-df1['Final_bad']) * df1['WOE']976# In[327]:977df1['IV'] = df1['IV'].sum()978# In[328]:979df1.head()980# Grade Information value is 0.29 which comes under the bracket of 0.1-0.3981# It means medium predictive power to obtain output variable982# #### Function to calculate WOE983# In[330]:984def woe_discrete(df,independent_variable,dependent_variable):985    df = pd.concat([df[independent_variable],dependent_variable],axis=1)986    df = pd.concat([df.groupby(df.columns.values[0],as_index=False)[df.columns.values[1]].count(),987    df.groupby(df.columns.values[0],as_index=False)[df.columns.values[1]].mean()],axis=1)988    df = df.iloc[:,[0,1,3]]989    df.columns = [df.columns.values[0],'n_obs','prop_good']990    df['no_good'] = df['prop_good'] * df['n_obs']991    df['no_bad'] = (1- df['prop_good']) * df['n_obs']992    df['Final_good'] = df['no_good']/df['no_good'].sum()993    df['Final_bad'] = df['no_bad']/df['no_bad'].sum()994    df['WOE'] = np.log(df['Final_good']/df['Final_bad'])995    df = df.sort_values(['WOE'])996    df = df.reset_index(drop=True)997    df['IV'] = (df['Final_good']-df['Final_bad']) * df['WOE']998    df['IV'] = df['IV'].sum()999    return df1000# In[331]:1001df_temp=woe_discrete(df_input_prep,'grade',df_output_prep)1002# In[332]:1003df_temp.head()1004# #### Visualizing WOE for dicerete variables to interpret it1005# In[333]:1006sns.set()1007# In[334]:1008def plot_by_woe(df_woe,rotation_of_x_labels=0):1009    x = np.array(df_woe.iloc[:,0].apply(str)) ## matplotlib works better with array than dataframes1010    y = df_woe['WOE']1011    plt.figure(figsize=(18,6))1012    plt.plot(x,y,marker='o',linestyle='--',color='k')1013    plt.xlabel(df_woe.columns[0])1014    plt.ylabel('Weight of evidence')1015    plt.title(str('Weight of evidence by' + df_woe.columns[0]))1016    plt.xticks(rotation = rotation_of_x_labels)1017# In[335]:1018plot_by_woe(df_temp)1019# Keeping dummy variable G (grade) as reference1020# All other in regression model1021# ##### Home Ownership Variable1022# In[337]:1023df_input_prep.head()1024# In[338]:1025df_home_owner=woe_discrete(df_input_prep,'home_ownership',df_output_prep)1026# In[339]:1027df_home_owner.head()1028# In[340]:1029df_home_owner.tail()1030# In[341]:1031plot_by_woe(df_home_owner)1032# In[342]:1033df_home_owner.head()1034# in 2nd column(n_obs) it is clearly visible that OTHER, NONE and ANY has few values in the dataset, therefore it is less1035# WOE to predict loan default, but it is not good to delete those variables as those are most riskiest values1036# , better if we combine them to get good amount of information1037# 1038# For RENT also, WOE is very low so we can combine it with OTHER,NONE and ANY1039# In[344]:1040df_input_prep['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_input_prep['home_ownership:OTHER'] ,df_input_prep['home_ownership:RENT'],1041                                                          df_input_prep['home_ownership:NONE'],df_input_prep['home_ownership:ANY']])1042# From a set of categorical variables that represent one original independent variable, we make a reference category the 1043# category with lowest WOE value1044# #### address state discrete variable1045# In[346]:1046df_input_prep['addr_state'].unique()1047# In[347]:1048df_addr_state=woe_discrete(df_input_prep,'addr_state',df_output_prep)1049# In[348]:1050df_addr_state.head()1051# In[349]:1052plot_by_woe(df_addr_state)1053# In[350]:1054if ['addr_state:ND'] in df_input_prep.columns.values:1055    pass1056else:1057    df_input_prep['addr_state:ND'] = 01058# In[351]:1059plot_by_woe(df_addr_state.iloc[2:-2,:])1060# Earlier first two and last two states were making us believe that all states from NV to DC wee kind of similar but ideally 1061# it is not1062# Combine NE, IA, NV, FL, Al, HI based on WOE and number of observation, all of these are having worst borrowers , WOE is lowest1063# Being conservative, add ND(North Dakota earlier not in the list) also in this category1064# Last four WV,NH,WY,DC and ME,ID are having good borrowers -combine them1065# In[355]:1066plot_by_woe(df_addr_state.iloc[6:-6,:])1067# VA,NM,NY,TN,MO,LA,OK,NC,MD,CA have similar WOE1068# However NY and CA have many borrowers so they will be a seperate dummy variable1069# Final categories from VA to CA will be;1070# 1. VA,NM1071# 2. NY1072# 3. TN,MO,LA,OK,NC,MA1073# 4. CA1074# In[358]:1075### THEN UT,NJ,AZ,KY1076# #### 1077# In[359]:1078# We create the following categories:1079# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'1080# 'NM' 'VA'1081# 'NY'1082# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'1083# 'CA'1084# 'UT' 'KY' 'AZ' 'NJ'1085# 'AR' 'MI' 'PA' 'OH' 'MN'1086# 'RI' 'MA' 'DE' 'SD' 'IN'1087# 'GA' 'WA' 'OR'1088# 'WI' 'MT'1089# 'TX'1090# 'IL' 'CT'1091# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'1092# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'1093# 'IA_NV_HI_ID_AL_FL' will be the reference category.1094df_inputs_prepr = df_input_prep.copy()1095df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],1096                                              df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],1097                                              df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],1098                                                          df_inputs_prepr['addr_state:AL']])1099df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])1100df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],1101                                              df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],1102                                              df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])1103df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],1104                                              df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])1105df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],1106                                              df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],1107                                              df_inputs_prepr['addr_state:MN']])1108df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],1109                                              df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],1110                                              df_inputs_prepr['addr_state:IN']])1111df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],1112                                              df_inputs_prepr['addr_state:OR']])1113df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])1114df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])1115df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],1116                                              df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],1117                                              df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])1118df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],1119                                              df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],1120                                              df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])1121# In[361]:1122df_inputs_prepr.head()1123# #### verification status discrete variable1124# In[362]:1125df_inputs_prepr['verification_status'].unique()1126# In[363]:1127df_verification_status=woe_discrete(df_input_prep,'verification_status',df_output_prep)1128# In[364]:1129df_verification_status.head()1130# In[365]:1131plot_by_woe(df_verification_status)1132# #### purpose discrete variable1133# In[366]:1134df_inputs_prepr['purpose'].unique()1135# In[367]:1136df_purpose=woe_discrete(df_input_prep,'purpose',df_output_prep)1137# In[368]:1138df_purpose.head()1139# In[369]:1140plot_by_woe(df_purpose)1141# In[370]:1142# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.1143# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.1144# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.1145# We leave 'debt_consolidtion' in a separate category.1146# We leave 'credit_card' in a separate category.1147# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.1148df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],1149                                                                 df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],1150                                                                 df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])1151df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],1152                                             df_inputs_prepr['purpose:vacation']])1153df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],1154                                                        df_inputs_prepr['purpose:home_improvement']])1155# In[371]:1156# 'initial_list_status'1157df_initial_list_status = woe_discrete(df_inputs_prepr, 'initial_list_status', df_output_prep)1158df_initial_list_status1159# In[372]:1160plot_by_woe(df_initial_list_status)1161# We plot the weight of evidence values.1162# ### Preprocessing Continuous Variables: Automating Calculations and Visualizing Results1163# When we calculate and plot the weights of evidence of continuous variables categories, what do we sort them by their own1164# values in ascending order1165# In[139]:1166# WoE function for ordered discrete and continuous variables1167def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):1168    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)1169    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),1170                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)1171    df = df.iloc[:, [0, 1, 3]]1172    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']1173    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()1174    df['n_good'] = df['prop_good'] * df['n_obs']1175    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']1176    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()1177    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()1178    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])1179    #df = df.sort_values(['WoE'])1180    #df = df.reset_index(drop = True)1181    df['diff_prop_good'] = df['prop_good'].diff().abs()1182    df['diff_WoE'] = df['WoE'].diff().abs()1183    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']1184    df['IV'] = df['IV'].sum()1185    return df1186# Here we define a function similar to the one above, ...1187# ... with one slight difference: we order the results by the values of a different column.1188# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.1189# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 11190# In[140]:1191# term1192df_inputs_prepr['term_int'].unique()1193# There are only two unique values, 36 and 60.1194# In[141]:1195df_term_int = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_output_prep)1196# We calculate weight of evidence.1197df_term_int1198# In[142]:1199def plot_by_woe(df_woe,rotation_of_x_labels=0):1200    x = np.array(df_woe.iloc[:,0].apply(str)) ## matplotlib works better with array than dataframes1201    y = df_woe['WoE']1202    plt.figure(figsize=(18,6))1203    plt.plot(x,y,marker='o',linestyle='--',color='k')1204    plt.xlabel(df_woe.columns[0])1205    plt.ylabel('Weight of evidence')1206    plt.title(str('Weight of evidence by' + df_woe.columns[0]))1207    plt.xticks(rotation = rotation_of_x_labels)1208# In[143]:1209plot_by_woe(df_term_int)1210# We plot the weight of evidence values.1211# In[144]:1212##emp_length_int1213# In[145]:1214# Leave as is.1215# '60' will be the reference category.1216df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)1217df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)1218# In[146]:1219# emp_length_int1220df_inputs_prepr['emp_length_int'].unique()1221# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.1222# In[147]:1223df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_output_prep)1224# We calculate weight of evidence.1225df_temp.head()1226# In[148]:1227plot_by_woe(df_temp)1228# In[149]:1229# We create the following categories: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'1230# '0' will be the reference category1231df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)1232df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)1233df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)1234df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)1235df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)1236df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)1237# In[150]:1238## months since issue1239# In[151]:1240df_inputs_prepr.head(5)1241# In[152]:1242df_inputs_prepr.mths_since_issue.unique()1243# In[153]:1244### Fine classing of continous or discrete high ordered variable1245# In[154]:1246df_inputs_prepr['mths_since_issue'] = pd.cut(df_inputs_prepr['mths_since_issue'], 50)1247# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1248# In[155]:1249df_inputs_prepr.mths_since_issue.unique()1250# In[156]:1251df_inputs_prepr['mths_since_issue']1252# In[157]:1253# mths_since_issue_d1254df_mnths_since_issue = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue', df_output_prep)1255# We calculate weight of evidence.1256df_mnths_since_issue.head()1257# In[158]:1258plot_by_woe(df_mnths_since_issue)1259# In[159]:1260plot_by_woe(df_mnths_since_issue,rotation_of_x_labels=90)1261# In[160]:1262plot_by_woe(df_mnths_since_issue.iloc[3: , : ], 90)1263# We plot the weight of evidence values.1264# In[161]:1265# We create the following categories:1266# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.1267df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(38)), 1, 0)1268df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(38, 40)), 1, 0)1269df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(40, 42)), 1, 0)1270df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(42, 49)), 1, 0)1271df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(49, 53)), 1, 0)1272df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(53, 65)), 1, 0)1273df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(65, 85)), 1, 0)1274df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue'].isin(range(85, 127)), 1, 0)1275# In[162]:1276df_inputs_prepr['int_rate'].unique()1277# ### Fine classing1278# In[163]:1279# int_rate1280df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)1281# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1282# In[164]:1283df_inputs_prepr['int_rate_factor'].unique()1284# In[165]:1285df_inputs_prepr['int_rate_factor'].head()1286# In[166]:1287df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_output_prep)1288# We calculate weight of evidence.1289df_temp.head()1290# In[167]:1291plot_by_woe(df_temp,rotation_of_x_labels=90)1292# In[168]:1293#### greater the interest rate, lower the WOE and higher the probability of default (riskier)1294# In[169]:1295# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'1296# In[170]:1297df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)1298df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)1299df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)1300df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)1301df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)1302# In[ ]:1303# In[171]:1304df_inputs_prepr.head(3)1305# In[172]:1306df_inputs_prepr['funded_amnt'].unique()1307# In[173]:1308# funded_amnt1309df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)1310# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1311df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_output_prep)1312# We calculate weight of evidence.1313df_temp.head()1314# In[174]:1315plot_by_woe(df_temp,rotation_of_x_labels=90)1316# In[175]:1317### No need to inlude funded amount in the pD model as WOE is independent of the WOE1318# ### Data Preparation: Continuous Variables, Part 1 and 21319# In[176]:1320# mths_since_earliest_cr_line1321df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)1322# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1323df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_output_prep)1324# We calculate weight of evidence.1325df_temp.head()1326# In[177]:1327plot_by_woe(df_temp, 90)1328# We plot the weight of evidence values.1329# In[178]:1330plot_by_woe(df_temp.iloc[6: , : ], 90)1331# We plot the weight of evidence values.1332# In[179]:1333# We create the following categories:1334# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 3521335df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)1336df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)1337df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)1338df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)1339df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)1340df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)1341# In[180]:1342# delinq_2yrs1343df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_output_prep)1344# We calculate weight of evidence.1345df_temp.head()1346# In[181]:1347plot_by_woe(df_temp)1348# We plot the weight of evidence values.1349# In[182]:1350# Categories: 0, 1-3, >=41351df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)1352df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)1353df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)1354# In[183]:1355# inq_last_6mths1356df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_output_prep)1357# We calculate weight of evidence.1358df_temp.head()1359# In[184]:1360plot_by_woe(df_temp)1361# We plot the weight of evidence values.1362# In[185]:1363# Categories: 0, 1 - 2, 3 - 6, > 61364df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)1365df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)1366df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)1367df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)1368# In[186]:1369# open_acc1370df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_output_prep)1371# We calculate weight of evidence.1372df_temp.head()1373# In[187]:1374plot_by_woe(df_temp, 90)1375# We plot the weight of evidence values.1376# In[188]:1377plot_by_woe(df_temp.iloc[ : 40, :], 90)1378# We plot the weight of evidence values.1379# In[189]:1380# Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'1381df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)1382df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)1383df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)1384df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)1385df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)1386df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)1387df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)1388df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)1389# In[190]:1390# pub_rec1391df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_output_prep)1392# We calculate weight of evidence.1393df_temp.head()1394# In[191]:1395plot_by_woe(df_temp, 90)1396# We plot the weight of evidence values.1397# In[192]:1398# Categories '0-2', '3-4', '>=5'1399df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)1400df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)1401df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)1402# In[193]:1403# total_acc1404df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)1405# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1406df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_output_prep)1407# We calculate weight of evidence.1408df_temp.head()1409# In[194]:1410plot_by_woe(df_temp, 90)1411# We plot the weight of evidence values.1412# In[195]:1413# Categories: '<=27', '28-51', '>51'1414df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)1415df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)1416df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)1417# In[196]:1418# acc_now_delinq1419df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_output_prep)1420df_temp.head()1421# In[197]:1422plot_by_woe(df_temp)1423# We plot the weight of evidence values.1424# In[198]:1425# Categories: '0', '>=1'1426df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)1427df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)1428# In[199]:1429# total_rev_hi_lim1430df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)1431# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.1432df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_output_prep)1433# We calculate weight of evidence.1434df_temp.head()1435# In[200]:1436plot_by_woe(df_temp.iloc[: 50, : ], 90)1437# We plot the weight of evidence values.1438# In[201]:1439# Categories1440# '<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'1441df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)1442df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)1443df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)1444df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)1445df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)1446df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)1447df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)1448df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)1449# In[202]:1450# installment1451df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)1452# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1453df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_output_prep)1454# We calculate weight of evidence.1455df_temp.head()1456# In[203]:1457plot_by_woe(df_temp, 90)1458# We plot the weight of evidence values.1459# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 31460# In[204]:1461# annual_inc1462df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)1463# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1464df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)1465# We calculate weight of evidence.1466df_temp.head()1467# In[205]:1468df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)1469# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.1470df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)1471# We calculate weight of evidence.1472df_temp.head()1473# In[206]:1474# Initial examination shows that there are too few individuals with large income and too many with small income.1475# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine1476# the categories of everyone with 140k or less.1477df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]1478#loan_data_temp = loan_data_temp.reset_index(drop = True)1479#df_inputs_prepr_temp1480# In[207]:1481df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)1482# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1483df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_output_prep[df_inputs_prepr_temp.index])1484# We calculate weight of evidence.1485df_temp.head()1486# In[208]:1487plot_by_woe(df_temp, 90)1488# We plot the weight of evidence values.1489# In[209]:1490# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.1491df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)1492df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)1493df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)1494df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)1495df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)1496df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)1497df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)1498df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)1499df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)1500df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)1501df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)1502df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)1503# In[210]:1504# mths_since_last_delinq1505# We have to create one category for missing values and do fine and coarse classing for the rest.1506df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]1507df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)1508df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_output_prep[df_inputs_prepr_temp.index])1509# We calculate weight of evidence.1510df_temp.head()1511# In[211]:1512plot_by_woe(df_temp, 90)1513# We plot the weight of evidence values.1514# In[212]:1515# Categories: Missing, 0-3, 4-30, 31-56, >=571516df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)1517df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)1518df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)1519df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)1520df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)1521# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 31522# 1523# In[213]:1524# annual_inc1525df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)1526# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1527df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)1528# We calculate weight of evidence.1529df_temp.head()1530# In[214]:1531### 50 classes are not enough to fine class annual income as more than 94% lies in first class1532# In[215]:1533df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)1534# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.1535df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_output_prep)1536# We calculate weight of evidence.1537df_temp.head()1538# In[216]:1539# Initial examination shows that there are too few individuals with large income and too many with small income.1540# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine1541# the categories of everyone with 140k or less.1542df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]1543#loan_data_temp = loan_data_temp.reset_index(drop = True)1544#df_inputs_prepr_temp1545# In[217]:1546df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)1547# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1548df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_output_prep[df_inputs_prepr_temp.index])1549# We calculate weight of evidence.1550df_temp.head()1551# In[218]:1552plot_by_woe(df_temp, 90)1553# We plot the weight of evidence values.1554# ######1555# In[219]:1556# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.1557df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)1558df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)1559df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)1560df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)1561df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)1562df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)1563df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)1564df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)1565df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)1566df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)1567df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)1568df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)1569# In[220]:1570# mths_since_last_delinq1571# We have to create one category for missing values and do fine and coarse classing for the rest.1572df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]1573df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)1574df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_output_prep[df_inputs_prepr_temp.index])1575# We calculate weight of evidence.1576df_temp.head()1577# In[221]:1578plot_by_woe(df_temp, 90)1579# We plot the weight of evidence values.1580# In[222]:1581# Categories: Missing, 0-3, 4-30, 31-56, >=571582df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)1583df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)1584df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)1585df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)1586df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)1587# ### Preprocessing Continuous Variables: Creating Dummy Variables, Part 31588# In[223]:1589# dti1590df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)1591# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.1592df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_output_prep)1593# We calculate weight of evidence.1594df_temp.head()1595# In[224]:1596plot_by_woe(df_temp, 90)1597# We plot the weight of evidence values.1598# In[225]:1599# Similarly to income, initial examination shows that most values are lower than 200.1600# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine1601# the categories of everyone with 150k or less.1602df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]1603# In[226]:1604df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)1605# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1606df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_output_prep[df_inputs_prepr_temp.index])1607# We calculate weight of evidence.1608df_temp.head()1609# In[227]:1610plot_by_woe(df_temp, 90)1611# We plot the weight of evidence values.1612# In[228]:1613# Categories:1614df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)1615df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)1616df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)1617df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)1618df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)1619df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)1620df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)1621df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)1622df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)1623df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)1624# In[229]:1625# mths_since_last_record1626# We have to create one category for missing values and do fine and coarse classing for the rest.1627df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]1628#sum(loan_data_temp['mths_since_last_record'].isnull())1629df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)1630# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1631df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_output_prep[df_inputs_prepr_temp.index])1632# We calculate weight of evidence.1633df_temp.head()1634# In[230]:1635plot_by_woe(df_temp, 90)1636# We plot the weight of evidence values.1637# In[231]:1638# Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'1639df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)1640df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)1641df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)1642df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)1643df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)1644df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)1645df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)1646# In[232]:1647df_inputs_prepr.describe()1648# In[233]:1649loan_data_inputs_test = df_inputs_prepr.copy()1650# In[ ]:1651# In[234]:1652# loan_data_inputs_train.to_csv('loan_data_inputs_train.csv')1653# loan_data_outputs_train.to_csv('loan_data_targets_train.csv')1654# loan_data_inputs_test.to_csv('loan_data_inputs_test.csv')1655# loan_data_outputs_test.to_csv('loan_data_targets_test.csv')...Preprocessing.py
Source:Preprocessing.py  
1# -*- coding: utf-8 -*-2"""3Created on Fri Oct 2 16:52:29 202045@author: Abhimanyu Trakroo6"""789import numpy as np10import pandas as pd1112loan_data_backup = pd.read_csv('......\\loan_data_2007_2014.csv')13loan_data = loan_data_backup.copy()1415## Explore Data16#loan_data17pd.options.display.max_columns = None # used to display all data at once.. no truncating18#note that above code .. can be modified to display certain number of columns19#loan_data2021loan_data.head()22loan_data.tail()23loan_data.columns.values #displaying all the columns heads24loan_data['emp_length'].isnull().sum()25#When we imported the data, we got a message that column contains different value types. Thus, just 26# to know27loan_data.info() 28loan_data.describe()29    # Displays column names, complete (non-missing) cases per column, and datatype per column.30#Please note that the describe method still shows the mean if there are missing values3132#PREPROCESSING STARTS33## General Preprocessing343536### Preprocessing few CONTINUOUS VARIBALES /start37#SPOT THE OBJECT categories which should be Float/Integers38loan_data['emp_length'].unique()39# Displays unique values of a column.40#Notice the years at the end of numbers and the data type = object 41#CONVERSION into numbers/continuous variables4243#BE VERY VIGIL ABOUT THE SEQUENCE OF REPLACEMENT44loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\+ years', '')45#'\' above indicates that there is a number before it which we do not wanna replace46#Now, note the unique values47#loan_data['emp_length_int'].unique() #10 is still an object but do not worry. Will convert that later4849loan_data['emp_length_int'] = loan_data['emp_length_int'].replace('< 1 year', int(0))50#notice that value you are replacing with is '0'51#str(0)52#<1 year is classified as 0 as we already have 1 year in the dataset5354loan_data['emp_length_int'] = loan_data['emp_length_int'].replace('n/a',  int(0))55#Also, NOTE THAT 'NAN' VALUES ARE PROCESSED AS 'n/a' 56#'n/a' is classified as 0 as we assume that it is 0<<<------ IMPORTANT to note when to replace57#with 0 and when to use no value as ''585960loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')61loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')62# We store the preprocessed âemployment lengthâ variable in a new variable called âemployment length intâ,63# We assign the new âemployment length intâ to be equal to the âemployment lengthâ variable with the string â+ yearsâ64# replaced with nothing. Next, we replace the whole string âless than 1 yearâ with the string â0â.65# Then, we replace the ân/aâ string with the string â0â. Then, we replace the string âspace yearsâ with nothing.66# Finally, we replace the string âspace yearâ with nothing. 6768#type(loan_data['emp_length_int'][0])69# Checks the datatype of a single element of a column.7071loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])72# Transforms the values to numeric.7374#type(loan_data['emp_length_int'][0])75# Checks the datatype of a single element of a column.76#pd.DataFrame(loan_data['emp_length_int']).info()77loan_data['emp_length_int'].unique()7879####################emp_length_int has been converted to integer########################8081#converting TERM into integer82loan_data['term'].unique()83#There is only 'space months' as extra8485loan_data['term_int']=loan_data['term'].str.replace(' months','')86loan_data['term_int'].unique()87loan_data['term_int']=pd.to_numeric(loan_data['term_int'])88loan_data['term_int'].unique()89#type(loan_data['term_int'][0]) #important to mention index to pick any value and check it's type9091##### Preprocessing Date variables #######92loan_data['earliest_cr_line'].unique()93# Displays a column.94loan_data['earliest_cr_line'].isnull().sum()9596loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format='%b-%y')97#loan_data['earliest_cr_line_date'][loan_data['earliest_cr_line_date']=='Jan-1985']9899# Extracts the date and the time from a string variable that is in a given format.100pd.DataFrame(loan_data['earliest_cr_line_date']).info() #CHECK THE CHANGE ----->> IT IS TIMESTAMP101#type(loan_data['earliest_cr_line_date'][0])102# Checks the datatype of a single element of a column.103#pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']104# Calculates the difference between two dates and times.105# Assume we are now in December 2017106loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))107#pd.to_numeric was not required here but may be to make the code reusable, this is done108# We calculate the difference between two dates in months, turn it to numeric datatype and round it.109# We save the result in a new variable.110111#loan_data['mths_since_earliest_cr_line'].describe()112# Shows some descriptive statisics for the values of a column.113# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,114# and negative differences are being calculated.115loan_data['mths_since_earliest_cr_line'].max()116loan_data['mths_since_earliest_cr_line'].min()117#loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]118# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.119# There are 2303 strange negative values.120121loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()122# We set the rows that had negative differences to the maximum value.123124#min(loan_data['mths_since_earliest_cr_line'])125# Calculates and shows the minimum value of a column.126127################## earliiest credit line done##################128129##########preprocessing - issue_d##########################130131#loan_data['issue_d']132loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'],format='%b-%y')133loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01')-loan_data['issue_d_date'])/np.timedelta64(1,'M')))134#loan_data['mths_since_issue_d'].describe()135##################136137#########Just for fun - processing one more --> last_pymnt_d, This is not needed as it's a payment date not a variable##########################138loan_data['last_pymnt_d']139loan_data['mths_since_last_pymnt_d'] = round((pd.to_datetime('2017-12-01')-pd.to_datetime(loan_data['last_pymnt_d'],format='%b-%y'))/np.timedelta64(1,'M'))140#loan_data['mths_since_last_pymnt_d'].describe()141###################################This was not useful, but good practice########################################142143### Preprocessing few CONTINUOUS VARIBALES /ends144145### Preprocessing few CATEGORICAL VARIBALES /starts######################################146#loan_data.info() #SPOT THE OBJECT categories147###################### trying to pull only object categories148#list_objects = list() ## trying to pull only object categories149#for i in range(len(loan_data.iloc[1:2,:])) ## trying to pull only object categories150 #   if type(loan_data.iloc[:1,i])==object## trying to pull only object categories151  #     list_objects=list_objects.append(list(loan_data.iloc[0,i]))## trying to pull only object categories152######################        153154# Starting with Grade variable155#pd.get_dummies(loan_data['grade']) #created the dummy variables156#pd.get_dummies(loan_data['grade'], prefix = 'Grade', prefix_sep=':') #created dummy variables with separators157158#We want to create a new dataframe consisting all the dummy variables and then append it to the original dataframe as we 159#run regressions on the original dataset160#### FOR NOW THE DUMMIES ARE JUST BEING CREATED FOR DISCRETE VARIABLES 161### lATER WE WOULD USE np.where METHOD WITH isin(range()) TO COMPARE THE values of continuous variables162###... and assign them 0 or 1 and save them in a class ##Dummies for continouos variables 163loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),164                     pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),165                     pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),166                     pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),167                     pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),168                     pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),169                     pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),170                     pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]171# We create dummy variables from all 8 original independent variables, and save them into a list.172# Note that we are using a particular naming convention for all variables: original variable name, colon, category name.173174#pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':').to_csv("C:\\Users\\Abhimanyu Trakroo\\Downloads\\Udemy-Credit_risk_in_python\\test for add_state.csv")175176loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)177# We concatenate the dummy variables and this turns them into a dataframe.178#type(loan_data_dummies)179# Returns the type of the variable.180181loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)182# Concatenates two dataframes.183# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. 184185#loan_data.columns.values186# Displays all column names. and check if all the dummy variables are concatenated187188189190191########################## Check for missing values and clean ###########################192loan_data.isnull()193# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe.194#pd.options.display.max_rows = None195# Sets the pandas dataframe options to display all columns/ rows.196loan_data.isnull().sum()197#pd.options.display.max_rows = 100198# Sets the pandas dataframe options to display 100 columns/ rows.199# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.200loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)201# We fill the missing values with the values of another variable.202203#loan_data['total_rev_hi_lim'].isnull().sum()204205206############      TREATMENT OF MISSING VARIABLES FOR THE: (âannual_incâ) (MEAN REPLACEMENT), 207#ZERO REPLACEMENT:âmths_since_earliest_cr_lineâ, âacc_now_delinqâ, âtotal_accâ, âpub_recâ, âopen_accâ,208#âinq_last_6mthsâ209#âdelinq_2yrsâ210#âemp_length_intâ  ###############211212#loan_data['annual_inc'].isnull().sum()213loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(),inplace=True)214#loan_data['annual_inc'].isnull().sum()215######216217#loan_data['mths_since_earliest_cr_line'].isnull().sum()218loan_data['mths_since_earliest_cr_line'].fillna(int(0),inplace=True)219#loan_data['mths_since_earliest_cr_line'].isnull().sum()220221########222#Remember this thing223#type(int(0))224#type(str(0))225226#loan_data['acc_now_delinq'].isnull().sum()227loan_data['acc_now_delinq'].fillna(int(0),inplace=True)228#loan_data['acc_now_delinq'].isnull().sum()229#loan_data['acc_now_delinq'][loan_data['acc_now_delinq']==str(0)]=int(0)230########231232#loan_data['total_acc'].isnull().sum()233loan_data['total_acc'].fillna(int(0),inplace=True)234#loan_data['total_acc'].isnull().sum()235#loan_data['total_acc'][loan_data['total_acc']==str(0)]=int(0)236###############237238#loan_data['pub_rec'].isnull().sum()239loan_data['pub_rec'].fillna(int(0),inplace=True)240#loan_data['pub_rec'].isnull().sum()241#loan_data['pub_rec'][loan_data['pub_rec']==str(0)]=int(0)242###########243244#loan_data['open_acc'].isnull().sum()245loan_data['open_acc'].fillna(int(0),inplace=True)246#loan_data['open_acc'].isnull().sum()247#loan_data['open_acc'][loan_data['open_acc']==str(0)]=int(0)248249##################250251#loan_data['inq_last_6mths'].isnull().sum()252loan_data['inq_last_6mths'].fillna(int(0),inplace=True)253#loan_data['inq_last_6mths'].isnull().sum()254#loan_data['inq_last_6mths'][loan_data['inq_last_6mths']==str(0)]=int(0)255256#the following statements helped me find the replaced str(0) characters257#loan_data['inq_last_6mths'].value_counts() #finding out replaced str(0)258#loan_data['inq_last_6mths'][loan_data['inq_last_6mths']==int(0)]259#and now they will be replaced260#loan_data['inq_last_6mths'][loan_data['inq_last_6mths']==str(0)]=int(0)261#loan_data['inq_last_6mths'].isnull().sum()262##########263264#loan_data['delinq_2yrs'].isnull().sum()265loan_data['delinq_2yrs'].fillna(int(0),inplace=True)266#loan_data['delinq_2yrs'].isnull().sum()267#loan_data['delinq_2yrs'][loan_data['delinq_2yrs']==str(0)]=int(0)268269################270271#loan_data['emp_length_int'].isnull().sum()272loan_data['emp_length_int'].fillna(int(0),inplace=True)273#loan_data['emp_length_int'].isnull().sum()274#loan_data['emp_length_int'][loan_data['emp_length_int']==str(0)]=int(0)275276###############################################################################################277278279################# PD MODEL BEGINS ######################################280################# PD MODEL BEGINS ######################################281################# PD MODEL BEGINS ######################################282################# PD MODEL BEGINS ######################################283################# PD MODEL BEGINS ######################################284285#STEP 1: DEFINING GOOD / BAD: 286#Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts. 287288#loan_data['loan_status'].unique()289# Displays unique values of a column. ALLOWS US TO DECIDE WHICH LOANS TO PUT AS DEFAULT AND WHICH NOT290291#loan_data['loan_status'].value_counts()292# Calculates the number of observations for each unique value of a variable.293294#loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()295# We divide the number of observations for each unique value of a variable by the total number of observations.296# Thus, we get the proportion of observations for each unique value of a variable.297298# Good/ Bad Definition MOST IMPORTANT: PLEASE NOTE THAT WE HAVE USED 1 FOR NON-DEFAULT I.E. GOOD LOANS299300loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default','Does not meet the credit policy. Status:Charged Off','Late (31-120 days)']), 0, 1)301#MOST IMPORTANT DECISIVE FACTOR # We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met.302303#loan_data['good_bad'] #LOANS CLASSIFIED AS DEFAULT OR NON-DEFAULT304########################################################################################305################# PART OF THE PD MODEL ######################################306307#SPLITTING THE DATA INTO TRAINING AND TESTING308from sklearn.model_selection import train_test_split309310#NOTICE THAT THE FOLLOWING CODE MAKES US DROP A VARIABLE (DEPENDENT VARIABLE).311312#train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])313# Takes a set of inputs and a set of targets as arguments. Splits the inputs and the targets into four dataframes:314# Inputs - Train, Inputs - Test, Targets - Train, Targets - Test.315316#loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])317# We split two dataframes with inputs and targets, each into a train and test dataframe, and store them in variables.318319#loan_data_inputs_train.shape320# Displays the size of the dataframe.321322#loan_data_targets_train.shape323# Displays the size of the dataframe.324325#loan_data_inputs_test.shape326# Displays the size of the dataframe.327328#loan_data_targets_test.shape329# Displays the size of the dataframe.330331loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)332333# We split two dataframes with inputs and targets, each into a train and test dataframe, and store them in variables.334# This time we set the size of the test dataset to be 20%.335# Respectively, the size of the train dataset becomes 80%.336# We also set a specific random state.337# This would allow us to perform the exact same split multimple times.338# This means, to assign the exact same observations to the train and test datasets.339340loan_data_inputs_train.shape341# Displays the size of the dataframe.342343loan_data_targets_train.shape344# Displays the size of the dataframe.345346loan_data_inputs_test.shape347# Displays the size of the dataframe.348349loan_data_targets_test.shape350# Displays the size of the dataframe.351352353########################## Data Preparation OF THE TRAINING DATASET ##################################354##########################This would include automating the process of calculating WoE and IV for several 355#variables356################# PART OF THE PD MODEL ######################################357#IF THE FOLLOWING TWO CODES ARE HASHED OUT MEANS THAT WE HAVE ALREADY PREPROCESSED TRAINING VARIABLES358# AND NOW ARE GONNA PREPROCESS TEST VARIABLES359#Otherwise hash them when preprocessing test variables360361362df_inputs_prepr = loan_data_inputs_train363df_targets_prepr = loan_data_targets_train364365366367368##### CREATED WORKING DATAFRAMES JUST LIKE A ROUGH SHEET, OPERATIONS ON WHICH WOULD NOT IMPACT MAIN DATASET369370#IMPORTANT: Note that the folloowing two lines of codes would be unhashed when we complete the preprocessing of training dataset371# When we complete the preprocessing the training dataset, we would need to save the results contained in 'df_inputs_prepr' in 372#.. in 'loan_data_inputs_train (i.e. reverse the above code)373# This is becasue the automated version of the code that we create in the following, is gonna basically run on 'df_input_prepr' 374#....as an input. Thus we need to save the dataframe, once the preprocessing of the inputs and targets is done 375#..for each, training and test data376377378379380#    381#df_inputs_prepr = loan_data_inputs_test382#df_targets_prepr = loan_data_targets_test383#... AND RUN THE FOLLOWING CODE AFTER UNHASHING THE ABOVE TWO LINES384385386387388######Preprocessing using different variables starting with 'grade': Calculation of WoE in predicting good/bad split389df_inputs_prepr['grade'].unique()390# Displays unique values of a column.391#JUST LIKE A REGRESSION MODEL WHERE ONE VARIABLE OF INDEPENDENT IS MATCHED WITH INDEPENDENT FOR THE MODEL TO LEARN 392df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)393# Concatenates two dataframes along the columns.394#df1.head()395396######Part of data preparation: Calculation of weight of index397#we want to split and count the data for good and bad398#Thus, we need to group it399#IMPORTANT: Since we want this code to be reusable thus using indexes i.e. df1.columns.values[0] because we know 400#whichever variable we use it will always be in the first column which is 0th index401#df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()402#df1.groupby(df1.columns.values[0])[df1.columns.values[1]].count() #Notice the difference in result when run 403#without index=False404#as_index=False lets the values in the dataframe be the part of dataset and not as an index405# Groups the data according to a criterion contained in one column.406# Does not turn the names of the values of the criterion (sub category of a variable) as indexes.407# Aggregates the data in another column, using a selected function.408# In this specific case, we group by the column with index 0 and we aggregate the values of the column with index 1.409# More specifically, we count them.410# In other words, we count the values in the column with index 1 for each value of the column with index 0.411412####### REMINDER THAT WE ARE WORKING WITH THE Train DATA and we will run the similar code for the test data as well413df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()414#the statement works as to calculate the propostion of good borrowers as bad borrowers would anyway have the value415#of '0' 416# Groups the data according to a criterion contained in one column.417# Does not turn the names of the values of the criterion as indexes.418# Aggregates the data in another column, using a selected function.419# Here we calculate the mean of the values in the column with index 1 for each value of the column with index 0.420421#Merging the dataframes with the count of each grade and the mean (i.e. % of good borrowers in each grade) 422df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),423                df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)424# Concatenates two dataframes along the columns.425426df1427428#keeping just one coulmn of grades429df1 = df1.iloc[:, [0, 1, 3]]430# Selects only columns with specific indexes.431df1432433434df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good'] #df1.columns and df1.columns.values are same435# Changes the names of the columns of a dataframe.436df1437438439df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()440# We divide the values of one column by he values of another column and save the result in a new variable.441df1442443444df1['n_good'] = df1['prop_good'] * df1['n_obs']445# We multiply the values of one column by he values of another column and save the result in a new variable.446df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']447df1448449#### REMINDER THAT WE ARE WORKING WITH THE TEST DATA450451df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()452df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()453df1454455df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])456# We take the natural logarithm of a variable and save the result in a nex variable.457df1458459460df1 = df1.sort_values(['WoE'])461# Sorts a dataframe by the values of a given column.462463464#In the above result the index was also showing and it was messed up as the values of WoE were sorted. Thus, we 465#have to sort the index now for the visual easyness (I hope).466df1 = df1.reset_index(drop = True)467# We reset the index of a dataframe and overwrite it.468df1469470##########The difference does not solve any real purpose (I guess, find out about this more). May be the coach uses471#these later472# .diff() function subtracts the values of two subsequent values in the column (or rows altogether)473df1['diff_prop_good'] = df1['prop_good'].diff().abs()474# We take the difference between two subsequent values of a column. Then, we take the absolute value of the result.475#Absolute values are taken as they are more intuitive in understanding the differences476477####### REMINDER THAT WE ARE WORKING WITH THE TEST DATA478479df1['diff_WoE'] = df1['WoE'].diff().abs()480# We take the difference between two subsequent values of a column. Then, we take the absolute value of the result.481#Absolute values are taken as they are more intuitive in understanding the differences482483df1484##########The difference does not solve any real purpose (I guess, find out about this more)485486487df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE'] #CALCULATES IV FOR EACH 488#VARIABLE - IN A NONTECHNICAL SENSE AS IV IS FOR A VARIABLE WHOLE NOT CATEGORY INDIVIDUAL489490df1['IV'] = df1['IV'].sum() #Asigns the same value to each category491# We sum all values of a given column.492493df1494################# PART OF THE PD MODEL ######################################495################# PART OF THE PD MODEL ######################################496############################ AUTOMATING THE CALCULATION OF WoE FOR ALL VARIABLES #################################497498499def woe_discrete(df, discrete_variable_name, good_bad_variable_df):500    df = pd.concat([df[discrete_variable_name], good_bad_variable_df], axis = 1) 501    #Before this function is involed the df remains the same as the one broken into train and test......502    # We will be using TRAINING_INPUTS_data for 'df' and TRAINING_TARGETS_data for 'good_bad_variable_df'503    504    #which was after we treated it for: 505        # Continuous variables- date formats, number formats506        # Discrete variables - created the dummies, set individual columns for each dummy except for good/bad507        # Checked for missing values and rectifying the places508        # Analyzing the loan status values and assigning 0 and 1 to the status depending on the business logic509        # splitting the data into training and testing; training data set dropped the dependent variable510        # Calculating of WoE and IV for an variable to set basis for this code511        # Training dataset is called into this function with the reference to a specific variable 512    #513    514    #Inside this function we remake the df. Invoking this function will then use the above code to recreate the 515    #usable dataframe in df516    #Remember that good_bad_variable_df has only dependent variable dataframe517    518                    # groups the variables as per the variable,#provides the count(Total people) for each category519    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),520                df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)521                    # groups the variables as per the variable, #provides the mean (%good) for each category                    522            523    #The above creates a dataframe as per the classes of the specified grade variable and provides the counts for524    #total borrowers and proportion of good borrowers in each category. 525    526    527    df = df.iloc[:, [0, 1, 3]] #separates the extra column of the variable which gets repeated from the abo ve process 528    df.columns = [df.columns.values[0], 'n_obs', 'prop_good'] #renaming of columns to make it intuitive529    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum() #adding column of proportion of observations in each category530    df['n_good'] = df['prop_good'] * df['n_obs'] #propotion of good converted into numbers. 531    532    #From the mean() used earlier, we got the mean of the CATEGORY (prop_good). But to convert it to the mean of533    #the total good population, we multiply prop_good--->> number of goods in the category --->> add number of 534    #goods across categories----> find prop_n_good which is total % of goods from the total goods handled by the535    #category536    537    #Also instead of going round the busheS, we could have simply calculated .sum() instead of .mean(), and 538    #avoided confusion539    #BUT WE NEED PROPN_GOOD LATER TO COMBINE CATEGORIES540     541    542    df['n_bad'] = (1 - df['prop_good']) * df['n_obs'] #number of bad borrowers in each category543    df['prop_n_good'] = df['n_good'] / df['n_good'].sum() #Categorywise % of good borrowers out of Total Good borrowers 544    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum() #Categorywise % of bad borrowers out of Total Bad borrowers545    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad']) #Calculating WoE546    df = df.sort_values(['WoE']) #sorting values by WoE547    df = df.reset_index(drop = True) #resetting index to avoid confusion548    df['diff_prop_good'] = df['prop_good'].diff().abs() #calculating difference between % good borrowers row-wise. 549    550    df['diff_WoE'] = df['WoE'].diff().abs()551    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE'] #calculating weight*WoE operation for each category552    df['IV'] = df['IV'].sum() #assigning same IV i.e. total of categories to the whole column553    554    return df #now df consists of all the things we want 555556df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr) #testing whether it works good557df_temp558559df_temp2 = woe_discrete(df_inputs_prepr, 'emp_length', df_targets_prepr) #calling it on emp_length to check implementation560df_temp2561562563################# PART OF THE PD MODEL ######################################564################### Preprocessing Discrete Variables: Automating Visualization of Results ############################565566import matplotlib.pyplot as plt567import seaborn as sns568# Imports the libraries we need.569sns.set()570# We set the default style of the graphs to the seaborn style. 571572# Below we define a function that takes 2 arguments: a dataframe and a number.573# The number parameter has a default value of 0.574# IMPORTANT: Setting a predefined value means that if we call the function and omit the number parameter, it will be executed with it having a value of 0.575# The function displays a graph.576def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):577    x = np.array(df_WoE.iloc[:, 0].apply(str))578    # Turns the values of the column with index 0 to strings, makes an array from these strings, and passes it to variable x.579    #apply str was necessary to make sure that we have usable text on x-axis580    # np.array is applied as matplotlib works well with numpy and scipy instead of dataframes 581    y = df_WoE['WoE']582    # Selects a column with label 'WoE' and passes it to variable y.583    plt.figure(figsize=(18, 6))584    # Sets the graph size to width 18 x height 6. #INCHES585    plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')586    # Plots the datapoints with coordiantes variable x on the x-axis and variable y on the y-axis.587    # Sets the marker for each datapoint to a circle, the style line between the points to dashed, and the color to black.588    plt.xlabel(df_WoE.columns[0])589    # Names the x-axis with the name of the column with index 0.590    plt.ylabel('Weight of Evidence')591    # Names the y-axis 'Weight of Evidence'.592    plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))593    # Names the grapth 'Weight of Evidence by ' the name of the column with index 0.594    plt.xticks(rotation = rotation_of_x_axis_labels)595    #IMPORTANT # Rotates the labels of the x-axis a predefined number of degrees.596597plot_by_woe(df_temp) #calling this to check the implementation598# We execute the function we defined with the necessary arguments: a dataframe.599# We omit the number argument, which means the function will use its default value, 0.600601################# PART OF THE PD MODEL ######################################602################## Preprocessing Discrete Variables: Creating Dummy Variables, Part 1603604pd.options.display.max_columns=None605###################### 'home_ownership'606df_temp = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr)607# We calculate weight of evidence.608df_temp609610plot_by_woe(df_temp)611# We plot the weight of evidence values.612613#FROM OBSERVING THE WoE TABLE, THE FOLLOWING:614    # There are many categories with very few observations and many categories with very different "good" %.615    # Therefore, we create a new discrete variable where we combine some of the categories.616    # 'OTHERS' and 'NONE' are riskiest but HAVE very few LOANS. 'RENT' is the next riskiest.617    # 'ANY' are least risky but are too few. Conceptually, they belong to the same category. Also, their inclusion would not change anything.618    # We combine them in one category, 'RENT_OTHER_NONE_ANY'.619    # We end up with 3 categories: 'RENT_OTHER_NONE_ANY', 'OWN', 'MORTGAGE'.620621#HOW DOES THE FOLLOWING SUM FUNCTION FIT IN622#columnwise sum of 0s and 1s to create a new category623df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'],624                                                      df_inputs_prepr['home_ownership:NONE'],df_inputs_prepr['home_ownership:ANY']])625# 'RENT_OTHER_NONE_ANY' will be the reference category.626#Whatever the values for the initial dummy variables for 'Home_ownership' were, will 627#now get added and assigned to  'home ownership: RENT_OTHER_NONE_ANY' category628    629    630# Alternatively:631#loan_data.loc['home_ownership' in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 1632#loan_data.loc['home_ownership' not in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 0633#loan_data.loc['loan_status' in ['OWN'], 'home_ownership:OWN'] = 1634#loan_data.loc['loan_status' not in ['OWN'], 'home_ownership:OWN'] = 0635#loan_data.loc['loan_status' in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 1636#loan_data.loc['loan_status' not in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 0637638########Preprocessing Discrete Variables: Creating Dummy Variables, Part 2#########639640#### 'addr_state'641df_inputs_prepr['addr_state'].unique()642df_temp = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr)643# We calculate weight of evidence.644df_temp645646plot_by_woe(df_temp)647# We plot the weight of evidence values.648649650#Following code is written in case borrowers from North Dakota come in later651if ['addr_state:ND'] in df_inputs_prepr.columns.values: #if there are we don't do anything652    pass653else:654    df_inputs_prepr['addr_state:ND'] = 0 #if not we set all the values to 0655656#This also brings to liight that we should always keep the mind open just in case any other 657    #value not included in the model pops up, we should make arrangements for that658659#Note that WoE is very less for the NE and IA, and that their proportion of obsn is very less660    #Also their WoE is exceptionally high (NE and IA). WoE is also very high in case of Maine and Idaho (ME and ID)661    #which also have low obsn662    # We will include them in the first the worst and last the best categories (as per WOE)663664#Merging them with other categories will let us analyse the nuances among states665    666#Analysing the values excluding the 4 states667plot_by_woe(df_temp.iloc[2: -2, : ])668# We plot the weight of evidence values.669#After running the above function we got to know that670    # the chart is more like waht we expect671    # NV and FL are different but can be clubbed as their prop_n_obs (observations) is less672    # Further, we notice due to low no. of obs, we should combine first 6 and last 6 states673    # Unknown variable North Dakota (ND) should be included in the .........674675676plot_by_woe(df_temp.iloc[6: -6, : ])677# We plot the weight of evidence values.678# California and NYC can be separate groups because of larger obs679#COARSE CLASSING can begin by combining similar WOE (and/ or low obs) states but it should be kept in mind 680#that if they are separated by larger obs category, separate groups of states will have to be 681#created682#Larger obs group CAN'T be clubbed with lower obs on the both, right and left, to create a683#separate category684#North Dakota is included in the riskiest category as we do not have any information about it685686687# We create the following categories:688# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'689# 'NM' 'VA'690# 'NY'691# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'692# 'CA'693# 'UT' 'KY' 'AZ' 'NJ'694# 'AR' 'MI' 'PA' 'OH' 'MN'695# 'RI' 'MA' 'DE' 'SD' 'IN'696# 'GA' 'WA' 'OR'697# 'WI' 'MT'698# 'TX'699# 'IL' 'CT'700# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'701# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'702703# 'IA_NV_HI_ID_AL_FL' will be the reference category.704705df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],706                                              df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],707                                              df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],708                                                          df_inputs_prepr['addr_state:AL']])709710df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])711712df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],713                                              df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],714                                              df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])715716df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],717                                              df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])718719df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],720                                              df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],721                                              df_inputs_prepr['addr_state:MN']])722723df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],724                                              df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],725                                              df_inputs_prepr['addr_state:IN']])726727df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],728                                              df_inputs_prepr['addr_state:OR']])729730df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])731732df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])733734df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],735                                              df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],736                                              df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])737738df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],739                                              df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],740                                              df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])741#WHAT DOES THE SUM FUNCTION DO?742    #Sums across the columns743744############################# RUNNING THE WOE FN ON OTHER VARIABLES745    746df_inputs_prepr['verification_status'].unique()747df_temp = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr)748# We calculate weight of evidence.749df_temp750751plot_by_woe(df_temp)752753##################754755df_inputs_prepr['purpose'].unique()756df_temp = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr)757# We calculate weight of evidence.758df_temp759760plot_by_woe(df_temp,90)761762# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.763# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.764# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.765# We leave 'debt_consolidtion' in a separate category.766# We leave 'credit_card' in a separate category.767# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.768df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],769                                                                 df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],770                                                                 df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])771df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],772                                             df_inputs_prepr['purpose:vacation']])773df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],774                                                        df_inputs_prepr['purpose:home_improvement']])775776###################################777778    # 'initial_list_status'779df_temp = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr)780df_temp781782plot_by_woe(df_temp)783# We plot the weight of evidence values.784######################Preprocessing for the discrete variable is  done############785786787788######################Preprocessing for continuous variables begins: Automating calculations$$$$$789######################Preprocessing for continuous variables begins: Automating calculations$$$$$790######################Preprocessing for continuous variables begins: Automating calculations$$$$$791792#first the fine classing and then coarse classing793#we ordered the dicrete functions by WoE as they showed no quantitative differences by the virtue 794#... of the category itself795796#but the continuous variables diiffer quantitatively in the categories itself. Thus, we order them...797#... by the category, and not by WoE798799#Preprocessing Continuous Variables: Automating Calculations and Visualizing Results800# WoE function for ordered discrete and continuous variables801def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):802    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)803    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),804                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)805    df = df.iloc[:, [0, 1, 3]]806    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']807    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()808    df['n_good'] = df['prop_good'] * df['n_obs']809    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']810    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()811    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()812    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])813    #df = df.sort_values(['WoE'])814    #df = df.reset_index(drop = True)815    #This function is similar to the discrete one with the difference that we do not wanna order them by WoE, instead 816    #retain their natural order817    df['diff_prop_good'] = df['prop_good'].diff().abs()818    df['diff_WoE'] = df['WoE'].diff().abs()819    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']820    df['IV'] = df['IV'].sum()821    return df822# Here we define a function similar to the one above, ...823# ... with one slight difference: we order the results by the values of a different column.824# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.825826### Preprocessing Continuous Variables: Creating Dummy Variables, Part 1827828# term829df_inputs_prepr['term_int'].unique()830# There are only two unique values, 36 and 60.831832df_temp = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr)833# We calculate weight of evidence.834df_temp835836plot_by_woe(df_temp)837# We plot the weight of evidence values.838839# Leave as is.840# '60' will be the reference category.841df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)842df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)843844# emp_length_int845df_inputs_prepr['emp_length_int'].unique()846# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.847848df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr)849# We calculate weight of evidence.850df_temp851852plot_by_woe(df_temp)853# We plot the weight of evidence values.854855# We create the following categories: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'856# '0' will be the reference category857df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)858df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)859df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)860df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)861df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)862df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)863864## isin() is analogous to putting in filters on one of the heading in excel and being able to view the whole865#... dataset just related for that filter 866867## We had to use in the ISIN() function because it can be clubbed with range() beautifully868869870### Preprocessing Continuous Variables: Creating Dummy Variables, Part 2#########################871872#####REMEMBER THAT SINCE CONTINUOUS VARIABLE HAS NUMERICAL VALUES, THE LIMITS THAT WE CAN ASSIGN IN 873#### FINE CLASSING HAVE TO BE VALUES - fAIRLY OBVIOUS...874875#####WHENEVR THE WOE FUNCTION OSCILLATES TOO MUCH.. THAT IS A RED FLAG AND CHECK FOR THE NO. OF OBSRVN876## WE SHOULD ANYWAY KEEP A REFERENCE TO NO OF OBSERVATIONS877878###eVEN IF THE WOE OSCILLATING TOO MUCH BUT IF THE NUMBER OF OBSRVTNS ARE LESS, WE CAN CLUB THE X CATEGORIES879## OR DIVIDE THEM IN TWO880881 882df_inputs_prepr['mths_since_issue_d'].unique()883df_inputs_prepr['mths_since_issue_d_factor'] = pd.cut(df_inputs_prepr['mths_since_issue_d'], 50)884885# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.886887df_inputs_prepr['mths_since_issue_d_factor']888# mths_since_issue_d889df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue_d_factor', df_targets_prepr)890# We calculate weight of evidence.891df_temp892893plot_by_woe(df_temp)894# We plot the weight of evidence values.895# We have to rotate the labels because we cannot read them otherwise.896897plot_by_woe(df_temp, 90)898# We plot the weight of evidence values, rotating the labels 90 degrees.899900plot_by_woe(df_temp.iloc[3: , : ], 90)901# We plot the weight of evidence values.902903# We create the following categories:904# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.905df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38)), 1, 0)906df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38, 40)), 1, 0)907df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(40, 42)), 1, 0)908df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(42, 49)), 1, 0)909df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(49, 53)), 1, 0)910df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(53, 65)), 1, 0)911df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(65, 85)), 1, 0)912df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(85, int(df_inputs_prepr['mths_since_issue_d'].max()))), 1, 0)913914# int_rate915df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)916# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.917918df_inputs_prepr['int_rate_factor']919920df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr)921# We calculate weight of evidence.922df_temp923924925plot_by_woe(df_temp, 90)926# We plot the weight of evidence values.927928929# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'930#There is a bit of stability after 9.548, thus it is a cut off931## Also, note that we used rounded off values in creating dummies for 'mths_since_issue_d'932###but we kept the categories of int_rate within the integer range933934df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)935df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)936df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)937df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)938df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)939940####Note that everything is being stored in df_inputs_prepr, which is the copy of the 941#inputs_test that we created942943#### After 1st round of preprocessing inputs, we would hash out df_inputs_prepr = loan_data_inputs_train944# and df_targets_prepr = loan_data_targets_train945946#But before that we would save df_inputs_prepr in loan_data_inputs_train and947# df_targets_prepr in loan_data_targets_train ## This is just the reverese of what we did948#after splitting the data 949950####At that stage, we would hash in #df_inputs_prepr = loan_data_inputs_test and 951# df_targets_prepr = loan_data_targets_test 952953# funded_amnt954df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)955# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.956df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr)957# We calculate weight of evidence.958df_temp959960plot_by_woe(df_temp, 90)961# We plot the weight of evidence values.962963#WoE does not seem to have any relation with the Funded Amount whatsoever. Because the964## chart is all zig zag.. Thus, we would NOT use this variable in the model 965966967# mths_since_earliest_cr_line968df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)969# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.970df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_targets_prepr)971# We calculate weight of evidence.972df_temp973974plot_by_woe(df_temp, 90)975# We plot the weight of evidence values.976977plot_by_woe(df_temp.iloc[6: , : ], 90)978# We plot the weight of evidence values.979980# We create the following categories:981# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352982df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(0,140)), 1, 0)983df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)984df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)985df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)986df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)987df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)988989990# delinq_2yrs991df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr)992# We calculate weight of evidence.993df_temp994995plot_by_woe(df_temp)996# We plot the weight of evidence values.997998# Categories: 0, 1-3, >=4999df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)1000df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)1001df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 4), 1, 0)100210031004# inq_last_6mths1005df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr)1006# We calculate weight of evidence.1007df_temp10081009plot_by_woe(df_temp)1010# We plot the weight of evidence values.10111012df_inputs_prepr['inq_last_6mths'].unique()1013# Categories: 0, 1 - 2, 3 - 6, > 61014df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)1015df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)1016df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)1017df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)101810191020# open_acc1021df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr)1022# We calculate weight of evidence.1023df_temp102410251026plot_by_woe(df_temp, 90)1027# We plot the weight of evidence values.10281029plot_by_woe(df_temp.iloc[ : 40, :], 90)1030# We plot the weight of evidence values.10311032# Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'1033df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)1034df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)1035df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)1036df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)1037df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)1038df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)1039df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)1040df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)10411042# pub_rec1043df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr)1044# We calculate weight of evidence.1045df_temp10461047plot_by_woe(df_temp, 90)1048# We plot the weight of evidence values.10491050# Categories '0-2', '3-4', '>=5'1051df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)1052df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)1053df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)10541055# total_acc1056df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)1057# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1058df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr)1059# We calculate weight of evidence.1060df_temp106110621063plot_by_woe(df_temp, 90)1064# We plot the weight of evidence values.10651066# Categories: '<=27', '28-51', '>51'1067df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)1068df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)1069df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)10701071# acc_now_delinq1072df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr)1073# We calculate weight of evidence.1074df_temp10751076plot_by_woe(df_temp)1077# We plot the weight of evidence values.10781079# Categories: '0', '>=1'1080df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)1081df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)108210831084# total_rev_hi_lim1085#df_inputs_prepr['total_rev_hi_lim'].describe() #IMPORTANT to analyze quartiles and make suitable cuts1086#Analyze a bit of numbers, see number is covering 95% data (for instance) --> set everything into one category above that 1087# analyze the population % observations  1088#df_inputs_prepr['total_rev_hi_lim'].to_excel('.....\\total_rev_hi_lim_prepr.xlsx')1089df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)1090# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.1091df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_targets_prepr)1092# We calculate weight of evidence.1093#I analysed the numbers % population observations and above that make the highest category1094#df_temp.to_excel('............\\total_rev_hi_lim.xlsx')1095#pd.options.display.max_columns=None1096df_temp10971098plot_by_woe(df_temp.iloc[: 50, : ], 90)1099# We plot the weight of evidence values.1100plot_by_woe(df_temp.iloc[: 25, : ], 90)1101# Categories1102# '<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'1103df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)1104df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)1105df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)1106df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)1107df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)1108df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)1109df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)1110df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)11111112# installment1113df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)1114# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1115df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)1116# We calculate weight of evidence.1117df_temp1118# May be we do not consider this factor1119plot_by_woe(df_temp, 90)1120# We plot the weight of evidence values.11211122112311241125### Preprocessing Continuous Variables: Creating Dummy Variables, Part 31126##########Thiis is a bit indirect variable to split...1127## You analyse the data and look what is covering most of the values like about 99%1128# Split the categories into reasonable gaps1129##look at the data if any of the category is pulling the weight or are they too light (in which case split has to be reduced)1130## Head and Tail might be light so club couple of categories over there, rest split equally in 10K range1131 1132# annual_inc1133df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)1134# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1135df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)1136# We calculate weight of evidence.1137df_temp11381139#WE OBSERVED HERE THAT THE FIRST INTERVAL CARRIES 94% OF THE TOTAL OBSERVATIONS. THUS, WE AGAIN SPLIT IT INTO1140##100 CATEGORIES11411142df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)1143# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.1144df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)1145# We calculate weight of evidence.1146df_temp1147#DIVIDING INTO 100 CATEGORIES GAVE US A FAIR SPLIT BETWEEN THE CLASSES11481149plot_by_woe(df_temp, 90)11501151# Initial examination shows that there are too few individuals with large income and too many with small income.1152# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine1153# the categories of everyone with 140k or less.1154df_inputs_prepr_temp  = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]1155#loan_data_temp = loan_data_temp.reset_index(drop = True)1156#df_inputs_prepr_temp11571158df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)1159# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1160df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index])1161# We calculate weight of evidence.1162df_temp11631164plot_by_woe(df_temp, 90)1165# We plot the weight of evidence values.1166# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 10k.1167df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)1168df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)1169df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)1170df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)1171df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)1172df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)1173df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)1174df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)1175df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)1176df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)1177df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)1178df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)11791180# mths_since_last_delinq1181# We have to create one category for missing values and do fine and coarse classing for the rest.1182df_inputs_prepr['mths_since_last_delinq']1183df_inputs_prepr['mths_since_last_delinq'].unique1184df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]1185df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)1186df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index])1187# We calculate weight of evidence.1188df_temp11891190plot_by_woe(df_temp, 90)1191# We plot the weight of evidence values.1192plot_by_woe(df_temp.iloc[:16,:], 90)119311941195# Categories: Missing, 0-3, 4-30, 31-56, >=571196df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)1197df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)1198df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)1199df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)1200df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)12011202### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3: Homework12031204# dti1205df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)1206# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.1207df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr)1208# We calculate weight of evidence.1209df_temp1210#df_temp.to_excel('.....//dti_factor.xlsx')1211plot_by_woe(df_temp, 90)1212# We plot the weight of evidence values.12131214# Similarly to income, initial examination shows that most values are lower than 200.1215# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine1216# the categories of everyone with 150k or less.1217df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]1218df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)1219# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1220df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index])1221# We calculate weight of evidence.1222df_temp12231224plot_by_woe(df_temp, 90)1225# We plot the weight of evidence values.12261227# Categories:1228df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)1229df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)1230df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)1231df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)1232df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)1233df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)1234df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)1235df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)1236df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)1237df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)12381239# mths_since_last_record1240# We have to create one category for missing values and do fine and coarse classing for the rest.1241df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]1242#sum(loan_data_temp['mths_since_last_record'].isnull())1243df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)1244# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.1245df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index])1246# We calculate weight of evidence.1247df_temp12481249plot_by_woe(df_temp, 90)1250# We plot the weight of evidence values.12511252# Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'1253df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)1254df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)1255df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)1256df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)1257df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)1258df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)1259df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)12601261## Most important line of the PREPROCESSING CODES AFTER COMPLETION1262## Most important line of the PREPROCESSING CODES AFTER COMPLETION1263## Most important line of the PREPROCESSING CODES AFTER COMPLETION1264## Most important line of the PREPROCESSING CODES AFTER COMPLETION1265## Most important line of the PREPROCESSING CODES AFTER COMPLETION1266df_inputs_prepr.isnull().sum()1267df_inputs_prepr.columns.values1268df_inputs_prepr.shape1269pd.options.display.max_rows = None12701271#Unhash the following lines while training the data12721273#loan_data_inputs_train = df_inputs_prepr1274#loan_data_targets_train = df_targets_prepr12751276## Most important line of the PREPROCESSING CODES AFTER COMPLETION1277## Most important line of the PREPROCESSING CODES AFTER COMPLETION127812791280####################################### Preprocessing the Test Dataset #####################################1281####################################### Preprocessing the Test Dataset #####################################1282####################################### Preprocessing the Test Dataset #####################################1283####################################### Preprocessing the Test Dataset #####################################12841285#####First read this:1286# Test data needs to be exactly similar to the train data.. Thus, we need to use the same code as above1287# We do not care for WoE as it does not matter in test data. We are gonna ignore it1288# Now, you know that the fuctions that you have created work for 'df_inputs_prepr'1289# Since we are moving into a test space, we need to save the preprocessed data1290# Initially, we created a variable loan_data_inputs_train, it is now wtime to use it12911292############################ The following is a very skillful part where steps in codes iis extremely important....####129312941295#loan_data_inputs_train = df_inputs_prepr ###You will need to move step by step and first execute this code 1296#.. to save the preprocessing data results for train and then move on to the following1297## then go right below the train_test_split, where we created the dataframes equal to df_inputs_prepr1298    #hash out the df_inputs_prepr = loan_data_inputs_train 1299    # and write that df_inputss_prepr= loan_data_inputs_test1300    #run the code again till this point BUT hashing out 'loan_data_inputs_train = df_inputs_prepr'1301    #.....as you now want to save the results in 'loan_data_inputs_test 1302    # save the result1303#####1304loan_data_inputs_test = df_inputs_prepr1305loan_data_targets_test = df_targets_prepr1306#################################################################13071308#loan_data_inputs_train.to_csv('........Resources\\Train_Test dataset after running the code\\loan_data_inputs_train.csv')1309#loan_data_targets_train.to_csv('.....Resources\\Train_Test dataset after running the code\\loan_data_targets_train.csv')1310loan_data_inputs_test.to_csv('.......Resources\\Train_Test dataset after running the code\\loan_data_inputs_test.csv')1311loan_data_targets_test.to_csv('.......Resources\\Train_Test dataset after running the code\\loan_data_targets_test.csv')1312131313141315#..... this section is complete.. This section included:1316#    1. preprocessing dates, time, integer, floats, objects 1317#    2. creating dummies1318#    3. splitting the training and testing data1319# On training data1320#    4. creating WoE and IV function, visualising and automating it1321#    5. Fine classing and coarse classing1322#    6. Creating dummies for the classes1323#    7. Saving the df_inputs_prepr dataframe into inputs dataframe1324#    8. Hashing out the df_inputs_train1325# On training data1326#    4. assigning test data to df_inputs_prepr 1327#      running the automated code for WoE and IV function1328#    5. Fine classing and coarse classing1329#    6. Creating dummies for the classes      
...4-PD Model Monitoring.py
Source:4-PD Model Monitoring.py  
1#!/usr/bin/env python32# -*- coding: utf-8 -*-3"""4Created on Wed Apr 29 16:04:33 20205@author: barry6"""7# Import Libraries8import numpy as np9import pandas as pd10# Import Data11# Import Train and Test Data.12loan_data_inputs_train = pd.read_csv('loan_data_inputs_train.csv', index_col = 0)13loan_data_targets_train = pd.read_csv('loan_data_targets_train.csv', index_col = 0, header = None)14loan_data_inputs_test = pd.read_csv('loan_data_inputs_test.csv', index_col = 0)15loan_data_targets_test = pd.read_csv('loan_data_targets_test.csv', index_col = 0, header = None)16# Here we import the new data.17loan_data_backup = pd.read_csv('loan_data_2015.csv')18### Explore Data19loan_data = loan_data_backup.copy()20pd.options.display.max_columns = None21#pd.options.display.max_rows = None22# Sets the pandas dataframe options to display all columns/ rows.23loan_data.head()24loan_data.info()25# Population Stability Index: Preprocessing26### >>> The code from here to the other line starting with '>>>' is copied from the Data Preparation notebook, with minor adjustments. We have to perform the exact same data preprocessing, fine-classing, and coarse classing on the new data, in order to be able to calculate statistics for the exact same variables to the ones we used for training and testing the PD model.27### Preprocessing few continuous variables28## General Preprocessing29loan_data['emp_length'].unique()30loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\+ years', '')31loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))32loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a',  str(0))33loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')34loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')35type(loan_data['emp_length_int'][0])36loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])37type(loan_data['emp_length_int'][0])38# Earliest credit line39loan_data['earliest_cr_line']40loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format = '%b-%y')41type(loan_data['earliest_cr_line_date'][0])42pd.to_datetime('2018-12-01') - loan_data['earliest_cr_line_date']43# Assume we are now in December 201744loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2018-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))45loan_data['mths_since_earliest_cr_line'].describe()46# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar, and negative differences are being calculated.47# There are 2303 such values.48loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]49# We set all these negative differences to the maximum.50loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()51min(loan_data['mths_since_earliest_cr_line'])52#Term53loan_data['term']54loan_data['term'].describe()55loan_data['term_int'] = loan_data['term'].str.replace(' months', '')56loan_data['term_int']57type(loan_data['term_int'])58type(loan_data['term_int'][25])59loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))60loan_data['term_int']61type(loan_data['term_int'][0])62#Time since the loan was funded63loan_data['issue_d']64# Assume we are now in December 201765loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], format = '%b-%y')66loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2018-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))67loan_data['mths_since_issue_d'].describe()68# Data preparation: preprocessing discrete variables69loan_data.info()70# loan_data['grade_factor'] = loan_data['grade'].astype('category')71#grade72#sub_grade73#home_ownership74#verification_status75#loan_status76#purpose77#addr_state78#initial_list_status79pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':')80loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),81                     pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),82                     pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),83                     pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),84                     pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),85                     pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),86                     pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),87                     pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]88loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)89type(loan_data_dummies)90loan_data_dummies.shape91loan_data_dummies.info()92loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)93loan_data.columns.values94# Data preparation: check for missing values and clean95loan_data.isnull()96pd.options.display.max_rows = None97loan_data.isnull().sum()98pd.options.display.max_rows = 10099# loan_data$total_rev_hi_lim - There are 70276 missing values here.100# 'Total revolving high credit/credit limit', so it makes sense that the missing values are equal to funded_amnt.101# loan_data$acc_now_delinq102# loan_data$total_acc103# loan_data$pub_rec104# loan_data$open_acc105# loan_data$inq_last_6mths106# loan_data$delinq_2yrs107# loan_data$mths_since_earliest_cr_line108# - There are 29 missing values in all of these columns. They are likely the same observations.109# An eyeballing examination of the dataset confirms that.110# All of these are with loan_status 'Does not meet the credit policy. Status:Fully Paid'.111# We impute these values.112# loan_data$annual_inc113# - There are 4 missing values in all of these columns.114# loan_data$mths_since_last_record115# loan_data$mths_since_last_delinq116# 'Total revolving high credit/credit limit', so it makes sense that the missing values are equal to funded_amnt.117loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)118loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)119loan_data['acc_now_delinq'].fillna(0, inplace=True)120loan_data['total_acc'].fillna(0, inplace=True)121loan_data['pub_rec'].fillna(0, inplace=True)122loan_data['open_acc'].fillna(0, inplace=True)123loan_data['inq_last_6mths'].fillna(0, inplace=True)124loan_data['delinq_2yrs'].fillna(0, inplace=True)125loan_data['emp_length_int'].fillna(0, inplace=True)126loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)127# PD model: Data preparation: Good/ Bad (DV for the PD model)128loan_data['loan_status'].unique()129loan_data['loan_status'].value_counts()130loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()131# Good/ Bad Definition132loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',133                                                       'Does not meet the credit policy. Status:Charged Off',134                                                       'Late (31-120 days)']), 0, 1)135#loan_data['good_bad'].sum()/loan_data['loan_status'].count()136loan_data['good_bad']137# PD model: Data Preparation: Splitting Data138# loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test139from sklearn.model_selection import train_test_split140# Here we don't split data into training and test141#train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])142#loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])143#loan_data_inputs_train.shape144#loan_data_targets_train.shape145#loan_data_inputs_test.shape146#loan_data_targets_test.shape147#loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)148#loan_data_inputs_train.shape149#loan_data_targets_train.shape150#loan_data_inputs_test.shape151#loan_data_targets_test.shape152# PD model: Data Preparation: Discrete Variables153loan_data.drop('good_bad', axis = 1)154loan_data['good_bad']155#####156df_inputs_prepr = loan_data.drop('good_bad', axis = 1)157df_targets_prepr = loan_data['good_bad']158#####159#df_inputs_prepr = loan_data_inputs_test160##df_targets_prepr = loan_data_targets_test161df_inputs_prepr['grade'].unique()162df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)163df1.head()164df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()165df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()166df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),167                df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)168df1169df1 = df1.iloc[:, [0, 1, 3]]170df1171df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']172df1173df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()174df1175df1['n_good'] = df1['prop_good'] * df1['n_obs']176df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']177df1178df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()179df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()180df1181df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])182df1183df1 = df1.sort_values(['WoE'])184df1 = df1.reset_index(drop = True)185df1186df1['diff_prop_good'] = df1['prop_good'].diff().abs()187df1['diff_WoE'] = df1['WoE'].diff().abs()188df1189df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']190df1['IV'] = df1['IV'].sum()191df1192# WoE function for discrete unordered variables193def woe_discrete(df, discrete_variabe_name, good_bad_variable_df):194    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)195    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),196                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)197    df = df.iloc[:, [0, 1, 3]]198    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']199    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()200    df['n_good'] = df['prop_good'] * df['n_obs']201    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']202    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()203    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()204    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])205    df = df.sort_values(['WoE'])206    df = df.reset_index(drop = True)207    df['diff_prop_good'] = df['prop_good'].diff().abs()208    df['diff_WoE'] = df['WoE'].diff().abs()209    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']210    df['IV'] = df['IV'].sum()211    return df212# 'grade', 'home_ownership', 'verification_status',213# 'purpose', 'addr_state', 'initial_list_status'214# 'grade'215df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)216df_temp217import matplotlib.pyplot as plt218import seaborn as sns219sns.set()220def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):221    #x = df_WoE.iloc[:, 0]222    x = np.array(df_WoE.iloc[:, 0].apply(str))223    y = df_WoE['WoE']224    plt.figure(figsize=(18, 6))225    plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')226    plt.xlabel(df_WoE.columns[0])227    plt.ylabel('Weight of Evidence')228    plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))229    plt.xticks(rotation = rotation_of_x_axis_labels)230plot_by_woe(df_temp)231# Leave as is.232# 'G' will be the reference category.233# 'home_ownership'234df_temp = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr)235df_temp236plot_by_woe(df_temp)237# There are many categories with very few observations and many categories with very different "good" %.238# Therefore, we create a new discrete variable where we combine some of the categories.239# 'OTHERS' and 'NONE' are riskiest but are very few. 'RENT' is the next riskiest.240# 'ANY' are least risky but are too few. Conceptually, they belong to the same category. Also, their inclusion would not change anything.241# We combine them in one category, 'RENT_OTHER_NONE_ANY'.242# We end up with 3 categories: 'RENT_OTHER_NONE_ANY', 'OWN', 'MORTGAGE'.243df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'],244                                                      df_inputs_prepr['home_ownership:NONE'],df_inputs_prepr['home_ownership:ANY']])245# 'RENT_OTHER_NONE_ANY' will be the reference category.246# Alternatively:247#loan_data.loc['home_ownership' in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 1248#loan_data.loc['home_ownership' not in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 0249#loan_data.loc['loan_status' not in ['OWN'], 'home_ownership:OWN'] = 1250#loan_data.loc['loan_status' not in ['OWN'], 'home_ownership:OWN'] = 0251#loan_data.loc['loan_status' not in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 1252#loan_data.loc['loan_status' not in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 0253loan_data['home_ownership'].unique()254df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:ANY']])255# 'addr_state'256df_inputs_prepr['addr_state'].unique()257#df_inputs_prepr['addr_state:ND'] = 0258if ['addr_state:ND'] in df_inputs_prepr.columns.values:259    pass260else:261    df_inputs_prepr['addr_state:ND'] = 0262if ['addr_state:ID'] in df_inputs_prepr.columns.values:263    pass264else:265    df_inputs_prepr['addr_state:ID'] = 0266if ['addr_state:IA'] in df_inputs_prepr.columns.values:267    pass268else:269    df_inputs_prepr['addr_state:IA'] = 0270df_temp = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr)271df_temp272plot_by_woe(df_temp)273plot_by_woe(df_temp.iloc[2: -2, : ])274plot_by_woe(df_temp.iloc[6: -6, : ])275df_inputs_prepr.columns.values276# We create the following categories:277# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'278# 'NM' 'VA'279# 'NY'280# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'281# 'CA'282# 'UT' 'KY' 'AZ' 'NJ'283# 'AR' 'MI' 'PA' 'OH' 'MN'284# 'RI' 'MA' 'DE' 'SD' 'IN'285# 'GA' 'WA' 'OR'286# 'WI' 'MT'287# 'TX'288# 'IL' 'CT'289# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'290# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'291# 'IA_NV_HI_ID_AL_FL' will be the reference category.292df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],293                                              df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],294                                              df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],295                                                          df_inputs_prepr['addr_state:AL']])296df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])297df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],298                                              df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],299                                              df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])300df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],301                                              df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])302df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],303                                              df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],304                                              df_inputs_prepr['addr_state:MN']])305df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],306                                              df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],307                                              df_inputs_prepr['addr_state:IN']])308df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],309                                              df_inputs_prepr['addr_state:OR']])310df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])311df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])312df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],313                                              df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],314                                              df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])315df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],316                                              df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],317                                              df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])318# 'verification_status'319df_temp = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr)320df_temp321plot_by_woe(df_temp)322# Leave as is.323# 'Verified' will be the reference category.324# 'purpose'325df_temp = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr)326df_temp327#plt.figure(figsize=(15, 5))328#sns.pointplot(x = 'purpose', y = 'WoE', data = df_temp, figsize = (5, 15))329plot_by_woe(df_temp, 90)330# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.331# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.332# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.333# We leave 'debt_consolidtion' in a separate category.334# We leave 'credit_card' in a separate category.335# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.336df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],337                                                                 df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],338                                                                 df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])339df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],340                                             df_inputs_prepr['purpose:vacation']])341df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],342                                                        df_inputs_prepr['purpose:home_improvement']])343# 'initial_list_status'344df_temp = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr)345df_temp346plot_by_woe(df_temp)347# Leave as is.348# 'f' will be the reference category.349# PD model: Data Preparation: Continuous Variables, Part 1350# WoE function for ordered discrete and continuous variables351def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):352    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)353    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),354                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)355    df = df.iloc[:, [0, 1, 3]]356    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']357    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()358    df['n_good'] = df['prop_good'] * df['n_obs']359    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']360    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()361    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()362    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])363    #df = df.sort_values(['WoE'])364    #df = df.reset_index(drop = True)365    df['diff_prop_good'] = df['prop_good'].diff().abs()366    df['diff_WoE'] = df['WoE'].diff().abs()367    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']368    df['IV'] = df['IV'].sum()369    return df370# term371df_inputs_prepr['term_int'].unique()372# There are only two unique values, 36 and 60.373df_temp = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr)374df_temp375plot_by_woe(df_temp)376# Leave as is.377# '60' will be the reference category.378df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)379df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)380# emp_length_int381df_inputs_prepr['emp_length_int'].unique()382# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.383df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr)384df_temp385plot_by_woe(df_temp)386# We create the following categories: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'387# '0' will be the reference category388df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)389df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)390df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)391df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)392df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)393df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)394df_inputs_prepr['mths_since_issue_d'].unique()395df_inputs_prepr['mths_since_issue_d_factor'] = pd.cut(df_inputs_prepr['mths_since_issue_d'], 50)396df_inputs_prepr['mths_since_issue_d_factor']397# mths_since_issue_d398df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue_d_factor', df_targets_prepr)399df_temp400# !!!!!!!!!401#df_temp['mths_since_issue_d_factor'] = np.array(df_temp.mths_since_issue_d_factor.apply(str))402#df_temp['mths_since_issue_d_factor'] = list(df_temp.mths_since_issue_d_factor.apply(str))403#df_temp['mths_since_issue_d_factor'] = tuple(df_temp.mths_since_issue_d_factor.apply(str))404plot_by_woe(df_temp)405plot_by_woe(df_temp, 90)406plot_by_woe(df_temp.iloc[3: , : ], 90)407# We create the following categories:408# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.409df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38)), 1, 0)410df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38, 40)), 1, 0)411df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(40, 42)), 1, 0)412df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(42, 49)), 1, 0)413df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(49, 53)), 1, 0)414df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(53, 65)), 1, 0)415df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(65, 85)), 1, 0)416df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(85, int(df_inputs_prepr['mths_since_issue_d'].max()))), 1, 0)417# int_rate418df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)419df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr)420df_temp421plot_by_woe(df_temp, 90)422plot_by_woe(df_temp, 90)423# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'424#loan_data.loc[loan_data['int_rate'] < 5.8, 'int_rate:<5.8'] = 1425#(loan_data['int_rate'] > 5.8) & (loan_data['int_rate'] <= 8.64)426#loan_data['int_rate:<5.8'] = np.where(loan_data['int_rate'] < 5.8, 1, 0)427#loan_data[(loan_data['int_rate'] > 5.8) & (loan_data['int_rate'] <= 8.64)]428#loan_data['int_rate'][(np.where((loan_data['int_rate'] > 5.8) & (loan_data['int_rate'] <= 8.64)))]429#loan_data.loc[(loan_data['int_rate'] > 5.8) & (loan_data['int_rate'] <= 8.64), 'int_rate:<5.8'] = 1430df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)431df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)432df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)433df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)434df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)435### PD model: Data Preparation: Continuous Variables, Part 1: Homework436# mths_since_earliest_cr_line437df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)438df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_targets_prepr)439df_temp440plot_by_woe(df_temp, 90)441plot_by_woe(df_temp.iloc[6: , : ], 90)442# We create the following categories:443# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352444df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)445df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)446df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)447df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)448df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)449df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)450# REFERENCE CATEGORY!!!451# delinq_2yrs452df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr)453df_temp454plot_by_woe(df_temp)455# Categories: 0, 1-3, >=4456df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)457df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)458df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)459# inq_last_6mths460df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr)461df_temp462plot_by_woe(df_temp)463# Categories: 0, 1 - 2, 3 - 6, > 6464df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)465df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)466df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)467df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)468# open_acc469df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr)470df_temp471plot_by_woe(df_temp, 90)472plot_by_woe(df_temp.iloc[ : 40, :], 90)473# Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'474df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)475df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)476df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)477df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)478df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)479df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)480df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)481df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)482# pub_rec483df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr)484df_temp485plot_by_woe(df_temp, 90)486# Categories '0-2', '3-4', '>=5'487df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)488df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)489df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)490# total_acc491df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)492df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr)493df_temp494plot_by_woe(df_temp, 90)495# Categories: '<=27', '28-51', '>51'496df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)497df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)498df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)499# acc_now_delinq500df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr)501df_temp502plot_by_woe(df_temp)503# Categories: '0', '>=1'504df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)505df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)506# total_rev_hi_lim507df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)508df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_targets_prepr)509df_temp510plot_by_woe(df_temp.iloc[: 50, : ], 90)511# Categories512# '<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'513df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)514df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)515df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)516df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)517df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)518df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)519df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)520df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)521# PD model: Data Preparation: Continuous Variables, Part 2522# annual_inc523df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)524df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)525df_temp526df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)527df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)528df_temp529# Initial examination shows that there are too few individuals with large income and too many with small income.530# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine531# the categories of everyone with 140k or less.532df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]533#loan_data_temp = loan_data_temp.reset_index(drop = True)534#df_inputs_prepr_temp535#pd.options.mode.chained_assignment = None 536df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)537df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index])538df_temp539plot_by_woe(df_temp, 90)540# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.541df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)542df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)543df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)544df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)545df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)546df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)547df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)548df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)549df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)550df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)551df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)552df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)553# dti554df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)555df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr)556df_temp557plot_by_woe(df_temp, 90)558# Similarly to income, initial examination shows that most values are lower than 200.559# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine560# the categories of everyone with 150k or less.561df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]562df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)563df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index])564df_temp565plot_by_woe(df_temp, 90)566# Categories:567df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)568df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)569df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)570df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)571df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)572df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)573df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)574df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)575df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)576df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)577# mths_since_last_delinq578# We have to create one category for missing values and do fine and coarse classing for the rest.579#loan_data_temp = loan_data[np.isfinite(loan_data['mths_since_last_delinq'])]580df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]581#sum(loan_data_temp['mths_since_last_delinq'].isnull())582df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)583df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index])584df_temp585plot_by_woe(df_temp, 90)586# Categories: Missing, 0-3, 4-30, 31-56, >=57587df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)588df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)589df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)590df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)591df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)592# mths_since_last_record593# We have to create one category for missing values and do fine and coarse classing for the rest.594df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]595#sum(loan_data_temp['mths_since_last_record'].isnull())596df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)597df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index])598df_temp599plot_by_woe(df_temp, 90)600# Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'601df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)602df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)603df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)604df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)605df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)606df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)607df_inputs_prepr['mths_since_last_record:>=86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 86), 1, 0)608df_inputs_prepr['mths_since_last_delinq:Missing'].sum()609# display inputs_train, inputs_test610# funded_amnt611df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)612df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr)613df_temp614plot_by_woe(df_temp, 90)615# WON'T USE because there is no clear trend, even if segments of the whole range are considered.616# installment617df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)618df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)619df_temp620plot_by_woe(df_temp, 90)621# WON'T USE because there is no clear trend, even if segments of the whole range are considered.622### Preprocessing the test dataset623#####624#loan_data_inputs_train = df_inputs_prepr625#####626#loan_data_inputs_test = df_inputs_prepr627######628loan_data_inputs_2015 = df_inputs_prepr629loan_data_targets_2015 = df_targets_prepr630#loan_data_inputs_train.columns.values631#loan_data_inputs_test.columns.values632#loan_data_inputs_train.shape633#loan_data_targets_train.shape634#loan_data_inputs_test.shape635#loan_data_targets_test.shape636loan_data_inputs_2015.columns.values637loan_data_inputs_2015.shape638loan_data_targets_2015.shape639#loan_data_inputs_train.to_csv('loan_data_inputs_train.csv')640#loan_data_targets_train.to_csv('loan_data_targets_train.csv')641#loan_data_inputs_test.to_csv('loan_data_inputs_test.csv')642#loan_data_targets_test.to_csv('loan_data_targets_test.csv')643loan_data_inputs_2015.to_csv('loan_data_inputs_2015.csv')644loan_data_targets_2015.to_csv('loan_data_targets_2015.csv')645### >>> The code up to here, from the other line starting with '>>>' is copied from the Data Preparation notebook, with minor adjustments.646inputs_train_with_ref_cat = pd.read_csv('inputs_train_with_ref_cat.csv', index_col = 0)647# We import the dataset with old data, i.e. "expected" data.648# From the dataframe with new, "actual" data, we keep only the relevant columns.649inputs_2015_with_ref_cat = loan_data_inputs_2015.loc[: , ['grade:A',650'grade:B',651'grade:C',652'grade:D',653'grade:E',654'grade:F',655'grade:G',656'home_ownership:RENT_OTHER_NONE_ANY',657'home_ownership:OWN',658'home_ownership:MORTGAGE',659'addr_state:ND_NE_IA_NV_FL_HI_AL',660'addr_state:NM_VA',661'addr_state:NY',662'addr_state:OK_TN_MO_LA_MD_NC',663'addr_state:CA',664'addr_state:UT_KY_AZ_NJ',665'addr_state:AR_MI_PA_OH_MN',666'addr_state:RI_MA_DE_SD_IN',667'addr_state:GA_WA_OR',668'addr_state:WI_MT',669'addr_state:TX',670'addr_state:IL_CT',671'addr_state:KS_SC_CO_VT_AK_MS',672'addr_state:WV_NH_WY_DC_ME_ID',673'verification_status:Not Verified',674'verification_status:Source Verified',675'verification_status:Verified',676'purpose:educ__sm_b__wedd__ren_en__mov__house',677'purpose:credit_card',678'purpose:debt_consolidation',679'purpose:oth__med__vacation',680'purpose:major_purch__car__home_impr',681'initial_list_status:f',682'initial_list_status:w',683'term:36',684'term:60',685'emp_length:0',686'emp_length:1',687'emp_length:2-4',688'emp_length:5-6',689'emp_length:7-9',690'emp_length:10',691'mths_since_issue_d:<38',692'mths_since_issue_d:38-39',693'mths_since_issue_d:40-41',694'mths_since_issue_d:42-48',695'mths_since_issue_d:49-52',696'mths_since_issue_d:53-64',697'mths_since_issue_d:65-84',698'mths_since_issue_d:>84',699'int_rate:<9.548',700'int_rate:9.548-12.025',701'int_rate:12.025-15.74',702'int_rate:15.74-20.281',703'int_rate:>20.281',704'mths_since_earliest_cr_line:<140',705'mths_since_earliest_cr_line:141-164',706'mths_since_earliest_cr_line:165-247',707'mths_since_earliest_cr_line:248-270',708'mths_since_earliest_cr_line:271-352',709'mths_since_earliest_cr_line:>352',710'inq_last_6mths:0',711'inq_last_6mths:1-2',712'inq_last_6mths:3-6',713'inq_last_6mths:>6',714'acc_now_delinq:0',715'acc_now_delinq:>=1',716'annual_inc:<20K',717'annual_inc:20K-30K',718'annual_inc:30K-40K',719'annual_inc:40K-50K',720'annual_inc:50K-60K',721'annual_inc:60K-70K',722'annual_inc:70K-80K',723'annual_inc:80K-90K',724'annual_inc:90K-100K',725'annual_inc:100K-120K',726'annual_inc:120K-140K',727'annual_inc:>140K',728'dti:<=1.4',729'dti:1.4-3.5',730'dti:3.5-7.7',731'dti:7.7-10.5',732'dti:10.5-16.1',733'dti:16.1-20.3',734'dti:20.3-21.7',735'dti:21.7-22.4',736'dti:22.4-35',737'dti:>35',738'mths_since_last_delinq:Missing',739'mths_since_last_delinq:0-3',740'mths_since_last_delinq:4-30',741'mths_since_last_delinq:31-56',742'mths_since_last_delinq:>=57',743'mths_since_last_record:Missing',744'mths_since_last_record:0-2',745'mths_since_last_record:3-20',746'mths_since_last_record:21-31',747'mths_since_last_record:32-80',748'mths_since_last_record:81-86',749'mths_since_last_record:>=86',750]]751inputs_train_with_ref_cat.shape752inputs_2015_with_ref_cat.shape753df_scorecard = pd.read_csv('df_scorecard.csv', index_col = 0)754# We import the scorecard.755df_scorecard756inputs_train_with_ref_cat_w_intercept = inputs_train_with_ref_cat757inputs_train_with_ref_cat_w_intercept.insert(0, 'Intercept', 1)758# We insert a column in the dataframe, with an index of 0, that is, in the beginning of the dataframe.759# The name of that column is 'Intercept', and its values are 1s.760inputs_train_with_ref_cat_w_intercept = inputs_train_with_ref_cat_w_intercept[df_scorecard['Feature name'].values]761# Here, from the 'inputs_train_with_ref_cat_w_intercept' dataframe, we keep only the columns with column names,762# exactly equal to the row values of the 'Feature name' column from the 'df_scorecard' dataframe.763inputs_train_with_ref_cat_w_intercept.head()764inputs_2015_with_ref_cat_w_intercept = inputs_2015_with_ref_cat765inputs_2015_with_ref_cat_w_intercept.insert(0, 'Intercept', 1)766# We insert a column in the dataframe, with an index of 0, that is, in the beginning of the dataframe.767# The name of that column is 'Intercept', and its values are 1s.768inputs_2015_with_ref_cat_w_intercept = inputs_2015_with_ref_cat_w_intercept[df_scorecard['Feature name'].values]769# Here, from the 'inputs_train_with_ref_cat_w_intercept' dataframe, we keep only the columns with column names,770# exactly equal to the row values of the 'Feature name' column from the 'df_scorecard' dataframe.771inputs_2015_with_ref_cat_w_intercept.head()772scorecard_scores = df_scorecard['Score - Final']773scorecard_scores = scorecard_scores.values.reshape(102, 1)774y_scores_train = inputs_train_with_ref_cat_w_intercept.dot(scorecard_scores)775# Here we multiply the values of each row of the dataframe by the values of each column of the variable,776# which is an argument of the 'dot' method, and sum them. It's essentially the sum of the products.777y_scores_train.head()778y_scores_2015 = inputs_2015_with_ref_cat_w_intercept.dot(scorecard_scores)779# Here we multiply the values of each row of the dataframe by the values of each column of the variable,780# which is an argument of the 'dot' method, and sum them. It's essentially the sum of the products.781y_scores_2015.head()782inputs_train_with_ref_cat_w_intercept = pd.concat([inputs_train_with_ref_cat_w_intercept, y_scores_train], axis = 1)783inputs_2015_with_ref_cat_w_intercept = pd.concat([inputs_2015_with_ref_cat_w_intercept, y_scores_2015], axis = 1)784# Here we concatenate the scores we calculated with the rest of the variables in the two dataframes:785# the one with old ("expected") data and the one with new ("actual") data.786inputs_train_with_ref_cat_w_intercept.columns.values[inputs_train_with_ref_cat_w_intercept.shape[1] - 1] = 'Score'787inputs_2015_with_ref_cat_w_intercept.columns.values[inputs_2015_with_ref_cat_w_intercept.shape[1] - 1] = 'Score'788# Here we rename the columns containing scores to "Score" in both dataframes.789inputs_2015_with_ref_cat_w_intercept.head()790inputs_train_with_ref_cat_w_intercept['Score:300-350'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 300) & (inputs_train_with_ref_cat_w_intercept['Score'] < 350), 1, 0)791inputs_train_with_ref_cat_w_intercept['Score:350-400'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 350) & (inputs_train_with_ref_cat_w_intercept['Score'] < 400), 1, 0)792inputs_train_with_ref_cat_w_intercept['Score:400-450'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 400) & (inputs_train_with_ref_cat_w_intercept['Score'] < 450), 1, 0)793inputs_train_with_ref_cat_w_intercept['Score:450-500'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 450) & (inputs_train_with_ref_cat_w_intercept['Score'] < 500), 1, 0)794inputs_train_with_ref_cat_w_intercept['Score:500-550'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 500) & (inputs_train_with_ref_cat_w_intercept['Score'] < 550), 1, 0)795inputs_train_with_ref_cat_w_intercept['Score:550-600'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 550) & (inputs_train_with_ref_cat_w_intercept['Score'] < 600), 1, 0)796inputs_train_with_ref_cat_w_intercept['Score:600-650'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 600) & (inputs_train_with_ref_cat_w_intercept['Score'] < 650), 1, 0)797inputs_train_with_ref_cat_w_intercept['Score:650-700'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 650) & (inputs_train_with_ref_cat_w_intercept['Score'] < 700), 1, 0)798inputs_train_with_ref_cat_w_intercept['Score:700-750'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 700) & (inputs_train_with_ref_cat_w_intercept['Score'] < 750), 1, 0)799inputs_train_with_ref_cat_w_intercept['Score:750-800'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 750) & (inputs_train_with_ref_cat_w_intercept['Score'] < 800), 1, 0)800inputs_train_with_ref_cat_w_intercept['Score:800-850'] = np.where((inputs_train_with_ref_cat_w_intercept['Score'] >= 800) & (inputs_train_with_ref_cat_w_intercept['Score'] <= 850), 1, 0)801# We create dummy variables for score intervals in the dataframe with old ("expected").802inputs_2015_with_ref_cat_w_intercept['Score:300-350'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 300) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 350), 1, 0)803inputs_2015_with_ref_cat_w_intercept['Score:350-400'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 350) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 400), 1, 0)804inputs_2015_with_ref_cat_w_intercept['Score:400-450'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 400) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 450), 1, 0)805inputs_2015_with_ref_cat_w_intercept['Score:450-500'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 450) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 500), 1, 0)806inputs_2015_with_ref_cat_w_intercept['Score:500-550'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 500) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 550), 1, 0)807inputs_2015_with_ref_cat_w_intercept['Score:550-600'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 550) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 600), 1, 0)808inputs_2015_with_ref_cat_w_intercept['Score:600-650'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 600) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 650), 1, 0)809inputs_2015_with_ref_cat_w_intercept['Score:650-700'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 650) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 700), 1, 0)810inputs_2015_with_ref_cat_w_intercept['Score:700-750'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 700) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 750), 1, 0)811inputs_2015_with_ref_cat_w_intercept['Score:750-800'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 750) & (inputs_2015_with_ref_cat_w_intercept['Score'] < 800), 1, 0)812inputs_2015_with_ref_cat_w_intercept['Score:800-850'] = np.where((inputs_2015_with_ref_cat_w_intercept['Score'] >= 800) & (inputs_2015_with_ref_cat_w_intercept['Score'] <= 850), 1, 0)813# We create dummy variables for score intervals in the dataframe with new ("actual").814# Population Stability Index: Calculation and Interpretation815PSI_calc_train = inputs_train_with_ref_cat_w_intercept.sum() / inputs_train_with_ref_cat_w_intercept.shape[0]816# We create a dataframe with proportions of observations for each dummy variable for the old ("expected") data.817PSI_calc_2015 = inputs_2015_with_ref_cat_w_intercept.sum() / inputs_2015_with_ref_cat_w_intercept.shape[0]818# We create a dataframe with proportions of observations for each dummy variable for the new ("actual") data.819PSI_calc = pd.concat([PSI_calc_train, PSI_calc_2015], axis = 1)820# We concatenate the two dataframes along the columns.821PSI_calc = PSI_calc.reset_index()822# We reset the index of the dataframe. The index becomes from 0 to the total number of rows less one.823# The old index, which is the dummy variable name, becomes a column, named 'index'.824PSI_calc['Original feature name'] = PSI_calc['index'].str.split(':').str[0]825# We create a new column, called 'Original feature name', which contains the value of the 'Feature name' column,826# up to the column symbol.827PSI_calc.columns = ['index', 'Proportions_Train', 'Proportions_New', 'Original feature name']828# We change the names of the columns of the dataframe.829PSI_calc = PSI_calc[np.array(['index', 'Original feature name', 'Proportions_Train', 'Proportions_New'])]830PSI_calc831PSI_calc = PSI_calc[(PSI_calc['index'] != 'Intercept') & (PSI_calc['index'] != 'Score')]832# We remove the rows with values in the 'index' column 'Intercept' and 'Score'.833PSI_calc['Contribution'] = np.where((PSI_calc['Proportions_Train'] == 0) | (PSI_calc['Proportions_New'] == 0), 0, (PSI_calc['Proportions_New'] - PSI_calc['Proportions_Train']) * np.log(PSI_calc['Proportions_New'] / PSI_calc['Proportions_Train']))834# We calculate the contribution of each dummy variable to the PSI of each original variable it comes from.835# If either the proportion of old data or the proportion of new data are 0, the contribution is 0.836# Otherwise, we apply the PSI formula for each contribution.837PSI_calc838PSI_calc.groupby('Original feature name')['Contribution'].sum()...2-PD Model Data Preparation.py
Source:2-PD Model Data Preparation.py  
1#!/usr/bin/env python32# -*- coding: utf-8 -*-3"""4Created on Fri Apr 24 17:04:12 20205@author: barry6"""7# PD model8## Data preparation9### Dependent Variable. Good/ Bad (Default) Definition. Default and Non-default Accounts.10loan_data['loan_status'].unique()11# Displays unique values of a column.12loan_data['loan_status'].value_counts()13# Calculates the number of observations for each unique value of a variable.14loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()15# We divide the number of observations for each unique value of a variable by the total number of observations.16# Thus, we get the proportion of observations for each unique value of a variable.17# Good/ Bad Definition18loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',19                                                       'Does not meet the credit policy. Status:Charged Off',20                                                       'Late (31-120 days)']), 0, 1)21# We create a new variable that has the value of '0' if a condition is met, and the value of '1' if it is not met.22loan_data['good_bad']23### Splitting Data24from sklearn.model_selection import train_test_split25# Imports the libraries we need.26train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])27# Takes a set of inputs and a set of targets as arguments. Splits the inputs and the targets into four dataframes:28#  input and output29# Inputs - Train, Inputs - Test, Targets - Train, Targets - Test.30loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'])31# We split two dataframes with inputs and targets, each into a train and test dataframe, and store them in variables.32loan_data_inputs_train.shape33# Displays the size of the dataframe.34loan_data_targets_train.shape35# Displays the size of the dataframe.36loan_data_inputs_test.shape37# Displays the size of the dataframe.38loan_data_targets_test.shape39# Displays the size of the dataframe.40loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)41# We split two dataframes with inputs and targets, each into a train and test dataframe, and store them in variables.42# This time we set the size of the test dataset to be 20%.43# Respectively, the size of the train dataset becomes 80%.44# We also set a specific random state.45# This would allow us to perform the exact same split multimple times.46# This means, to assign the exact same observations to the train and test datasets.47loan_data_inputs_train.shape48# Displays the size of the dataframe.49loan_data_targets_train.shape50# Displays the size of the dataframe.51loan_data_inputs_test.shape52# Displays the size of the dataframe.53loan_data_targets_test.shape54# Displays the size of the dataframe.55### Data Preparation: An Example56#####57df_inputs_prepr = loan_data_inputs_train58df_targets_prepr = loan_data_targets_train59#####60#df_inputs_prepr = loan_data_inputs_test61#df_targets_prepr = loan_data_targets_test62df_inputs_prepr['grade'].unique()63# Displays unique values of a column.64df1 = pd.concat([df_inputs_prepr['grade'], df_targets_prepr], axis = 1)65# Concatenates two dataframes along the columns.66df1.head()67df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count()68# df.groupby(series1)[series2].method69# group series2 from dataframe df by series1 from dataframe df and aggregate series270# Groups the data according to a criterion contained in one column.71# Does not turn the names of the values of the criterion as indexes.72# Aggregates the data in another column, using a selected function.73# In this specific case, we group by the column with index 0 and we aggregate the values of the column with index 1.74# More specifically, we count them.75# In other words, we count the values in the column with index 1 for each value of the column with index 0.76df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()77# Groups the data according to a criterion contained in one column.78# Does not turn the names of the values of the criterion as indexes.79# Aggregates the data in another column, using a selected function.80# Here we calculate the mean of the values in the column with index 1 for each value of the column with index 0.81df1 = pd.concat([df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].count(),82                df1.groupby(df1.columns.values[0], as_index = False)[df1.columns.values[1]].mean()], axis = 1)83# Concatenates two dataframes along the columns.84df185df1 = df1.iloc[:, [0, 1, 3]]86# Selects only columns with specific indexes.87df188df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']89# Changes the names of the columns of a dataframe.90df191df1['prop_n_obs'] = df1['n_obs'] / df1['n_obs'].sum()92# We divide the values of one column by he values of another column and save the result in a new variable.93df194df1['n_good'] = df1['prop_good'] * df1['n_obs']95# We multiply the values of one column by he values of another column and save the result in a new variable.96df1['n_bad'] = (1 - df1['prop_good']) * df1['n_obs']97df198df1['prop_n_good'] = df1['n_good'] / df1['n_good'].sum()99df1['prop_n_bad'] = df1['n_bad'] / df1['n_bad'].sum()100df1101df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])102# We take the natural logarithm of a variable and save the result in a nex variable.103df1104df1['WoE'] = np.log(df1['prop_n_good'] / df1['prop_n_bad'])105# We take the natural logarithm of a variable and save the result in a nex variable.106df1107df1 = df1.sort_values(['WoE'])108# Sorts a dataframe by the values of a given column.109df1 = df1.reset_index(drop = True)110# We reset the index of a dataframe and overwrite it.111df1112df1['diff_prop_good'] = df1['prop_good'].diff().abs()113# We take the difference between two subsequent values of a column. Then, we take the absolute value of the result.114df1['diff_WoE'] = df1['WoE'].diff().abs()115# We take the difference between two subsequent values of a column. Then, we take the absolute value of the result.116df1117df1['IV'] = (df1['prop_n_good'] - df1['prop_n_bad']) * df1['WoE']118df1['IV'] = df1['IV'].sum()119# We sum all values of a given column.120df1121### Preprocessing Discrete Variables: Automating Calculaions122# WoE function for discrete unordered variables123def woe_discrete(df, discrete_variabe_name, good_bad_variable_df):124    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)125    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),126                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)127    df = df.iloc[:, [0, 1, 3]]128    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']129    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()130    df['n_good'] = df['prop_good'] * df['n_obs']131    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']132    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()133    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()134    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])135    df = df.sort_values(['WoE'])136    df = df.reset_index(drop = True)137    df['diff_prop_good'] = df['prop_good'].diff().abs()138    df['diff_WoE'] = df['WoE'].diff().abs()139    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']140    df['IV'] = df['IV'].sum()141    return df142# Here we combine all of the operations above in a function.143# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.144# 'grade'145df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)146# We execute the function we defined with the necessary arguments: a dataframe, a string, and a dataframe.147# We store the result in a dataframe.148df_temp149### Preprocessing Discrete Variables: Visualizing Results150import matplotlib.pyplot as plt151import seaborn as sns152# Imports the libraries we need.153sns.set()154# We set the default style of the graphs to the seaborn style. 155# Below we define a function that takes 2 arguments: a dataframe and a number.156# The number parameter has a default value of 0.157# This means that if we call the function and omit the number parameter, it will be executed with it having a value of 0.158# The function displays a graph.159def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):160    x = np.array(df_WoE.iloc[:, 0].apply(str))161    # Turns the values of the column with index 0 to strings, makes an array from these strings, and passes it to variable x.162    y = df_WoE['WoE']163    # Selects a column with label 'WoE' and passes it to variable y.164    plt.figure(figsize=(18, 6))165    # Sets the graph size to width 18 x height 6.166    plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')167    # Plots the datapoints with coordiantes variable x on the x-axis and variable y on the y-axis.168    # Sets the marker for each datapoint to a circle, the style line between the points to dashed, and the color to black.169    plt.xlabel(df_WoE.columns[0])170    # Names the x-axis with the name of the column with index 0.171    plt.ylabel('Weight of Evidence')172    # Names the y-axis 'Weight of Evidence'.173    plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))174    # Names the grapth 'Weight of Evidence by ' the name of the column with index 0.175    plt.xticks(rotation = rotation_of_x_axis_labels)176    # Rotates the labels of the x-axis a predefined number of degrees.177plot_by_woe(df_temp)178# We execute the function we defined with the necessary arguments: a dataframe.179# We omit the number argument, which means the function will use its default value, 0.180### Preprocessing Discrete Variables: Creating Dummy Variables, Part 1181# 'home_ownership'182df_temp = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr)183# We calculate weight of evidence.184df_temp185plot_by_woe(df_temp)186# We plot the weight of evidence values.187# There are many categories with very few observations and many categories with very different "good" %.188# Therefore, we create a new discrete variable where we combine some of the categories.189# 'OTHERS' and 'NONE' are riskiest but are very few. 'RENT' is the next riskiest.190# 'ANY' are least risky but are too few. Conceptually, they belong to the same category. Also, their inclusion would not change anything.191# We combine them in one category, 'RENT_OTHER_NONE_ANY'.192# We end up with 3 categories: 'RENT_OTHER_NONE_ANY', 'OWN', 'MORTGAGE'.193df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'],194                                                      df_inputs_prepr['home_ownership:NONE'],df_inputs_prepr['home_ownership:ANY']])195# 'RENT_OTHER_NONE_ANY' will be the reference category.196# Alternatively:197#loan_data.loc['home_ownership' in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 1198#loan_data.loc['home_ownership' not in ['RENT', 'OTHER', 'NONE', 'ANY'], 'home_ownership:RENT_OTHER_NONE_ANY'] = 0199#loan_data.loc['loan_status' not in ['OWN'], 'home_ownership:OWN'] = 1200#loan_data.loc['loan_status' not in ['OWN'], 'home_ownership:OWN'] = 0201#loan_data.loc['loan_status' not in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 1202#loan_data.loc['loan_status' not in ['MORTGAGE'], 'home_ownership:MORTGAGE'] = 0203### Preprocessing Discrete Variables: Creating Dummy Variables, Part 2204# 'addr_state'205df_inputs_prepr['addr_state'].unique()206df_temp = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr)207# We calculate weight of evidence.208df_temp209plot_by_woe(df_temp)210# We plot the weight of evidence values.211if ['addr_state:ND'] in df_inputs_prepr.columns.values:212    pass213else:214    df_inputs_prepr['addr_state:ND'] = 0215plot_by_woe(df_temp.iloc[2: -2, : ])216# We plot the weight of evidence values.217plot_by_woe(df_temp.iloc[6: -6, : ])218# We plot the weight of evidence values.219# We create the following categories:220# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'221# 'NM' 'VA'222# 'NY'223# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'224# 'CA'225# 'UT' 'KY' 'AZ' 'NJ'226# 'AR' 'MI' 'PA' 'OH' 'MN'227# 'RI' 'MA' 'DE' 'SD' 'IN'228# 'GA' 'WA' 'OR'229# 'WI' 'MT'230# 'TX'231# 'IL' 'CT'232# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'233# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'234# 'IA_NV_HI_ID_AL_FL' will be the reference category.235df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],236                                              df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],237                                              df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],238                                                          df_inputs_prepr['addr_state:AL']])239df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])240df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],241                                              df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],242                                              df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])243df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],244                                              df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])245df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],246                                              df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],247                                              df_inputs_prepr['addr_state:MN']])248df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],249                                              df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],250                                              df_inputs_prepr['addr_state:IN']])251df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],252                                              df_inputs_prepr['addr_state:OR']])253df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])254df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])255df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],256                                              df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],257                                              df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])258df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],259                                              df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],260                                              df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])261### Preprocessing Discrete Variables: Homework262# 'verification_status'263df_temp = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr)264# We calculate weight of evidence.265df_temp266plot_by_woe(df_temp)267# We plot the weight of evidence values.268# 'purpose'269df_temp = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr)270# We calculate weight of evidence.271df_temp272plot_by_woe(df_temp, 90)273# We plot the weight of evidence values.274# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.275# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.276# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.277# We leave 'debt_consolidtion' in a separate category.278# We leave 'credit_card' in a separate category.279# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.280df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],281                                                                 df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],282                                                                 df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])283df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],284                                             df_inputs_prepr['purpose:vacation']])285df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],286                                                        df_inputs_prepr['purpose:home_improvement']])287# 'initial_list_status'288df_temp = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr)289df_temp290plot_by_woe(df_temp)291# We plot the weight of evidence values.292### Preprocessing Continuous Variables: Automating Calculations and Visualizing Results293# WoE function for ordered discrete and continuous variables294def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):295    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)296    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),297                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)298    df = df.iloc[:, [0, 1, 3]]299    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']300    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()301    df['n_good'] = df['prop_good'] * df['n_obs']302    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']303    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()304    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()305    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])306    #df = df.sort_values(['WoE'])307    #df = df.reset_index(drop = True)308    df['diff_prop_good'] = df['prop_good'].diff().abs()309    df['diff_WoE'] = df['WoE'].diff().abs()310    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']311    df['IV'] = df['IV'].sum()312    return df313# Here we define a function similar to the one above, ...314# ... with one slight difference: we order the results by the values of a different column.315# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.316### Preprocessing Continuous Variables: Creating Dummy Variables, Part 1317# term318df_inputs_prepr['term_int'].unique()319# There are only two unique values, 36 and 60.320df_temp = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr)321# We calculate weight of evidence.322df_temp323plot_by_woe(df_temp)324# We plot the weight of evidence values.325# Leave as is.326# '60' will be the reference category.327# df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)328df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)329# emp_length_int330df_inputs_prepr['emp_length_int'].unique()331# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.332df_temp = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr)333# We calculate weight of evidence.334df_temp335plot_by_woe(df_temp)336# We plot the weight of evidence values.337# We create the following categories: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'338# '0' will be the reference category339df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)340df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)341df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)342df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)343df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)344df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)345### Preprocessing Continuous Variables: Creating Dummy Variables, Part 2346df_inputs_prepr['mths_since_issue_d'].unique()347df_inputs_prepr['mths_since_issue_d_factor'] = pd.cut(df_inputs_prepr['mths_since_issue_d'], 50)348# pd.cut(series, number of categories)349# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.350df_inputs_prepr['mths_since_issue_d_factor']351# mths_since_issue_d352df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue_d_factor', df_targets_prepr)353# We calculate weight of evidence.354df_temp355plot_by_woe(df_temp)356# We plot the weight of evidence values.357# We have to rotate the labels because we cannot read them otherwise.358plot_by_woe(df_temp, 90)359# We plot the weight of evidence values, rotating the labels 90 degrees.360plot_by_woe(df_temp.iloc[3: , : ], 90)361# We plot the weight of evidence values.362# We create the following categories:363# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.364df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38)), 1, 0)365df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38, 40)), 1, 0)366df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(40, 42)), 1, 0)367df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(42, 49)), 1, 0)368df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(49, 53)), 1, 0)369df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(53, 65)), 1, 0)370df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(65, 85)), 1, 0)371df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(85, int(df_inputs_prepr['mths_since_issue_d'].max()))), 1, 0)372# int_rate373df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)374# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.375df_temp = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr)376# We calculate weight of evidence.377df_temp378plot_by_woe(df_temp, 90)379# We plot the weight of evidence values.380### Data Preparation: Continuous Variables, Part 1 and 2: Homework381# mths_since_earliest_cr_line382df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)383# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.384df_temp = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_targets_prepr)385# We calculate weight of evidence.386df_temp387plot_by_woe(df_temp, 90)388# We plot the weight of evidence values.389plot_by_woe(df_temp.iloc[6: , : ], 90)390# We plot the weight of evidence values.391# We create the following categories:392# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352393df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)394df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)395df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)396df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)397df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)398df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)399# delinq_2yrs400df_temp = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr)401# We calculate weight of evidence.402df_temp403plot_by_woe(df_temp)404# We plot the weight of evidence values.405# Categories: 0, 1-3, >=4406df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)407df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)408df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)409# inq_last_6mths410df_temp = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr)411# We calculate weight of evidence.412df_temp413plot_by_woe(df_temp)414# We plot the weight of evidence values.415# Categories: 0, 1 - 2, 3 - 6, > 6416df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)417df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)418df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)419df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)420# open_acc421df_temp = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr)422# We calculate weight of evidence.423df_temp424plot_by_woe(df_temp, 90)425# We plot the weight of evidence values.426plot_by_woe(df_temp.iloc[ : 40, :], 90)427# We plot the weight of evidence values.428# Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'429df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)430df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)431df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)432df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)433df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)434df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)435df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)436df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)437# pub_rec438df_temp = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr)439# We calculate weight of evidence.440df_temp441plot_by_woe(df_temp, 90)442# We plot the weight of evidence values.443# Categories '0-2', '3-4', '>=5'444df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)445df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)446df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)447# total_acc448df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)449# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.450df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr)451# We calculate weight of evidence.452df_temp453plot_by_woe(df_temp, 90)454# We plot the weight of evidence values.455# Categories: '<=27', '28-51', '>51'456df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)457df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)458df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)459# acc_now_delinq460df_temp = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr)461# We calculate weight of evidence.462df_temp463plot_by_woe(df_temp)464# We plot the weight of evidence values.465# Categories: '0', '>=1'466df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)467df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)468# total_rev_hi_lim469df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)470# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.471df_temp = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_targets_prepr)472# We calculate weight of evidence.473df_temp474plot_by_woe(df_temp.iloc[: 50, : ], 90)475# We plot the weight of evidence values.476# Categories477# '<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'478df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)479df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)480df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)481df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)482df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)483df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)484df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)485df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)486# installment487df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)488# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.489df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)490# We calculate weight of evidence.491df_temp492plot_by_woe(df_temp, 90)493# We plot the weight of evidence values.494### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3495# annual_inc496df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)497# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.498df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)499# We calculate weight of evidence.500df_temp501df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)502# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.503df_temp = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)504# We calculate weight of evidence.505df_temp506# Initial examination shows that there are too few individuals with large income and too many with small income.507# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine508# the categories of everyone with 140k or less.509df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]510#loan_data_temp = loan_data_temp.reset_index(drop = True)511#df_inputs_prepr_temp512df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)513# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.514df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index])515# We calculate weight of evidence.516df_temp517plot_by_woe(df_temp, 90)518# We plot the weight of evidence values.519# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.520df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)521df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)522df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)523df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)524df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)525df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)526df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)527df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)528df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)529df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)530df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)531df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)532# mths_since_last_delinq533# We have to create one category for missing values and do fine and coarse classing for the rest.534df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]535# pd.notnull(dataframe or series) ~ select the non-missing values from a dataframe or series536df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)537df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index])538# We calculate weight of evidence.539df_temp540plot_by_woe(df_temp, 90)541# We plot the weight of evidence values.542# Categories: Missing, 0-3, 4-30, 31-56, >=57543df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)544df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)545df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)546df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)547df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)548### Preprocessing Continuous Variables: Creating Dummy Variables, Part 3: Homework549# dti550df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)551# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.552df_temp = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr)553# We calculate weight of evidence.554df_temp555plot_by_woe(df_temp, 90)556# We plot the weight of evidence values.557# Similarly to income, initial examination shows that most values are lower than 200.558# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine559# the categories of everyone with 150k or less.560df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]561df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)562# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.563df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index])564# We calculate weight of evidence.565df_temp566plot_by_woe(df_temp, 90)567# We plot the weight of evidence values.568# Categories:569df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)570df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)571df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)572df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)573df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)574df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)575df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)576df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)577df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)578df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)579# mths_since_last_record580# We have to create one category for missing values and do fine and coarse classing for the rest.581df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]582#sum(loan_data_temp['mths_since_last_record'].isnull())583df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)584# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.585df_temp = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index])586# We calculate weight of evidence.587df_temp588plot_by_woe(df_temp, 90)589# We plot the weight of evidence values.590# Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'591df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)592df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)593df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)594df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)595df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)596df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)597df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)598df_inputs_prepr['mths_since_last_delinq:Missing'].sum()599# display inputs_train, inputs_test600# funded_amnt601df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)602df_temp = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr)603df_temp604plot_by_woe(df_temp, 90)605# WON'T USE because there is no clear trend, even if segments of the whole range are considered.606# installment607df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)608df_temp = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)609df_temp610plot_by_woe(df_temp, 90)611# WON'T USE because there is no clear trend, even if segments of the whole range are considered.612### Preprocessing the Test Dataset613#####614#loan_data_inputs_train = df_inputs_prepr615#####616loan_data_inputs_test = df_inputs_prepr617loan_data_inputs_train.to_csv('loan_data_inputs_train.csv')618loan_data_targets_train.to_csv('loan_data_targets_train.csv')619loan_data_inputs_test.to_csv('loan_data_inputs_test.csv')...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
