Best Python code snippet using autotest_python
location_predict.py
Source:location_predict.py  
1# coding: utf-82# In[133]:3import numpy as np4import pandas as pd5import re6import xgboost as xgb7from sklearn import svm8from collections import defaultdict9from sklearn.feature_extraction.text import TfidfVectorizer10from sklearn.linear_model import LogisticRegression11from sklearn.cross_validation import StratifiedShuffleSplit12from sklearn.preprocessing import LabelEncoder13from sklearn.cross_validation import StratifiedKFold14#读å
¥è®ç»æµè¯æ°æ®ï¼åå¹¶15train_data = pd.read_csv('./data/train/train_labels.txt',sep=u'|',header=None).dropna(1)16train_data.columns = ['uid','sex','age','location']17test_data = pd.read_csv('./data/valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)18test_data.columns = ['uid']19total_data = pd.concat([train_data,test_data],axis=0)20#读å
¥è®ç»æµè¯infoæ°æ®ï¼åå¹¶21train_data_info = pd.read_csv('./data/train/train_info.txt',sep=u'|',header=None).dropna(1)22train_data_info.columns = ['uid','name','image']23train_data_info = train_data_info.drop_duplicates()24test_data_info = pd.read_csv('./data/valid/valid_info.txt',sep=u'|',header=None).dropna(1)25test_data_info.columns = ['uid','name','image']26test_data_info = test_data_info.drop_duplicates()27total_data_info = pd.concat([train_data_info,test_data_info],axis=0)28total_data_info = total_data_info.drop_duplicates('uid')29#读å
¥è®ç»æµè¯linksæ°æ®ï¼åå¹¶30links = []31for i, line in enumerate(open('./data/train/train_links.txt')):32    line = line.split()33    row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}34    links.append(row)35train_data_links = pd.DataFrame(links)36train_data_links = train_data_links.drop_duplicates()37links = []38for i, line in enumerate(open('./data/valid/valid_links.txt')):39    line = line.split()40    row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}41    links.append(row)42test_data_links = pd.DataFrame(links)43test_data_links = test_data_links.drop_duplicates()44total_data_links = pd.concat([train_data_links,test_data_links],axis=0)45#读å
¥è®ç»æµè¯statusæ°æ®ï¼åå¹¶46status = []47for i, line in enumerate(open('./data/train/train_status.txt')):48    49    l = re.search(',',line).span()[0]50    r = re.search(',',line).span()[1]51    row = {'uid':int(line[:l]),'sta':line[r:]}52    status.append(row)53train_data_status = pd.DataFrame(status)54status = []55for i, line in enumerate(open('./data/valid/valid_status.txt')):56    57    l = re.search(',',line).span()[0]58    r = re.search(',',line).span()[1]59    row = {'uid':int(line[:l]),'sta':line[r:]}60    status.append(row)61test_data_status = pd.DataFrame(status)62total_data_status = pd.concat([train_data_status,test_data_status],axis=0)63#åå¹¶é¢ç®ç»çå ä¸ªè¡¨æ°æ®64merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')65merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')66merge_data.index = range(len(merge_data))67##################################################################################68total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))69total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))70total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])71total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])72total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))73contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))74merge_data['contents'] = merge_data.uid.map(contents)75merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())76#ç»è®¡ç¹å¾77merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))78merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))79merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))80merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))81merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))82merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))83merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))84merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))85merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))86merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))87#locationå°åºæ å°è¯è¡¨88d = {'ç³å®¶åº': 'åå',89 'åéµåº': 'åä¸',90 'æ·±å³': 'åå',91 '广å·': 'åå',92 'å®å®': 'åå',93 'ååº': 'åä¸',94 'æ²å¸': 'åä¸',95 'æ¦æ±': 'åä¸',96 'è¥é³': 'åä¸',97 'å®é': 'åä¸',98 'èé¨': 'åä¸',99 '西å®': '西å',100 'é¶å·': '西å',101 'æé½': '西å',102 '绵é³': '西å',103 '䏿µ·': 'åä¸',104 'äºå': '西å',105 'å
èå¤': 'åå',106 'å京': 'åå',107 'å°æ¹¾': 'åä¸',108 'åæ': 'ä¸å',109 'åå·': '西å',110 '天津': 'åå',111 'å®å¤': '西å',112 'å®å¾½': 'åä¸',113 'å±±ä¸': 'åä¸',114 '山西': 'åå',115 'è¾½å®': 'ä¸å',116 'éåº': '西å',117 'é西': '西å',118 'éæµ·': '西å',119 '馿¸¯': 'åå',120 'é»é¾æ±': 'ä¸å',121 'é¿ç½': 'ä¸å',122 '丹ä¸': 'ä¸å',123 '大庸桥': 'ä¸å',124 'æ²é³': 'ä¸å',125 '大è¿': 'ä¸å',126 'æé¡º': 'ä¸å',127 'ç³å®¶åº': 'åå',128 'æé³': 'åå',129 '广ä¸': 'åå',130 '广西': 'åå',131 'æ°ç': '西å',132 'æ±è': 'åä¸',133 'æ±è¥¿': 'åä¸',134 'æ²³å': 'åå',135 'æ²³å': 'åä¸',136 'æµæ±': 'åä¸',137 'æµ·å': 'åå',138 'æ¹å': 'åä¸',139 'æ¹å': 'åä¸',140 'æ¾³é¨': 'åå',141 'çè': '西å',142 'ç¦å»º': 'åä¸',143 '西è': '西å',144 'è´µå·': '西å',145}146#å°locationåage转åæéè¦æäº¤çèå´147def trans_loc(s):148    if pd.isnull(s):149        return s150    s = s.split(' ')[0]151    if s == 'None':152        return 'åå'153    if s == 'æµ·å¤':154        return 'å¢å¤'155    return d[s]156def trans_age(age):157    if pd.isnull(age):158        return age159    if age <=1979:160        return "-1979"161    elif age<=1989:162        return "1980-1989"163    else:164        return "1990+"165merge_data['location2'] = merge_data['location'].map(trans_loc)166merge_data['age2'] = merge_data['age'].map(trans_age)167src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))168merge_data['source_content'] = merge_data['uid'].map(src_lst) 169keys = '|'.join(d.keys())170merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))171merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))172d = defaultdict(lambda :'空',d)173tokenizer = lambda line: [d[w] for w in line.split(' ')]174tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)175X_all_sp = tfv.fit_transform(merge_data['num_province'])176sum_province = X_all_sp.toarray()177for i in range(sum_province.shape[1]):178    merge_data['sum_province_%d'%i] = sum_province[:,i]179length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))180merge_data['max_content_len'] = merge_data['uid'].map(length)181length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))182merge_data['min_content_len'] = merge_data['uid'].map(length)183length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))184merge_data['mean_content_len'] = merge_data['uid'].map(length)185merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))186def num_missing(x):    187    return sum(x.isnull())  188merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 189#rankç¹å¾190merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')191merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')192merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')193merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')194merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')195# In[134]:196#导å
¥ä½¿ç¨tfidfç¹å¾è®ç»ç模åç颿µç»æï¼éç¨stackingèåï¼æé¢æµç»æä½ä¸ºæ°ç¹å¾å è¿æ¨¡åï¼197tfidf_stacking = pd.read_csv('./data/newfeat/stack_new.csv')198merge_data = pd.concat([merge_data,tfidf_stacking],axis=1)199feat_time_3hour = pd.read_csv('./data/newfeat/feat_time_3hour.csv')200merge_data = pd.merge(merge_data,feat_time_3hour,on='uid',how='left')201#导å
¥ä½¿ç¨word2vecç¹å¾è®ç»ç模åç颿µç»æ202w2v_stacking = pd.read_csv('./data/newfeat/w2v_prob1.csv')203merge_data = pd.merge(merge_data,w2v_stacking,on='uid',how='left')204newmerge_feat1 = pd.read_csv('./data/newfeat/newmerge_feat.csv')205merge_data = pd.merge(merge_data,newmerge_feat1,on='uid',how='left')206feat_area1 = pd.read_csv('./data/newfeat/feat_area.csv')207merge_data = pd.merge(merge_data,feat_area1,on='uid',how='left')208# In[135]:209#########################################################################################210cols = '|'.join(['twts_len','name_len','sum_province','sum_fans',211                'age_','sex_','loc_',212               'mean_retweet','sum_content','mean_review','num_missing',213                 'w2v_f_prob','w2v_m_prob','w2v_young_prob','w2v_old_prob','w2v_mid_prob',214                 'max_retweet','min_retweet','max_review','min_review',215                 'rank_sum_content','rank_sum_fans','rank_mean_retweet','rank_mean_review','rank_num_missing',216                 'timePeriod_3hour_0','timePeriod_3hour_1','timePeriod_3hour_2','timePeriod_3hour_3',217                 'timePeriod_3hour_4','timePeriod_3hour_5','timePeriod_3hour_6','timePeriod_3hour_7',218                 'name_isnull','image_isnull','fans_isnull','retweet_isnull','review_isnull',219                 'area_0','area_1','area_2','area_3','area_4','area_5','area_6','area_7'220                ])221cols = [c for c in merge_data.columns if re.match(cols,c)]222age_le = LabelEncoder()223ys = {}224ys['age'] = age_le.fit_transform(merge_data.iloc[:3200]['age2'])225loc_le = LabelEncoder()226ys['loc'] = loc_le.fit_transform(merge_data.iloc[:3200]['location2'])227sex_le = LabelEncoder()228ys['sex'] = sex_le.fit_transform(merge_data.iloc[:3200]['sex'])229merge_data = merge_data.fillna(0)230# In[136]:231task = ['tr']232TR = 3200233TE = 1240234X_all = merge_data[cols]235X = X_all[:TR]236prds = []237# In[137]:238from sklearn.preprocessing import PolynomialFeatures239poly=PolynomialFeatures(2)240X_poly=poly.fit_transform(X_all)241from sklearn.feature_selection import VarianceThreshold242vt=VarianceThreshold(0.001)243X_poly=vt.fit_transform(X_poly)244from sklearn.preprocessing import StandardScaler245ss=StandardScaler()246X_poly=ss.fit_transform(X_poly)247# In[138]:248X_poly=pd.DataFrame(X_poly)249X_poly.columns='Poly_'+X_poly.columns.astype(str)250# In[144]:251X_train=X_poly[:TR]252X_test=X_poly[TR:]253# In[145]:254#label=pd.read_csv('newlabel.csv',header=None,index_col=0)255#label.columns=['uid','age','gender','province']256# In[146]:257merge_data.iloc[:3200]['location2'].value_counts()258# In[416]:259########################260#å°åºé¢æµé¨å261label = 'loc'262print('='*20)263print(label)264print('='*20)265y = ys[label]266dtrain=xgb.DMatrix(X_train,y)267dtest=xgb.DMatrix(X_test)268# In[424]:269params = {270    "objective": "multi:softprob",271    "booster": "gbtree",272    "eval_metric": "merror",273    "num_class":8,274    'max_depth':4,275    #'min_child_weight':2.5,276    'subsample':0.65,277    'colsample_bytree':1.0,278    'gamma':2.5,279    "eta": 0.006,280    #"lambda":1,281    #'alpha':0,282    "silent": 1,283    #'seed':1123284}285xgb1=xgb.train(params,dtrain,num_boost_round=25)286# In[425]:287pre=xgb1.predict(dtest,ntree_limit=25)288pre_loc=[loc_le.classes_[idx] for idx in pre.argmax(1)]289sub = pd.DataFrame()290sub['uid'] = merge_data.iloc[TR:]['uid']291sub['province'] = pre_loc292sub.to_csv('./data/location_sub.csv',index=False)293# In[426]:294#loc_pro2=pd.DataFrame(pre,columns=loc_le.classes_,index=test_data.uid)295# In[427]:...smp_merge_data.py
Source:smp_merge_data.py  
1import numpy as np2import pandas as pd3import re4import xgboost as xgb5from sklearn import svm6from collections import defaultdict7from sklearn.feature_extraction.text import TfidfVectorizer8from sklearn.linear_model import LogisticRegression9from sklearn.cross_validation import StratifiedShuffleSplit10from sklearn.preprocessing import LabelEncoder11path = './data/'12#â¡»Ãⵡââ¤â â ËæâºÂ£Â¨â«Åâ¤Â¢13train_data = pd.read_csv( path + 'train/train_labels.txt',sep=u'|',header=None).dropna(1)14train_data.columns = ['uid','sex','age','location']15test_data = pd.read_csv(path + 'valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)16test_data.columns = ['uid']17total_data = pd.concat([train_data,test_data],axis=0)18#â¡»Ãⵡââ¤â âinfo ËæâºÂ£Â¨â«Åâ¤Â¢19train_data_info = pd.read_csv( path + 'train/train_info.txt',sep=u'|',header=None).dropna(1)20train_data_info.columns = ['uid','name','image']21train_data_info = train_data_info.drop_duplicates('uid')22test_data_info = pd.read_csv(path + 'valid/valid_info.txt',sep=u'|',header=None).dropna(1)23test_data_info.columns = ['uid','name','image']24test_data_info = test_data_info.drop_duplicates('uid')25total_data_info = pd.concat([train_data_info,test_data_info],axis=0)26#â¡»Ãⵡââ¤â âlinks ËæâºÂ£Â¨â«Åâ¤Â¢27links = []28for i, line in enumerate(open(path + 'train/train_links.txt')):29    line = line.split()30    row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}31    links.append(row)32train_data_links = pd.DataFrame(links)33train_data_links = train_data_links.drop_duplicates()34links = []35for i, line in enumerate(open(path + 'valid/valid_links.txt')):36    line = line.split()37    row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}38    links.append(row)39test_data_links = pd.DataFrame(links)40test_data_links = test_data_links.drop_duplicates()41total_data_links = pd.concat([train_data_links,test_data_links],axis=0)42#â¡»Ãⵡââ¤â âstatus ËæâºÂ£Â¨â«Åâ¤Â¢43status = []44for i, line in enumerate(open(path + 'train/train_status.txt')):45    46    l = re.search(',',line).span()[0]47    r = re.search(',',line).span()[1]48    row = {'uid':int(line[:l]),'sta':line[r:]}49    status.append(row)50train_data_status = pd.DataFrame(status)51status = []52for i, line in enumerate(open(path + 'valid/valid_status.txt')):53    54    l = re.search(',',line).span()[0]55    r = re.search(',',line).span()[1]56    row = {'uid':int(line[:l]),'sta':line[r:]}57    status.append(row)58test_data_status = pd.DataFrame(status)59total_data_status = pd.concat([train_data_status,test_data_status],axis=0)60#â«Åâ¤Â¢ÃâÆÃ¸â¯µÆÂºââË±Ã Ëæâº61merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')62merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')63merge_data.index = range(len(merge_data))64##################################################################################65total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))66total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))67total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])68total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])69total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))70contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))71merge_data['contents'] = merge_data.uid.map(contents)72merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())73#Ãâ¥ÂºâÃÿâË74merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))75merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))76merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))77merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))78merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))79merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))80merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))81merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))82merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))83merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))84#locationµÿ«¯ââ¥â¦â°Â¥Â Â±Ã85d = {' ÃºââÃ': 'ªâ¢Â±Â±',86 'ââ¡ÃâÃ': 'ªâ¢â´',87 'â¦Ãâ¬â': 'Âªâ¢ÆÅ',88 'Ïâ÷âº': 'Âªâ¢ÆÅ',89 '±¶ââ¤': 'Âªâ¢ÆÅ',90 '¡ıâÃ': 'ªâ¢Ã·â',91 'â¦â¥Â â': 'ªâ¢Ã·â',92 'Åâ°â«â«': 'ªâ¢Ã·â',93 'ÅÃâÃ': 'ªâ¢Ã·â',94 'ââ¤Â¬Î©': 'ªâ¢Ã·â',95 'æ£ââ': 'ªâ¢Ã·â',96 'ÅËââ¤': 'Å˱±',97 '⯥®': 'Å˱±',98 'â¥â¦âº': 'ÅËÆÅ',99 'ââ¡âÃ': 'ÅËÆÅ',100 'â¦Åâ«Â£': 'ªâ¢â´',101 'ââÆÅ': 'ÅËÆÅ',102 'Æâââ¦Ïâ': 'ªâ¢Â±Â±',103 '±±æ©': 'ªâ¢Â±Â±',104 'îÃÃ': 'ªâ¢â´',105 'ºâ¢Â¡Ã·': 'â´±±',106 'ÃÆÂ¥Â®': 'ÅËÆÅ',107 'ÃÃΩÃ': 'ªâ¢Â±Â±',108 'ÆËÅÆ': 'Å˱±',109 'ââ¤Âªâ': 'ªâ¢â´',110 'â¦Î©â´': 'ªâ¢â´',111 'â¦Î©ÅË': 'ªâ¢Â±Â±',112 'Â¡â¦ÆË': 'â´±±',113 '÷ÿ«Ã': 'ÅËÆÅ',114 'â¦Â¬ÅË': 'Å˱±',115 '«â¡â«Â£': 'Å˱±',116 'Åâââ¬': 'Âªâ¢ÆÅ',117 'â«â¡ËΩâ ': 'â´±±',118 'â¥Â§ââ': 'â´±±',119 'µ§â´': 'â´±±',120 'Â¥ÃâÏ«â': 'â´±±',121 'â¦ÃâÃ': 'â´±±',122 '¥á¨': 'â´±±',123 'âÃÃâ¥': 'â´±±',124 ' ÃºââÃ': 'ªâ¢Â±Â±',125 'â¥ÃâÃ': 'ªâ¢Â±Â±',126 'Ïââ´': 'Âªâ¢ÆÅ',127 'ÏâÅË': 'Âªâ¢ÆÅ',128 'â¬ΩÃ': 'Å˱±',129 'Ωâ Ãâ': 'ªâ¢â´',130 'Ωâ ÅË': 'ªâ¢â´',131 'â«â±±': 'ªâ¢Â±Â±',132 'â«âÆÅ': 'ªâ¢Ã·â',133 'ââΩâ ': 'ªâ¢â´',134 'â«Â£ÆÅ': 'Âªâ¢ÆÅ',135 'â«Ë±±': 'ªâ¢Ã·â',136 'â«ËÆÅ': 'ªâ¢Ã·â',137 'âÆââ': 'Âªâ¢ÆÅ',138 'â Ãâ¡': 'Å˱±',139 'â£Ω®': 'ªâ¢â´',140 'ÅËâ¤Ã¿': 'ÅËÆÅ',141 'ÏÃ÷âº': 'ÅËÆÅ',142}143#Ω´locationâ«Ãageââ¢ÂªÃâ¥â¦âÃââ¢Ã·ΩªµÆââÅÃ144def trans_loc(s):145    if pd.isnull(s):146        return s147    s = s.split(' ')[0]148    if s == 'None':149        return 'ªâ¢Â±Â±'150    if s == 'â«Â£Ãâ':151        return s152    return d[s]153def trans_age(age):154    if pd.isnull(age):155        return age156    if age <=1979:157        return "-1979"158    elif age<=1989:159        return "1980-1989"160    else:161        return "1990+"162merge_data['location2'] = merge_data['location'].map(trans_loc)163merge_data['age2'] = merge_data['age'].map(trans_age)164src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))165merge_data['source_content'] = merge_data['uid'].map(src_lst) 166keys = '|'.join(d.keys())167merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))168merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))169d = defaultdict(lambda :'øâ',d)170tokenizer = lambda line: [d[w] for w in line.split(' ')]171tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)172X_all_sp = tfv.fit_transform(merge_data['num_province'])173sum_province = X_all_sp.toarray()174for i in range(sum_province.shape[1]):175    merge_data['sum_province_%d'%i] = sum_province[:,i]176length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))177merge_data['max_content_len'] = merge_data['uid'].map(length)178length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))179merge_data['min_content_len'] = merge_data['uid'].map(length)180length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))181merge_data['mean_content_len'] = merge_data['uid'].map(length)182merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))183def num_missing(x):    184    return sum(x.isnull())  185merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 186#rankÃÿâË187merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')188merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')189merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')190merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')191merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')...prepare_data.py
Source:prepare_data.py  
1import numpy as np2import pandas as pd3from sklearn.linear_model import LinearRegression4import sys56args = sys.argv7order_file_name = args[1]8group_file_name = args[2]9airline_file_name = args[3]10train_set_file_name = args[4]11total_dataset = args[5]121314order = pd.read_csv(order_file_name)15group = pd.read_csv(group_file_name)16airline = pd.read_csv(airline_file_name)1718group['product_name_price_min'] = group['price'].groupby(group['product_name']).transform('min')19merge_data = order.merge(group, on=['group_id'], how='left')2021#cpå¼:å¹é¢/天æ¸22merge_data["cp"] = merge_data["price"] / merge_data['days']2324#source25source_1_dummy = pd.get_dummies(merge_data["source_1"] )26source_2_dummy = pd.get_dummies(merge_data["source_2"] )27merge_data = pd.concat([merge_data , source_1_dummy] , axis = 1)28merge_data = pd.concat([merge_data , source_2_dummy] , axis = 1)29303132#åä¸è¡ç¨æ¯å¦çºæä½å¹33merge_data['price-min'] = merge_data['price'] - merge_data['product_name_price_min']34merge_data['is-min-price'] = 035merge_data.loc[merge_data['price-min'] == 0 , ['is-min-price']] = 13637#åä¸è¡ç¨ææå¤å°è¨å®38merge_data['num_same_group'] = merge_data[['order_id']].groupby(merge_data['group_id']).transform('count')394041#åæè¡ç¨ç¸½å
±å¤å°äºº42merge_data['total_people_amount'] = merge_data[['people_amount']].groupby(merge_data['group_id']).transform('sum')434445#è¡ç¨ åªæ 46merge_data["discount"] = 047merge_data.loc[merge_data["product_name"].str.contains("ç") == True , ["discount"]] = 148merge_data.loc[merge_data["product_name"].str.contains("æ") == True , ["discount"]] = 149merge_data.loc[merge_data["product_name"].str.contains("è´") == True , ["discount"]] = 150merge_data.loc[merge_data["product_name"].str.contains("é") == True , ["discount"]] = 151merge_data.loc[merge_data["product_name"].str.contains("æ¸") == True , ["discount"]] = 152merge_data.loc[merge_data["product_name"].str.contains("åªæ ") == True , ["discount"]] = 153merge_data.drop(['product_name'], axis=1, inplace=True)545556#æéæ ¼å¼è½æï¼ä»¥åæéèç57merge_data['begin_date'] = pd.to_datetime(merge_data['begin_date'])58merge_data['order_date'] = pd.to_datetime(merge_data['order_date'])59merge_data['begin_date_month'] = merge_data["begin_date"].dt.month60merge_data['order_date_month'] = merge_data["order_date"].dt.month61merge_data['order_date_dayofweek'] = merge_data['order_date'].dt.dayofweek62merge_data['begin_date_dayofweek'] = merge_data['begin_date'].dt.dayofweek63merge_data['order_date_isweekend'] = 064merge_data['begin_date_isweekend'] = 065merge_data.loc[merge_data['order_date_dayofweek'] == 5  , ['order_date_isweekend']] = 166merge_data.loc[merge_data['order_date_dayofweek'] == 6  , ['order_date_isweekend']] = 167merge_data.loc[merge_data['begin_date_dayofweek'] == 5  , ['order_date_isweekend']] = 168merge_data.loc[merge_data['begin_date_dayofweek'] == 6  , ['order_date_isweekend']] = 16970717273# èªçèç74#å»ç¨èµ·é£æéï¼åç¨æµéæé75go_fly = airline[["group_id" , "fly_time" , "arrive_time"]]76go_fly['fly_time'] = airline['fly_time'].groupby(airline['group_id']).transform('min')77go_fly['fly_time'] = pd.to_datetime(go_fly['fly_time'])78go_fly['arrive_time'] = airline['arrive_time'].groupby(airline['group_id']).transform('max')79go_fly['arrive_time'] = pd.to_datetime(go_fly['arrive_time'])80go_fly = go_fly.drop_duplicates()81merge_data = merge_data.merge(go_fly, on=['group_id'], how='left')8283#æ´åè¡ç¨æäºå¹¾æ¬¡é£æ©84count = airline.groupby(['group_id']).size().to_frame("fly_count")85merge_data = merge_data.merge(count, on=['group_id'], how='left')8687#åªé¤æ²ç¨å°çæ¬ä½88merge_data.drop(['source_1'], axis=1, inplace=True)89merge_data.drop(['source_2'], axis=1, inplace=True)90merge_data.drop(['unit'], axis=1, inplace=True)91merge_data.drop(['area'], axis=1, inplace=True)92merge_data.drop(['sub_line'], axis=1, inplace=True)93merge_data.drop(['promotion_prog'], axis=1, inplace=True)949596training_set = pd.read_csv(train_set_file_name)97merge_data = merge_data.merge(training_set , on=['order_id'], how='left')98merge_data = merge_data.dropna()   #åªé¤æç¼ºå¼çå99100print(merge_data.info())101merge_data.to_csv(total_dataset , index = False)102
...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
