How to use merge_data method in autotest

Best Python code snippet using autotest_python

location_predict.py

Source:location_predict.py Github

copy

Full Screen

1# coding: utf-82# In[133]:3import numpy as np4import pandas as pd5import re6import xgboost as xgb7from sklearn import svm8from collections import defaultdict9from sklearn.feature_extraction.text import TfidfVectorizer10from sklearn.linear_model import LogisticRegression11from sklearn.cross_validation import StratifiedShuffleSplit12from sklearn.preprocessing import LabelEncoder13from sklearn.cross_validation import StratifiedKFold14#读入训练测试数据,合并15train_data = pd.read_csv('./data/train/train_labels.txt',sep=u'|',header=None).dropna(1)16train_data.columns = ['uid','sex','age','location']17test_data = pd.read_csv('./data/valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)18test_data.columns = ['uid']19total_data = pd.concat([train_data,test_data],axis=0)20#读入训练测试info数据,合并21train_data_info = pd.read_csv('./data/train/train_info.txt',sep=u'|',header=None).dropna(1)22train_data_info.columns = ['uid','name','image']23train_data_info = train_data_info.drop_duplicates()24test_data_info = pd.read_csv('./data/valid/valid_info.txt',sep=u'|',header=None).dropna(1)25test_data_info.columns = ['uid','name','image']26test_data_info = test_data_info.drop_duplicates()27total_data_info = pd.concat([train_data_info,test_data_info],axis=0)28total_data_info = total_data_info.drop_duplicates('uid')29#读入训练测试links数据,合并30links = []31for i, line in enumerate(open('./data/train/train_links.txt')):32 line = line.split()33 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}34 links.append(row)35train_data_links = pd.DataFrame(links)36train_data_links = train_data_links.drop_duplicates()37links = []38for i, line in enumerate(open('./data/valid/valid_links.txt')):39 line = line.split()40 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}41 links.append(row)42test_data_links = pd.DataFrame(links)43test_data_links = test_data_links.drop_duplicates()44total_data_links = pd.concat([train_data_links,test_data_links],axis=0)45#读入训练测试status数据,合并46status = []47for i, line in enumerate(open('./data/train/train_status.txt')):48 49 l = re.search(',',line).span()[0]50 r = re.search(',',line).span()[1]51 row = {'uid':int(line[:l]),'sta':line[r:]}52 status.append(row)53train_data_status = pd.DataFrame(status)54status = []55for i, line in enumerate(open('./data/valid/valid_status.txt')):56 57 l = re.search(',',line).span()[0]58 r = re.search(',',line).span()[1]59 row = {'uid':int(line[:l]),'sta':line[r:]}60 status.append(row)61test_data_status = pd.DataFrame(status)62total_data_status = pd.concat([train_data_status,test_data_status],axis=0)63#合并题目给的几个表数据64merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')65merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')66merge_data.index = range(len(merge_data))67##################################################################################68total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))69total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))70total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])71total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])72total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))73contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))74merge_data['contents'] = merge_data.uid.map(contents)75merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())76#统计特征77merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))78merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))79merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))80merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))81merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))82merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))83merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))84merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))85merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))86merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))87#location地区映射词表88d = {'石家庄': '华北',89 '子陵庙': '华东',90 '深圳': '华南',91 '广州': '华南',92 '宝安': '华南',93 '刘庄': '华中',94 '沙市': '华中',95 '武汉': '华中',96 '襄阳': '华中',97 '安陆': '华中',98 '荆门': '华中',99 '西安': '西北',100 '银川': '西北',101 '成都': '西南',102 '绵阳': '西南',103 '上海': '华东',104 '云南': '西南',105 '内蒙古': '华北',106 '北京': '华北',107 '台湾': '华东',108 '吉林': '东北',109 '四川': '西南',110 '天津': '华北',111 '宁夏': '西北',112 '安徽': '华东',113 '山东': '华东',114 '山西': '华北',115 '辽宁': '东北',116 '重庆': '西南',117 '陕西': '西北',118 '青海': '西北',119 '香港': '华南',120 '黑龙江': '东北',121 '长白': '东北',122 '丹东': '东北',123 '大庸桥': '东北',124 '沈阳': '东北',125 '大连': '东北',126 '抚顺': '东北',127 '石家庄': '华北',128 '朝阳': '华北',129 '广东': '华南',130 '广西': '华南',131 '新疆': '西北',132 '江苏': '华东',133 '江西': '华东',134 '河北': '华北',135 '河南': '华中',136 '浙江': '华东',137 '海南': '华南',138 '湖北': '华中',139 '湖南': '华中',140 '澳门': '华南',141 '甘肃': '西北',142 '福建': '华东',143 '西藏': '西南',144 '贵州': '西南',145}146#将location和age转化成需要提交的范围147def trans_loc(s):148 if pd.isnull(s):149 return s150 s = s.split(' ')[0]151 if s == 'None':152 return '华北'153 if s == '海外':154 return '境外'155 return d[s]156def trans_age(age):157 if pd.isnull(age):158 return age159 if age <=1979:160 return "-1979"161 elif age<=1989:162 return "1980-1989"163 else:164 return "1990+"165merge_data['location2'] = merge_data['location'].map(trans_loc)166merge_data['age2'] = merge_data['age'].map(trans_age)167src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))168merge_data['source_content'] = merge_data['uid'].map(src_lst) 169keys = '|'.join(d.keys())170merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))171merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))172d = defaultdict(lambda :'空',d)173tokenizer = lambda line: [d[w] for w in line.split(' ')]174tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)175X_all_sp = tfv.fit_transform(merge_data['num_province'])176sum_province = X_all_sp.toarray()177for i in range(sum_province.shape[1]):178 merge_data['sum_province_%d'%i] = sum_province[:,i]179length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))180merge_data['max_content_len'] = merge_data['uid'].map(length)181length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))182merge_data['min_content_len'] = merge_data['uid'].map(length)183length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))184merge_data['mean_content_len'] = merge_data['uid'].map(length)185merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))186def num_missing(x): 187 return sum(x.isnull()) 188merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 189#rank特征190merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')191merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')192merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')193merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')194merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')195# In[134]:196#导入使用tfidf特征训练的模型的预测结果(采用stacking融合,把预测结果作为新特征加进模型)197tfidf_stacking = pd.read_csv('./data/newfeat/stack_new.csv')198merge_data = pd.concat([merge_data,tfidf_stacking],axis=1)199feat_time_3hour = pd.read_csv('./data/newfeat/feat_time_3hour.csv')200merge_data = pd.merge(merge_data,feat_time_3hour,on='uid',how='left')201#导入使用word2vec特征训练的模型的预测结果202w2v_stacking = pd.read_csv('./data/newfeat/w2v_prob1.csv')203merge_data = pd.merge(merge_data,w2v_stacking,on='uid',how='left')204newmerge_feat1 = pd.read_csv('./data/newfeat/newmerge_feat.csv')205merge_data = pd.merge(merge_data,newmerge_feat1,on='uid',how='left')206feat_area1 = pd.read_csv('./data/newfeat/feat_area.csv')207merge_data = pd.merge(merge_data,feat_area1,on='uid',how='left')208# In[135]:209#########################################################################################210cols = '|'.join(['twts_len','name_len','sum_province','sum_fans',211 'age_','sex_','loc_',212 'mean_retweet','sum_content','mean_review','num_missing',213 'w2v_f_prob','w2v_m_prob','w2v_young_prob','w2v_old_prob','w2v_mid_prob',214 'max_retweet','min_retweet','max_review','min_review',215 'rank_sum_content','rank_sum_fans','rank_mean_retweet','rank_mean_review','rank_num_missing',216 'timePeriod_3hour_0','timePeriod_3hour_1','timePeriod_3hour_2','timePeriod_3hour_3',217 'timePeriod_3hour_4','timePeriod_3hour_5','timePeriod_3hour_6','timePeriod_3hour_7',218 'name_isnull','image_isnull','fans_isnull','retweet_isnull','review_isnull',219 'area_0','area_1','area_2','area_3','area_4','area_5','area_6','area_7'220 ])221cols = [c for c in merge_data.columns if re.match(cols,c)]222age_le = LabelEncoder()223ys = {}224ys['age'] = age_le.fit_transform(merge_data.iloc[:3200]['age2'])225loc_le = LabelEncoder()226ys['loc'] = loc_le.fit_transform(merge_data.iloc[:3200]['location2'])227sex_le = LabelEncoder()228ys['sex'] = sex_le.fit_transform(merge_data.iloc[:3200]['sex'])229merge_data = merge_data.fillna(0)230# In[136]:231task = ['tr']232TR = 3200233TE = 1240234X_all = merge_data[cols]235X = X_all[:TR]236prds = []237# In[137]:238from sklearn.preprocessing import PolynomialFeatures239poly=PolynomialFeatures(2)240X_poly=poly.fit_transform(X_all)241from sklearn.feature_selection import VarianceThreshold242vt=VarianceThreshold(0.001)243X_poly=vt.fit_transform(X_poly)244from sklearn.preprocessing import StandardScaler245ss=StandardScaler()246X_poly=ss.fit_transform(X_poly)247# In[138]:248X_poly=pd.DataFrame(X_poly)249X_poly.columns='Poly_'+X_poly.columns.astype(str)250# In[144]:251X_train=X_poly[:TR]252X_test=X_poly[TR:]253# In[145]:254#label=pd.read_csv('newlabel.csv',header=None,index_col=0)255#label.columns=['uid','age','gender','province']256# In[146]:257merge_data.iloc[:3200]['location2'].value_counts()258# In[416]:259########################260#地区预测部分261label = 'loc'262print('='*20)263print(label)264print('='*20)265y = ys[label]266dtrain=xgb.DMatrix(X_train,y)267dtest=xgb.DMatrix(X_test)268# In[424]:269params = {270 "objective": "multi:softprob",271 "booster": "gbtree",272 "eval_metric": "merror",273 "num_class":8,274 'max_depth':4,275 #'min_child_weight':2.5,276 'subsample':0.65,277 'colsample_bytree':1.0,278 'gamma':2.5,279 "eta": 0.006,280 #"lambda":1,281 #'alpha':0,282 "silent": 1,283 #'seed':1123284}285xgb1=xgb.train(params,dtrain,num_boost_round=25)286# In[425]:287pre=xgb1.predict(dtest,ntree_limit=25)288pre_loc=[loc_le.classes_[idx] for idx in pre.argmax(1)]289sub = pd.DataFrame()290sub['uid'] = merge_data.iloc[TR:]['uid']291sub['province'] = pre_loc292sub.to_csv('./data/location_sub.csv',index=False)293# In[426]:294#loc_pro2=pd.DataFrame(pre,columns=loc_le.classes_,index=test_data.uid)295# In[427]:...

Full Screen

Full Screen

smp_merge_data.py

Source:smp_merge_data.py Github

copy

Full Screen

1import numpy as np2import pandas as pd3import re4import xgboost as xgb5from sklearn import svm6from collections import defaultdict7from sklearn.feature_extraction.text import TfidfVectorizer8from sklearn.linear_model import LogisticRegression9from sklearn.cross_validation import StratifiedShuffleSplit10from sklearn.preprocessing import LabelEncoder11path = './data/'12#∂¡»Î—µ¡∑≤‚ ‘ ˝æ›£¨∫œ≤¢13train_data = pd.read_csv( path + 'train/train_labels.txt',sep=u'|',header=None).dropna(1)14train_data.columns = ['uid','sex','age','location']15test_data = pd.read_csv(path + 'valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)16test_data.columns = ['uid']17total_data = pd.concat([train_data,test_data],axis=0)18#∂¡»Î—µ¡∑≤‚ ‘info ˝æ›£¨∫œ≤¢19train_data_info = pd.read_csv( path + 'train/train_info.txt',sep=u'|',header=None).dropna(1)20train_data_info.columns = ['uid','name','image']21train_data_info = train_data_info.drop_duplicates('uid')22test_data_info = pd.read_csv(path + 'valid/valid_info.txt',sep=u'|',header=None).dropna(1)23test_data_info.columns = ['uid','name','image']24test_data_info = test_data_info.drop_duplicates('uid')25total_data_info = pd.concat([train_data_info,test_data_info],axis=0)26#∂¡»Î—µ¡∑≤‚ ‘links ˝æ›£¨∫œ≤¢27links = []28for i, line in enumerate(open(path + 'train/train_links.txt')):29 line = line.split()30 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}31 links.append(row)32train_data_links = pd.DataFrame(links)33train_data_links = train_data_links.drop_duplicates()34links = []35for i, line in enumerate(open(path + 'valid/valid_links.txt')):36 line = line.split()37 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}38 links.append(row)39test_data_links = pd.DataFrame(links)40test_data_links = test_data_links.drop_duplicates()41total_data_links = pd.concat([train_data_links,test_data_links],axis=0)42#∂¡»Î—µ¡∑≤‚ ‘status ˝æ›£¨∫œ≤¢43status = []44for i, line in enumerate(open(path + 'train/train_status.txt')):45 46 l = re.search(',',line).span()[0]47 r = re.search(',',line).span()[1]48 row = {'uid':int(line[:l]),'sta':line[r:]}49 status.append(row)50train_data_status = pd.DataFrame(status)51status = []52for i, line in enumerate(open(path + 'valid/valid_status.txt')):53 54 l = re.search(',',line).span()[0]55 r = re.search(',',line).span()[1]56 row = {'uid':int(line[:l]),'sta':line[r:]}57 status.append(row)58test_data_status = pd.DataFrame(status)59total_data_status = pd.concat([train_data_status,test_data_status],axis=0)60#∫œ≤¢Ã‚ƒø∏¯µƒº∏∏ˆ±Ì ˝æ›61merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')62merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')63merge_data.index = range(len(merge_data))64##################################################################################65total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))66total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))67total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])68total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])69total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))70contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))71merge_data['contents'] = merge_data.uid.map(contents)72merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())73#Õ≥º∆Ãÿ’˜74merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))75merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))76merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))77merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))78merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))79merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))80merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))81merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))82merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))83merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))84#locationµÿ«¯”≥…‰¥ ±Ì85d = {' Øº“◊Ø': 'ª™±±',86 '◊”¡Í√Ì': 'ª™∂´',87 '…Ó€⁄': 'ª™ƒœ',88 'π„÷›': 'ª™ƒœ',89 '±¶∞≤': 'ª™ƒœ',90 '¡ı◊Ø': 'ª™÷–',91 '…≥ –': 'ª™÷–',92 'Œ‰∫∫': 'ª™÷–',93 'œÂ—Ù': 'ª™÷–',94 '∞≤¬Ω': 'ª™÷–',95 'æ£√≈': 'ª™÷–',96 'Œ˜∞≤': 'Œ˜±±',97 '“¯¥®': 'Œ˜±±',98 '≥…∂º': 'Œ˜ƒœ',99 '√‡—Ù': 'Œ˜ƒœ',100 '…œ∫£': 'ª™∂´',101 '‘∆ƒœ': 'Œ˜ƒœ',102 'ƒ⁄√…π≈': 'ª™±±',103 '±±æ©': 'ª™±±',104 'îÕÂ': 'ª™∂´',105 'º™¡÷': '∂´±±',106 'Àƒ¥®': 'Œ˜ƒœ',107 'ÃÏΩÚ': 'ª™±±',108 'ƒ˛œƒ': 'Œ˜±±',109 '∞≤ª’': 'ª™∂´',110 '…Ω∂´': 'ª™∂´',111 '…ΩŒ˜': 'ª™±±',112 '¡…ƒ˛': '∂´±±',113 '÷ÿ«Ï': 'Œ˜ƒœ',114 '…¬Œ˜': 'Œ˜±±',115 '«‡∫£': 'Œ˜±±',116 'œ„∏€': 'ª™ƒœ',117 '∫⁄¡˙Ω≠': '∂´±±',118 '≥§∞◊': '∂´±±',119 'µ§∂´': '∂´±±',120 '¥Û”π«≈': '∂´±±',121 '…Ú—Ù': '∂´±±',122 '¥Û¡¨': '∂´±±',123 '∏ßÀ≥': '∂´±±',124 ' Øº“◊Ø': 'ª™±±',125 '≥Ø—Ù': 'ª™±±',126 'π„∂´': 'ª™ƒœ',127 'π„Œ˜': 'ª™ƒœ',128 '–¬ΩÆ': 'Œ˜±±',129 'Ω≠À’': 'ª™∂´',130 'Ω≠Œ˜': 'ª™∂´',131 '∫”±±': 'ª™±±',132 '∫”ƒœ': 'ª™÷–',133 '’„Ω≠': 'ª™∂´',134 '∫£ƒœ': 'ª™ƒœ',135 '∫˛±±': 'ª™÷–',136 '∫˛ƒœ': 'ª™÷–',137 '∞ƒ√≈': 'ª™ƒœ',138 '∏ À‡': 'Œ˜±±',139 '∏£Ω®': 'ª™∂´',140 'Œ˜≤ÿ': 'Œ˜ƒœ',141 'πÛ÷›': 'Œ˜ƒœ',142}143#Ω´location∫Õage◊™ªØ≥…–Ë“™Ã·Ωªµƒ∑∂Œß144def trans_loc(s):145 if pd.isnull(s):146 return s147 s = s.split(' ')[0]148 if s == 'None':149 return 'ª™±±'150 if s == '∫£Õ‚':151 return s152 return d[s]153def trans_age(age):154 if pd.isnull(age):155 return age156 if age <=1979:157 return "-1979"158 elif age<=1989:159 return "1980-1989"160 else:161 return "1990+"162merge_data['location2'] = merge_data['location'].map(trans_loc)163merge_data['age2'] = merge_data['age'].map(trans_age)164src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))165merge_data['source_content'] = merge_data['uid'].map(src_lst) 166keys = '|'.join(d.keys())167merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))168merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))169d = defaultdict(lambda :'ø’',d)170tokenizer = lambda line: [d[w] for w in line.split(' ')]171tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)172X_all_sp = tfv.fit_transform(merge_data['num_province'])173sum_province = X_all_sp.toarray()174for i in range(sum_province.shape[1]):175 merge_data['sum_province_%d'%i] = sum_province[:,i]176length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))177merge_data['max_content_len'] = merge_data['uid'].map(length)178length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))179merge_data['min_content_len'] = merge_data['uid'].map(length)180length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))181merge_data['mean_content_len'] = merge_data['uid'].map(length)182merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))183def num_missing(x): 184 return sum(x.isnull()) 185merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 186#rankÃÿ’˜187merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')188merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')189merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')190merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')191merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')...

Full Screen

Full Screen

prepare_data.py

Source:prepare_data.py Github

copy

Full Screen

1import numpy as np2import pandas as pd3from sklearn.linear_model import LinearRegression4import sys56args = sys.argv7order_file_name = args[1]8group_file_name = args[2]9airline_file_name = args[3]10train_set_file_name = args[4]11total_dataset = args[5]121314order = pd.read_csv(order_file_name)15group = pd.read_csv(group_file_name)16airline = pd.read_csv(airline_file_name)1718group['product_name_price_min'] = group['price'].groupby(group['product_name']).transform('min')19merge_data = order.merge(group, on=['group_id'], how='left')2021#cp值:價錢/天數22merge_data["cp"] = merge_data["price"] / merge_data['days']2324#source25source_1_dummy = pd.get_dummies(merge_data["source_1"] )26source_2_dummy = pd.get_dummies(merge_data["source_2"] )27merge_data = pd.concat([merge_data , source_1_dummy] , axis = 1)28merge_data = pd.concat([merge_data , source_2_dummy] , axis = 1)29303132#同一行程是否為最低價33merge_data['price-min'] = merge_data['price'] - merge_data['product_name_price_min']34merge_data['is-min-price'] = 035merge_data.loc[merge_data['price-min'] == 0 , ['is-min-price']] = 13637#同一行程有有多少訂單38merge_data['num_same_group'] = merge_data[['order_id']].groupby(merge_data['group_id']).transform('count')394041#同意行程總共多少人42merge_data['total_people_amount'] = merge_data[['people_amount']].groupby(merge_data['group_id']).transform('sum')434445#行程 優惠46merge_data["discount"] = 047merge_data.loc[merge_data["product_name"].str.contains("省") == True , ["discount"]] = 148merge_data.loc[merge_data["product_name"].str.contains("折") == True , ["discount"]] = 149merge_data.loc[merge_data["product_name"].str.contains("贈") == True , ["discount"]] = 150merge_data.loc[merge_data["product_name"].str.contains("送") == True , ["discount"]] = 151merge_data.loc[merge_data["product_name"].str.contains("減") == True , ["discount"]] = 152merge_data.loc[merge_data["product_name"].str.contains("優惠") == True , ["discount"]] = 153merge_data.drop(['product_name'], axis=1, inplace=True)545556#時間格式轉換,以及時間處理57merge_data['begin_date'] = pd.to_datetime(merge_data['begin_date'])58merge_data['order_date'] = pd.to_datetime(merge_data['order_date'])59merge_data['begin_date_month'] = merge_data["begin_date"].dt.month60merge_data['order_date_month'] = merge_data["order_date"].dt.month61merge_data['order_date_dayofweek'] = merge_data['order_date'].dt.dayofweek62merge_data['begin_date_dayofweek'] = merge_data['begin_date'].dt.dayofweek63merge_data['order_date_isweekend'] = 064merge_data['begin_date_isweekend'] = 065merge_data.loc[merge_data['order_date_dayofweek'] == 5 , ['order_date_isweekend']] = 166merge_data.loc[merge_data['order_date_dayofweek'] == 6 , ['order_date_isweekend']] = 167merge_data.loc[merge_data['begin_date_dayofweek'] == 5 , ['order_date_isweekend']] = 168merge_data.loc[merge_data['begin_date_dayofweek'] == 6 , ['order_date_isweekend']] = 16970717273# 航班處理74#去程起飛時間,回程抵達時間75go_fly = airline[["group_id" , "fly_time" , "arrive_time"]]76go_fly['fly_time'] = airline['fly_time'].groupby(airline['group_id']).transform('min')77go_fly['fly_time'] = pd.to_datetime(go_fly['fly_time'])78go_fly['arrive_time'] = airline['arrive_time'].groupby(airline['group_id']).transform('max')79go_fly['arrive_time'] = pd.to_datetime(go_fly['arrive_time'])80go_fly = go_fly.drop_duplicates()81merge_data = merge_data.merge(go_fly, on=['group_id'], how='left')8283#整個行程搭了幾次飛機84count = airline.groupby(['group_id']).size().to_frame("fly_count")85merge_data = merge_data.merge(count, on=['group_id'], how='left')8687#刪除沒用到的欄位88merge_data.drop(['source_1'], axis=1, inplace=True)89merge_data.drop(['source_2'], axis=1, inplace=True)90merge_data.drop(['unit'], axis=1, inplace=True)91merge_data.drop(['area'], axis=1, inplace=True)92merge_data.drop(['sub_line'], axis=1, inplace=True)93merge_data.drop(['promotion_prog'], axis=1, inplace=True)949596training_set = pd.read_csv(train_set_file_name)97merge_data = merge_data.merge(training_set , on=['order_id'], how='left')98merge_data = merge_data.dropna() #刪除有缺值的列99100print(merge_data.info())101merge_data.to_csv(total_dataset , index = False)102 ...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful