How to use test_save method in autotest

Best Python code snippet using autotest_python

data_preprocessing.py

Source:data_preprocessing.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2"""3Created on Mon Jun 11 17:45:28 20184@author: huang5"""6import sys7import csv8import time9import collections10import numpy as np11import pandas as pd12import logging13#sys.path.append('..')14#sys.path.append('D:/GitHub/jhye_tool')15from flags import FLAGS, unparsed16sys.path.append(FLAGS.tool_ml_dir)17from ml.ml_utils import *18from joblib import dump, load, Parallel, delayed19from sklearn.model_selection import train_test_split20import random21from sklearn.utils import shuffle22from sklearn.preprocessing import OneHotEncoder23import xgboost as xgb24from sklearn.decomposition import PCA25import gc26import lightgbm as lgb 27logging.basicConfig(28 format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s', level=logging.DEBUG)29train_set_path = FLAGS.train_set_path30output = FLAGS.output_dir31def def_user(src_data):32 src_data['uid'] = src_data['device_id'].map(str).values + src_data['device_ip'].map(str).values + '-' + src_data['device_model'].map(str).values33def def_user_one_day_hour(src_data):34 src_data['uid_time'] = src_data['uid'].values + '-' + src_data['one_day_hour'].map(str).values35# hour shi jian tezheng36def anly_hour(src_data):37 src_data['date']=pd.to_datetime((src_data['hour'] / 100).map(int)+20000000,format='%Y%m%d')38 logging.debug(src_data['date'].unique())39 src_data['one_day']=src_data['date'].dt.day40 logging.debug(src_data['one_day'].unique())41 src_data['one_day_hour'] = src_data['date'].dt.hour42 src_data['week_day'] = src_data['date'].dt.dayofweek43 src_data['day_hour_prev']=src_data['one_day_hour']-144 src_data['day_hour_next'] = src_data['one_day_hour'] + 145 src_data['is_work_day'] = src_data['week_day'].apply(lambda x: 1 if x in [0,1,2,3,4] else 0)46# src_data[src_data['is_work_day']==0]47 src_data.drop(['date'], axis=1,inplace = True)48 src_data.drop(['week_day'], axis=1,inplace = True)49 50 date_list=['one_day','one_day_hour','day_hour_prev','day_hour_next','is_work_day']51 52 src_data[date_list].to_csv(FLAGS.tmp_data_path+'date_list.csv',index=False)53 54def drop_limit_10(train,col_name):55 return dict((key,-1) if value <10 else (key,value) for key,value in dict(train[col_name].value_counts()).items())56# 可以直接数的类别特征57def cat_features_cnt(src_data):58 id_cnt = collections.defaultdict(int)59 ip_cnt = collections.defaultdict(int)60 user_cnt = collections.defaultdict(int)61 user_hour_cnt = collections.defaultdict(int)62 id_cnt=drop_limit_10(src_data,'device_id')63 logging.debug(len(id_cnt))64 ip_cnt=drop_limit_10(src_data,'device_ip')65 logging.debug(len(ip_cnt))66 def_user(src_data)67 user_cnt=drop_limit_10(src_data,'uid')68 logging.debug(len(user_cnt))69 def_user_one_day_hour(src_data)70 user_hour_cnt=drop_limit_10(src_data,'uid_time')71 logging.debug(len(user_hour_cnt))72 73 return id_cnt,ip_cnt,user_cnt,user_hour_cnt74def col_one_hot(train):75 for _col in train.columns.values.tolist():76 logging.debug(_col)77 if train[_col].dtypes=='object':78 ont=train[_col].astype('category').values.codes79 logging.debug(ont)80 train[_col]=ont81 return82FIELDS = ['C1','click','app_id','site_id','banner_pos','device_id','device_ip','device_model','device_conn_type','C14','C17','C20','C21']83#DATE_FIELDS=['one_day','date_time','day_hour_prev','one_day_hour','app_or_web','day_hour_next','app_site_id']84#NEW_FIELDS = FIELDS+DATE_FIELDS+['pub_id','pub_domain','pub_category','device_id_count','device_ip_count','user_count','smooth_user_hour_count','user_click_histroy']85#exptv_vn_list=['device_id','device_ip','C14','C17','C21',86# 'app_domain','site_domain','site_id','app_id','device_model','hour']87category_list = ['app_or_web', 'device_ip', 'app_site_id', 'device_model', 'app_site_model', 'C1', 'C14', 'C17', 'C21',88 'device_type', 'device_conn_type','app_site_model_aw', 'dev_ip_app_site']89 90#exptv_vn_list=['C14','C17','C21','site_domain','device_model']91 92def add_col_cnt(src_data,col_name,cnt):93 vn=col_name+'_cnt'94 src_data[vn]=np.zeros(src_data.shape[0])95 func=lambda x: cnt[x]96 src_data[vn]=src_data[col_name].apply(func)97 logging.debug(src_data[vn].head())98# 可以在单条记录情况下 加工的类别特征99def one_line_data_preprocessing(x=25,is_train=True):100 src_data=pd.read_csv(FLAGS.tmp_data_path+'train'+str(x)+'/'+'train_test.csv')101 anly_hour(src_data)102 logging.debug(src_data.shape)103 id_cnt,ip_cnt,user_cnt,user_hour_cnt=cat_features_cnt(src_data) 104 105 add_col_cnt(src_data,'device_id',id_cnt)106 add_col_cnt(src_data,'device_ip',ip_cnt)107 add_col_cnt(src_data,'uid',user_cnt)108 add_col_cnt(src_data,'uid_time',user_hour_cnt)109 col_one_hot(src_data)110 procdess_col(src_data,'app_id')111 procdess_col(src_data,'site_id')112 procdess_col(src_data,'app_domain')113 procdess_col(src_data,'app_category')114# procdess_col(src_data,'site_id')115 num_writeheader_list=[]116 cat_writeheader_list=[]117 date_list=[]118 for col in src_data.columns.values.tolist():119 if col in category_list :120 cat_writeheader_list.append(col)121 elif 'day' in col:122# pass123 date_list.append(col)124 else:125 num_writeheader_list.append(col)126 src_data[cat_writeheader_list].to_csv(FLAGS.tmp_data_path+'train'+str(x)+'/'+'cat_features.csv',index=False)127 src_data[date_list].to_csv(FLAGS.tmp_data_path+'train'+str(x)+'/date_list.csv',index=False)128 src_data[num_writeheader_list].to_csv(FLAGS.tmp_data_path+'train'+str(x)+'/'+'num_features.csv',index=False)129 del src_data130 return 'cat_features.csv','date_list.csv','num_features.csv'131 132def two_features_data_preprocessing(x, is_train=True):133 134 logging.debug("to add some basic features ...")135 136 #类别型特征俩俩 链接 137 calc_exptv(x,category_list,add_count=True)138 new_expvn=calc_exptv_cnt(x)139 return new_expvn140# 计算各特征的 权重141def new_features_w( is_train=True):142 143 data=pd.read_csv(FLAGS.tmp_data_path+'num_features.csv')144 new_expvn= ['C15', 'C16', 'C18', 'C19', 'C20', ]145 src_data=data[new_expvn]146 src_data['click']=data['click'].values147 del data148 src_data=data_concat(src_data,FLAGS.tmp_data_path +'date_list.csv')149 #后验均值编码中的先验强度,随机给定强度150 n_ks={}151 for x in new_expvn:152 n_ks[x]=np.random.uniform(1, 500, 1)153 #初始化154 exp2_dict = {}155 for vn in new_expvn:156 exp2_dict[vn] = np.zeros(src_data.shape[0])157 days_npa = src_data.one_day.values158 159 for day_v in range(22, 32):160 # 将训练数据 分为 3 部分 : day_v之前 ,day_v161 day_v_before = src_data.ix[src_data.one_day.values < day_v, :].copy()162 163 #当前天的记录,作为校验集164 day_v_now = src_data.ix[src_data.one_day.values == day_v, :]165 logging.debug("Validation day:", day_v, ", train data shape:", day_v_before.shape, ", validation data shape:", day_v_now.shape)166 167 #初始化每个样本的y的先验 都等于 平均click率168 pred_prev = day_v_before.click.values.mean() * np.ones(day_v_before.shape[0])169 170 for vn in new_expvn:171 if 'exp2_'+vn in day_v_before.columns.values.tolist(): #已经有了,丢弃重新计算172 day_v_before.drop('exp2_'+vn, inplace=True, axis=1)173 174 for i in range(3):175 for vn in new_expvn:176 #计算对应的特征列中 在给定 y 的情况下的 概率177 p1 = calcLeaveOneOut2(day_v_before, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev)178 pred = pred_prev * p1179 logging.debug (day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean())180 pred_prev = pred 181 182 #y的先验183 pred1 = day_v_before.click.values.mean()184 for vn in new_expvn:185 logging.debug("="*20, "merge", day_v, vn)186 diff1 = mergeLeaveOneOut2(day_v_before, day_v_now, vn)187 pred1 *= diff1188 exp2_dict[vn][days_npa == day_v] = diff1189 190 pred1 *= day_v_before.click.values.mean() / pred1.mean()191 logging.debug("logloss = ", logloss(pred1, day_v_now.click.values))192 t1=pd.DataFrame(np.zeros(src_data.shape[0]),columns=['click',])193 for vn in new_expvn:194 t1['exp2_'+vn] = exp2_dict[vn]195 t1.drop('click', axis=1,inplace = True)196 t1.to_csv(FLAGS.tmp_data_path+'new_features_w.csv',index=False)197 return 198#199def data_concat(src_data, dst_data_path,nrows=0,usecols=None, is_train=True):200 if usecols!=None:201 Reader_ = pd.read_csv(dst_data_path,usecols=[9,])202 elif nrows != 0:203 Reader_ = pd.read_csv(dst_data_path,nrows=nrows)204 else:205 Reader_ = pd.read_csv(dst_data_path,)206 try:207 Reader_.drop('id', axis=1,inplace = True)208 except:209 pass 210 211 logging.debug('data1.shape:'+str(src_data.shape))212 logging.debug('data2.shape:'+str(Reader_.shape))213 start = time.time()214 src_data=pd.concat([src_data,Reader_],axis = 1)215 logging.debug('结果.shape:'+str(src_data.shape))216 logging.debug('耗时'+str(time.time()-start))217# return NEW_FIELDS218 return src_data219 220def concat_train_test(src_path, test_path,):221 train = pd.read_csv(src_path, dtype ={'id': object,})222 t5=pd.DataFrame(train['id'].map(int),columns=['id',])223# logging.debug(t5.head(5))224# t5.to_csv(FLAGS.tmp_data_path+'train_id.csv',index=False)225 col_cnts={}226 col_cnts['train']=(t5.shape[0])227 logging.debug(train.shape)228 del t5229 230 #训练集 乱序,下采样231# train = shuffle(train)232# train=train.sample(frac=0.05).reset_index(drop=True)233 234 235 test = pd.read_csv(test_path,dtype ={'id': object,})236 test['click'] = 0 #测试样本加一列click,初始化为0237 t6=pd.DataFrame(test['id'].map(str),columns=['id',])238 logging.debug(test['id'].map(str).head(5))239 logging.debug(t6.head(5))240 t6.to_csv(FLAGS.tmp_data_path+'test_id.csv',index=False)241 col_cnts['test']=(t6.shape[0])242 logging.debug(col_cnts)243 ret=dump(col_cnts, FLAGS.tmp_data_path+'test_index.joblib_dat')244 del t6245 246 logging.debug(test.shape)247 248# try:249# train.drop('id', axis=1,inplace = True)250# except:251# pass252# try:253# test.drop('id', axis=1,inplace = True)254# except:255# pass256 #将训练样本和测试样本连接,一起进行特征工程257 train = pd.concat([train, test])258 259 train['app_or_web'] = '0'260 #如果app_id='ecad2386',app_or_web=1261 train.ix[train.app_id.values=='ecad2386', 'app_or_web'] = '1'262 train['app_site_id'] = np.add(train.app_id.values, train.site_id.values)263 train['app_site_model'] = np.add(train.device_model.values, train.app_site_id.values)264 train['app_site_model_aw'] = np.add(train.app_site_model.values, train.app_or_web.values)265 train['dev_ip_app_site'] = np.add(train.device_ip.values, train.app_site_id.values)266 logging.debug(train.shape)267 268 269 train.to_csv(FLAGS.tmp_data_path+'train_test.csv',index=False)270 return 0271def features_by_chick(x):272 273 train_save = pd.read_csv(FLAGS.tmp_data_path +'train'+str(x)+'/'+'cat_features.csv',)274 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(x)+'/'+'date_list.csv')275 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(x)+'/'+'click.csv')276 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(x)+'/'+'two_col_join.csv')277 278 logging.debug(train_save['one_day'].unique())279 vns=[vn for vn in train_save.columns.values if 'day' not in vn ]280 #后验均值编码中的先验强度281 n_ks = {'app_or_web': 100, 'app_site_id': 100, 'device_ip': 10, 'C14': 50, 'app_site_model': 50, 'device_id': 50,282 'C17': 100, 'C21': 100, 'C1': 100, 'device_type': 100, 'device_conn_type': 100, 'banner_pos': 100,283 'app_site_model_aw': 100,'one_day':100, 'dev_ip_app_site': 10 , 'device_model': 500,'click':1}284 285# vns=list(n_ks.keys())286 logging.debug(vns)287 logging.debug(train_save.one_day.unique())288 289 # 训练&测试290 train_save = train_save.ix[np.logical_and(train_save.one_day.values >= 21, train_save.one_day.values < 32), :]291 #串联两个特征成新的特征292 train_save['app_site_model'] = np.add(train_save.device_model.values, train_save.app_site_id.values)293 train_save['app_site_model_aw'] = np.add(train_save.app_site_model.values, train_save.app_or_web.values)294 train_save['dev_ip_app_site'] = np.add(train_save.device_ip.values, train_save.app_site_id.values)295 296 297 logging.debug(train_save.shape)298# logging.debug(train_save.one_day.values)299 #初始化300 301 302 for vn in vns:303 if vn in n_ks:304 pass305 else:306 n_ks[vn]=100307 logging.debug (vn)308 309 #初始化310 exp2_dict = {}311 for vn in vns:312 exp2_dict[vn] = np.zeros(train_save.shape[0])313 314 days_npa = train_save.one_day.values315 logging.debug(days_npa)316 317 for day_v in range(22, 32):318 # day_v之前的天,所以从22开始,作为训练集319 logging.debug(train_save['one_day'])320 df1 = train_save.ix[np.logical_and(train_save.one_day.values < day_v, True), :].copy()321 logging.debug(df1.shape)322 #当前天的记录,作为校验集323 df2 = train_save.ix[train_save.one_day.values == day_v, :]324 logging.debug(df2.shape)325 print ("Validation day:", day_v, ", train data shape:", df1.shape, ", validation data shape:", df2.shape)326 327 #每个样本的y的先验都等于平均click率328 pred_prev = df1.click.values.mean() * np.ones(df1.shape[0])329 330 331 for vn in vns:332 if 'exp2_'+vn in df1.columns: #已经有了,丢弃重新计算333 df1.drop('exp2_'+vn, inplace=True, axis=1)334 335 for i in range(3):336 for vn in vns:337 p1 = calcLeaveOneOut2(df1, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev)338 pred = pred_prev * p1339 print (day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean())340 pred_prev = pred 341 342 #y的先验343 pred1 = df1.click.values.mean()344 for vn in vns:345 print ("="*20, "merge", day_v, vn)346 diff1 = mergeLeaveOneOut2(df1, df2, vn)347 pred1 *= diff1348 exp2_dict[vn][days_npa == day_v] = diff1349 350 pred1 *= df1.click.values.mean() / pred1.mean()351 print ("logloss = ", logloss(pred1, df2.click.values))352 #print my_lift(pred1, None, df2.click.values, None, 20, fig_size=(10, 5))353 #plt.show()354 exp_list=[]355 for vn in vns:356 train_save['exp2_'+vn] = exp2_dict[vn]357 exp_list.append('exp2_'+vn)358 train_save[exp_list].to_csv(FLAGS.tmp_data_path+'train'+str(x)+'/'+'exp_features.csv',index=False)359 del train_save360 361def ouwenzhang():362 train_save = pd.read_csv(FLAGS.tmp_data_path +'cat_features.csv',)363 train_save=data_concat(train_save,FLAGS.tmp_data_path +'date_list.csv')364 train_save=data_concat(train_save,FLAGS.tmp_data_path +'click.csv')365 366 ori_col=train_save.columns.values367 print ("to count prev/current/next hour by ip ...")368 cntDualKey(train_save, 'device_ip', None, 'one_day', 'day_hour_prev', fill_na=0)369 cntDualKey(train_save, 'device_ip', None, 'one_day', 'one_day', fill_na=0)370 cntDualKey(train_save, 'device_ip', None, 'one_day', 'day_hour_next', fill_na=0)371 372 print( "to create day diffs")373 train_save['pday'] = train_save.one_day - 1374 calcDualKey(train_save, 'device_ip', None, 'one_day', 'pday', 'click', 10, None, True, True)375# train_save['cnt_diff_device_ip_day_pday'] = train_save.cnt_device_ip_day.values - train_save.cnt_device_ip_pday.values376 train_save['hour1_web'] = train_save.one_day_hour.values377 train_save.ix[train_save.app_or_web.values==0, 'hour1_web'] = -1378# train_save['app_cnt_by_dev_ip'] = my_grp_cnt(train_save.device_ip.values, train_save.app_id.values)379 380 381 train_save['hour1'] = np.round(train_save.one_day_hour.values % 100)382# train_save['cnt_diff_device_ip_day_pday'] = train_save.cnt_device_ip_day.values - train_save.cnt_device_ip_pday.values383 384# train_save['rank_dev_ip'] = my_grp_idx(train_save.device_ip.values, train_save.id.values)385 train_save['rank_day_dev_ip'] = my_grp_idx(np.add(train_save.device_ip.values, train_save.day.values), train_save.id.values)386# train_save['rank_app_dev_ip'] = my_grp_idx(np.add(train_save.device_ip.values, train_save.app_id.values), train_save.id.values)387 388 389 train_save['cnt_dev_ip'] = get_agg(train_save.device_ip.values, train_save.id, np.size)390 train_save['cnt_dev_id'] = get_agg(train_save.device_id.values, train_save.id, np.size)391 392 train_save['dev_id_cnt2'] = np.minimum(train_save.cnt_dev_id.astype('int32').values, 300)393 train_save['dev_ip_cnt2'] = np.minimum(train_save.cnt_dev_ip.astype('int32').values, 300)394 395 train_save['dev_id2plus'] = train_save.device_id.values396 train_save.ix[train_save.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1'397 train_save['dev_ip2plus'] = train_save.device_ip.values398 train_save.ix[train_save.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1'399 400 train_save['diff_cnt_dev_ip_hour_phour_aw2_prev'] = (train_save.cnt_device_ip_day_hour.values - train_save.cnt_device_ip_day_hour_prev.values) * ((train_save.app_or_web * 2 - 1)) 401 train_save['diff_cnt_dev_ip_hour_phour_aw2_next'] = (train_save.cnt_device_ip_day_hour.values - train_save.cnt_device_ip_day_hour_next.values) * ((train_save.app_or_web * 2 - 1)) 402 403 now_col=train_save.columns.values404 new_col=[x for x in now_col if x not in ori_col]405 print("to save train_save ...")406 407 train_save[new_col].to_csv(FLAGS.tmp_data_path+'idx_features.csv',index=False)408 del train_save409def click_to_csv():410 num_features=pd.read_csv(FLAGS.tmp_data_path+'num_features.csv')411 t4=pd.DataFrame(num_features['click'].values,columns=['click',])412 t4.to_csv(FLAGS.tmp_data_path+'click.csv',index=False)413 del t4414 return True415def get_train_split():416# click=pd.read_csv(FLAGS.tmp_data_path+'click.csv')417 test_index = load(FLAGS.tmp_data_path+'test_index.joblib_dat')418 test_id=test_index['test']419 train_id=test_index['train']420# train_click=click[:train_id]421# filter_1 = np.logical_and(train_click.click.values > 0, True)422# filter_0 = np.logical_and(train_click.click.values == 0,True)423# files_name=['click.csv','cat_features.csv','date_list.csv','num_features.csv','two_col_join_cnt.csv','two_col_join.csv']424 files_name=['train_test.csv']425 426 427 logging.debug(files_name)428 for file in files_name:429 save=pd.read_csv(FLAGS.tmp_data_path+file)430 test_save=save[(-test_id):]431 test_save.to_csv(FLAGS.tmp_data_path+'test/'+file,index=False)432 logging.debug(test_save.shape)433 train_save=save[:train_id]434 for x in [100,299,799,1537]:435 np.random.seed(x)436 r1 = np.random.uniform(0, 1, train_save.shape[0]) #产生0~40M的随机数437 train_ = train_save.ix[r1 < 0.13, :]438 logging.debug( "testing with small sample of training data, {}".format(train_.shape))439# train_0=train_.ix[filter_0, :]440# train_1=train_.ix[filter_1, :]441# prc=train_1.shape[0]/train_0.shape[0]442# train_1=train_1.sample(frac=0.5).reset_index(drop=True)443# logging.debug(train_1.shape)444 logging.debug(file)445 logging.debug(x )446# sampler = np.random.randint(0,train_0.shape[0],size=int(int(train_1.shape[0])/prc))447# train_0=train_0.take(sampler)448 train = pd.concat([train_, test_save])449# train = shuffle(train)450 train=train.sample(frac=1).reset_index(drop=True)451 logging.debug(train.shape)452 train.to_csv(FLAGS.tmp_data_path+'train'+str(x)+'/'+file,index=False)453 del train454 del train_455# del train_1456# del sampler457def get_train_test_split():458 test_index = load(FLAGS.tmp_data_path+'test_index.joblib_dat')459 train_id=test_index['train']460 test_id=test_index['test']461 files_name=['click.csv','cat_features.csv','date_list.csv','num_features.csv','two_col_join_cnt.csv','two_col_join.csv']462 463 464 logging.debug(files_name)465 for file in files_name:466 save=pd.read_csv(FLAGS.tmp_data_path+file)467 test_save=save[(-test_id):]468 test_save.to_csv(FLAGS.tmp_data_path+'test/'+file,index=False)469 logging.debug(test_save.shape)470 train_save=save[:-1*test_id]471# train_save=train_save.sample(frac=0.005).reset_index(drop=True)472 logging.debug(train_save.shape)473 train_save.to_csv(FLAGS.tmp_data_path+'train25'+'/'+file,index=False)474 del train_save475 del save476def gdbt_data_get_train(seed=25):477 train_save = pd.read_csv(FLAGS.tmp_data_path +'train'+str(seed)+'/cat_features.csv',)478 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/date_list.csv')479# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/num_features.csv')480 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/click.csv')481 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join.csv')482# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join_cnt.csv')483 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/xgb_new_features.csv')484 logging.debug(train_save.columns)485 logging.debug(train_save.shape)486 try:487 train_save.drop('id', axis=1,inplace = True)488 except:489 pass490 491 return train_save492def gdbt_data_get_test():493 test_save = pd.read_csv(FLAGS.tmp_data_path +'test/cat_features.csv',)494 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/date_list.csv')495# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/num_features.csv')496 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/click.csv')497 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join.csv')498# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join_cnt.csv')499 test_save=data_concat(test_save,FLAGS.tmp_data_path + 'test/xgb_new_features.csv')500 logging.debug(test_save.shape)501 try:502 test_save.drop('id', axis=1,inplace = True)503 except:504 pass505 506 507 test_save.drop('click',axis=1,inplace=True)508 return test_save509def lr_data_get(test_path):510 train_save = pd.read_csv(FLAGS.tmp_data_path +'train1537/cat_features.csv',)511 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train1537/date_list.csv')512 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train1537/num_features.csv')513# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/click.csv')514 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train1537/two_col_join.csv')515 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train1537/two_col_join_cnt.csv')516 logging.debug(train_save.columns)517# logging.debug(train_save['id'])518 test_save = pd.read_csv(FLAGS.tmp_data_path +'test/cat_features.csv',)519 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/date_list.csv')520 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/num_features.csv')521# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/click.csv')522 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join.csv')523 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join_cnt.csv')524 logging.debug(train_save.shape)525 logging.debug(test_save.shape)526 try:527 train_save.drop('id', axis=1,inplace = True)528 except:529 pass530 try:531 test_save.drop('id', axis=1,inplace = True)532 except:533 pass534 535 536 test_save.drop('click',axis=1,inplace=True)537 return train_save,test_save538def lightgbm_data_get(test_path):539 train_save = pd.read_csv(FLAGS.tmp_data_path +'train100/cat_features.csv',)540 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/date_list.csv')541 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/num_features.csv')542# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/click.csv')543 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/two_col_join.csv')544 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/two_col_join_cnt.csv')545 logging.debug(train_save.columns)546# logging.debug(train_save['id'])547 test_save = pd.read_csv(FLAGS.tmp_data_path +'test/cat_features.csv',)548 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/date_list.csv')549 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/num_features.csv')550# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/click.csv')551 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join.csv')552 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join_cnt.csv')553# logging.debug(test_save.shape)554 logging.debug(train_save.shape)555 logging.debug(test_save.shape)556 try:557 train_save.drop('id', axis=1,inplace = True)558 except:559 pass560 try:561 test_save.drop('id', axis=1,inplace = True)562 except:563 pass564 565 print(train_save.shape)566 y_train = train_save['click']567 train_save.drop('click',axis=1,inplace=True)568 X_train = train_save569 570 test_save.drop('click',axis=1,inplace=True)571 X_test=test_save572 573 X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 0.9,random_state = 0)574 logging.debug(X_train_part.head(1))575 logging.debug(y_train_part.head(1))576 ### 数据转换577 lgb_train = lgb.Dataset(X_train_part, y_train_part, free_raw_data=False)578 lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)579 return lgb_train,lgb_eval,X_test,X_val,y_val580def tiny_lightgbm_data_get_train(seed=25):581 train_save = pd.read_csv(FLAGS.tmp_data_path + 'train'+str(seed)+'/cat_features.csv',)582 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/date_list.csv')583 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/num_features.csv')584# train_save=data_concat(FLAGS.tmp_data_path + 'train'+str(seed) +'/click.csv')585 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join.csv')586 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join_cnt.csv')587 train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/xgb_new_features.csv')588 logging.debug(train_save.columns)589# logging.debug(train_save['id'])590# logging.debug(test_save.shape)591 logging.debug(train_save.shape)592 try:593 train_save.drop('id', axis=1,inplace = True)594 except:595 pass596 597 print(train_save.shape)598 y_train = train_save['click']599 train_save.drop('click',axis=1,inplace=True)600 X_train = train_save601 602 603 X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 0.9,random_state = 0)604 logging.debug(X_train_part.head(1))605 logging.debug(y_train_part.head(1))606 ### 数据转换607 lgb_train = lgb.Dataset(X_train_part, y_train_part, free_raw_data=False)608 lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,free_raw_data=False)609 return lgb_train,lgb_eval,X_val,y_val610def tiny_lightgbm_data_get_test():611 test_save = pd.read_csv(FLAGS.test_data_path +'cat_features.csv',)612 test_save=data_concat(test_save,FLAGS.test_data_path +'date_list.csv')613 test_save=data_concat(test_save,FLAGS.test_data_path +'num_features.csv')614# test_save=data_concat(FLAGS.tmp_data_path +'test/click.csv')615 test_save=data_concat(test_save,FLAGS.test_data_path +'two_col_join.csv')616 test_save=data_concat(test_save,FLAGS.test_data_path +'two_col_join_cnt.csv')617 test_save=data_concat(test_save,FLAGS.test_data_path +'xgb_new_features.csv')618 logging.debug(test_save.shape)619 try:620 test_save.drop('id', axis=1,inplace = True)621 except:622 pass623 624 test_save.drop('click',axis=1,inplace=True)625 626 return test_save627def pandas_onehot(df, col):628 df = pd.get_dummies(df, columns=col)629 return df630def sklearn_onehoot(df,col):631 enc = OneHotEncoder()632 enc.fit(df) 633 data = enc.transform(df).toarray()634 return data635columns_all=['C14', 'C17', 'C21', 'device_model', 'site_domain',636 'C1', 'C15', 'C16', 'C18', 'C19', 'C20', 'app_category', 637 'app_domain', 'app_id', 'banner_pos', 'device_conn_type', 638 'device_id', 'device_ip', 'device_type', 'hour', 'site_category', 639 'site_id', 'uid', 'uid_time', 'device_id_cnt', 'device_ip_cnt', 640 'uid_cnt', 'uid_time_cnt', 'C14C17', '_key1', 'C14device_model', 641 'C14C21', 'C14site_domain', 'C17device_model', 'C17C21', 'C17site_domain',642 'C21device_model', 'C21site_domain', 'site_domaindevice_model', 'cnttv_C14C17',643 'cnttv_C14device_model', 'cnttv_C14C21', 'cnttv_C14site_domain', 'cnttv_C17device_model', 644 'cnttv_C17C21', 'cnttv_C17site_domain', 'cnttv_C21device_model', 'cnttv_C21site_domain',645 'cnttv_site_domaindevice_model']646columns_top=['site_id', 'hour', 'app_id', 'C19', 'device_ip_cnt', 'C20', 'site_category', 647 'uid_cnt', 'app_domain', 'device_id_cnt', 'device_ip', 'C18', 'uid_time_cnt', 648 'device_model', 'app_category', 'site_domain', 'C21device_model', 649 'exptv_site_domaindevice_model', 'banner_pos', 'C14', 'exptv_C21device_model',650 'cnttv_C21device_model', 'cnttv_site_domaindevice_model', '_key1', 'cnttv_C14device_model',651 'exptv_C14device_model', 'C16', 'cnttv_C17device_model', 'exptv_C17device_model', 652 'device_conn_type', 'device_id', 'cnttv_C14site_domain', 'cnttv_C17C21', 653 'exptv_C21site_domain', 'cnttv_C21site_domain', 'C17', 'C21', 'uid', 'C17device_model',654 'cnttv_C17site_domain', 'exptv_C14site_domain', 'site_domaindevice_model', 'C21site_domain',655 'exptv_C17site_domain', 'top_1_site_id', 'cnttv_C14C17', 'exptv_C17C21', 'C1', 656 'C14device_model', 'top_2_site_id', 'uid_time', 'top_5_site_id', 'C15', 'exptv_C14C17',657 'C14site_domain', 'top_1_app_id', 'cnttv_C14C21', 'C17site_domain', 'top_2_app_id', 658 'device_type', 'top_10_site_id', 'exptv_C14C21']659columns_100002w=['device_id', 'device_ip', 'device_id_cnt', 'device_ip_cnt', '_key1', 660 'C14device_model', 'C17device_model', 'site_domaindevice_model', 661 'cnttv_C14C17', 'cnttv_C14C21', 'cnttv_C14site_domain', 'cnttv_C17device_model', 662 'cnttv_C17C21', 'cnttv_C17site_domain', 'cnttv_C21device_model', 'cnttv_C21site_domain']663columns = [item for item in columns_top if item not in columns_100002w]664def col_one_hot2(train,one_field):665# enc = OneHotEncoder()666# enc.fit(train)667 668 logging.debug(train.head(2))669 logging.debug(one_field)670 now = time.time()671 logging.debug('Format Converting begin in time:...')672 logging.debug(now)673 columns = train.columns.values674 d = len(columns)675 feature_index = [i for i in range(d)]676 field_index = [0]*d677 field = []678 for col in columns:679 field.append(col)680 index = -1681 for i in range(d):682 if i==0 or field[i]!=field[i-1]:683 index+=1684 field_index[i] = index685 fp=FLAGS.tmp_data_path +one_field+'-ont_hot_train.libffm.txt'686 with open(fp, 'w') as f:687 for row_no,row in enumerate(train.values):688 line =str(row_no) 689# row= enc.transform(row).toarray()690 logging.debug(row)691 for i in range(1, len(row)):692 if row[i]!=0:693 line += ' ' + "%s:%d:%d:" % (one_field,train.values, 1) + ' '694 line+='\n'695 f.write(line)696 logging.debug('finish convert,the cost time is ')697 logging.debug(time.time()-now)698 logging.debug('[Done]')699 logging.debug()700# return pd.DataFrame(train)701def features_index():702 pass703def train_data_ont_hot(seed=25):704 train_save = pd.read_csv(FLAGS.tmp_data_path + 'train'+str(seed)+'/num_features.csv',)705# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/date_list.csv')706# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/num_features.csv')707# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/cat_features.csv')708# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join.csv')709# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join_cnt.csv')710 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed) +'/xgb_new_features.csv')711 logging.debug(train_save.columns)712# logging.debug(train_save['id'])713# logging.debug(test_save.shape)714 logging.debug(train_save.shape)715 try:716 train_save.drop('id', axis=1,inplace = True)717 except:718 pass719 720 logging.debug(train_save.shape)721 try:722 y_train = train_save['click']723 train_save.drop('click',axis=1,inplace=True)724 except:725 pass 726 columns=train_save.columns.values727 train_save=train_save[columns]728 features = list(train_save.columns)729 for feature_index,feature in enumerate(features):730 def set_field_feature_value(row):731 return "%d:%d:%d" % (feature_index,row, 1)732 logging.debug(feature+':cnt:'+str(train_save[feature].max()))733 now=time.time()734 logging.debug(feature + ' Format Converting begin in time:...')735 logging.debug(now)736 max_ = train_save[feature].max()737 train_save[feature] = (train_save[feature] - max_) * (-1)738 train_save[feature]=train_save[feature].apply(set_field_feature_value)739# train_save['label']=y_train740 logging.debug(feature + ' finish convert,the cost time is ')741 logging.debug(time.time()-now)742# one_col=pandas_onehot(train_save.loc[:,feature],feature)743# logging.debug(one_col.shape)744# col_one_hot(one_col,feature)745# del one_col746 fp=FLAGS.tmp_data_path +'ont_hot_train.libffm.csv'747 now=time.time()748 train_save=pd.concat([y_train,train_save],axis = 1)749 logging.debug(time.time()-now)750# with open(fp, 'w') as f:751# for y,row in zip(y_train.values,train_save.values):752# logging.debug(row)753# row=[str(x) for x in row]754# line=str(y)+' '+' '.join(row)+'\n'755# f.write(line)756 train_save.to_csv(fp, sep=' ', header=False, index=False)757 logging.debug('finish convert,the cost time is ')758 logging.debug(time.time()-now)759 logging.debug('[Done]')760 761 logging.debug(train_save.head(2))762 logging.debug(train_save.shape)763 del train_save764def vali_data_ont_hot(seed=799):765 train_save = pd.read_csv(FLAGS.tmp_data_path + 'train'+str(seed)+'/num_features.csv',)766# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/date_list.csv')767# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/num_features.csv')768# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/cat_features.csv')769# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join.csv')770# train_save=data_concat(train_save,FLAGS.tmp_data_path + 'train'+str(seed) +'/two_col_join_cnt.csv')771 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed) +'/xgb_new_features.csv')772 logging.debug(train_save.columns)773# logging.debug(train_save['id'])774# logging.debug(test_save.shape)775 logging.debug(train_save.shape)776 try:777 train_save.drop('id', axis=1,inplace = True)778 except:779 pass780 781 logging.debug(train_save.shape)782 try:783 y_train = train_save['click']784 train_save.drop('click',axis=1,inplace=True)785 except:786 pass 787 columns=train_save.columns.values788 train_save=train_save[columns]789 features = list(train_save.columns)790 for feature_index,feature in enumerate(features):791 def set_field_feature_value(row):792 return "%d:%d:%d" % (feature_index,row, 1)793 now=time.time()794 logging.debug(feature + ' Format Converting begin in time:...')795 logging.debug(now)796 max_ = train_save[feature].max()797 train_save[feature] = (train_save[feature] - max_) * (-1)798 train_save[feature]=train_save[feature].apply(set_field_feature_value)799# train_save['label']=y_train800 logging.debug(feature + ' finish convert,the cost time is ')801 logging.debug(time.time()-now)802# one_col=pandas_onehot(train_save.loc[:,feature],feature)803# logging.debug(one_col.shape)804# col_one_hot(one_col,feature)805# del one_col806 fp=FLAGS.tmp_data_path +'ont_hot_vali.libffm.csv'807 train_save=pd.concat([y_train,train_save],axis = 1)808# with open(fp, 'w') as f:809# for y,row in zip(y_train.values,train_save.values):810# logging.debug(row)811# row=[str(x) for x in row]812# line=str(y)+' '+' '.join(row)+'\n'813# f.write(line)814 train_save.to_csv(fp, sep=' ', header=False, index=False)815 logging.debug('finish convert,the cost time is ')816 logging.debug(time.time()-now)817 logging.debug('[Done]')818 819 logging.debug(train_save.head(2))820 logging.debug(train_save.shape)821 del train_save822def test_data_ont_hot():823 test_save = pd.read_csv(FLAGS.tmp_data_path +'test/num_features.csv',)824# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/date_list.csv')825# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/num_features.csv')826# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/cat_features.csv')827# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join.csv')828# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join_cnt.csv')829 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/xgb_new_features.csv')830 logging.debug(test_save.shape)831 try:832 test_save.drop('id', axis=1,inplace = True)833 except:834 pass835 logging.debug(test_save.columns)836# logging.debug(train_save['id'])837 838 logging.debug(test_save.shape)839 try:840 y_train = test_save['click']841 test_save.drop('click',axis=1,inplace=True)842 except:843 pass 844 columns=test_save.columns.values845 test_save=test_save[columns]846 features = list(test_save.columns)847 for feature_index,feature in enumerate(features):848 def set_field_feature_value(row):849 return "%d:%d:%d" % (feature_index,row, 1)850 now=time.time()851 logging.debug(feature + ' Format Converting begin in time:...')852 logging.debug(now)853 max_ = test_save[feature].max()854 test_save[feature] = (test_save[feature] - max_) * (-1)855 test_save[feature]=test_save[feature].apply(set_field_feature_value)856# train_save['label']=y_train857 logging.debug(feature + ' finish convert,the cost time is ')858 logging.debug(time.time()-now)859# one_col=pandas_onehot(train_save.loc[:,feature],feature)860# logging.debug(one_col.shape)861# col_one_hot(one_col,feature)862# del one_col863 fp=FLAGS.tmp_data_path +'ont_hot_test.libffm.csv'864 train_save=pd.concat([y_train,test_save],axis = 1)865# with open(fp, 'w') as f:866# for y,row in zip(y_train.values,train_save.values):867# logging.debug(row)868# row=[str(x) for x in row]869# line=str(y)+' '+' '.join(row)+'\n'870# f.write(line)871 train_save.to_csv(fp, sep=' ', header=False, index=False)872 logging.debug('finish convert,the cost time is ')873 logging.debug(time.time()-now)874 logging.debug('[Done]')875 876 logging.debug(train_save.head(2))877 logging.debug(train_save.shape)878 del train_save879 del test_save880def gdbt_DM_get_train(seed=25):881 train_save = pd.read_csv(FLAGS.tmp_data_path +'train'+str(seed)+'/cat_features.csv',)882 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/date_list.csv')883 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/num_features.csv')884# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/click.csv')885 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join.csv')886# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join_cnt.csv')887 logging.debug(train_save.columns)888 logging.debug(train_save.shape)889 890 try:891 train_save.drop('id', axis=1,inplace = True)892 except:893 pass894 895 y_train = train_save['click']896 train_save.drop('click',axis=1,inplace=True)897 pca = PCA(n_components=1)898 899 900 X_train_part, X_val, y_train_part, y_val = train_test_split(train_save, y_train, train_size = 0.6,random_state = 7)901 pca.fit(X_train_part[:200000])902 X_train_part=pca.transform(X_train_part)903 dtrain = xgb.DMatrix(X_train_part, label=y_train_part)904 dtrain.save_binary(FLAGS.tmp_data_path+'train'+str(seed)+'/xgboost.new_features.dtrain.joblib_dat')905 del dtrain,X_train_part,y_train_part906 gc.collect()907 X_val=pca.transform(X_val)908 dvalid = xgb.DMatrix(X_val, label=y_val)909 dvalid.save_binary(FLAGS.tmp_data_path+'train'+str(seed)+'/xgboost.new_features.dvalid.joblib_dat')910 del dvalid,X_val,y_val911 gc.collect()912 train_save=pca.transform(train_save)913 dtv = xgb.DMatrix(train_save)914 dtv.save_binary(FLAGS.tmp_data_path+'train'+str(seed)+'/xgboost.new_features.dtv.joblib_dat')915 del dtv,train_save916 gc.collect()917 dump(pca, FLAGS.tmp_data_path+'pca'+'.model.joblib_dat') 918 return 0919def gdbt_DM_get_test():920 test_save = pd.read_csv(FLAGS.tmp_data_path +'test/cat_features.csv',)921 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/date_list.csv')922 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/num_features.csv')923# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/click.csv')924 test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join.csv')925# test_save=data_concat(test_save,FLAGS.tmp_data_path +'test/two_col_join_cnt.csv')926 logging.debug(test_save.shape)927 pca=load(FLAGS.tmp_data_path+'pca'+'.model.joblib_dat') 928 929 try:930 test_save.drop('id', axis=1,inplace = True)931 except:932 pass933 934 935 test_save.drop('click',axis=1,inplace=True)936 test_save=pca.transform(test_save)937 dtv = xgb.DMatrix(test_save)938 dtv.save_binary(FLAGS.tmp_data_path+'test/xgboost.new_features.test.joblib_dat')939 del dtv,test_save940 gc.collect()941 return 0942def get_PCA_train_data(seed=25):943 train_save = pd.read_csv(FLAGS.tmp_data_path +'train'+str(seed)+'/cat_features.csv',nrows=200000)944 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/date_list.csv',nrows=200000)945 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/num_features.csv',nrows=200000)946# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train100/click.csv',nrows=200000)947 train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join.csv',nrows=200000)948# train_save=data_concat(train_save,FLAGS.tmp_data_path +'train'+str(seed)+'/two_col_join_cnt.csv',nrows=200000)949 logging.debug(train_save.columns)950 logging.debug(train_save.shape)951 try:952 train_save.drop('id', axis=1,inplace = True)953 except:954 pass955 956 957 gc.collect()...

Full Screen

Full Screen

split_dataset.py

Source:split_dataset.py Github

copy

Full Screen

1import os2import random3import shutil45# replace this with the folder you wanna split6input_folder = 'data_rgb'78train_save = 'training_data'9test_save = 'testing_data'1011test_bike_size = 0.312test_boat_size = 0.313test_canoe_size = 0.314test_car_size = 0.315test_human_size = 0.316test_noise_size = 0.317test_pickup_size = 0.318test_truck_size = 0.319test_van_size = 0.3202122bike = input_folder + '/bike'23boat = input_folder + '/boat'24canoe = input_folder + '/canoe'25car = input_folder + '/car'26human = input_folder + '/human'27noise = input_folder + '/noise'28pickup = input_folder + '/pickup'29truck = input_folder + '/truck'30van = input_folder + '/van'3132bike_images = []33boat_images = []34canoe_images = []35car_images = []36human_images = []37noise_images = []38pickup_images = []39truck_images = []40van_images = []414243for di in os.listdir(bike):44 bike_images.append(bike + '/' + di)454647for di in os.listdir(boat):48 boat_images.append(boat + '/' + di)495051for di in os.listdir(canoe):52 canoe_images.append(canoe + '/' + di)535455for di in os.listdir(car):56 car_images.append(car + '/' + di)575859for di in os.listdir(human):60 human_images.append(human + '/' + di)616263for di in os.listdir(noise):64 noise_images.append(noise + '/' + di)656667for di in os.listdir(pickup):68 pickup_images.append(pickup + '/' + di)697071for di in os.listdir(truck):72 truck_images.append(truck + '/' + di)737475for di in os.listdir(van):76 van_images.append(van + '/' + di)777879# SHUFFLE80random.shuffle(bike_images)81random.shuffle(boat_images)82random.shuffle(canoe_images)83random.shuffle(car_images)84random.shuffle(human_images)85random.shuffle(noise_images)86random.shuffle(pickup_images)87random.shuffle(truck_images)88random.shuffle(van_images)89909192def splittraintest(images, size):93 train_images = images[:1-int(len(images)*size)]94 test_images = images[1-int(len(images)*size):]9596 return train_images, test_images979899# SPLIT100train_bike_images, test_bike_images = splittraintest(bike_images, test_bike_size)101train_boat_images, test_boat_images = splittraintest(boat_images, test_boat_size)102train_canoe_images, test_canoe_images = splittraintest(canoe_images, test_canoe_size)103train_car_images, test_car_images = splittraintest(car_images, test_car_size)104train_human_images, test_human_images = splittraintest(human_images, test_human_size)105train_noise_images, test_noise_images = splittraintest(noise_images, test_noise_size)106train_pickup_images, test_pickup_images = splittraintest(pickup_images, test_pickup_size)107train_truck_images, test_truck_images = splittraintest(truck_images, test_truck_size)108train_van_images, test_van_images = splittraintest(van_images, test_van_size)109110print('SAVING IMAGES')111112113def crFolder(path):114 if(not os.path.exists(path)):115 os.mkdir(path)116117118crFolder(train_save)119crFolder(test_save)120121122crFolder(train_save+'/bike')123crFolder(train_save+'/boat')124crFolder(train_save+'/canoe')125crFolder(train_save+'/car')126crFolder(train_save+'/human')127crFolder(train_save+'/noise')128crFolder(train_save+'/pickup')129crFolder(train_save+'/truck')130crFolder(train_save+'/van')131132133crFolder(test_save+'/bike')134crFolder(test_save+'/boat')135crFolder(test_save+'/canoe')136crFolder(test_save+'/car')137crFolder(test_save+'/human')138crFolder(test_save+'/noise')139crFolder(test_save+'/pickup')140crFolder(test_save+'/truck')141crFolder(test_save+'/van')142143144def saveImages(images, savepath):145 for img_path in images:146 img_name = img_path.split('/')[-1]147 shutil.copy(img_path, savepath+'/'+img_name)148149150saveImages(train_bike_images, train_save+'/bike')151saveImages(train_boat_images, train_save+'/boat')152saveImages(train_canoe_images, train_save+'/canoe')153saveImages(train_car_images, train_save+'/car')154saveImages(train_human_images, train_save+'/human')155saveImages(train_noise_images, train_save+'/noise')156saveImages(train_truck_images, train_save+'/truck')157saveImages(train_pickup_images, train_save+'/pickup')158saveImages(train_van_images, train_save+'/van')159160161saveImages(test_bike_images, test_save+'/bike')162saveImages(test_boat_images, test_save+'/boat')163saveImages(test_canoe_images, test_save+'/canoe')164saveImages(test_car_images, test_save+'/car')165saveImages(test_human_images, test_save+'/human')166saveImages(test_noise_images, test_save+'/noise')167saveImages(test_truck_images, test_save+'/truck')168saveImages(test_pickup_images, test_save+'/pickup') ...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful