How to use copy_data method in autotest

Best Python code snippet using autotest_python

_1_encode_cat_features.py

Source:_1_encode_cat_features.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2import pandas as pd3import numpy as np4import scipy as sc5import scipy.sparse as sp6from sklearn.utils import check_random_state 7import pylab 8import sys9import time10# sys.path.append('/home/zzhang/Downloads/xgboost/wrapper')11import xgboost as xgb12from joblib import dump, load, Parallel, delayed13import utils14from utils import *15raw_data_path = utils.raw_data_path16tmp_data_path = utils.tmp_data_path17train_data = pd.read_csv(raw_data_path + "train.csv", nrows=1e5)18test_data = pd.read_csv(raw_data_path + "test.csv", nrows=1e5)19# train_data = test_data.copy()20# 从训练样本中随机抽取 utils.sample_pct 的样本来训练, 1.0 表示全部,21if utils.sample_pct < 1.0:22 np.random.seed(999)23 r1 = np.random.uniform(0, 1, train_data.shape[0])24 train_data = train_data.ix[r1 < utils.sample_pct, :]25 print("testing with small sample of training data, ", train_data.shape)26# 测试样本比训练样本少了label属性27test_data['click'] = 028# 合并测试样本和测试样本, 统一进行特征工程处理29all_data = pd.concat([train_data, test_data])30print("finished loading raw data, ", all_data.shape)31print("to add some basic features ...")32# 将hour特征转成hour1, day_hour, day_hour_prev, day_hour_next特征33all_data['day']=np.round(all_data.hour % 10000 / 100)34all_data['hour1'] = np.round(all_data.hour % 100)35all_data['day_hour'] = (all_data.day.values - 21) * 24 + all_data.hour1.values36all_data['day_hour_prev'] = all_data['day_hour'] - 137all_data['day_hour_next'] = all_data['day_hour'] + 138all_data['app_or_web'] = 039all_data.ix[all_data.app_id.values=='ecad2386', 'app_or_web'] = 140copy_data = all_data41copy_data['app_site_id'] = np.add(copy_data.app_id.values, copy_data.site_id.values)42print("to encode categorical features using mean responses from earlier days -- univariate")43sys.stdout.flush()44calc_exptv(copy_data, ['app_or_web'])45exptv_vn_list = ['app_site_id', 'as_domain', 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 46 'app_site_model', 'site_model','app_model', 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']47calc_exptv(copy_data, exptv_vn_list)48calc_exptv(copy_data, ['app_site_id'], add_count=True)49print("to encode categorical features using mean responses from earlier days -- multivariate")50vns = ['app_or_web', 'device_ip', 'app_site_id', 'device_model', 'app_site_model', 'C1', 'C14', 'C17', 'C21',51 'device_type', 'device_conn_type','app_site_model_aw', 'dev_ip_app_site']52dftv = copy_data.ix[np.logical_and(copy_data.day.values >= 21, copy_data.day.values < 32), ['click', 'day', 'id'] + vns].copy()53dftv['app_site_model'] = np.add(dftv.device_model.values, dftv.app_site_id.values)54dftv['app_site_model_aw'] = np.add(dftv.app_site_model.values, dftv.app_or_web.astype('string').values)55dftv['dev_ip_app_site'] = np.add(dftv.device_ip.values, dftv.app_site_id.values)56for vn in vns:57 dftv[vn] = dftv[vn].astype('category')58 print(vn)59n_ks = {'app_or_web': 100, 'app_site_id': 100, 'device_ip': 10, 'C14': 50, 'app_site_model': 50, 'device_model': 100, 'device_id': 50,60 'C17': 100, 'C21': 100, 'C1': 100, 'device_type': 100, 'device_conn_type': 100, 'banner_pos': 100,61 'app_site_model_aw': 100, 'dev_ip_app_site': 10 , 'device_model': 500}62exp2_dict = {}63for vn in vns:64 exp2_dict[vn] = np.zeros(dftv.shape[0])65days_npa = dftv.day.values66 67for day_v in range(22, 32):68 df1 = dftv.ix[np.logical_and(dftv.day.values < day_v, dftv.day.values < 31), :].copy()69 df2 = dftv.ix[dftv.day.values == day_v, :]70 print("Validation day:", day_v, ", train data shape:", df1.shape, ", validation data shape:", df2.shape)71 pred_prev = df1.click.values.mean() * np.ones(df1.shape[0])72 for vn in vns:73 if 'exp2_'+vn in df1.columns:74 df1.drop('exp2_'+vn, inplace=True, axis=1)75 for i in range(3):76 for vn in vns:77 p1 = calcLeaveOneOut2(df1, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev)78 pred = pred_prev * p179 print(day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean())80 pred_prev = pred 81 82 pred1 = df1.click.values.mean()83 for vn in vns:84 print("="*20, "merge", day_v, vn)85 diff1 = mergeLeaveOneOut2(df1, df2, vn)86 pred1 *= diff187 exp2_dict[vn][days_npa == day_v] = diff188 89 pred1 *= df1.click.values.mean() / pred1.mean()90 print("logloss = ", logloss(pred1, df2.click.values))91 #print my_lift(pred1, None, df2.click.values, None, 20, fig_size=(10, 5))92 #plt.show()93for vn in vns:94 copy_data['exp2_'+vn] = exp2_dict[vn]95print("to count prev/current/next hour by ip ...")96cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour_prev', fill_na=0)97cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour', fill_na=0)98cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour_next', fill_na=0)99print("to create day diffs")100copy_data['pday'] = copy_data.day - 1101calcDualKey(copy_data, 'device_ip', None, 'day', 'pday', 'click', 10, None, True, True)102copy_data['cnt_diff_device_ip_day_pday'] = copy_data.cnt_device_ip_day.values - copy_data.cnt_device_ip_pday.values103copy_data['hour1_web'] = copy_data.hour1.values104copy_data.ix[copy_data.app_or_web.values==0, 'hour1_web'] = -1105copy_data['app_cnt_by_dev_ip'] = my_grp_cnt(copy_data.device_ip.values.astype('string'), copy_data.app_id.values.astype('string'))106copy_data['hour1'] = np.round(copy_data.hour.values % 100)107copy_data['cnt_diff_device_ip_day_pday'] = copy_data.cnt_device_ip_day.values - copy_data.cnt_device_ip_pday.values108copy_data['rank_dev_ip'] = my_grp_idx(copy_data.device_ip.values.astype('string'), copy_data.id.values.astype('string'))109copy_data['rank_day_dev_ip'] = my_grp_idx(np.add(copy_data.device_ip.values, copy_data.day.astype('string').values).astype('string'), copy_data.id.values.astype('string'))110copy_data['rank_app_dev_ip'] = my_grp_idx(np.add(copy_data.device_ip.values, copy_data.app_id.values).astype('string'), copy_data.id.values.astype('string'))111copy_data['cnt_dev_ip'] = get_agg(copy_data.device_ip.values, copy_data.id, np.size)112copy_data['cnt_dev_id'] = get_agg(copy_data.device_id.values, copy_data.id, np.size)113copy_data['dev_id_cnt2'] = np.minimum(copy_data.cnt_dev_id.astype('int32').values, 300)114copy_data['dev_ip_cnt2'] = np.minimum(copy_data.cnt_dev_ip.astype('int32').values, 300)115copy_data['dev_id2plus'] = copy_data.device_id.values116copy_data.ix[copy_data.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1'117copy_data['dev_ip2plus'] = copy_data.device_ip.values118copy_data.ix[copy_data.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1'119copy_data['diff_cnt_dev_ip_hour_phour_aw2_prev'] = (copy_data.cnt_device_ip_day_hour.values - copy_data.cnt_device_ip_day_hour_prev.values) * ((copy_data.app_or_web * 2 - 1)) 120copy_data['diff_cnt_dev_ip_hour_phour_aw2_next'] = (copy_data.cnt_device_ip_day_hour.values - copy_data.cnt_device_ip_day_hour_next.values) * ((copy_data.app_or_web * 2 - 1)) 121print("to save copy_data ...")122dump(copy_data, tmp_data_path + 'copy_data.joblib_dat')123print("to generate copy_datatv_mx .. ")124app_or_web = None125_start_day = 22126list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type']127feature_list_dict = {}128feature_list_name = 'tvexp3'129feature_list_dict[feature_list_name] = list_param + \130 ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 131 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 132 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \133 ['cnt_diff_device_ip_day_pday', 134 'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web',135 'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip',136 'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next',137 'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model',138 'exp2_app_site_model_aw', 'exp2_dev_ip_app_site',139 'cnt_dev_ip', 'cnt_dev_id', 'hour1_web']140filter_tv = np.logical_and(copy_data.day.values >= _start_day, copy_data.day.values < 31)141filter_t1 = np.logical_and(copy_data.day.values < 30, filter_tv)142filter_v1 = np.logical_and(~filter_t1, filter_tv) 143 144print(filter_tv.sum())145for vn in feature_list_dict[feature_list_name] :146 if vn not in copy_data.columns:147 print("="*60 + vn)148 149yv = copy_data.click.values[filter_v1]150copy_datatv_mx = copy_data.as_matrix(feature_list_dict[feature_list_name])151print(copy_datatv_mx.shape)152print("to save copy_datatv_mx ...")153copy_datatv_mx_save = {}154copy_datatv_mx_save['copy_datatv_mx'] = copy_datatv_mx155copy_datatv_mx_save['click'] = copy_data.click.values156copy_datatv_mx_save['day'] = copy_data.day.values157copy_datatv_mx_save['site_id'] = copy_data.site_id.values...

Full Screen

Full Screen

feuture_get.py

Source:feuture_get.py Github

copy

Full Screen

1def lag(data, index_cols, lag_cols, mode = "day"):2 3 shift_range = [1, 2, 3, 4, 5, 12]45 for shift in tqdm_notebook(shift_range):6 shifted_data = data[index_cols + lag_cols].copy()78 # pd.DateOffset(seconds = shift)9 # "-" for data from future10 shifted_data['date'] -= pd.DateOffset(days = 1)1112 foo = lambda x: '{}_{}lag_{}'.format(x, mode, shift) if x in lag_cols else x13 shifted_data = shifted_data.rename(columns = foo)1415 data = pd.merge(data, shifted_data, on = index_cols, how = 'left').fillna(0) # or other NaN value1617 return data1819def first_extremum(data, delta_list, value_column, mode = 'max'):20 # data must have "date"21 copy_data = data.copy()22 23 for delta in delta_list:24 for value_label in value_column:25 26 if mode == 'max':27 max_mask = (copy_data[value_label].rolling(1 + 2 * delta , center = True).max() ==\28 copy_data[value_label])29 indexes = np.where(copy_data[value_label].rolling(1 + 2 * delta , center = True).max() ==\30 copy_data[value_label])[0]31 else:32 max_mask = (copy_data[value_label].rolling(1 + 2 * delta , center = True).min() ==\33 copy_data[value_label])34 indexes = np.where(copy_data[value_label].rolling(1 + 2 * delta , center = True).min() ==\35 copy_data[value_label])[0]3637 indexes_with_nan = np.concatenate([indexes, [None]])3839 # fmxi is first max index (index of the first maximum)40 copy_data['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)] =\41 indexes_with_nan[np.searchsorted(indexes, data.index, side='right')]4243 # fmxr is first max range (range to the first maximum)44 copy_data['{}_f{}r{}'.format(value_label, mode, 1 + 2 * delta)] =\45 copy_data['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)] - copy_data.index4647 max_val = copy_data[max_mask][[value_label]]48 max_val = max_val.rename(columns = lambda x : "{}_{}{}".format(x, mode, 1 + 2 * delta)) 4950 copy_data = copy_data.join(max_val, how = 'left')5152 # print(copy_data[value_label] == copy_data[value_label+"_max"])53 copy_data.loc[copy_data[value_label] == copy_data['{}_{}{}'.format(value_label, mode, 1 + 2 * delta)], 54 '{}_f{}r{}'.format(value_label, mode, 1 + 2 * delta)] = 05556 copy_data.drop(['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)], inplace = True, axis = 1)57 copy_data = copy_data[::-1]58 copy_data['{}_{}{}'.format(value_label, mode, 1 + 2 * delta)] = copy_data[59 '{}_{}{}'.format(value_label, mode, 1 + 2 * delta)].ffill()60 copy_data = copy_data[::-1]6162 return copy_data636465#lag(data, ["date"], ["brent_close", "brent_open"])66# date brent_close brent_open brent_close_daylag_1 brent_open_daylag_1 brent_close_daylag_2 brent_open_daylag_2 brent_close_daylag_3 brent_open_daylag_3 brent_close_daylag_4 brent_open_daylag_4 brent_close_daylag_5 brent_open_daylag_5 brent_close_daylag_12 brent_open_daylag_1267#0 2002-07-01 25.64 25.50 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.6168#1 2002-07-02 25.75 25.61 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.736970#get_max(data, [2, 3], ['brent_close', 'brent_open'], 'min')71# date brent_close brent_open brent_close_fminr5 brent_close_min5 brent_open_fminr5 brent_open_min5 brent_close_fminr7 brent_close_min7 brent_open_fminr7 brent_open_min772#0 2002-07-01 25.64 25.50 5 25.08 6 25.1 5 25.08 6 25.1 ...

Full Screen

Full Screen

modify.py

Source:modify.py Github

copy

Full Screen

1import sys2def modify_instance(name):3 with open(name, "r") as f:4 data = f.readlines()5 copy_data = data.copy()6 for i,line in enumerate(data):7 for j,c in enumerate(line):8 if c == "<":9 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]10 if c == ">":11 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]12 if c == ",":13 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]14 15 16 with open("modify.dat", "w") as out :17 out.writelines(copy_data)18if __name__ == "__main__":...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful