How to use end_test method in avocado

Best Python code snippet using avocado_python

possion_FixPossionModel_siteBYsite_optimal_GCmodify.py

Source:possion_FixPossionModel_siteBYsite_optimal_GCmodify.py Github

copy

Full Screen

1#! /usr/bin/env python2# -*- coding: utf-8 -*-3#########################################################################################################4# CNVar: accurate knowledge-based copy number variation prediction using max likelihood method #5# # 6# Copyright (C) 2019 Enming He(emhe@wegene.com) #7# #8# This program is free software: you can redistribute it and/or modify #9# it under the terms of the GNU General Public License as published by #10# the Free Software Foundation, either version 3 of the License, or #11# (at your option) any later version. #12# #13# This program is distributed in the hope that it will be useful, #14# but WITHOUT ANY WARRANTY; without even the implied warranty of #15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #16# GNU General Public License for more details. #17# #18# You should have received a copy of the GNU General Public License #19# along with this program. If not, see <https://www.gnu.org/licenses/>. #20#########################################################################################################21import os22import sys23import re24import numpy as np25import scipy.stats as stats26import math27import subprocess28import argparse29import read_depth_method_matrix_mem_optimize_GCmodify_GCcorrect30def safe_open(infile):31 if infile.endswith('.gz'):32 import gzip33 return gzip.open(infile)34 else:35 return open(infile)36def read_GCprofile(infile):37 print('reading:GCprofile %s ...' % infile)38 GCprofile = {}39 with open(infile) as f:40 line = f.readline()41 headers = line.strip().split()42 line = f.readline()43 gc_values = line.strip().split()44 45 for i in range(1,len(headers)):46 GCprofile[int(headers[i])] = float(gc_values[i])47 print('finish reading:GCprofile')48 return GCprofile49def possion(cov):50 if cov <= 60:51 rate = 6052 else:53 rate = int(cov)54 Dict = {}55 n = np.arange((rate+1)*2)56 y = stats.poisson.pmf(n,cov)57 for i in n:58 Dict[i] = y[i]59 return Dict60def reverse_possion(cov):61 rate = int(cov)62 Dict = {}63 n = np.arange(rate)64 y = stats.poisson.pmf(n,rate)65 y1 = y.tolist()66 y1.reverse()67 for i in n:68 Dict[i] = y1[i]69 return Dict70def log(Dict,x,logmin):71 if x > max(Dict.keys()) or x < min(Dict.keys()):72 return math.log(logmin)73 else:74 if Dict[x] < logmin:75 return math.log(logmin)76 else:77 return math.log(Dict[x])78def possibility(D,DR,cov,N, CHROM_TEST,START_TEST,END_TEST, MISMATCH, logmin):79 lgP = math.log(1)80 CN_models = {}81 82 def possionCN(CN):83 if (CN!=0):84 CN_model = possion(cov*CN/2)85 else:86 CN_model = possion(MISMATCH * cov)87 return CN_model88 89 for i in range(START_TEST,END_TEST):90 depth = D[(CHROM_TEST,i)]91 CN = round(DR[(CHROM_TEST,i)]/N,2)92 93 if CN not in CN_models:94 CN_models[CN] = possionCN(CN)95 96 y = log(CN_models[CN],depth, logmin)97 98 lgP += y99 return lgP100def read_ref(reffile, CHROM_TEST, START_TEST,END_TEST):101 print('reading ref file: %s ' % reffile)102 depth_sum =0103 count = 0104 D = {}105 with safe_open(reffile) as f:106 for line in f:107 chrom, pos, depth = [int(x) for x in line.strip().split()[0:3]]108 if chrom == CHROM_TEST and pos<= END_TEST and pos>= START_TEST:109 D[(chrom,pos)] = depth110 elif chrom == CHROM_TEST and pos > END_TEST:111 break112 return D113def read_sample(infile,GCprofile, CHROM_TEST, START_TEST,END_TEST ):114 print('reading sample file: %s ...' % infile)115 with safe_open(infile) as f:116 D = {}117 depth_sum = 0118 count = 0119 for line in f:120 chrom, pos, depth_raw = [int(x) for x in line.strip().split()[0:3]]121 if chrom == CHROM_TEST and pos<= END_TEST and pos>= START_TEST:122 depth = int(depth_raw * GCprofile[pos - pos % 50])123 D[(chrom,pos)] = depth124 elif chrom == CHROM_TEST and pos > END_TEST: # if assume background region behinds testing region125 break126 return D127def calculator(LIST,sample,reflist, binsize, slide, gc_config,CHROM_TEST, START_TEST, END_TEST, AVERAGE_DEPTH, MISMATCH, logmin):128 min_start = START_TEST129 max_end = END_TEST130 131 read_depth_method_matrix_mem_optimize_GCmodify_GCcorrect.readDepthMethodMatrixMemOptimizeGCmodifyGCcorrect(sample, binsize, sample, slide, '%s:%s-%s' % (CHROM_TEST, START_TEST-150, END_TEST+150), gc_config) # 150 , end should be wilder132 GCprofile = read_GCprofile('%s.GCprofile.txt' % sample)133 D = read_sample(sample,GCprofile, CHROM_TEST, START_TEST, END_TEST)134 cov = AVERAGE_DEPTH135 for ref_info in reflist:136 ref, Depth_Ref = ref_info.strip().split()137 N = float(Depth_Ref)/2138 DR = read_ref(ref,CHROM_TEST, START_TEST,END_TEST)139 lgP = possibility(D, DR, cov, N, CHROM_TEST,START_TEST,END_TEST, MISMATCH,logmin)140 LIST.append((ref,lgP))141def possionFixPossionModelSiteBYSiteOptionGCmodify(reference_list, sample_list, outfile, threads, detect_region, average_depth, gc_config, binsize, slide_window, logmin, mismatch ):142 143 logmin = float(logmin)144 m1 = re.search('(\d+):(\d+)-(\d+)',detect_region)145 CHROM_TEST = int(m1.group(1))146 START_TEST = int(m1.group(2))147 END_TEST = int(m1.group(3))148 AVERAGE_DEPTH = float(average_depth)149 MISMATCH = float(mismatch)150 151 print('''152 CHROM_TEST = %s153 START_TEST = %s154 END_TEST = %s155 ''' % (CHROM_TEST, START_TEST , END_TEST)156 )157 158 reflist=[]159 samplelist=[]160 161 with open(reference_list) as f:162 for line in f:163 reflist.append(line.strip())164 with open(sample_list) as f:165 for line in f:166 samplelist.append(line.strip())167 168 sample_Dict = {}169 170 for sample in samplelist:171 sample_Dict[sample] = []172 calculator(sample_Dict[sample],sample,reflist, binsize, slide_window,173 gc_config,CHROM_TEST, START_TEST, END_TEST, AVERAGE_DEPTH, MISMATCH, logmin)174 with open(outfile,'w') as w:175 for sample in sample_Dict:176 mylist = sorted(sample_Dict[sample],key=lambda x: x[1],reverse=True)177 rank = 1178 for i in mylist:179 w.write('%s\t%s\t%s\t%s\n' % (sample,i[0], i[1],rank))180 rank += 1181 return 0182if __name__ == '__main__':183 184 parser = argparse.ArgumentParser()185 parser.add_argument('-rl', '--reference_list', help='reference list',required=True)186 parser.add_argument('-sl', '--sample_list', help='sample list',required=True)187 parser.add_argument('-o', '--outfile', help='outfile',required=True)188 parser.add_argument('-t', '--threads', help='threads',default=16)189 parser.add_argument('-r', '--detect_region', help='detect region, chr:start-end ',required=True)190 parser.add_argument('-d', '--average_depth', help='average depth',required=True)191 parser.add_argument('-c', '--gc_config', help='GC config',required=True)192 parser.add_argument('-bz', '--binsize', help='binsize',required=True)193 parser.add_argument('-sw', '--slide_window', help='slide_window',required=True)194 parser.add_argument('-l', '--logmin', help='logmin', default=1e-7)195 parser.add_argument('-m', '--mismatch', help='assume mismatch', default=1e-4)196 args = parser.parse_args()197 198 reference_list = args.reference_list199 sample_list = args.sample_list200 outfile = args.outfile201 threads = args.threads202 detect_region = args.args.detect_region203 average_depth = args.average_depth204 gc_config = args.gc_config205 binsize = args.binsize206 slide_window = args.slide_window207 logmin = args.logmin208 mismatch = args.mismatch209 ...

Full Screen

Full Screen

WeatherClassifier.py

Source:WeatherClassifier.py Github

copy

Full Screen

1import numpy as np2import pandas3import argparse4import matplotlib.pyplot as plt5import datetime6from keras.models import Sequential7from keras.layers import Dense, Dropout, advanced_activations8def main():9 # Get input args10 args = parse_arguments()11 # Init random seed for reproducibility12 np.random.seed(0)13 # load the dataset14 dataframe = pandas.read_csv(args["data_path"], engine='python', parse_dates=['DATE'],15 date_parser=lambda x: pandas.to_datetime(x, infer_datetime_format=True))16 # Define the training set using the input begin and end dates17 train_df= dataframe[(dataframe['DATE'] >= datetime.datetime(args["begin_train"],1,1)) &18 (dataframe['DATE'] <= datetime.datetime(args["end_train"],12,31))]19 # Define the testing set using the input begin and end dates20 test_df = dataframe[(dataframe['DATE'] >= datetime.datetime(args["begin_test"],1,1)) &21 (dataframe['DATE'] <= datetime.datetime(args["end_test"],12,31))]22 # Remove null and other invalid entries in the data23 train_data = np.nan_to_num(train_df['TAVG'].values.astype('float32'))24 test_data = np.nan_to_num(test_df['TAVG'].values.astype('float32'))25 # Combine the data to one array26 combined_data = np.append(train_data, test_data)27 # reshape dataset to window matrix28 look_back = 12 # This is the size of the window29 trainX, trainY = create_dataset(train_data, look_back)30 testX, testY = create_dataset(test_data, look_back)31 # Define and fit the model32 model = create_model(look_back=look_back)33 model.fit(trainX, trainY, epochs=500, batch_size=12, verbose=2)34 # Estimate model performance35 trainScore = model.evaluate(trainX, trainY, verbose=0)36 print('Train Score: %.2f MAE' % (trainScore))37 testScore = model.evaluate(testX, testY, verbose=0)38 print('Test Score: %.2f MAE' % (testScore))39 # generate predictions for training40 trainPredict = model.predict(trainX)41 testPredict = model.predict(testX)42 # shift train predictions for plotting43 trainPredictPlot = np.empty((len(combined_data), 1))44 trainPredictPlot[:] = np.nan45 trainPredictPlot[look_back:len(trainPredict) + look_back] = trainPredict46 # shift test predictions for plotting47 testPredictPlot = np.empty((len(combined_data), 1))48 testPredictPlot[:] = np.nan49 testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(combined_data) - 1] = testPredict50 # Combine the results51 combined_df = train_df.append(test_df)52 combined_dates = combined_df['DATE']53 # plot baseline and predictions54 plt.plot(combined_dates, combined_data, )55 plt.plot(combined_dates, trainPredictPlot)56 plt.plot(combined_dates, testPredictPlot)57 plt.minorticks_on()58 plt.show()59# Standard Model Creation60def create_model(look_back):61 # create and fit Multilayer Perceptron model62 model = Sequential()63 model.add(Dense(100, input_dim=look_back, activation='relu'))64 # model.add(Dense(50, activation='relu'))65 # model.add(Dense(25, activation='relu'))66 # model.add(Dense(10, activation='relu'))67 # model.add(Dense(5, activation='relu'))68 model.add(Dense(1))69 model.compile(loss='mean_absolute_error', optimizer='nadam')70 return model71# convert an array of values into a dataset matrix72def create_dataset(dataset, look_back=1):73 dataX, dataY = [], []74 for i in range(len(dataset) - look_back - 1):75 a = dataset[i:(i + look_back)]76 dataX.append(a)77 dataY.append(dataset[i + look_back])78 return np.array(dataX), np.array(dataY)79# Command Line Arguments are parsed here80def parse_arguments():81 parser = argparse.ArgumentParser()82 parser.add_argument("-dp", "--data_path", help="Data File Path")83 parser.add_argument("-ad", "--adverse", help="Turns on Adversarial Learning")84 parser.add_argument("-m", "--mode", help="Choose mode: full, grid")85 parser.add_argument("-e", "--epochs", help="Number of Epochs", type=int, nargs="*")86 parser.add_argument("-tr", "--train_ratio", nargs="*", type=int,87 help="Set Train Ratios. Enter as a percent (20,40,60,80). Can be a list space delimited")88 parser.add_argument("-bs", "--batch_size", nargs="*", type=int,89 help="Batch size. Can be a list space delimited")90 parser.add_argument("-n", "--neurons", nargs="*", type=int,91 help="Number of Neurons. Can be a list space delimited")92 parser.add_argument("-o", "--optimizer", nargs="*",93 help="Optimizers. Can be a list space delimited")94 parser.add_argument("-w", "--weight_constraint", nargs="*", type=int,95 help="Weight Constraint. Can be a list space delimited")96 parser.add_argument("-d", "--dropout", nargs="*", type=int,97 help="Dropout. Enter as percent (10,20,30,40...). Can be a list space delimited.")98 parser.add_argument("-model", "--model", help="Select which model to run: all, one_layer, four_decr, four_same")99 parser.add_argument("-s", "--splits", help="Number of Splits for SSS", type=int)100 parser.add_argument("-btr", "--begin_train", help="Year to begin training (1940-2016)", type=int)101 parser.add_argument("-etr", "--end_train", help="Year to end training. Should be higher than begin & <=2017", type=int)102 parser.add_argument("-bts", "--begin_test", help="Year to begin testing (1940-2017)", type=int)103 parser.add_argument("-ets", "--end_test", help="Year to end testing. Should be higher than begin test.", type=int)104 args = parser.parse_args()105 arguments = {}106 if args.data_path:107 arguments["data_path"] = args.data_path108 else:109 print("Default Data Path: ../Data/BWIMonthly1939.csv")110 arguments["data_path"] = "../Data/BWIMonthly1939.csv"111 if args.adverse:112 adverse = True113 else:114 adverse = False115 arguments["adverse"] = adverse116 if args.mode == "grid":117 mode = "grid"118 print("Mode is %s" % mode)119 else:120 mode = "full"121 print("Mode is %s" % mode)122 arguments["mode"] = mode123 if args.model == "all":124 model = ["oneLayer", "fourDecr", "fourSame"]125 elif args.model in ["oneLayer", "fourDecr", "fourSame"]:126 model = [args.model]127 else:128 print("Defaulting to All models")129 model = ["oneLayer", "fourDecr", "fourSame"]130 arguments["model"] = model131 if args.epochs:132 epochs = args.epochs133 else:134 print("Defaulting to 16 epochs")135 epochs = 16136 arguments["epochs"] = epochs137 if args.train_ratio:138 train_ratio = args.train_ratio139 else:140 print("Defaulting to testing all ratios")141 train_ratio = [20, 40, 60, 80]142 arguments["train_ratio"] = train_ratio143 if args.batch_size:144 batch_size = args.batch_size145 else:146 print("Defaulting to Batch Size 10")147 batch_size = 10148 arguments["batch_size"] = batch_size149 if args.neurons:150 neurons = args.neurons151 else:152 print("Defaulting to 45 Neurons")153 neurons = 45154 arguments["neurons"] = neurons155 if args.optimizer:156 optimizer = args.optimizer157 else:158 print("Defaulting to NADAM Optimizer")159 optimizer = "Nadam"160 arguments["optimizer"] = optimizer161 if args.weight_constraint:162 weight_constraint = args.weight_constraint163 else:164 print("Defaulting to weight constraint 5")165 weight_constraint = 5166 arguments["weight_constraint"] = weight_constraint167 if args.dropout:168 dropout = args.dropout169 else:170 print("Defaulting to dropout of 10%")171 dropout = 10172 arguments["dropout"] = dropout173 if args.splits:174 splits = args.splits175 else:176 print("Defaulting to 1 SSS Split")177 splits = 1178 arguments["splits"] = splits179 if args.begin_train:180 begin_train = args.begin_train181 else:182 print("Default begin training is 1940")183 begin_train = 1940184 arguments["begin_train"] = begin_train185 if args.end_train:186 end_train = args.end_train187 else:188 print("Defult end training is 1980")189 end_train = 1980190 if end_train < begin_train:191 print("End_Train should be bigger than Begin_Train")192 exit(1)193 arguments["end_train"] = end_train194 if args.begin_test:195 begin_test = args.begin_test196 else:197 print("Default begin test is 1981")198 begin_test = 1981199 arguments["begin_test"] = begin_test200 if args.end_test:201 end_test = args.end_test202 else:203 print("Default end test is 2017")204 end_test = 2017205 if end_test < begin_test:206 print("End_Test should be bigger than Begin_Test")207 exit(1)208 arguments["end_test"] = end_test209 return arguments210if __name__ == "__main__":...

Full Screen

Full Screen

sarimax_script.py

Source:sarimax_script.py Github

copy

Full Screen

1import matplotlib.pyplot as plt2import pandas as pd3import numpy as np4import itertools5import statsmodels.api as sm6from statsmodels.tsa.stattools import adfuller7from datetime import datetime, timedelta8from dateutil.relativedelta import *9import statsmodels.tsa.api as smt10import seaborn as sns11from sklearn.metrics import mean_squared_error12import pickle13from script import *14#function to create training and testing set15def create_train_test(data, start_train, end_train, start_test, end_test, test_length=24):16 df_train = data.loc[start_train:end_train, :]17 df_test = data.loc[start_test:end_test, :]18 start = datetime.strptime(start_test, '%Y-%m-%d %H:%M:%S')19 date_list = [start + relativedelta(hours=x) for x in range(0,test_length)] #test set will always have 24 hours20 future = pd.DataFrame(index=date_list, columns= df_train.columns)21 df_train = pd.concat([df_train, future])22 return df_train, df_test23#function to add all exogenous variables24def add_exog(data, weather, start_time, end_time):25 #add dummy variables for precipitation26 precip = pd.get_dummies(weather.precip_type)27 data = data.join(precip)28 data['Day_of_Week'] = data.index.dayofweek29 data['Weekend'] = data.apply(is_weekend, axis=1)30 data['Temperature'] = weather.loc[start_time:end_time, 'temperature']31 data['Humidity'] = weather.loc[start_time:end_time, 'humidity']32 data['Precip_Intensity'] = weather.loc[start_time:end_time, 'precip_intensity']33 data.rename(columns={'rain':'Rain', 'sleet':'Sleet', 'snow':'Snow'}, inplace=True)34 #fill missing values with mean35 data['Temperature'] = data.Temperature.fillna(np.mean(data['Temperature']))36 data['Humidity'] = data.Humidity.fillna(np.mean(data['Humidity']))37 data['Precip_Intensity'] = data.Precip_Intensity.fillna(np.mean(data['Precip_Intensity']))38 return data39#function to start/end dates for train and test40def find_dates(building_id, length=1, total_length=30, final_date=None):41 start_train, end_test = find_egauge_dates(building_id, total_length)42 time_delta_1 = timedelta(days=length)43 time_delta_2 = timedelta(hours=1)44 end_train = end_test - time_delta_145 start_test = end_train + time_delta_246 start_train = str(start_train)47 end_train = str(end_train)48 start_test = str(start_test)49 end_test = str(end_test)50 return start_train, end_train, start_test, end_test51def fit_exog_arima(data, weather, building_id, length=1, total_length=30, test_length=24):52 start_train, end_train, start_test, end_test = find_dates(building_id, length=length, total_length=total_length)53 df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, test_length)54 df_exog = add_exog(df_train, weather, start_train, end_test)55 exogenous = df_exog.loc[start_train:,['Weekend','Temperature','Humidity','car1']].astype(float)56 endogenous = df_exog.loc[:,'Hourly_Usage'].astype(float)57# low_aic = gridsearch_arima(endogenous,exogenous)58# arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,59# exog = exogenous,60# trend=None,61# order=low_aic[0],62# seasonal_order=low_aic[1],63# enforce_stationarity=False,64# enforce_invertibility=False)65 arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,66 exog = exogenous,67 trend=None,68 order=(1, 0, 1),69 seasonal_order=(0, 1, 1, 24),70 enforce_stationarity=False,71 enforce_invertibility=False)72 results = arima_model.fit()73 return df_exog, results74def plot_exog_arima(data, data_exog, model, building_id, length=1, total_length=30, test_length=24):75 start_train, end_train, start_test, end_test = find_dates(building_id, length=length, total_length=total_length)76 df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, test_length=test_length)77 df_exog_train, df_exog_test = create_train_test(data_exog, start_train, end_train, start_test, end_test, test_length=test_length)78 mse, rmse = add_forecast(model, df_test, df_exog_train, start_test, end_test)79 plot_forecast(df_exog_train, 500)80 return mse, rmse, df_exog_train81#function to find optimal parameters and resulting AIC score82def gridsearch_arima(y, exog=None):83 p = d = q = range(0, 2)84 pdq = list(itertools.product(p, d, q))85 seasonal_pdq = [(x[0], x[1], x[2], 24) for x in list(itertools.product(p, d, q))]86 low_aic = [0,0,50000]87 for param in pdq:88 for param_seasonal in seasonal_pdq:89 try:90 model = sm.tsa.statespace.SARIMAX(y,91 exog=exog,92 order=param,93 seasonal_order=param_seasonal,94 enforce_stationarity=False,95 enforce_invertibility=False)96 results = model.fit()97 if results.aic < low_aic[2]:98 low_aic = [param, param_seasonal, results.aic]99# print('ARIMA{}x{}24 - AIC:{}'.format(param, param_seasonal, results.aic))100 except:101 continue102 return low_aic103#function to forecast with fitted model, returns MSE and RMSE104def add_forecast(model, test, train, start_time, end_time):105 train['forecast'] = model.predict(start=start_time, end=end_time)106 y_true = test.loc[start_time:end_time, 'Hourly_Usage']107 y_pred = train.loc[start_time:end_time, 'forecast']108 train.loc[start_time:end_time, 'Hourly_Usage'] = test.loc[start_time:end_time, 'Hourly_Usage']109 mse = mean_squared_error(y_true, y_pred)110 rmse = np.sqrt(mse)111 return mse, rmse112def plot_forecast(data, datapoints):113 fig = plt.figure(figsize=(16,8))114 plt.plot(data['Hourly_Usage'][datapoints:])115 plt.plot(data['forecast'])116 plt.legend()117#function to find mean car charge118def mean_car_charge(data, start, end):119 car_charge = {}120 for index in data.Time_Index.unique():121 car_charge[index] = np.mean(data[data.Time_Index==index].car1)122 return car_charge123#function to add all exogenous variables124def create_exog_endo(data, weather, building_id, length=1, total_length=30):125 start_train, end_train, start_test, end_test = find_dates(building_id, length, total_length)126 df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, 24*length)127 car_charge = mean_car_charge(data, start_train,end_train)128 df_train['Time_Index'] = df_train.index.weekday_name+ df_train.index.hour.astype(str)129 df_train['Temperature'] = weather.loc[start_train:end_test, 'temperature']130 df_train['Humidity'] = weather.loc[start_train:end_test, 'humidity']131 for time in df_train.loc[start_test:end_test,:].index:132 df_train.loc[time,'car1'] = car_charge[df_train.loc[time,'Time_Index']]133 #fill missing values with mean134 df_train['Temperature'] = df_train.Temperature.fillna(np.mean(df_train['Temperature']))135 df_train['Humidity'] = df_train.Humidity.fillna(np.mean(df_train['Humidity']))136 exogenous = df_train.loc[start_train:,['Temperature','Humidity','car1']].astype(float)137 endogenous = df_train.loc[:,'Hourly_Usage'].astype(float)138 return df_train, exogenous, endogenous139#function to fit SARIMAX model with create_exog_endo140def fit_exog_arima_new(exogenous, endogenous):141 low_aic = gridsearch_arima(endogenous,exogenous)142 arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,143 exog = exogenous,144 trend=None,145 order=low_aic[0],146 seasonal_order=low_aic[1],147 enforce_stationarity=False,148 enforce_invertibility=False)149 arima_exog_results = arima_model.fit()...

Full Screen

Full Screen

data_parse.py

Source:data_parse.py Github

copy

Full Screen

1import re2import os3import csv4import sys5import numpy as np6import pickle as pkl7from random import sample8from DataProcess.config import ConfigParam9class DataParse:10 def random_embedding(self, embedding_dim, word_num):11 """12 随机的生成word的embedding,这里如果有语料充足的话,可以直接使用word2vec蓄念出词向量,这样词之间的区别可能更大。13 :param embedding_dim: 词向量的维度。14 :return: numpy format array. shape is : (vocab, embedding_dim)15 """16 # 根据字典大小word_num给字典内每一个单词生成一个embedding_dim维度的向量。17 # 采用截断正太分布的方式生成。前提是假设你的数据集满足正太分布18 embedding_mat = np.random.uniform(-0.25, 0.25, (word_num, embedding_dim))19 embedding_mat = np.float32(embedding_mat)20 return embedding_mat21class Data_Inter:22 """23 生成训练数据24 """25 def __init__(self, vocab_au, vocab_key):26 self.config = ConfigParam()27 self.vocab_au = vocab_au # 字典路径28 self.vocab_key = vocab_key # 字典路径29 self.index = 030 self.index_test = 031 self.reload_num = 1032 # train set33 self.task_sentence = pkl.load(open('DataProcess/data_manage_key_mapping.pkl', mode='rb'))34 self.au_all, self.ke_all = self.get_author_art()35 print(self.au_all.shape, self.ke_all.shape)36 # TRAIN37 self.get_data_mapping()38 def get_data_mapping(self):39 self.shuffle_all = sample(range(0, self.au_all.shape[0], 1), self.au_all.shape[0])40 self.all_sam = len(self.shuffle_all)41 self.au = self.au_all[self.shuffle_all[: int(self.all_sam * 0.7)]] # 0.842 self.ke = self.ke_all[self.shuffle_all[: int(self.all_sam * 0.7)]]43 self.au_test = self.au_all[self.shuffle_all[int(self.all_sam * 0.7):]]44 self.ke_test = self.ke_all[self.shuffle_all[int(self.all_sam * 0.7):]]45 self.shuffle = sample(range(0, self.au.shape[0], 1), self.au.shape[0])46 self.shuffle_test = sample(range(0, self.au_test.shape[0], 1), self.au_test.shape[0])47 self.end = self.au.shape[0]48 self.end_test = self.au_test.shape[0]49 def get_author_art(self):50 auth = []51 keys = []52 for j in self.task_sentence:53 for k in j:54 auth.append(k[: 2])55 keys.append(k[2][: 7])56 return np.array(auth), np.array(keys)57 def next(self):58 # 获取批59 sentence = []60 task_ = []61 if self.index + self.config.batch_size < self.end: # 没有遍历到末尾62 it_data = self.shuffle[self.index: self.index + self.config.batch_size]63 self.index += self.config.batch_size # 标识获取批次的索引值随批改变64 elif self.index + self.config.batch_size == self.end: # 刚好遍历到末尾65 it_data = self.shuffle[self.index + self.config.batch_size: self.end]66 self.shuffle = sample(range(0, self.end, 1), self.end)67 self.index = 068 else:69 it_data = self.shuffle[self.index: self.end] # 随机选取70 self.shuffle = sample(range(0, self.end, 1), self.end)71 remain = self.shuffle[: self.index + self.config.batch_size - self.end] # 剩余72 it_data = np.concatenate((it_data, remain), axis=0)73 self.index = 074 if self.reload_num > 0:75 self.get_data_mapping()76 self.reload_num -= 177 # print('it_data:', it_data)78 sentences_au = self.au[it_data]79 sentences_key = self.ke[it_data]80 for cur_sentences, cur_task in zip(sentences_au, sentences_key):81 task_.append(self.sentence2index(cur_sentences, self.vocab_au)) # author mapping82 sentence.append(self.sentence2index(cur_task, self.vocab_key)) # keys mapping83 return np.array(sentence), np.array(task_)84 def next_test(self):85 # 获取批86 sentence = []87 task_ = []88 if self.index_test + self.config.batch_size <= self.end_test: # 没有遍历到末尾89 # 从task_sentence里面获取self.config.batch_size大小的块90 it_data = self.shuffle_test[self.index_test: self.index_test + self.config.batch_size] # 迭代数据91 self.index_test += self.config.batch_size # 标识获取批次的索引值随批改变92 elif self.index_test + self.config.batch_size == self.end_test: # 刚好遍历到末尾93 it_data = self.shuffle_test[self.index_test + self.config.batch_size: self.end_test]94 self.shuffle_test = sample(range(0, self.end_test, 1), self.end_test)95 self.index_test = 0 # 重置index。非常简单的小学数学问题,后面不在注释了,一眼便懂96 else:97 it_data = self.shuffle_test[self.index_test: self.end_test] # 随机选取98 self.shuffle_test = sample(range(0, self.end_test, 1), self.end_test)99 remain = self.shuffle_test[: self.index_test + self.config.batch_size - self.end_test] # 剩余100 it_data = np.concatenate((it_data, remain), axis=0)101 self.index_test = 0102 sentences_au = self.au_test[it_data]103 sentences_key = self.ke_test[it_data]104 for cur_sentences, cur_task in zip(sentences_au, sentences_key):105 task_.append(self.sentence2index(cur_sentences, self.vocab_au)) # author mapping106 sentence.append(self.sentence2index(cur_task, self.vocab_key)) # keys mapping107 return np.array(sentence), np.array(task_)108 def sentence2index(self, sen, vocab):109 sen2id = []110 for cur_sen in sen:111 sen2id.append(vocab.get(cur_sen, 0)) # 如果找不到,就用0代替。112 return sen2id113 def task2index(self, cur_tasks, mapping):114 assert isinstance(cur_tasks, list) and len(cur_tasks) > 0 and hasattr(cur_tasks, '__len__')115 assert isinstance(mapping, dict) and len(mapping) > 0116 cur_task2index_mapping = []117 for cur_task in cur_tasks:118 cur_task2index_mapping.append(mapping[cur_task])119 return cur_task2index_mapping120if __name__ == '__main__':121 from DataProcess.dataset_info import info122 au_vocab, key_vocab, au_len, key = info('DataProcess/data_manage_key_mapping.pkl')...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run avocado automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful