Best Python code snippet using avocado_python
possion_FixPossionModel_siteBYsite_optimal_GCmodify.py
Source:possion_FixPossionModel_siteBYsite_optimal_GCmodify.py  
1#! /usr/bin/env python2# -*- coding: utf-8 -*-3#########################################################################################################4#    CNVar: accurate knowledge-based copy number variation prediction using max likelihood method       #5#                                                                                                       # 6#    Copyright (C) 2019 Enming He(emhe@wegene.com)                                                      #7#                                                                                                       #8#    This program is free software: you can redistribute it and/or modify                               #9#    it under the terms of the GNU General Public License as published by                               #10#    the Free Software Foundation, either version 3 of the License, or                                  #11#    (at your option) any later version.                                                                #12#                                                                                                       #13#    This program is distributed in the hope that it will be useful,                                    #14#    but WITHOUT ANY WARRANTY; without even the implied warranty of                                     #15#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                                      #16#    GNU General Public License for more details.                                                       #17#                                                                                                       #18#    You should have received a copy of the GNU General Public License                                  #19#    along with this program.  If not, see <https://www.gnu.org/licenses/>.                             #20#########################################################################################################21import os22import sys23import re24import numpy as np25import scipy.stats as stats26import math27import subprocess28import argparse29import read_depth_method_matrix_mem_optimize_GCmodify_GCcorrect30def safe_open(infile):31    if infile.endswith('.gz'):32        import gzip33        return gzip.open(infile)34    else:35        return open(infile)36def read_GCprofile(infile):37    print('reading:GCprofile %s ...' % infile)38    GCprofile = {}39    with open(infile) as f:40        line = f.readline()41        headers = line.strip().split()42        line = f.readline()43        gc_values = line.strip().split()44    45    for i in range(1,len(headers)):46        GCprofile[int(headers[i])] = float(gc_values[i])47    print('finish reading:GCprofile')48    return GCprofile49def possion(cov):50    if cov <= 60:51        rate = 6052    else:53        rate = int(cov)54    Dict = {}55    n = np.arange((rate+1)*2)56    y = stats.poisson.pmf(n,cov)57    for i in n:58        Dict[i] = y[i]59    return Dict60def reverse_possion(cov):61    rate = int(cov)62    Dict = {}63    n = np.arange(rate)64    y = stats.poisson.pmf(n,rate)65    y1 = y.tolist()66    y1.reverse()67    for i in n:68        Dict[i] = y1[i]69    return Dict70def log(Dict,x,logmin):71    if x > max(Dict.keys()) or x < min(Dict.keys()):72        return math.log(logmin)73    else:74        if Dict[x] < logmin:75            return math.log(logmin)76        else:77            return math.log(Dict[x])78def possibility(D,DR,cov,N, CHROM_TEST,START_TEST,END_TEST, MISMATCH, logmin):79    lgP = math.log(1)80    CN_models = {}81    82    def possionCN(CN):83        if (CN!=0):84            CN_model = possion(cov*CN/2)85        else:86            CN_model  = possion(MISMATCH * cov)87        return CN_model88    89    for i in range(START_TEST,END_TEST):90        depth = D[(CHROM_TEST,i)]91        CN = round(DR[(CHROM_TEST,i)]/N,2)92        93        if CN not in CN_models:94            CN_models[CN] = possionCN(CN)95    96        y = log(CN_models[CN],depth, logmin)97       98        lgP += y99    return lgP100def read_ref(reffile, CHROM_TEST, START_TEST,END_TEST):101    print('reading ref file: %s ' % reffile)102    depth_sum =0103    count = 0104    D = {}105    with safe_open(reffile) as f:106        for line in f:107            chrom, pos, depth = [int(x) for x in line.strip().split()[0:3]]108            if chrom == CHROM_TEST and pos<= END_TEST and pos>= START_TEST:109                D[(chrom,pos)] = depth110            elif chrom == CHROM_TEST and pos > END_TEST:111                break112    return D113def read_sample(infile,GCprofile, CHROM_TEST, START_TEST,END_TEST ):114    print('reading sample file: %s ...' % infile)115    with safe_open(infile) as f:116        D = {}117        depth_sum = 0118        count = 0119        for line in f:120            chrom, pos, depth_raw = [int(x) for x in line.strip().split()[0:3]]121            if chrom == CHROM_TEST and pos<= END_TEST and pos>= START_TEST:122                depth = int(depth_raw * GCprofile[pos - pos % 50])123                D[(chrom,pos)] = depth124            elif chrom == CHROM_TEST and pos > END_TEST:   # if assume background region behinds testing region125                break126    return D127def calculator(LIST,sample,reflist, binsize, slide, gc_config,CHROM_TEST, START_TEST, END_TEST, AVERAGE_DEPTH, MISMATCH, logmin):128    min_start =  START_TEST129    max_end = END_TEST130    131    read_depth_method_matrix_mem_optimize_GCmodify_GCcorrect.readDepthMethodMatrixMemOptimizeGCmodifyGCcorrect(sample, binsize, sample, slide, '%s:%s-%s' % (CHROM_TEST, START_TEST-150, END_TEST+150), gc_config) # 150 , end should be wilder132    GCprofile = read_GCprofile('%s.GCprofile.txt' % sample)133    D = read_sample(sample,GCprofile, CHROM_TEST, START_TEST, END_TEST)134    cov = AVERAGE_DEPTH135    for ref_info in reflist:136        ref, Depth_Ref = ref_info.strip().split()137        N = float(Depth_Ref)/2138        DR = read_ref(ref,CHROM_TEST, START_TEST,END_TEST)139        lgP = possibility(D, DR, cov, N, CHROM_TEST,START_TEST,END_TEST, MISMATCH,logmin)140        LIST.append((ref,lgP))141def possionFixPossionModelSiteBYSiteOptionGCmodify(reference_list, sample_list, outfile, threads, detect_region, average_depth, gc_config, binsize, slide_window, logmin, mismatch ):142    143    logmin = float(logmin)144    m1 = re.search('(\d+):(\d+)-(\d+)',detect_region)145    CHROM_TEST = int(m1.group(1))146    START_TEST = int(m1.group(2))147    END_TEST = int(m1.group(3))148    AVERAGE_DEPTH = float(average_depth)149    MISMATCH = float(mismatch)150    151    print('''152        CHROM_TEST = %s153        START_TEST = %s154        END_TEST = %s155        ''' % (CHROM_TEST, START_TEST , END_TEST)156          )157        158    reflist=[]159    samplelist=[]160    161    with open(reference_list) as f:162        for line in f:163            reflist.append(line.strip())164    with open(sample_list) as f:165        for line in f:166            samplelist.append(line.strip())167        168        sample_Dict = {}169        170        for sample in samplelist:171            sample_Dict[sample] = []172            calculator(sample_Dict[sample],sample,reflist, binsize, slide_window,173            gc_config,CHROM_TEST, START_TEST, END_TEST, AVERAGE_DEPTH, MISMATCH, logmin)174    with open(outfile,'w') as w:175        for sample in sample_Dict:176            mylist = sorted(sample_Dict[sample],key=lambda x: x[1],reverse=True)177            rank = 1178            for i in mylist:179                w.write('%s\t%s\t%s\t%s\n' % (sample,i[0], i[1],rank))180                rank += 1181    return 0182if __name__ == '__main__':183    184    parser = argparse.ArgumentParser()185    parser.add_argument('-rl', '--reference_list', help='reference list',required=True)186    parser.add_argument('-sl', '--sample_list', help='sample list',required=True)187    parser.add_argument('-o', '--outfile', help='outfile',required=True)188    parser.add_argument('-t', '--threads', help='threads',default=16)189    parser.add_argument('-r', '--detect_region', help='detect region, chr:start-end ',required=True)190    parser.add_argument('-d', '--average_depth', help='average depth',required=True)191    parser.add_argument('-c', '--gc_config', help='GC config',required=True)192    parser.add_argument('-bz', '--binsize', help='binsize',required=True)193    parser.add_argument('-sw', '--slide_window', help='slide_window',required=True)194    parser.add_argument('-l', '--logmin', help='logmin', default=1e-7)195    parser.add_argument('-m', '--mismatch', help='assume mismatch', default=1e-4)196    args = parser.parse_args()197    198    reference_list = args.reference_list199    sample_list = args.sample_list200    outfile = args.outfile201    threads = args.threads202    detect_region = args.args.detect_region203    average_depth = args.average_depth204    gc_config = args.gc_config205    binsize = args.binsize206    slide_window = args.slide_window207    logmin = args.logmin208    mismatch =  args.mismatch209    ...WeatherClassifier.py
Source:WeatherClassifier.py  
1import numpy as np2import pandas3import argparse4import matplotlib.pyplot as plt5import datetime6from keras.models import Sequential7from keras.layers import Dense, Dropout, advanced_activations8def main():9    # Get input args10    args = parse_arguments()11    # Init random seed for reproducibility12    np.random.seed(0)13    # load the dataset14    dataframe = pandas.read_csv(args["data_path"], engine='python', parse_dates=['DATE'],15                                date_parser=lambda x: pandas.to_datetime(x, infer_datetime_format=True))16    # Define the training set using the input begin and end dates17    train_df= dataframe[(dataframe['DATE'] >= datetime.datetime(args["begin_train"],1,1)) &18                        (dataframe['DATE'] <= datetime.datetime(args["end_train"],12,31))]19    # Define the testing set using the input begin and end dates20    test_df = dataframe[(dataframe['DATE'] >= datetime.datetime(args["begin_test"],1,1)) &21                        (dataframe['DATE'] <= datetime.datetime(args["end_test"],12,31))]22    # Remove null and other invalid entries in the data23    train_data = np.nan_to_num(train_df['TAVG'].values.astype('float32'))24    test_data = np.nan_to_num(test_df['TAVG'].values.astype('float32'))25    # Combine the data to one array26    combined_data = np.append(train_data, test_data)27    # reshape dataset to window matrix28    look_back = 12  # This is the size of the window29    trainX, trainY = create_dataset(train_data, look_back)30    testX, testY = create_dataset(test_data, look_back)31    # Define and fit the model32    model = create_model(look_back=look_back)33    model.fit(trainX, trainY, epochs=500, batch_size=12, verbose=2)34    # Estimate model performance35    trainScore = model.evaluate(trainX, trainY, verbose=0)36    print('Train Score: %.2f MAE' % (trainScore))37    testScore = model.evaluate(testX, testY, verbose=0)38    print('Test Score: %.2f MAE' % (testScore))39    # generate predictions for training40    trainPredict = model.predict(trainX)41    testPredict = model.predict(testX)42    # shift train predictions for plotting43    trainPredictPlot = np.empty((len(combined_data), 1))44    trainPredictPlot[:] = np.nan45    trainPredictPlot[look_back:len(trainPredict) + look_back] = trainPredict46    # shift test predictions for plotting47    testPredictPlot = np.empty((len(combined_data), 1))48    testPredictPlot[:] = np.nan49    testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(combined_data) - 1] = testPredict50    # Combine the results51    combined_df = train_df.append(test_df)52    combined_dates = combined_df['DATE']53    # plot baseline and predictions54    plt.plot(combined_dates, combined_data, )55    plt.plot(combined_dates, trainPredictPlot)56    plt.plot(combined_dates, testPredictPlot)57    plt.minorticks_on()58    plt.show()59# Standard Model Creation60def create_model(look_back):61    # create and fit Multilayer Perceptron model62    model = Sequential()63    model.add(Dense(100, input_dim=look_back, activation='relu'))64    # model.add(Dense(50, activation='relu'))65    # model.add(Dense(25, activation='relu'))66    # model.add(Dense(10, activation='relu'))67    # model.add(Dense(5, activation='relu'))68    model.add(Dense(1))69    model.compile(loss='mean_absolute_error', optimizer='nadam')70    return model71# convert an array of values into a dataset matrix72def create_dataset(dataset, look_back=1):73    dataX, dataY = [], []74    for i in range(len(dataset) - look_back - 1):75        a = dataset[i:(i + look_back)]76        dataX.append(a)77        dataY.append(dataset[i + look_back])78    return np.array(dataX), np.array(dataY)79# Command Line Arguments are parsed here80def parse_arguments():81    parser = argparse.ArgumentParser()82    parser.add_argument("-dp", "--data_path", help="Data File Path")83    parser.add_argument("-ad", "--adverse", help="Turns on Adversarial Learning")84    parser.add_argument("-m", "--mode", help="Choose mode: full, grid")85    parser.add_argument("-e", "--epochs", help="Number of Epochs", type=int, nargs="*")86    parser.add_argument("-tr", "--train_ratio", nargs="*", type=int,87                        help="Set Train Ratios. Enter as a percent (20,40,60,80). Can be a list space delimited")88    parser.add_argument("-bs", "--batch_size", nargs="*", type=int,89                        help="Batch size. Can be a list space delimited")90    parser.add_argument("-n", "--neurons", nargs="*", type=int,91                        help="Number of Neurons. Can be a list space delimited")92    parser.add_argument("-o", "--optimizer", nargs="*",93                        help="Optimizers. Can be a list space delimited")94    parser.add_argument("-w", "--weight_constraint", nargs="*", type=int,95                        help="Weight Constraint. Can be a list space delimited")96    parser.add_argument("-d", "--dropout", nargs="*", type=int,97                        help="Dropout. Enter as percent (10,20,30,40...). Can be a list space delimited.")98    parser.add_argument("-model", "--model", help="Select which model to run: all, one_layer, four_decr, four_same")99    parser.add_argument("-s", "--splits", help="Number of Splits for SSS", type=int)100    parser.add_argument("-btr", "--begin_train", help="Year to begin training (1940-2016)", type=int)101    parser.add_argument("-etr", "--end_train", help="Year to end training. Should be higher than begin & <=2017", type=int)102    parser.add_argument("-bts", "--begin_test", help="Year to begin testing (1940-2017)", type=int)103    parser.add_argument("-ets", "--end_test", help="Year to end testing. Should be higher than begin test.", type=int)104    args = parser.parse_args()105    arguments = {}106    if args.data_path:107        arguments["data_path"] = args.data_path108    else:109        print("Default Data Path: ../Data/BWIMonthly1939.csv")110        arguments["data_path"] = "../Data/BWIMonthly1939.csv"111    if args.adverse:112        adverse = True113    else:114        adverse = False115    arguments["adverse"] = adverse116    if args.mode == "grid":117        mode = "grid"118        print("Mode is %s" % mode)119    else:120        mode = "full"121        print("Mode is %s" % mode)122    arguments["mode"] = mode123    if args.model == "all":124        model = ["oneLayer", "fourDecr", "fourSame"]125    elif args.model in ["oneLayer", "fourDecr", "fourSame"]:126        model = [args.model]127    else:128        print("Defaulting to All models")129        model = ["oneLayer", "fourDecr", "fourSame"]130    arguments["model"] = model131    if args.epochs:132        epochs = args.epochs133    else:134        print("Defaulting to 16 epochs")135        epochs = 16136    arguments["epochs"] = epochs137    if args.train_ratio:138        train_ratio = args.train_ratio139    else:140        print("Defaulting to testing all ratios")141        train_ratio = [20, 40, 60, 80]142    arguments["train_ratio"] = train_ratio143    if args.batch_size:144        batch_size = args.batch_size145    else:146        print("Defaulting to Batch Size 10")147        batch_size = 10148    arguments["batch_size"] = batch_size149    if args.neurons:150        neurons = args.neurons151    else:152        print("Defaulting to 45 Neurons")153        neurons = 45154    arguments["neurons"] = neurons155    if args.optimizer:156        optimizer = args.optimizer157    else:158        print("Defaulting to NADAM Optimizer")159        optimizer = "Nadam"160    arguments["optimizer"] = optimizer161    if args.weight_constraint:162        weight_constraint = args.weight_constraint163    else:164        print("Defaulting to weight constraint 5")165        weight_constraint = 5166    arguments["weight_constraint"] = weight_constraint167    if args.dropout:168        dropout = args.dropout169    else:170        print("Defaulting to dropout of 10%")171        dropout = 10172    arguments["dropout"] = dropout173    if args.splits:174        splits = args.splits175    else:176        print("Defaulting to 1 SSS Split")177        splits = 1178    arguments["splits"] = splits179    if args.begin_train:180        begin_train = args.begin_train181    else:182        print("Default begin training is 1940")183        begin_train = 1940184    arguments["begin_train"] = begin_train185    if args.end_train:186        end_train = args.end_train187    else:188        print("Defult end training is 1980")189        end_train = 1980190    if end_train < begin_train:191        print("End_Train should be bigger than Begin_Train")192        exit(1)193    arguments["end_train"] = end_train194    if args.begin_test:195        begin_test = args.begin_test196    else:197        print("Default begin test is 1981")198        begin_test = 1981199    arguments["begin_test"] = begin_test200    if args.end_test:201        end_test = args.end_test202    else:203        print("Default end test is 2017")204        end_test = 2017205    if end_test < begin_test:206        print("End_Test should be bigger than Begin_Test")207        exit(1)208    arguments["end_test"] = end_test209    return arguments210if __name__ == "__main__":...sarimax_script.py
Source:sarimax_script.py  
1import matplotlib.pyplot as plt2import pandas as pd3import numpy as np4import itertools5import statsmodels.api as sm6from statsmodels.tsa.stattools import adfuller7from datetime import datetime, timedelta8from dateutil.relativedelta import *9import statsmodels.tsa.api as smt10import seaborn as sns11from sklearn.metrics import mean_squared_error12import pickle13from script import *14#function to create training and testing set15def create_train_test(data, start_train, end_train, start_test, end_test, test_length=24):16    df_train = data.loc[start_train:end_train, :]17    df_test = data.loc[start_test:end_test, :]18    start = datetime.strptime(start_test, '%Y-%m-%d %H:%M:%S')19    date_list = [start + relativedelta(hours=x) for x in range(0,test_length)] #test set will always have 24 hours20    future = pd.DataFrame(index=date_list, columns= df_train.columns)21    df_train = pd.concat([df_train, future])22    return df_train, df_test23#function to add all exogenous variables24def add_exog(data, weather, start_time, end_time):25    #add dummy variables for precipitation26    precip = pd.get_dummies(weather.precip_type)27    data = data.join(precip)28    data['Day_of_Week'] = data.index.dayofweek29    data['Weekend'] = data.apply(is_weekend, axis=1)30    data['Temperature'] = weather.loc[start_time:end_time, 'temperature']31    data['Humidity'] = weather.loc[start_time:end_time, 'humidity']32    data['Precip_Intensity'] = weather.loc[start_time:end_time, 'precip_intensity']33    data.rename(columns={'rain':'Rain', 'sleet':'Sleet', 'snow':'Snow'}, inplace=True)34    #fill missing values with mean35    data['Temperature'] = data.Temperature.fillna(np.mean(data['Temperature']))36    data['Humidity'] = data.Humidity.fillna(np.mean(data['Humidity']))37    data['Precip_Intensity'] = data.Precip_Intensity.fillna(np.mean(data['Precip_Intensity']))38    return data39#function to start/end dates for train and test40def find_dates(building_id, length=1, total_length=30, final_date=None):41    start_train, end_test = find_egauge_dates(building_id, total_length)42    time_delta_1 = timedelta(days=length)43    time_delta_2 = timedelta(hours=1)44    end_train = end_test - time_delta_145    start_test = end_train + time_delta_246    start_train = str(start_train)47    end_train = str(end_train)48    start_test = str(start_test)49    end_test = str(end_test)50    return start_train, end_train, start_test, end_test51def fit_exog_arima(data, weather, building_id, length=1, total_length=30, test_length=24):52    start_train, end_train, start_test, end_test = find_dates(building_id, length=length, total_length=total_length)53    df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, test_length)54    df_exog = add_exog(df_train, weather, start_train, end_test)55    exogenous = df_exog.loc[start_train:,['Weekend','Temperature','Humidity','car1']].astype(float)56    endogenous = df_exog.loc[:,'Hourly_Usage'].astype(float)57#    low_aic = gridsearch_arima(endogenous,exogenous)58#    arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,59#                                   exog = exogenous,60#                                   trend=None,61#                                   order=low_aic[0],62#                                   seasonal_order=low_aic[1],63#                                   enforce_stationarity=False,64#                                   enforce_invertibility=False)65    arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,66                                  exog = exogenous,67                                  trend=None,68                                  order=(1, 0, 1),69                                  seasonal_order=(0, 1, 1, 24),70                                  enforce_stationarity=False,71                                  enforce_invertibility=False)72    results = arima_model.fit()73    return df_exog, results74def plot_exog_arima(data, data_exog, model, building_id, length=1, total_length=30, test_length=24):75    start_train, end_train, start_test, end_test = find_dates(building_id, length=length, total_length=total_length)76    df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, test_length=test_length)77    df_exog_train, df_exog_test = create_train_test(data_exog, start_train, end_train, start_test, end_test, test_length=test_length)78    mse, rmse = add_forecast(model, df_test, df_exog_train, start_test, end_test)79    plot_forecast(df_exog_train, 500)80    return mse, rmse, df_exog_train81#function to find optimal parameters and resulting AIC score82def gridsearch_arima(y, exog=None):83    p = d = q = range(0, 2)84    pdq = list(itertools.product(p, d, q))85    seasonal_pdq = [(x[0], x[1], x[2], 24) for x in list(itertools.product(p, d, q))]86    low_aic = [0,0,50000]87    for param in pdq:88        for param_seasonal in seasonal_pdq:89            try:90                model = sm.tsa.statespace.SARIMAX(y,91                                                  exog=exog,92                                                order=param,93                                                seasonal_order=param_seasonal,94                                                enforce_stationarity=False,95                                                enforce_invertibility=False)96                results = model.fit()97                if results.aic < low_aic[2]:98                    low_aic = [param, param_seasonal, results.aic]99#                 print('ARIMA{}x{}24 - AIC:{}'.format(param, param_seasonal, results.aic))100            except:101                continue102    return low_aic103#function to forecast with fitted model, returns MSE and RMSE104def add_forecast(model, test, train, start_time, end_time):105    train['forecast'] = model.predict(start=start_time, end=end_time)106    y_true = test.loc[start_time:end_time, 'Hourly_Usage']107    y_pred = train.loc[start_time:end_time, 'forecast']108    train.loc[start_time:end_time, 'Hourly_Usage'] = test.loc[start_time:end_time, 'Hourly_Usage']109    mse = mean_squared_error(y_true, y_pred)110    rmse = np.sqrt(mse)111    return mse, rmse112def plot_forecast(data, datapoints):113    fig = plt.figure(figsize=(16,8))114    plt.plot(data['Hourly_Usage'][datapoints:])115    plt.plot(data['forecast'])116    plt.legend()117#function to find mean car charge118def mean_car_charge(data, start, end):119    car_charge = {}120    for index in data.Time_Index.unique():121        car_charge[index] = np.mean(data[data.Time_Index==index].car1)122    return car_charge123#function to add all exogenous variables124def create_exog_endo(data, weather, building_id, length=1, total_length=30):125    start_train, end_train, start_test, end_test = find_dates(building_id, length, total_length)126    df_train, df_test = create_train_test(data, start_train, end_train, start_test, end_test, 24*length)127    car_charge = mean_car_charge(data, start_train,end_train)128    df_train['Time_Index'] = df_train.index.weekday_name+ df_train.index.hour.astype(str)129    df_train['Temperature'] = weather.loc[start_train:end_test, 'temperature']130    df_train['Humidity'] = weather.loc[start_train:end_test, 'humidity']131    for time in df_train.loc[start_test:end_test,:].index:132        df_train.loc[time,'car1'] = car_charge[df_train.loc[time,'Time_Index']]133    #fill missing values with mean134    df_train['Temperature'] = df_train.Temperature.fillna(np.mean(df_train['Temperature']))135    df_train['Humidity'] = df_train.Humidity.fillna(np.mean(df_train['Humidity']))136    exogenous = df_train.loc[start_train:,['Temperature','Humidity','car1']].astype(float)137    endogenous = df_train.loc[:,'Hourly_Usage'].astype(float)138    return df_train, exogenous, endogenous139#function to fit SARIMAX model with create_exog_endo140def fit_exog_arima_new(exogenous, endogenous):141    low_aic = gridsearch_arima(endogenous,exogenous)142    arima_model = sm.tsa.statespace.SARIMAX(endog=endogenous,143                                  exog = exogenous,144                                  trend=None,145                                  order=low_aic[0],146                                  seasonal_order=low_aic[1],147                                  enforce_stationarity=False,148                                  enforce_invertibility=False)149    arima_exog_results = arima_model.fit()...data_parse.py
Source:data_parse.py  
1import re2import os3import csv4import sys5import numpy as np6import pickle as pkl7from random import sample8from DataProcess.config import ConfigParam9class DataParse:10    def random_embedding(self, embedding_dim, word_num):11        """12        éæºççæwordçembeddingï¼è¿é妿æè¯æå
è¶³çè¯ï¼å¯ä»¥ç´æ¥ä½¿ç¨word2vecè念åºè¯åéï¼è¿æ ·è¯ä¹é´çåºå«å¯è½æ´å¤§ã13        :param embedding_dim:  è¯åéç维度ã14        :return: numpy format array. shape is : (vocab, embedding_dim)15        """16        # æ ¹æ®åå
¸å¤§å°word_numç»åå
¸å
æ¯ä¸ä¸ªåè¯çæä¸ä¸ªembedding_dim维度çåéã17        # éç¨æªææ£å¤ªåå¸çæ¹å¼çæãåææ¯åè®¾ä½ çæ°æ®é满足æ£å¤ªåå¸18        embedding_mat = np.random.uniform(-0.25, 0.25, (word_num, embedding_dim))19        embedding_mat = np.float32(embedding_mat)20        return embedding_mat21class Data_Inter:22    """23    çæè®ç»æ°æ®24    """25    def __init__(self, vocab_au, vocab_key):26        self.config = ConfigParam()27        self.vocab_au = vocab_au  # åå
¸è·¯å¾28        self.vocab_key = vocab_key  # åå
¸è·¯å¾29        self.index = 030        self.index_test = 031        self.reload_num = 1032        # train set33        self.task_sentence = pkl.load(open('DataProcess/data_manage_key_mapping.pkl', mode='rb'))34        self.au_all, self.ke_all = self.get_author_art()35        print(self.au_all.shape, self.ke_all.shape)36        # TRAIN37        self.get_data_mapping()38    def get_data_mapping(self):39        self.shuffle_all = sample(range(0, self.au_all.shape[0], 1), self.au_all.shape[0])40        self.all_sam = len(self.shuffle_all)41        self.au = self.au_all[self.shuffle_all[: int(self.all_sam * 0.7)]]  # 0.842        self.ke = self.ke_all[self.shuffle_all[: int(self.all_sam * 0.7)]]43        self.au_test = self.au_all[self.shuffle_all[int(self.all_sam * 0.7):]]44        self.ke_test = self.ke_all[self.shuffle_all[int(self.all_sam * 0.7):]]45        self.shuffle = sample(range(0, self.au.shape[0], 1), self.au.shape[0])46        self.shuffle_test = sample(range(0, self.au_test.shape[0], 1), self.au_test.shape[0])47        self.end = self.au.shape[0]48        self.end_test = self.au_test.shape[0]49    def get_author_art(self):50        auth = []51        keys = []52        for j in self.task_sentence:53            for k in j:54                auth.append(k[: 2])55                keys.append(k[2][: 7])56        return np.array(auth), np.array(keys)57    def next(self):58        # è·åæ¹59        sentence = []60        task_ = []61        if self.index + self.config.batch_size < self.end:  # 没æéåå°æ«å°¾62            it_data = self.shuffle[self.index: self.index + self.config.batch_size]63            self.index += self.config.batch_size  # æ è¯è·åæ¹æ¬¡çç´¢å¼å¼éæ¹æ¹å64        elif self.index + self.config.batch_size == self.end:  # å好éåå°æ«å°¾65            it_data = self.shuffle[self.index + self.config.batch_size: self.end]66            self.shuffle = sample(range(0, self.end, 1), self.end)67            self.index = 068        else:69            it_data = self.shuffle[self.index: self.end]  # éæºéå70            self.shuffle = sample(range(0, self.end, 1), self.end)71            remain = self.shuffle[: self.index + self.config.batch_size - self.end]  # å©ä½72            it_data = np.concatenate((it_data, remain), axis=0)73            self.index = 074            if self.reload_num > 0:75                self.get_data_mapping()76                self.reload_num -= 177        # print('it_data:', it_data)78        sentences_au = self.au[it_data]79        sentences_key = self.ke[it_data]80        for cur_sentences, cur_task in zip(sentences_au, sentences_key):81            task_.append(self.sentence2index(cur_sentences, self.vocab_au))  # author mapping82            sentence.append(self.sentence2index(cur_task, self.vocab_key))  # keys mapping83        return np.array(sentence), np.array(task_)84    def next_test(self):85        # è·åæ¹86        sentence = []87        task_ = []88        if self.index_test + self.config.batch_size <= self.end_test:  # 没æéåå°æ«å°¾89            # ä»task_sentenceéé¢è·åself.config.batch_size大å°çå90            it_data = self.shuffle_test[self.index_test: self.index_test + self.config.batch_size]  # è¿ä»£æ°æ®91            self.index_test += self.config.batch_size  # æ è¯è·åæ¹æ¬¡çç´¢å¼å¼éæ¹æ¹å92        elif self.index_test + self.config.batch_size == self.end_test:  # å好éåå°æ«å°¾93            it_data = self.shuffle_test[self.index_test + self.config.batch_size: self.end_test]94            self.shuffle_test = sample(range(0, self.end_test, 1), self.end_test)95            self.index_test = 0  # éç½®indexãé常ç®åçå°å¦æ°å¦é®é¢ï¼åé¢ä¸å¨æ³¨éäºï¼ä¸ç¼ä¾¿æ96        else:97            it_data = self.shuffle_test[self.index_test: self.end_test]  # éæºéå98            self.shuffle_test = sample(range(0, self.end_test, 1), self.end_test)99            remain = self.shuffle_test[: self.index_test + self.config.batch_size - self.end_test]  # å©ä½100            it_data = np.concatenate((it_data, remain), axis=0)101            self.index_test = 0102        sentences_au = self.au_test[it_data]103        sentences_key = self.ke_test[it_data]104        for cur_sentences, cur_task in zip(sentences_au, sentences_key):105            task_.append(self.sentence2index(cur_sentences, self.vocab_au))  # author mapping106            sentence.append(self.sentence2index(cur_task, self.vocab_key))  # keys mapping107        return np.array(sentence), np.array(task_)108    def sentence2index(self, sen, vocab):109        sen2id = []110        for cur_sen in sen:111            sen2id.append(vocab.get(cur_sen, 0))  # 妿æ¾ä¸å°ï¼å°±ç¨0代æ¿ã112        return sen2id113    def task2index(self, cur_tasks, mapping):114        assert isinstance(cur_tasks, list) and len(cur_tasks) > 0 and hasattr(cur_tasks, '__len__')115        assert isinstance(mapping, dict) and len(mapping) > 0116        cur_task2index_mapping = []117        for cur_task in cur_tasks:118            cur_task2index_mapping.append(mapping[cur_task])119        return cur_task2index_mapping120if __name__ == '__main__':121    from DataProcess.dataset_info import info122    au_vocab, key_vocab, au_len, key = info('DataProcess/data_manage_key_mapping.pkl')...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
