Best Python code snippet using Kiwi_python
LHS_NLP_ML.py
Source:LHS_NLP_ML.py  
1#!/usr/bin/env python2# coding: utf-83# In[14]:4import pandas as pd5df_training_tweet = pd.read_csv("/Users/arjunanandapadmanabhan/Downloads/wn22_data/wn22_PA_training_tweets.txt", sep=",")6df_labels = pd.read_csv("/Users/arjunanandapadmanabhan/Downloads/wn22_data/wn22_PA_training_labels.txt", sep=",")7final_df = pd.merge(df_training_tweet, df_labels, on="TweetID")8final_df9df_testing_tweet = pd.read_csv("/Users/arjunanandapadmanabhan/Downloads/wn22_data/wn22_PA_testing_tweets.txt", sep=",")10df_testing_tweet11import re12import nltk13from nltk.corpus import stopwords14def clean_text(df, text_field):15    df[text_field] = df[text_field].str.lower()16    df[text_field] = df[text_field].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|(\d+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x)) 17    return df18test_clean = clean_text(df_testing_tweet, "Tweet")19train_clean = clean_text(final_df, "Tweet")20train_clean.Tweet[4]21import pandas as pd22import numpy as np23import nltk24import string25import contractions26from nltk.tokenize import word_tokenize27from nltk.corpus import stopwords, wordnet28from nltk.stem import WordNetLemmatizer29train_clean['no_contract'] = train_clean['Tweet'].apply(lambda x: [contractions.fix(word) for word in x.split()])30train_clean['Tweet'] = [' '.join(map(str, l)) for l in train_clean['no_contract']]31test_clean['no_contract'] = test_clean['Tweet'].apply(lambda x: [contractions.fix(word) for word in x.split()])32test_clean['Tweet'] = [' '.join(map(str, l)) for l in test_clean['no_contract']]33train_clean['tokenized'] = train_clean['Tweet'].apply(word_tokenize)34train_clean['tokenized'] = train_clean['tokenized'].apply(lambda x: [word.lower() for word in x])35test_clean['tokenized'] = test_clean['Tweet'].apply(word_tokenize)36test_clean['tokenized'] = test_clean['tokenized'].apply(lambda x: [word.lower() for word in x])37punc = string.punctuation38train_clean['no_punc'] = train_clean['tokenized'].apply(lambda x: [word for word in x if word not in punc])39test_clean['no_punc'] = test_clean['tokenized'].apply(lambda x: [word for word in x if word not in punc])40stop_words = set(stopwords.words('english'))41train_clean['tokenized']  = train_clean['no_punc'] .apply(lambda x: [word for word in x if word not in stop_words])42test_clean['tokenized']  = test_clean['no_punc'] .apply(lambda x: [word for word in x if word not in stop_words])43train_clean['pos_tags'] = train_clean['tokenized'].apply(nltk.tag.pos_tag)44test_clean['pos_tags'] = test_clean['tokenized'].apply(nltk.tag.pos_tag)45def get_wordnet_pos(tag):46    if tag.startswith('J'):47        return wordnet.ADJ48    elif tag.startswith('V'):49        return wordnet.VERB50    elif tag.startswith('N'):51        return wordnet.NOUN52    elif tag.startswith('R'):53        return wordnet.ADV54    else:55        return wordnet.NOUN56train_clean['wordnet_pos'] = train_clean['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])57test_clean['wordnet_pos'] = test_clean['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])58wnl = WordNetLemmatizer()59train_clean['lemmatized'] = train_clean['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])60test_clean['lemmatized'] = test_clean['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])61train_clean['Tweet'] = [' '.join(map(str, l)) for l in train_clean['lemmatized']]62test_clean['Tweet']  = [' '.join(map(str, l)) for l in test_clean['lemmatized']]63# Can be used to check the frequency of a term64# d = {}65# for word in train_clean['Tweet']:66#     for item in word.split():67#         if item in d:68#             d[item] = d[item] + 169#         else:70#             d[item] = 171# for i, word in enumerate(train_clean['Tweet']):72#     for item in word.split():73#         if d[item] <=10:74#             train_clean['Tweet'][i] = train_clean['Tweet'][i].replace(item, '')75#             train_clean['Tweet'][i] = re.sub(' +', ' ', train_clean['Tweet'][i])76            77            78# train_clean['Tweet'][0]79from sklearn.utils import resample80train_majority = train_clean[train_clean.Label==0]81train_minority = train_clean[train_clean.Label==1]82train_minority_upsampled = resample(train_minority, 83                                 replace=True,    84                                 n_samples=len(train_majority),   85                                 random_state=123)86train_upsampled = pd.concat([train_minority_upsampled, train_majority])87train_upsampled['Label'].value_counts()88from sklearn.feature_extraction.text import TfidfVectorizer89from sklearn.pipeline import Pipeline90from sklearn.feature_extraction.text import CountVectorizer91from sklearn.feature_extraction.text import TfidfTransformer92# from sklearn.linear_model import SGDClassifier93from sklearn.neural_network import MLPClassifier94# from sklearn.neighbors import KNeighborsClassifier95# from sklearn.ensemble import RandomForestClassifier96from sklearn.ensemble import GradientBoostingClassifier97from sklearn import svm98# from sklearn import tree99# pipeline_svc = Pipeline([100#     ('vect', CountVectorizer()),101#     ('tfidf',  TfidfTransformer()),102#     ('nb', SGDClassifier(learning_rate = 'constant', eta0=0.96, epsilon=0.0004, max_iter=5000, validation_fraction=0.8, loss='log')),103# ])104# pipeline_svc = Pipeline([105#     ('vect', CountVectorizer()),106#     ('tfidf',  TfidfTransformer()),107#     ('nb', MLPClassifier(hidden_layer_sizes = (2,),solver='adam', learning_rate='adaptive', max_iter=1000, epsilon=1e-6, tol=0.0001))108# ])109# pipeline_svc = Pipeline([110#     ('vect', CountVectorizer(ngram_range=(1,3) )),111#     ('tfidf',  TfidfTransformer()),112#     ('nb', svm.SVC(kernel='poly', C=2.5, degree=2, coef0=0.18, break_ties=True))113# ])114pipeline_svc = Pipeline([115    ('vect', CountVectorizer()),116    ('tfidf',  TfidfTransformer()),117    ('nb', svm.SVC(gamma='scale'))118])119# pipeline_svc = Pipeline([120#     ('vect', CountVectorizer()),121#     ('tfidf',  TfidfTransformer()),122#     ('nb', GradientBoostingClassifier(n_estimators=10000, subsample=1))123# ])124# from sklearn.model_selection import train_test_split125# from sklearn.metrics import f1_score126# from numpy import arange127# ep = [0.001, 0.0002, 0.0003, 0.0004, 0.0007, 0.005, 0.03, 0.01, 0.003, 0.000004, 0.00000006, 0.005]128# max_num = 0.0129# ep_max = 0.0130# et_max = 0.0131# for e in ep:132#     for et in arange(0.01,1, 0.01):133#         pipeline_svc = Pipeline([134#         ('vect', CountVectorizer()),135#         ('tfidf',  TfidfTransformer()),136#         ('nb', SGDClassifier(learning_rate = 'constant', eta0=et, epsilon=e, max_iter=5000, validation_fraction=0.8, loss='log')),137#     ])138#         X_train, X_test, y_train, y_test = train_test_split(train_upsampled['Tweet'], train_upsampled['Label'],random_state = 0)139#         model = pipeline_svc.fit(X_train, y_train)140#         y_predict = model.predict(X_test)141#         x = f1_score(y_test, y_predict)142#         if x > max_num:143#             max_num = x144#             ep_max = e145#             et_max = et146            147            148# print(max_num, ep_max, et_max)149# In[124]:150# from sklearn.model_selection import train_test_split151# X_train, X_test, y_train, y_test = train_test_split(train_upsampled['Tweet'], train_upsampled['Label'],random_state = 123)152# In[137]:153X_train = train_upsampled['Tweet']154y_train = train_upsampled['Label']155# In[138]:156model = pipeline_svc.fit(X_train, y_train)157# y_predict = model.predict(X_test)158# from sklearn.metrics import f1_score159# f1_score(y_test, y_predict)160# In[140]:161x_valid = df_testing_tweet['Tweet']162y_predict_1 = model.predict(x_valid)163y_predict_1164# In[141]:165# from sklearn.feature_extraction.text import TfidfVectorizer166# tfidf_vec = TfidfVectorizer()167# x_training = tfidf_vec.fit_transform(X_train)168# x_validation = tfidf_vec.transform(X_test)169# In[142]:170# from sklearn import svm171# model_svm = svm.SVC()172# model_svm.fit(x_training,y_train)173# In[143]:174# pred_svm = model_svm.predict(x_validation)175# In[144]:176# from sklearn.metrics import f1_score177# f1_score(y_test, pred_svm)178# In[145]:179# x_valid = df_testing_tweet['Tweet']180# x_test = tfidf_vec.transform(x_valid)181# y_predict_1 = model_svm.predict(x_test)182# y_predict_1183# In[146]:184pred = pd.DataFrame(y_predict_1)185pred.columns = ['Label']186pred187data = [df_testing_tweet['TweetID'], pred['Label']]188headers = ["TweetID", "Label"]189Final = pd.concat(data, axis=1, keys=headers)190Final.to_csv(r'/Users/arjunanandapadmanabhan/Downloads/wn22_data/output_2.txt', header=True, index=None, sep=',', mode='a')191# Confusion Matrix192from sklearn.metrics import confusion_matrix193import matplotlib.pyplot as plt194import itertools195import numpy as np196def plot_confusion_matrix(cm, classes,197                          normalize=False,198                          title='Confusion matrix',199                          cmap=plt.cm.Blues):200    """201    This function prints and plots the confusion matrix.202    Normalization can be applied by setting `normalize=True`.203    """204    if normalize:205        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]206        print("Normalized confusion matrix")207    else:208        print('Confusion matrix, without normalization')209        210    plt.imshow(cm, interpolation='nearest', cmap=cmap)211    plt.title(title)212    plt.colorbar()213    tick_marks = np.arange(len(classes))214    plt.xticks(tick_marks, classes, rotation=45)215    plt.yticks(tick_marks, classes)216    fmt = '.2f' if normalize else 'd'217    thresh = cm.max() / 2.218    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):219        plt.text(j, i, format(cm[i, j], fmt),220                 horizontalalignment="center",221                 color="white" if cm[i, j] > thresh else "black")222    plt.ylabel('True label')223    plt.xlabel('Predicted label')224    plt.tight_layout()225classes=list(set(y_test))226cm = confusion_matrix(y_test, y_predic, labels=classes)...Ion XGBoost.py
Source:Ion XGBoost.py  
1#!/usr/bin/env python32# -*- coding: utf-8 -*-3"""4Created on Mon May  4 14:03:53 20205@author: feichang6"""7import pandas as pd8import numpy as np9import matplotlib.pyplot as plt10train = pd.read_csv('train.csv')11test = pd.read_csv('test.csv')12type(train.shape[0])13train_clean = train14"""15#take a look at the train16plt.figure(figsize=(5,5)); res = 100017plt.plot(range(0,train.shape[0],res),train.signal[0::res])18for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')19plt.show()20#take a look at the test21plt.figure(figsize=(10,5)); res = 100022plt.plot(range(0,test.shape[0],res),test.signal[0::res])23plt.show()24"""25#first, fix the first drift in seg 2, 500000-60000026a = 50000027b = 60000028seg_1 = train.loc[train.index[a:b], 'signal'].values29time_1 = train.loc[train.index[a:b], 'time'].values30from sklearn.linear_model import LinearRegression31regressor = LinearRegression()32regressor.fit(time_1.reshape(-1,1),seg_1)33train_clean.loc[train.index[a:b], 'signal'] = train_clean.signal[a:b].values - regressor.coef_*(train_clean.time.values[a:b] - 50)34"""35plt.figure(figsize=(5,5)); res = 100036plt.plot(range(0,train_clean.shape[0],res),train.signal[0::res])37for i in range(11): plt.plot([i*500000,i*500000],[-5,12.5],'r')38plt.show()39"""40#then fix the polynomial drift41a = 0 42while a < 4500001:43    b = a + 50000044    seg_2 = train.loc[train.index[a:b], 'signal'].values45    time_2 = train.loc[train.index[a:b], 'time'].values46    47    from sklearn.preprocessing import PolynomialFeatures48    poly_reg = PolynomialFeatures(degree = 2)49    time_poly = poly_reg.fit_transform(time_2.reshape(-1,1))50    #define poly regressor51    lin_reg2 = LinearRegression()52    lin_reg2.fit(time_poly,seg_2)53    drift_0 = lin_reg2.predict(time_poly)[0]54    drift = lin_reg2.predict(time_poly) - drift_055    train_clean.loc[train.index[a:b], 'signal'] = train_clean.signal[a:b].values - drift56    a += 50000057#now the signal data is clean, look: 58"""59res = 100060plt.plot(range(0,train.shape[0],res),train.signal[0::res])61"""62"""63We also need to take the average current of the "phase" into consideration64"""65a = 0 66train_clean['Mean'] = 0.67train_clean['stdev'] = 0.68while a < 4500001:69    b = a + 50000070    avg = np.mean(train_clean.signal[a:b].values, dtype = 'float32')71    std = np.std(train_clean.signal[a:b].values, dtype = 'float32')72    train_clean.Mean[a:b].values.fill(avg) 73    train_clean.stdev[a:b].values.fill(std)74    a += 50000075"""76train_clean.head()77"""78from sklearn.model_selection import train_test_split79X_train, X_test, y_train, y_test = train_test_split(train_clean[['signal','Mean','stdev']], train_clean['open_channels'], test_size = 0.25, random_state = 0)80"""81from sklearn.tree import DecisionTreeClassifier82classifier_1 = DecisionTreeClassifier(random_state = 0, max_depth = 11, 83                                      min_samples_split = 32, min_samples_leaf = 5)84classifier_1.fit(X_train, y_train)85"""86from xgboost import XGBClassifier87classifier_1 = XGBClassifier()88classifier_1.fit(X_train, y_train)89prediction = classifier_1.predict(X_test)90from sklearn.metrics import f1_score91F1 = f1_score(y_test, prediction, average = 'macro')92print('F1 score:', F1)93"""94now it's very close. We are ready to move to the next step95plt.plot(prediction,'red')96plt.plot(train_clean['open_channels'])97"""98"""99now we clean the test data100first take a look101plt.plot(test['signal'])102"""103#every 100000 points, a phase, till 1000000104test_clean = test105a = 0106while a < 900001:107    b = a + 100000108    seg_1 = test.loc[train.index[a:b], 'signal'].values109    time_1 = test.loc[train.index[a:b], 'time'].values110    regressor_3 = LinearRegression()111    regressor_3.fit(time_1.reshape(-1,1),seg_1)112    drift_0 = regressor_3.predict(time_1.reshape(-1,1))[0]113    drift = regressor_3.predict(time_1.reshape(-1,1)) - drift_0114    115    test_clean.loc[train.index[a:b], 'signal'] = test_clean.signal[a:b].values - drift116    a += 100000117"""118take a look119plt.plot(test_clean['signal'])120"""121#1000000 to 1500000 a polynomial122a = 1000000123b = 1500000124seg_2 = test.loc[train.index[a:b], 'signal'].values125time_2 = test.loc[train.index[a:b], 'time'].values126    127poly_reg = PolynomialFeatures(degree = 2)128time_poly = poly_reg.fit_transform(time_2.reshape(-1,1))129lin_reg2 = LinearRegression()130lin_reg2.fit(time_poly,seg_2)131drift_0 = lin_reg2.predict(time_poly)[0]132drift = lin_reg2.predict(time_poly) - drift_0133test_clean.loc[test_clean.index[a:b], 'signal'] = test_clean.signal[a:b].values - drift + 0.25134"""135take a look136plt.figure(figsize=(20,5)); res = 1000137plt.plot(range(0,test_clean.shape[0],res),test_clean.signal[0::res])138plt.plot(test_clean['signal'])139plt.figure(figsize=(20,5)); res = 1000140plt.plot(pd.read_csv('test.csv')['signal'])141"""142a = 0 143test_clean['Mean'] = 0.144test_clean['stdev'] = 0.145while a < 1900001:146    b = a + 100000147    avg = np.mean(test_clean.signal[a:b].values, dtype = 'float32')148    std = np.std(test_clean.signal[a:b].values, dtype = 'float32')149    test_clean.Mean[a:b].values.fill(avg) 150    test_clean.stdev[a:b].values.fill(std)151    a += 100000152test_clean.head()153prediction = classifier_1.predict(test_clean[['signal','Mean','stdev']])154"""155Take a look at the mean156plt.plot(test_clean['Mean'])157plt.plot(train_clean['Mean'])158"""159"""160output = pd.DataFrame({'time': test_clean['time'], 'open_channels': prediction})161output.to_csv('my_submission.csv', index=False)162print("Your submission was successfully saved!")163"""164samplesubmission = pd.read_csv('sample_submission.csv', dtype={'time': 'str'})165samplesubmission.info()166output = pd.DataFrame({'time': samplesubmission.time, 'open_channels': prediction})167output.to_csv('submission.csv', index=False)168sub = pd.read_csv('submission.csv')...baseline.py
Source:baseline.py  
1import pandas as pd2from sklearn.tree import DecisionTreeRegressor3from sklearn.ensemble import RandomForestRegressor4from sklearn.model_selection import train_test_split5from sklearn.metrics import mean_absolute_error6def basic_cleaning(train_feature_path, train_labels_path, test_feature_path, export=False, out_path=None):7    train_features=pd.read_csv(train_feature_path)8    train_labels=pd.read_csv(train_labels_path)9    train_features=pd.merge(train_features, train_labels, on=['city', 'year','weekofyear'])10    test_features=pd.read_csv(test_feature_path)11    #convert kelvin to celsius12    train_features[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_tdtr_k']]=train_features[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_tdtr_k']]-273.1513    test_features[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_tdtr_k']]=test_features[['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_tdtr_k']]-273.1514    #dropping rows where no temperature data is available15    train_clean=train_features.dropna(subset=['reanalysis_air_temp_k', 'reanalysis_avg_temp_k','station_avg_temp_c'], how='all')16    # inputation for submission17    test_clean=test_features.copy(deep=True)18    #inputation - temperature data19    train_clean['station_avg_temp_c'].fillna(train_clean['reanalysis_avg_temp_k'], inplace=True)20    train_clean['station_diur_temp_rng_c'].fillna(train_clean['reanalysis_tdtr_k'], inplace=True)21    train_clean['station_max_temp_c'].fillna(train_clean['reanalysis_max_air_temp_k'], inplace=True)22    train_clean['station_min_temp_c'].fillna(train_clean['reanalysis_min_air_temp_k'], inplace=True)23    test_clean['station_avg_temp_c'].fillna(test_clean['reanalysis_avg_temp_k'], inplace=True)24    test_clean['station_diur_temp_rng_c'].fillna(test_clean['reanalysis_tdtr_k'], inplace=True)25    test_clean['station_max_temp_c'].fillna(test_clean['reanalysis_max_air_temp_k'], inplace=True)26    test_clean['station_min_temp_c'].fillna(test_clean['reanalysis_min_air_temp_k'], inplace=True)27    #inputation - vegetation index28    for i in ['ndvi_ne','ndvi_sw','ndvi_nw','ndvi_se']:29        train_clean[i]=train_clean[i].interpolate()30        test_clean[i]=test_clean[i].interpolate()31    #inputation - precipitation level32    train_clean['station_precip_mm'].fillna(train_clean['reanalysis_sat_precip_amt_mm'], inplace=True)33    test_clean['station_precip_mm'].fillna(test_clean['reanalysis_sat_precip_amt_mm'], inplace=True)34    #dropping duplicate columns35    train_clean.drop(['precipitation_amt_mm','reanalysis_sat_precip_amt_mm'],axis=1, inplace=True)36    test_clean.drop(['precipitation_amt_mm','reanalysis_sat_precip_amt_mm'],axis=1, inplace=True)37    #drop useless column38    train_clean.drop('week_start_date', axis=1, inplace=True)39    test_clean.drop('week_start_date', axis=1, inplace=True)40    #encode city as binary variable41    train_clean['city']=train_clean['city'].map({'sj':1, 'iq':0})42    test_clean['city']=test_clean['city'].map({'sj':1, 'iq':0})43    for i in test_clean.columns:44        test_clean[i]=test_clean[i].interpolate()45    if export:46        train_clean.to_csv('train_'+out_path)47        test_clean.to_csv('test_'+out_path)48    return train_clean, test_clean49def export_submission(test_clean, out_path, model):50    test_clean['total_cases']=model.predict(test_clean)51    test_clean['total_cases']=test_clean['total_cases'].astype(int)52    submission=test_clean[['city', 'year', 'weekofyear', 'total_cases']]53    submission['city']=submission['city'].map({1:'sj', 0:'iq'})54    submission.to_csv(out_path, index=False)55    return submission56cleaned=basic_cleaning('../data/dengue_features_train.csv', '../data/dengue_labels_train.csv','../data/dengue_features_test.csv' )57print(cleaned[0].head())58X=cleaned[0].drop('total_cases', axis=1)59Y=cleaned[0]['total_cases']60X_train, X_test, y_train , y_test=train_test_split(X,Y, random_state=42)61dtr=DecisionTreeRegressor(random_state=420)62dtr.fit(X_train, y_train)63rfr=RandomForestRegressor(random_state=420)64rfr.fit(X_train, y_train)65print(mean_absolute_error(y_test, dtr.predict(X_test)))...calculate_bleu_for_bt.py
Source:calculate_bleu_for_bt.py  
1from nltk.translate.bleu_score import corpus_bleu2from experiments.utils import get_daily_dialog, get_mutual_friends, get_babi_dialog3import os4import argparse5from config import *6if __name__ == "__main__":7    parser = argparse.ArgumentParser()8    parser.add_argument("--dataset_clean", type=str, required=True)9    parser.add_argument("--dataset_back", type=str, required=True)10    args = parser.parse_args()11    if "dailydialog" in args.dataset_clean.lower():12        test_clean = get_daily_dialog(os.path.join(args.dataset_clean, "test.json"))13        test_translated = get_daily_dialog(os.path.join(args.dataset_back, "test.json"))14        list_of_hypothesis = []15        list_of_references = []16        assert len(test_clean) == len(test_translated)17        for ground_truth, translation in zip(test_clean, test_translated):18            assert len(ground_truth.dialog) == len(translation.dialog)19            for utterance_ground, utterance_translation in zip(ground_truth.dialog, translation.dialog):20                references = [utterance_ground[TEXT].split(" ")]21                list_of_references.append(references)22                hypothesis = utterance_translation[TEXT].split(" ")23                list_of_hypothesis.append(hypothesis)24        print(corpus_bleu(list_of_references, list_of_hypothesis))25    elif "mutualfriends" in args.dataset_clean.lower():26        test_clean = get_mutual_friends(os.path.join(args.dataset_clean, "test.json"))27        test_translated = get_mutual_friends(os.path.join(args.dataset_back, "test.json"))28        list_of_hypothesis = []29        list_of_references = []30        assert len(test_clean) == len(test_translated)31        for ground_truth, translation in zip(test_clean, test_translated):32            assert ground_truth.uuid == translation.uuid33            assert len(ground_truth.dialog) == len(translation.dialog)34            for utterance_ground, utterance_translation in zip(ground_truth.dialog, translation.dialog):35                if utterance_ground["action"] == "message":36                    references = [utterance_ground["data"].split(" ")]37                    list_of_references.append(references)38                    hypothesis = utterance_translation["data"].split(" ")39                    list_of_hypothesis.append(hypothesis)40                    print(utterance_ground, utterance_translation)41        print(corpus_bleu(list_of_references, list_of_hypothesis))42    elif "babi" in args.dataset_clean.lower():43        test_clean = get_babi_dialog(os.path.join(args.dataset_clean, "dialog-babi-task5-full-dialogs-tst.txt"))44        test_translated = get_babi_dialog(os.path.join(args.dataset_back, "dialog-babi-task5-full-dialogs-tst.txt"))45        list_of_hypothesis = []46        list_of_references = []47        assert len(test_clean) == len(test_translated)48        for ground_truth, translation in zip(test_clean, test_translated):49            assert len(ground_truth.dialog) == len(translation.dialog)50            for utterance_ground, utterance_translation in zip(ground_truth.dialog, translation.dialog):51                key = list(utterance_ground.keys())[0]52                if key in ["User", "Bot"]:53                    references = [utterance_ground[key].split(" ")]54                    list_of_references.append(references)55                    hypothesis = utterance_translation[key].split(" ")56                    list_of_hypothesis.append(hypothesis)...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
