Best Python code snippet using lettuce_webdriver_python
project_code.py
Source:project_code.py  
1import csv2import numpy as np3import timeit4import random5import itertools6import math7from random import randint8from dim_reduction import *9from knn import KNN10from mpp import MPP11from sklearn import svm12from sklearn.metrics import roc_curve13from sklearn import tree14from bpnn import Network15from kmeans import KMeans16from kohonen import KMap17from wta import WTA18import matplotlib as mpl19import matplotlib.pyplot as plt20mpl.use('Qt4Agg')21def filter_retweets(data):22    no_rt = []23    for sample in data:24        retweet = sample[2]25        if retweet == 'False':26            no_rt.append(sample)27    return no_rt28def extract_features(data):29    features = np.zeros((9,len(data)))30    for i in range(0,len(data)):31        tweet = data[i][3]32        upper = 033        for word in tweet.split():34            if word.isupper():35                upper += 136        features[0,i] = tweet.count('!')37        features[1,i] = tweet.lower().count('pic.twitter.com')38        features[2,i] = tweet.count('@')39        features[3,i] = upper40        features[4,i] = tweet.lower().count('http')41        features[5,i] = tweet.count('#')42        features[6,i] = tweet.count('"')43        features[7,i] = tweet.count(',')44        features[8,i] = tweet.count('.')45#        features[7,i] = tweet.lower().count('trump') + tweet.lower().count('donald') 46#        features[8,i] = tweet.lower().count('maga') + tweet.lower().count('make america great again') + tweet.lower().count('makeamericagreatagain') + tweet.lower().count('make #americagreatagain') + tweet.lower().count('make america') + tweet.lower().count('great again')47 #       features[8,i] = tweet.lower().count('loser')48    return features49def nb_fusion(conf_mat, labels, true_labels):50#    print(conf_mat.shape)51#    print(labels.shape)52#    print(labels)53    num_classifiers = conf_mat.shape[0]54    comb = []55    for i in range(0,num_classifiers):56        comb.append(list(range(2)))57    comb = list(itertools.product(*comb))58    num_comb = len(comb)59    table = np.zeros((2,len(comb)))60    num_samples = labels.shape[1]61    num1 = np.count_nonzero(true_labels)62    num0 = num_samples - num163#    print('num0:', num0)64#    print('num1:', num1)65    for i in range(0,num_comb):66        prob0 = (1/math.pow(num0,num_classifiers-1))67        prob1 = (1/math.pow(num1,num_classifiers-1))68        prod = np.ones((2,1))69        for j in range(0,num_classifiers):70            col = comb[i][j]71            prod = np.multiply(prod, conf_mat[j,:,col].reshape((2,1)))72        prod[0] = prod[0] * prob073        prod[1] = prod[1] * prob174        table[:,i] = prod[:,0]75    fused = np.zeros((num_samples,1))76    for i in range(0,num_samples):77        combination = []78        for j in range(0,num_classifiers):79            combination.append(labels[j][i])80        combination = tuple(combination)81        entry = table[:,comb.index(combination)]82        if entry[0] > entry[1]:83            fused[i] = 084        else:85            fused[i] = 186    return table,comb,fused87def majority_vote(predictions):88    num_classifiers = predictions.shape[0]89    num_samples = predictions.shape[1]90    fused = np.zeros(num_samples)91    for i in range(0, num_samples):92        yes = 093        no = 094        for j in range(0, num_classifiers):95            if predictions[j,i] == 0.0:96                no += 197            else:98                yes += 199        if yes > no:100            fused[i] = 1.0101        else:102            fused[i] = 0.0103    return fused104def standardize(data, mean, sigma):105    for i in range(0, data.shape[1]):106        x = data[:,i].reshape(mean.shape)107        data[:,i] = ((x-mean)/sigma).reshape(x.shape[0])108def perf_eval(predict, true):109    num_samples = predict.shape[0]110    fp = 0111    fn = 0112    tp = 0113    tn = 0114    for i in range(0, num_samples):115        if predict[i] == 0:116            if predict[i] == true[i]:117                tn += 1118            else:119                fn += 1120        else:121            if predict[i] == true[i]:122                tp += 1123            else:124                fp += 1125    return (tp,tn,fn,fp)126def confusion_matrix(predict, true):127    tp,tn,fn,fp = perf_eval(predict, true)128    conf_mat = np.zeros((2,2))129    conf_mat[0,0] = tp130    conf_mat[0,1] = fp131    conf_mat[1,0] = fn132    conf_mat[1,1] = tn133    return conf_mat134def m_fold_cross_validation(tweets, person, m):135    print(len(tweets[0]))136    print(len(tweets[1]))137    print(len(tweets[2]))138    print(len(tweets[3]))139    print(len(tweets[4]))140    print(len(tweets[5]))141    all_tweets = []142    all_tweets.extend(tweets[0])143    all_tweets.extend(tweets[1])144    all_tweets.extend(tweets[2])145    all_tweets.extend(tweets[3])146    all_tweets.extend(tweets[4])147    all_tweets.extend(tweets[5])148    y = [0]*len(all_tweets)149    start = 0150    end = 0151    for i in range(0,person):152        start += len(tweets[i])153    end = start + len(tweets[person])154    print(start)155    print(end)156    for i in range(start, end):157        y[i] = 1.0158    z = list(zip(all_tweets, y))159    random.shuffle(z)160    all_tweets, all_labels = zip(*z)161    num_per_set = int(len(all_tweets)/m)162    all_tweets = all_tweets[0:num_per_set*m]163    all_labels = all_labels[0:num_per_set*m]164    sets = []165    for i in range(0,m):166        start = i*num_per_set167        end = (i+1)*num_per_set168        train_tweets = all_tweets[0:start] + all_tweets[end:]169        train_labels = all_labels[0:start] + all_labels[end:]170        test_tweets = all_tweets[start:end]171        test_labels = all_labels[start:end]172        train = (train_tweets, train_labels)173        test = (test_tweets, test_labels)174        sets.append((train, test))175    return sets176def create_dataset(tweets, person, num_train_tweets, train_percentages, num_test_tweets, test_percentages):177    random.shuffle(tweets[0])178    random.shuffle(tweets[1])179    random.shuffle(tweets[2])180    random.shuffle(tweets[3])181    random.shuffle(tweets[4])182    random.shuffle(tweets[5])183    train_data = []184    test_data = []185    num_train_0 = int(train_percentages[0]*num_train_tweets)186    num_train_1 = int(train_percentages[1]*num_train_tweets)187    num_train_2 = int(train_percentages[2]*num_train_tweets)188    num_train_3 = int(train_percentages[3]*num_train_tweets)189    num_train_4 = int(train_percentages[4]*num_train_tweets)190    num_train_5 = int(train_percentages[5]*num_train_tweets)191    num_test_0 = int(test_percentages[0]*num_test_tweets)192    num_test_1 = int(test_percentages[1]*num_test_tweets)193    num_test_2 = int(test_percentages[2]*num_test_tweets)194    num_test_3 = int(test_percentages[3]*num_test_tweets)195    num_test_4 = int(test_percentages[4]*num_test_tweets)196    num_test_5 = int(test_percentages[5]*num_test_tweets)197    for i in range(0, num_train_0):198        train_data.append(tweets[0][i])199    for i in range(num_train_0, num_train_0+num_test_0):200        test_data.append(tweets[0][i])201    for i in range(0, num_train_1):202        train_data.append(tweets[1][i])203    for i in range(num_train_1, num_train_1+num_test_1):204        test_data.append(tweets[1][i])205    for i in range(0, num_train_2):206        train_data.append(tweets[2][i])207    for i in range(num_train_2, num_train_2+num_test_2):208        test_data.append(tweets[2][i])209    for i in range(0, num_train_3):210        train_data.append(tweets[3][i])211    for i in range(num_train_3, num_train_3+num_test_3):212        test_data.append(tweets[3][i])213    for i in range(0, num_train_4):214        train_data.append(tweets[4][i])215    for i in range(num_train_4, num_train_4+num_test_4):216        test_data.append(tweets[4][i])217    for i in range(0, num_train_5):218        train_data.append(tweets[5][i])219    for i in range(num_train_5, num_train_5+num_test_5):220        test_data.append(tweets[5][i])221    222    train_labels = np.zeros(len(train_data))223    start = int(np.sum(train_percentages[0:person])*num_train_tweets)224    end = int(np.sum(train_percentages[0:person+1])*num_train_tweets)225    for i in range(start, end):226        train_labels[i] = 1227    228    test_labels = np.zeros(len(test_data))229    start = int(np.sum(test_percentages[0:person])*num_test_tweets)230    end = int(np.sum(test_percentages[0:person+1])*num_test_tweets)231    for i in range(start, end):232        test_labels[i] = 1233    return [(train_data, train_labels), (test_data, test_labels)]234def plot_roc(f_rate, t_rate, label_str):235    plt.plot(f_rate, t_rate, label=label_str)236    plt.plot([0,1],[0,1], linestyle='--')237    plt.xlabel('False Positive Rate')238    plt.ylabel('True Positive Rate')239#    plt.legend()240def main():241    dt_tweets = []242    hc_tweets = []243    kk_tweets = []244    ndgt_tweets = []245    rd_tweets = []246    sk_tweets = []247    with open('DonaldTrumpDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:248        reader = csv.reader(csvfile, delimiter=',')249        for row in reader:250            dt_tweets.append(row)251    with open('HillaryClintonDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:252        reader = csv.reader(csvfile, delimiter=',')253        for row in reader:254            hc_tweets.append(row)255    with open('KimKardashianDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:256        reader = csv.reader(csvfile, delimiter=',')257        for row in reader:258            kk_tweets.append(row)259    with open('NeildeGrasseTysonDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:260        reader = csv.reader(csvfile, delimiter=',')261        for row in reader:262            ndgt_tweets.append(row)263    with open('RichardDawkinsDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:264        reader = csv.reader(csvfile, delimiter=',')265        for row in reader:266            rd_tweets.append(row)267    with open('ScottKellyDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:268        reader = csv.reader(csvfile, delimiter=',')269        for row in reader:270            sk_tweets.append(row)271    dt_tweets.pop(0)272    hc_tweets.pop(0)273    kk_tweets.pop(0)274    ndgt_tweets.pop(0)275    rd_tweets.pop(0)276    sk_tweets.pop(0)277#    print(len(dt_tweets))278#    print(len(hc_tweets))279#    print(len(kk_tweets))280#    print(len(ndgt_tweets))281#    print(len(rd_tweets))282#    print(len(sk_tweets))283#    print(len(dt_tweets) + len(hc_tweets) + len(kk_tweets) + len(ndgt_tweets) + len(rd_tweets) + len(sk_tweets))284    tweets = [dt_tweets, hc_tweets, kk_tweets, ndgt_tweets, rd_tweets, sk_tweets]285    dt_nort_tweets   = filter_retweets(dt_tweets)286    hc_nort_tweets   = filter_retweets(hc_tweets)287    kk_nort_tweets   = filter_retweets(kk_tweets)288    ndgt_nort_tweets = filter_retweets(ndgt_tweets)289    rd_nort_tweets   = filter_retweets(rd_tweets)290    sk_nort_tweets   = filter_retweets(sk_tweets)291#    print(len(dt_nort_tweets) + len(hc_nort_tweets) + len(kk_nort_tweets) + len(ndgt_nort_tweets) + len(rd_nort_tweets) + len(sk_nort_tweets))292    nort_tweets = [dt_nort_tweets, hc_nort_tweets, kk_nort_tweets, ndgt_nort_tweets, rd_nort_tweets, sk_nort_tweets]293#    percentages = [0.43, 0.08, 0.26, 0.06, 0.14, 0.03]294    percentages = [0.17, 0.17, 0.17, 0.17, 0.16, 0.16]295    datasets = create_dataset(tweets, 0, 7000, percentages, 500, percentages)296    nort_datasets = create_dataset(nort_tweets, 0, 7000, percentages, 500, percentages)297    train_set = datasets[0][0]298    train_labels = datasets[0][1]299    test_set = datasets[1][0]300    test_labels = datasets[1][1]301    nort_train_set = datasets[0][0]302    nort_train_labels = datasets[0][1]303    nort_test_set = datasets[1][0]304    nort_test_labels = datasets[1][1]305    data = train_set306    true_labels = train_labels307    test_data = test_set308    test_labels = test_labels309    nort_data = nort_train_set310    nort_true_labels = nort_train_labels311    nort_test_data = nort_test_set312    nort_test_labels = nort_test_labels313    314    features = extract_features(data)315    nort_features = extract_features(nort_data)316    test_features = extract_features(test_data)317    test_features2 = test_features318    mean = np.mean(features, axis=1).reshape((features.shape[0],1))319    sigma = np.std(features, axis=1).reshape((features.shape[0],1))320    mean2 = np.mean(nort_features, axis=1).reshape((nort_features.shape[0],1))321    sigma2 = np.std(nort_features, axis=1).reshape((nort_features.shape[0],1))322    standardize(features, mean, sigma)323    standardize(nort_features, mean2, sigma2)324    standardize(test_features, mean, sigma)325    standardize(test_features2, mean2, sigma2)326#    fld = FLD()327#    fld.setup(features, true_labels)328#    features = fld.reduce(features)329#    test_features = fld.reduce(test_features)330#331#    fld2 = FLD()332#    fld2.setup(nort_features, nort_train_labels)333#    nort_features = fld.reduce(nort_features)334#    test_features2 = fld.reduce(test_features2)335#    pca = PCA()336#    pca.setup(features, 0.8)337#    features = pca.reduce(features)338#    test_features = pca.reduce(test_features)339#    print(pca.eigenvalues)340#341#    pca2 = PCA()342#    pca2.setup(nort_features, 0.8)343#    nort_features = pca.reduce(nort_features)344#    test_features2 = pca.reduce(test_features2)345#    print(pca2.eigenvalues)346#    print("Decision Tree")347#    clf = tree.DecisionTreeClassifier()348#    clf.probability = True349#    clf.fit(features.T, true_labels)350#    ymodel = clf.predict(test_features.T)351#    prob = clf.predict_proba(test_features.T)352#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)353#    plt.figure()354#    plot_roc(fper, tper)355#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)356#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))357#    print('TP:',tp)358#    print('TN:',tn)359#    print('FP:',fp)360#    print('FN:',fn)361#    print("SVM linear")362#    clf = svm.SVC(kernel='linear', gamma='auto')363#    clf.probability = True364#    clf.fit(features.T, true_labels)365#    ymodel = clf.predict(test_features.T)366#    prob = clf.predict_proba(test_features.T)367#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)368#    plt.figure()369#    plot_roc(fper, tper)370#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)371#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))372#    print('TP:',tp)373#    print('TN:',tn)374#    print('FP:',fp)375#    print('FN:',fn)376#    print("SVM poly")377#    clf = svm.SVC(kernel='poly', gamma='auto')378#    clf.probability = True379#    clf.fit(features.T, true_labels)380#    ymodel = clf.predict(test_features.T)381#    prob = clf.predict_proba(test_features.T)382#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)383#    plt.figure()384#    plot_roc(fper, tper)385#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)386#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))387#    print('TP:',tp)388#    print('TN:',tn)389#    print('FP:',fp)390#    print('FN:',fn)391#392#    print("SVM rbf")393#    clf = svm.SVC(kernel='rbf', gamma='auto')394#    clf.probability = True395#    clf.fit(features.T, true_labels)396#    ymodel = clf.predict(test_features.T)397#    prob = clf.predict_proba(test_features.T)398#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)399#    plt.figure()400#    plot_roc(fper, tper)401#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)402#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))403#    print('TP:',tp)404#    print('TN:',tn)405#    print('FP:',fp)406#    print('FN:',fn)407#408#    print("SVM sigmoid")409#    clf = svm.SVC(kernel='sigmoid', gamma='auto')410#    clf.probability = True411#    clf.fit(features.T, true_labels)412#    ymodel = clf.predict(test_features.T)413#    prob = clf.predict_proba(test_features.T)414#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)415#    plt.figure()416#    plot_roc(fper, tper)417#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)418#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))419#    print('TP:',tp)420#    print('TN:',tn)421#    print('FP:',fp)422#    print('FN:',fn)423#    k = 3424#    print("KNN: k =",k)425#    print('2 norm')426#    knn_model = KNN(k)427#    knn_model.fit(features, true_labels)428#    ymodel = knn_model.predict(test_features, norm=2)429#    prob = knn_model.predict_prob(test_features)430#    print(prob)431#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)432#    plt.figure()433#    plot_roc(fper, tper)434#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)435#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))436#    print('TP:',tp)437#    print('TN:',tn)438#    print('FP:',fp)439#    print('FN:',fn)440#    knn_model2 = KNN(k)441#    knn_model2.fit(nort_features, nort_train_labels)442#    ymodel = knn_model2.predict(test_features2, norm=2)443#    prob = knn_model2.predict_prob(test_features)444#    print(prob)445#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)446#    plt.figure()447#    plot_roc(fper, tper)448#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)449#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))450#    print('TP:',tp)451#    print('TN:',tn)452#    print('FP:',fp)453#    print('FN:',fn)454#    print('inf norm')455#    knn_model = KNN(k)456#    knn_model.fit(features, true_labels)457#    ymodel = knn_model.predict(test_features, norm='inf')458#    prob = knn_model.predict_prob(test_features)459#    print(prob)460#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)461#    plt.figure()462#    plot_roc(fper, tper)463#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)464#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))465#    print('TP:',tp)466#    print('TN:',tn)467#    print('FP:',fp)468#    print('FN:',fn)469#470#    knn_model2 = KNN(k)471#    knn_model2.fit(nort_features, nort_train_labels)472#    ymodel = knn_model2.predict(test_features2, norm='inf')473#    prob = knn_model2.predict_prob(test_features)474#    print(prob)475#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)476#    plt.figure()477#    plot_roc(fper, tper)478#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)479#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))480#    print('TP:',tp)481#    print('TN:',tn)482#    print('FP:',fp)483#    print('FN:',fn)484#485#    print('1 norm')486#    knn_model = KNN(k)487#    knn_model.fit(features, true_labels)488#    ymodel = knn_model.predict(test_features, norm=1)489#    prob = knn_model.predict_prob(test_features)490#    print(prob)491#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)492#    plt.figure()493#    plot_roc(fper, tper)494#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)495#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))496#    print('TP:',tp)497#    print('TN:',tn)498#    print('FP:',fp)499#    print('FN:',fn)500#501#    knn_model2 = KNN(k)502#    knn_model2.fit(nort_features, nort_train_labels)503#    ymodel = knn_model2.predict(test_features2, norm=1)504#    prob = knn_model.predict_prob(test_features)505#    print(prob)506#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)507#    plt.figure()508#    plot_roc(fper, tper)509#    tp,tn,fn,fp = perf_eval(ymodel, test_labels)510#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))511#    print('TP:',tp)512#    print('TN:',tn)513#    print('FP:',fp)514#    print('FN:',fn)515#    true = np.count_nonzero(true_labels)/true_labels.shape[0]516#    false = 1-true517#    print("MPP case 1")518#    mpp = MPP(1)519#    mpp.set_prior(false, true)520#    mpp.fit(features, true_labels)521#    mpp_pred1 = mpp.predict(test_features)522#    prob = mpp.predict_prob(test_features)523#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)524#    plt.figure()525#    plot_roc(fper, tper)526#    tp,tn,fn,fp = perf_eval(mpp_pred1, test_labels)527#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))528#    print('TP:',tp)529#    print('TN:',tn)530#    print('FP:',fp)531#    print('FN:',fn)532#533#    print("MPP case 2")534#    mpp = MPP(2)535#    mpp.set_prior(false, true)536#    mpp.fit(features, true_labels)537#    mpp_pred2 = mpp.predict(test_features)538#    prob = mpp.predict_prob(test_features)539#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)540#    plt.figure()541#    plot_roc(fper, tper)542#    tp,tn,fn,fp = perf_eval(mpp_pred2, test_labels)543#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))544#    print('TP:',tp)545#    print('TN:',tn)546#    print('FP:',fp)547#    print('FN:',fn)548#549#    print("MPP case 3")550#    mpp = MPP(3)551#    mpp.set_prior(false, true)552#    mpp.fit(features, true_labels)553#    mpp_pred3 = mpp.predict(test_features)554#    prob = mpp.predict_prob(test_features)555#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)556#    plt.figure()557#    plot_roc(fper, tper)558#    tp,tn,fn,fp = perf_eval(mpp_pred3, test_labels)559#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))560#    print('TP:',tp)561#    print('TN:',tn)562#    print('FP:',fp)563#    print('FN:',fn)564#565#    print("Fused MPP")566#    mpp_predictions = np.zeros((3,mpp_pred1.shape[0]))567#    mpp_predictions[0,:] = mpp_pred1.T568#    mpp_predictions[1,:] = mpp_pred2.T569#    mpp_predictions[2,:] = mpp_pred3.T570#    mpp_fused = majority_vote(mpp_predictions)571#    tp,tn,fn,fp = perf_eval(mpp_fused, test_labels)572#    print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))573#    print('TP:',tp)574#    print('TN:',tn)575#    print('FP:',fp)576#    print('FN:',fn)577#    print("BPNN")578#    num_features = 7579#    net = Network([features.shape[0], 10, 2])580#    net.SGD(features, true_labels, 1000, 1, 0.05, test_features, test_labels)581#    prob = net.SGD_prob(features, true_labels, 100, 1, 0.10, test_features, test_labels)582#    fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)583#    plt.figure()584#    plot_roc(fper, tper)585    #plt.show()586#    kmeans = KMeans(2)587#    kmeans.predict(train_features, train_labels)588#    kmeans.predict(test_features, test_labels)589#    kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)590#    wta = WTA(2)591#    wta.predict(test_features, test_labels, e=0.01)592#    kmap = KMap(2)593#    kmap.predict(test_features, test_labels, e=0.001, iters=100)594#    kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)595#    m = 5596#    sets = m_fold_cross_validation(tweets, 0, m)597#    print(len(sets))598#    conf_mats = np.zeros((m,2,2))599#    for i in range(0,m):600#        train,test = sets[i]601#        train_tweets,train_labels = train602#        test_tweets,test_labels = test603#        train_features = extract_features(train_tweets)604#        test_features = extract_features(test_tweets)605#        mean = np.mean(train_features, axis=1).reshape((train_features.shape[0],1))606#        sigma = np.std(train_features, axis=1).reshape((train_features.shape[0],1))607#        standardize(train_features, mean, sigma)608#        standardize(test_features, mean, sigma)609#        print("BGNN")610#        net = Network([train_features.shape[0], 10, 2])611#        conf_mats[i,:,:] = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)612#    kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)613    m = 10614    sets = m_fold_cross_validation(tweets, 0, m)615    print(len(sets))616    num_test = len(sets[0][0][1])617    print(num_test)618    for i in range(0,1):619        print('Set', i)620        train,test = sets[i]621        train_tweets,train_labels = train622        test_tweets,test_labels = test623        percentages = [0.43, 0.08, 0.26, 0.06, 0.14, 0.03]624        datasets = create_dataset(tweets, 0, 10000, percentages, 1000, percentages)625        train_tweets = datasets[0][0]626        train_labels = datasets[0][1]627        test_tweets = datasets[1][0]628        test_labels = datasets[1][1]629    630        train_features = extract_features(train_tweets)631        test_features = extract_features(test_tweets)632        mean = np.mean(train_features, axis=1).reshape((train_features.shape[0],1))633        sigma = np.std(train_features, axis=1).reshape((train_features.shape[0],1))634        standardize(train_features, mean, sigma)635        standardize(test_features, mean, sigma)636#        print(len(test_labels))637#        fld = FLD()638#        fld.setup(train_features, train_labels)639#        train_features = fld.reduce(train_features)640#        test_features = fld.reduce(test_features)641    642#        pca = PCA()643#        pca.setup(train_features, 0.8)644#        train_features = pca.reduce(train_features)645#        test_features = pca.reduce(test_features)646#        print(pca.eigenvalues)647   648#        print(len(test_labels))649#        num_test = len(test_labels)650#        conf_mats = np.zeros((3,2,2))651#        all_labels = np.zeros((3,num_test))652#653#        print("Decision Tree")654#        clf = tree.DecisionTreeClassifier()655#        clf.probability = True656#        clf.fit(train_features.T, train_labels)657#        ymodel = clf.predict(test_features.T)658#        prob = clf.predict_proba(test_features.T)659#        print(prob[0:10])660#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)661##        plt.figure()662##        plot_roc(fper, tper)663#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)664#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))665#        print('TP:',tp)666#        print('TN:',tn)667#        print('FP:',fp)668#        print('FN:',fn)669#        conf_mats[0,:,:] = confusion_matrix(ymodel, test_labels).T670#        all_labels[0,:] = ymodel671#        fld = FLD()672#        fld.setup(train_features, train_labels)673#        fld_train_features = fld.reduce(train_features)674#        fld_test_features = fld.reduce(test_features)675#676#        clf.fit(fld_train_features.T, train_labels)677#        ymodel = clf.predict(fld_test_features.T)678#        prob = clf.predict_proba(fld_test_features.T)679#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)680#        plot_roc(fper, tper, 'FLD')681#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)682#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))683#        print('TP:',tp)684#        print('TN:',tn)685#        print('FP:',fp)686#        print('FN:',fn)687#        kohonen_sens[0] = tp/(tp+fn)688#        kohonen_spec[0] = tn/(tn+fp)689#        kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=1)690#        kohonen_pred = np.array(kohonen_pred)691#        tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)692#        print('KMap: INF')693#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))694#        print('TP:',tp)695#        print('TN:',tn)696#        print('FP:',fp)697#        print('FN:',fn)698#        kohonen_sens[1] = tp/(tp+fn)699#        kohonen_spec[1] = tn/(tn+fp)700#        kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=2)701#        kohonen_pred = np.array(kohonen_pred)702#        tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)703#        print('KMap: INF')704#705#        pca = PCA()706#        tol = 0.75707#        pca.setup(train_features, tol)708#        pca_train_features = pca.reduce(train_features)709#        pca_test_features = pca.reduce(test_features)710#711#        clf.fit(pca_train_features.T, train_labels)712#        ymodel = clf.predict(pca_test_features.T)713#        prob = clf.predict_proba(pca_test_features.T)714#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)715#        plot_roc(fper, tper, 'PCA: tol='+str(tol))716#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)717#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))718#        print('TP:',tp)719#        print('TN:',tn)720#        print('FP:',fp)721#        print('FN:',fn)722#        kohonen_sens[2] = tp/(tp+fn)723#        kohonen_spec[2] = tn/(tn+fp)724#        knn_sens = np.zeros((3,1))725#        knn_spec = np.zeros((3,1))726#        k = 5727#        print("KNN: k =",k)728#        print('2 norm')729#        knn_model = KNN(k)730#        knn_model.fit(train_features, train_labels)731#        ymodel = knn_model.predict(test_features, norm='inf')732#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)733#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))734#        print('TP:',tp)735#        print('TN:',tn)736#        print('FP:',fp)737#        print('FN:',fn)738#        knn_sens[0] = tp/(tp+fn)739#        knn_spec[0] = tn/(tn+fp)740#        ymodel = knn_model.predict(test_features, norm=1)741#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)742#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))743#        print('TP:',tp)744#        print('TN:',tn)745#        print('FP:',fp)746#        print('FN:',fn)747#        knn_sens[1] = tp/(tp+fn)748#        knn_spec[1] = tn/(tn+fp)749#        ymodel = knn_model.predict(test_features, norm=2)750#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)751#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))752#        print('TP:',tp)753#        print('TN:',tn)754#        print('FP:',fp)755#        print('FN:',fn)756#        knn_sens[2] = tp/(tp+fn)757#        knn_spec[2] = tn/(tn+fp)758#        conf_mats[0,:,:] = confusion_matrix(ymodel, test_labels).T759#        all_labels[0,:] = ymodel760## Minkowski distance761#        kmeans_sens = np.zeros((3,1))762#        kmeans_spec = np.zeros((3,1))763#        kmeans = KMeans(2)764#        kpred = kmeans.predict(test_features, test_labels, norm=np.inf)765#        kpred = np.array(kpred)766#        tp,tn,fn,fp = perf_eval(kpred, test_labels)767#        print('KMeans: INF')768#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))769#        print('TP:',tp)770#        print('TN:',tn)771#        print('FP:',fp)772#        print('FN:',fn)773#        kmeans_sens[0] = tp/(tp+fn)774#        kmeans_spec[0] = tn/(tn+fp)775#        kpred = kmeans.predict(test_features, test_labels,1)776#        kpred = np.array(kpred)777#        tp,tn,fn,fp = perf_eval(kpred, test_labels)778#        print('KMeans: 1')779#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))780#        print('TP:',tp)781#        print('TN:',tn)782#        print('FP:',fp)783#        print('FN:',fn)784#        kmeans_sens[1] = tp/(tp+fn)785#        kmeans_spec[1] = tn/(tn+fp)786#        kpred = kmeans.predict(test_features, test_labels,2)787#        kpred = np.array(kpred)788#        tp,tn,fn,fp = perf_eval(kpred, test_labels)789#        print('KMeans: 2')790#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))791#        print('TP:',tp)792#        print('TN:',tn)793#        print('FP:',fp)794#        print('FN:',fn)795#        kmeans_sens[2] = tp/(tp+fn)796#        kmeans_spec[2] = tn/(tn+fp)797#798#        wta_sens = np.zeros((3,1))799#        wta_spec = np.zeros((3,1))800#        wta = WTA(2)801#        wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=np.inf)802#        wta_pred = np.array(wta_pred)803#        tp,tn,fn,fp = perf_eval(wta_pred, test_labels)804#        print('WTA: INF')805#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))806#        print('TP:',tp)807#        print('TN:',tn)808#        print('FP:',fp)809#        print('FN:',fn)810#        wta_sens[0] = tp/(tp+fn)811#        wta_spec[0] = tn/(tn+fp)812#        wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=1)813#        wta_pred = np.array(wta_pred)814#        tp,tn,fn,fp = perf_eval(wta_pred, test_labels)815#        print('WTA: 1')816#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))817#        print('TP:',tp)818#        print('TN:',tn)819#        print('FP:',fp)820#        print('FN:',fn)821#        wta_sens[1] = tp/(tp+fn)822#        wta_spec[1] = tn/(tn+fp)823#        wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=2)824#        wta_pred = np.array(wta_pred)825#        tp,tn,fn,fp = perf_eval(wta_pred, test_labels)826#        print('WTA: 2')827#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))828#        print('TP:',tp)829#        print('TN:',tn)830#        print('FP:',fp)831#        print('FN:',fn)832#        wta_sens[2] = tp/(tp+fn)833#        wta_spec[2] = tn/(tn+fp)834#835#        kohonen_sens = np.zeros((3,1))836#        kohonen_spec = np.zeros((3,1))837#        kohonen = KMap(2)838#        kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=np.inf)839#        kohonen_pred = np.array(kohonen_pred)840#        tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)841#        print('KMap: INF')842#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))843#        print('TP:',tp)844#        print('TN:',tn)845#        print('FP:',fp)846#        print('FN:',fn)847#        kohonen_sens[0] = tp/(tp+fn)848#        kohonen_spec[0] = tn/(tn+fp)849#        kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=1)850#        kohonen_pred = np.array(kohonen_pred)851#        tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)852#        print('KMap: 1')853#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))854#        print('TP:',tp)855#        print('TN:',tn)856#        print('FP:',fp)857#        print('FN:',fn)858#        kohonen_sens[1] = tp/(tp+fn)859#        kohonen_spec[1] = tn/(tn+fp)860#        kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=2)861#        kohonen_pred = np.array(kohonen_pred)862#        tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)863#        print('KMap: 2')864#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))865#        print('TP:',tp)866#        print('TN:',tn)867#        print('FP:',fp)868#        print('FN:',fn)869#        kohonen_sens[2] = tp/(tp+fn)870#        kohonen_spec[2] = tn/(tn+fp)871#872#        knn_sens = np.zeros((3,1))873#        knn_spec = np.zeros((3,1))874#        k = 5875#        print("KNN: k =",k)876#        knn_model = KNN(k)877#        knn_model.fit(train_features, train_labels)878#        ymodel = knn_model.predict(test_features, norm='inf')879#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)880#        print("KNN INF")881#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))882#        print('TP:',tp)883#        print('TN:',tn)884#        print('FP:',fp)885#        print('FN:',fn)886#        knn_sens[0] = tp/(tp+fn)887#        knn_spec[0] = tn/(tn+fp)888#        ymodel = knn_model.predict(test_features, norm=1)889#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)890#        print("KNN 1")891#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))892#        print('TP:',tp)893#        print('TN:',tn)894#        print('FP:',fp)895#        print('FN:',fn)896#        knn_sens[1] = tp/(tp+fn)897#        knn_spec[1] = tn/(tn+fp)898#        ymodel = knn_model.predict(test_features, norm=2)899#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)900#        print("KNN 2")901#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))902#        print('TP:',tp)903#        print('TN:',tn)904#        print('FP:',fp)905#        print('FN:',fn)906#        knn_sens[2] = tp/(tp+fn)907#        knn_spec[2] = tn/(tn+fp)908#909#        width = 0.20910#        plt.figure()911#        plt.xticks([0,1,2], ['INF', '1', '2'])912#        plt.xlabel('Norm')913#        plt.ylabel('Sensitivity')914#        plt.title('Sensitivity with different norms')915#        plt.bar(np.arange(3)-(3/2)*width, knn_sens[:,0], width=width)916#        plt.bar(np.arange(3)-width/2, kmeans_sens[:,0], width=width)917#        plt.bar(np.arange(3)+width/2, wta_sens[:,0], width=width)918#        plt.bar(np.arange(3)+(3/2)*width, kohonen_sens[:,0], width=width)919#        plt.legend(['KNN', 'KMeans', 'WTA', 'Kohonen'])920#        plt.figure()921#        plt.xticks([0,1,2], ['INF', '1', '2'])922#        plt.xlabel('Norm')923#        plt.ylabel('Specificity')924#        plt.title('Specificity with different norms')925#        plt.bar(np.arange(3)-(3/2)*width, knn_spec[:,0], width=width)926#        plt.bar(np.arange(3)-width/2, kmeans_spec[:,0], width=width)927#        plt.bar(np.arange(3)+width/2, wta_spec[:,0], width=width)928#        plt.bar(np.arange(3)+(3/2)*width, kohonen_spec[:,0], width=width)929#        plt.legend(['KNN', 'KMeans', 'WTA', 'Kohonen'])930#        plt.show()931#        conf_mats[1,:,:] = confusion_matrix(kpred, test_labels).T932#        all_labels[1,:] = kpred933#        k = 3934#        print("KNN: k =",k)935#        print('2 norm')936#        knn_model = KNN(k)937#        knn_model.fit(train_features, train_labels)938#        ymodel = knn_model.predict(test_features, norm=2)939#        prob = knn_model.predict_prob(test_features)940#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)941#        plt.figure()942#        plot_roc(fper, tper)943#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)944#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))945#        print('TP:',tp)946#        print('TN:',tn)947#        print('FP:',fp)948#        print('FN:',fn)949#        conf_mats[2,:,:] = confusion_matrix(ymodel, test_labels).T950#        all_labels[2,:] = ymodel951#        wta = WTA(2)952#        wta.predict(test_features, test_labels, e=0.01)953#        kmap = KMap(2)954#        kmap.predict(test_features, test_labels, e=0.001, iters=100)955#956#        predictions = np.zeros((2,ymodel.shape[0]))957#        predictions[0,:] = ymodel.T958#        fused = majority_vote(predictions)959#        tp,tn,fn,fp = perf_eval(fused, test_labels)960#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))961#        print('TP:',tp)962#        print('TN:',tn)963#        print('FP:',fp)964#        print('FN:',fn)965        net = Network([features.shape[0], 10, 2])966#        conf, ymodel = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)967        prob = net.SGD_prob(train_features, train_labels, 100, 1, 0.10, test_features, test_labels)968        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)969        plot_roc(fper, tper, 'Standard')970#        tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)971#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))972#        print('TP:',tp)973#        print('TN:',tn)974#        print('FP:',fp)975#        print('FN:',fn)976        fld = FLD()977        fld.setup(train_features, train_labels)978        fld_train_features = fld.reduce(train_features)979        fld_test_features = fld.reduce(test_features)980        net = Network([fld_train_features.shape[0], 10, 2])981#        conf, ymodel = net.SGD(fld_train_features, train_labels, 1000, 1, 0.05, fld_test_features, test_labels)982        prob = net.SGD_prob(fld_train_features, train_labels, 100, 1, 0.10, fld_test_features, test_labels)983        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)984        plot_roc(fper, tper, 'FLD')985#        tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)986#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))987#        print('TP:',tp)988#        print('TN:',tn)989#        print('FP:',fp)990#        print('FN:',fn)991        pca = PCA()992        tol = 0.75993        pca.setup(train_features, tol)994        pca_train_features = pca.reduce(train_features)995        pca_test_features = pca.reduce(test_features)996        net = Network([pca_train_features.shape[0], 10, 2])997#        conf, ymodel = net.SGD(pca_train_features, train_labels, 1000, 1, 0.05, pca_test_features, test_labels)998        prob = net.SGD_prob(pca_train_features, train_labels, 100, 1, 0.10, pca_test_features, test_labels)999        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1000        plot_roc(fper, tper, 'PCA')1001#        tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)1002#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1003#        print('TP:',tp)1004#        print('TN:',tn)1005#        print('FP:',fp)1006#        print('FN:',fn)1007        plt.title('BPNN: with different dimension reduction techniques')1008        plt.legend()1009#        print("SVM rbf")1010#        clf = svm.SVC(kernel='rbf', gamma='auto')1011#        clf.probability = True1012#        clf.fit(train_features.T, train_labels)1013#        ymodel = clf.predict(test_features.T)1014#        prob = clf.predict_proba(test_features.T)1015#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1016#        plot_roc(fper, tper, 'Standard')1017#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1018#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1019#        print('TP:',tp)1020#        print('TN:',tn)1021#        print('FP:',fp)1022#        print('FN:',fn)1023#1024#        fld = FLD()1025#        fld.setup(train_features, train_labels)1026#        fld_train_features = fld.reduce(train_features)1027#        fld_test_features = fld.reduce(test_features)1028#1029#        print("SVM rbf")1030#        clf = svm.SVC(kernel='rbf', gamma='auto')1031#        clf.probability = True1032#        clf.fit(fld_train_features.T, train_labels)1033#        ymodel = clf.predict(fld_test_features.T)1034#        prob = clf.predict_proba(fld_test_features.T)1035#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1036#        plot_roc(fper, tper, 'FLD')1037#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1038#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1039#        print('TP:',tp)1040#        print('TN:',tn)1041#        print('FP:',fp)1042#        print('FN:',fn)1043#1044#        pca = PCA()1045#        tol = 0.751046#        pca.setup(train_features, tol)1047#        pca_train_features = pca.reduce(train_features)1048#        pca_test_features = pca.reduce(test_features)1049#1050#        print("SVM rbf")1051#        clf = svm.SVC(kernel='rbf', gamma='auto')1052#        clf.probability = True1053#        clf.fit(pca_train_features.T, train_labels)1054#        ymodel = clf.predict(pca_test_features.T)1055#        prob = clf.predict_proba(pca_test_features.T)1056#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1057#        plot_roc(fper, tper, 'PCA')1058#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1059#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1060#        print('TP:',tp)1061#        print('TN:',tn)1062#        print('FP:',fp)1063#        print('FN:',fn)1064#        plt.title("SVM with rbf kernel")1065   1066#        print("SVM linear")1067#        clf = svm.SVC(kernel='linear', gamma='auto')1068#        clf.probability = True1069#        clf.fit(train_features.T, train_labels)1070#        ymodel = clf.predict(test_features.T)1071#        prob = clf.predict_proba(test_features.T)1072#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1073#        plot_roc(fper, tper, 'linear')1074#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1075#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1076#        print('TP:',tp)1077#        print('TN:',tn)1078#        print('FP:',fp)1079#        print('FN:',fn)1080#   1081#        print("SVM poly")1082#        clf = svm.SVC(kernel='poly', gamma='auto')1083#        clf.probability = True1084#        clf.fit(train_features.T, train_labels)1085#        ymodel = clf.predict(test_features.T)1086#        prob = clf.predict_proba(test_features.T)1087#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1088#        plot_roc(fper, tper, 'poly')1089#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1090#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1091#        print('TP:',tp)1092#        print('TN:',tn)1093#        print('FP:',fp)1094#        print('FN:',fn)1095#   1096#        print("SVM rbf")1097#        clf = svm.SVC(kernel='rbf', gamma='auto')1098#        clf.probability = True1099#        clf.fit(train_features.T, train_labels)1100#        ymodel = clf.predict(test_features.T)1101#        prob = clf.predict_proba(test_features.T)1102#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1103#        plot_roc(fper, tper, 'rbf')1104#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1105#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1106#        print('TP:',tp)1107#        print('TN:',tn)1108#        print('FP:',fp)1109#        print('FN:',fn)1110#   1111#        print("SVM sigmoid")1112#        clf = svm.SVC(kernel='sigmoid', gamma='auto')1113#        clf.probability = True1114#        clf.fit(train_features.T, train_labels)1115#        ymodel = clf.predict(test_features.T)1116#        prob = clf.predict_proba(test_features.T)1117#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1118#        plot_roc(fper, tper, 'sigmoid')1119#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1120#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1121#        print('TP:',tp)1122#        print('TN:',tn)1123#        print('FP:',fp)1124#        print('FN:',fn)1125#1126#        plt.title("SVM comparison with different kernels")1127#        k = 11128#        print("KNN: k =",k)1129#        print('2 norm')1130#        knn_model = KNN(k)1131#        knn_model.fit(train_features, train_labels)1132#        ymodel = knn_model.predict(test_features, norm=2)1133#        prob = knn_model.predict_prob(test_features)1134#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1135#        plt.figure()1136#        plt.plot(fper, tper, label='k = '+str(k))1137#        plt.plot([0,1],[0,1], linestyle='--')1138#        plt.xlabel('False Positive Rate')1139#        plt.ylabel('True Positive Rate')1140#        plt.title('KNN k=1 ROC curve')1141#        tp,tn,fn,fp = perf_eval(ymodel, test_labels)1142#        print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1143#        print('TP:',tp)1144#        print('TN:',tn)1145#        print('FP:',fp)1146#        print('FN:',fn)1147#        total_k = int(math.sqrt(len(train_labels)))1148#        x = list(range(1,total_k))1149#        k_sensitivity = np.zeros((len(x)+1,1))1150#        k_specificity = np.zeros((len(x)+1,1))1151#        for k in range(1,total_k):1152#            k_model = knn_model.predict_k(k)1153#            tp,tn,fn,fp = perf_eval(k_model, test_labels)1154#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1155#            print('TP:',tp)1156#            print('TN:',tn)1157#            print('FP:',fp)1158#            print('FN:',fn)1159#            k_sensitivity[k] = tp/(tp+fn)1160#            k_specificity[k] = tn/(tn+fp)1161#1162#        plt.figure()1163#        plt.plot(x, k_sensitivity[1:], label='Sensitivity')1164#        plt.xlabel("k")1165#        plt.ylabel("Sensitivity")1166#        plt.title("Sensitivity")1167#        plt.plot(x, k_specificity[1:], label='Specificity')1168#        plt.xlabel("k")1169#        plt.ylabel("Specificity")1170#        plt.title("Specificity")1171#        plt.title("KNN Performance")1172#        plt.legend()1173#        print("BPNN")1174#        net = Network([train_features.shape[0], 10, 10, 2])1175#        conf_mats[0,:,:],bpnn_pred = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)1176#        all_labels[0,:] = np.array(bpnn_pred)1177#        prob = net.SGD_prob(train_features, train_labels, 100, 1, 0.10, test_features, test_labels)1178#        fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1179#        plt.figure()1180#        plot_roc(fper, tper)1181#        plt.show()1182#        sensitivity = np.zeros((6,9))1183#        specificity = np.zeros((6,9))1184#        for j in range(1,10):1185#            num_test = len(test_labels)1186#            conf_mats = np.zeros((4,2,2))1187#            all_labels = np.zeros((4,num_test))1188##            plt.figure()1189#1190#            train_labels = np.array(train_labels)1191#            test_labels = np.array(test_labels)1192##            true = np.count_nonzero(true_labels)/true_labels.shape[0]1193##            false = 1-true1194#            true = j*0.11195#            false = 1-true1196#            print("Prior 0", false, "Prior 1", true)1197#1198#            print("MPP case 1")1199#            mpp1 = MPP(1)1200#            mpp1.set_prior(false, true)1201#            mpp1.fit(train_features, train_labels)1202#            mpp_pred1 = mpp1.predict(test_features)1203#            prob1 = mpp1.predict_prob(test_features)1204##            print(prob1[0:10])1205##            fper1, tper1, thresh = roc_curve(test_labels, prob1[:,1], pos_label=1)1206##            print(fper1)1207##            plot_roc(fper1, tper1, 'Case 1')1208##            plt.plot(fper1, tper1, label='Case 1')1209#            tp,tn,fn,fp = perf_eval(mpp_pred1, test_labels)1210#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1211#            print('TP:',tp)1212#            print('TN:',tn)1213#            print('FP:',fp)1214#            print('FN:',fn)1215#            sensitivity[0,j-1] = tp/(tp+fn)1216#            specificity[0,j-1] = tn/(tn+fp)1217#1218#            conf_mats[0,:,:] = confusion_matrix(mpp_pred1, test_labels).T1219#            all_labels[0,:] = mpp_pred1.reshape((mpp_pred1.shape[0]))1220#1221#            print("MPP case 2")1222#            mpp2 = MPP(2)1223#            mpp2.set_prior(false, true)1224#            mpp2.fit(train_features, train_labels)1225#            mpp_pred2 = mpp2.predict(test_features)1226##            prob2 = mpp2.predict_prob(test_features)1227##            fper2, tper2, thresh = roc_curve(test_labels, prob2[:,1], pos_label=1)1228##            plot_roc(fper2, tper2, 'Case 2')1229##            plt.plot(fper2, tper2, label='Case 2')1230#            tp,tn,fn,fp = perf_eval(mpp_pred2, test_labels)1231#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1232#            print('TP:',tp)1233#            print('TN:',tn)1234#            print('FP:',fp)1235#            print('FN:',fn)1236#            sensitivity[1,j-1] = tp/(tp+fn)1237#            specificity[1,j-1] = tn/(tn+fp)1238#1239#            conf_mats[1,:,:] = confusion_matrix(mpp_pred2, test_labels).T1240#            all_labels[1,:] = mpp_pred2.reshape((mpp_pred2.shape[0]))1241#1242#            print("MPP case 3")1243#            mpp3 = MPP(3)1244#            mpp3.set_prior(false, true)1245#            mpp3.fit(train_features, train_labels)1246#            mpp_pred3 = mpp3.predict(test_features)1247##            prob3 = mpp3.predict_prob(test_features)1248##            fper3, tper3, thresh = roc_curve(test_labels, prob3[:,1], pos_label=1)1249##            plot_roc(fper3, tper3, 'Case 3')1250##            plt.plot(fper3, tper3, label='Case 3')1251#            tp,tn,fn,fp = perf_eval(mpp_pred3, test_labels)1252#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1253#            print('TP:',tp)1254#            print('TN:',tn)1255#            print('FP:',fp)1256#            print('FN:',fn)1257#            sensitivity[2,j-1] = tp/(tp+fn)1258#            specificity[2,j-1] = tn/(tn+fp)1259#1260#            conf_mats[2,:,:] = confusion_matrix(mpp_pred3, test_labels).T1261#            all_labels[2,:] = mpp_pred3.reshape((mpp_pred2.shape[0]))1262#1263#            k = 51264#            print("KNN: k =",k)1265#            print('2 norm')1266#            knn_model = KNN(k)1267#            knn_model.set_prior(false, true)1268#            knn_model.fit(train_features, train_labels)1269#            ymodel = knn_model.predict(test_features, norm=2)1270##            prob = knn_model.predict_prob(test_features)1271##            fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1272##            plt.figure()1273##            plot_roc(fper, tper)1274##            plt.plot(fper3, tper3, label='KNN k=3')1275#            tp,tn,fn,fp = perf_eval(ymodel, test_labels)1276#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1277#            print('TP:',tp)1278#            print('TN:',tn)1279#            print('FP:',fp)1280#            print('FN:',fn)1281#            sensitivity[3,j-1] = tp/(tp+fn)1282#            specificity[3,j-1] = tn/(tn+fp)1283#1284#            conf_mats[3,:,:] = confusion_matrix(ymodel, test_labels).T1285#            all_labels[3,:] = ymodel.reshape((mpp_pred2.shape[0]))1286#1287##            plt.xlabel('False Positive Rate')1288##            plt.ylabel('True Positive Rate')1289##            plt.legend()1290##            plt.title('MPP: Prior 0: ' + str(round(false, 1)) + ' Prior 1: ' + str(round(true, 1)))1291#1292#            print("Majority Vote Fused MPP")1293#            mpp_predictions = np.zeros((6,mpp_pred1.shape[0]))1294#            mpp_predictions[0,:] = mpp_pred1.T1295#            mpp_predictions[1,:] = mpp_pred2.T1296#            mpp_predictions[2,:] = mpp_pred3.T1297#            mpp_predictions[3,:] = ymodel.T1298#            mpp_fused = majority_vote(mpp_predictions)1299#            tp,tn,fn,fp = perf_eval(mpp_fused, test_labels)1300#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1301#            print('TP:',tp)1302#            print('TN:',tn)1303#            print('FP:',fp)1304#            print('FN:',fn)1305#            sensitivity[4,j-1] = tp/(tp+fn)1306#            specificity[4,j-1] = tn/(tn+fp)1307#1308##            table,comb = nb_fusion(conf_mats, all_labels, test_labels)1309##            print(conf_mats)1310##            print('table',table)1311#1312#            print(conf_mats)1313#1314#            print("NB Fusion")1315#            table,comb,fused = nb_fusion(conf_mats, all_labels, test_labels)1316#            tp,tn,fn,fp = perf_eval(fused, test_labels)1317#            print('Accuracy:     ', (tp+tn)/(tp+tn+fp+fn))1318#            print('TP:',tp)1319#            print('TN:',tn)1320#            print('FP:',fp)1321#            print('FN:',fn)1322#            sensitivity[5,j-1] = tp/(tp+fn)1323#            specificity[5,j-1] = tn/(tn+fp)1324#1325#        plt.figure()1326#        x = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]1327#        plt.plot(x, sensitivity[0,:], label="Case 1")1328#        plt.plot(x, sensitivity[1,:], label="Case 2")1329#        plt.plot(x, sensitivity[2,:], label="Case 3")1330#        plt.plot(x, sensitivity[3,:], label="KNN")1331#        plt.plot(x, sensitivity[4,:], label="Majority Vote")1332#        plt.plot(x, sensitivity[5,:], label="NB Fusion")1333#        plt.xlabel("Prior probability of correct classification")1334#        plt.ylabel("Sensitivity")1335#        plt.title("Sensitivity")1336#        plt.figure()1337#        plt.plot(x, specificity[0,:], label="Case 1")1338#        plt.plot(x, specificity[1,:], label="Case 2")1339#        plt.plot(x, specificity[2,:], label="Case 3")1340#        plt.plot(x, specificity[3,:], label="KNN")1341#        plt.plot(x, specificity[4,:], label="Majority Vote")1342#        plt.plot(x, specificity[5,:], label="NB Fusion")1343#        plt.xlabel("Prior probability of correct classification")1344#        plt.ylabel("Specificity")1345#        plt.title("Specificity")1346#        plt.legend()1347    plt.show()1348if __name__ == "__main__":...learn_models.py
Source:learn_models.py  
1import lime2import sklearn3import numpy as np4import embedding_forest5import sklearn6import sklearn.ensemble7import sklearn.metrics8import sklearn.feature_extraction9import csv10import random11from sklearn.datasets import fetch_20newsgroups12from sklearn.externals import joblib13import itertools14import lstm15import json16import collections17def clean_vectors_wordlist(input_vectors, vectorizer, wordlist):18  ret_vectors = input_vectors.copy()19  words = np.array([vectorizer.vocabulary_[x] for x in set(wordlist) if x in vectorizer.vocabulary_])20  ret_vectors[:, words] = 021  return ret_vectors22def GetSuggestions(model, test_data, raw_data, test_labels):23    test_labels = np.array(test_labels)24    preds = (model.predict_proba(test_data)[:,1] > .5).astype(int)25    fp = np.where((preds == 1) * (test_labels == 0))[0]26    tp = np.where((preds == 1) * (test_labels == 1))[0]27    fn = np.where((preds == 0) * (test_labels == 1))[0]28    tn = np.where((preds == 0) * (test_labels == 0))[0]29    suggestions = []30    add_suggestion = lambda title, i: suggestions.append({'title' : 'ID %d (%s)' % (i, title), 'text' : raw_data[i], 'true_class' : test_labels[i]}) if i else None31    for a, b, c, d in itertools.izip_longest(fp[:15], tp[:15], fn[:15], tn[:15]):32        add_suggestion('FP', a) 33        add_suggestion('TP', b) 34        add_suggestion('FN', c) 35        add_suggestion('TN', d) 36    return suggestions37def GetSuggestionsPair(model1, model2, test_data, raw_data, test_labels, nn=False):38    test_labels = np.array(test_labels)39    preds1 = (model1.predict_proba(test_data)[:,1] > .5).astype(int)40    if nn:41        preds2 = (model2.predict_proba(raw_data)[:,1] > .5).astype(int)42    else:43        preds2 = (model2.predict_proba(test_data)[:,1] > .5).astype(int)44    fp_fp = np.where((preds1 == 1) * (test_labels == 0) * (preds2 == 1))[0]45    fp_tn = np.where((preds1 == 1) * (test_labels == 0) * (preds2 == 0))[0]46    tn_fp = np.where((preds1 == 0) * (test_labels == 0) * (preds2 == 1))[0]47    tn_tn = np.where((preds1 == 0) * (test_labels == 0) * (preds2 == 0))[0]48    fn_fn = np.where((preds1 == 0) * (test_labels == 1) * (preds2 == 0))[0]49    fn_tp = np.where((preds1 == 0) * (test_labels == 1) * (preds2 == 1))[0]50    tp_tp = np.where((preds1 == 1) * (test_labels == 1) * (preds2 == 1))[0]51    tp_fn = np.where((preds1 == 1) * (test_labels == 1) * (preds2 == 0))[0]52    suggestions = []53    add_suggestion = lambda title, i: suggestions.append({'title' : 'ID %d (%s)' % (i, title), 'text' : raw_data[i], 'true_class' : test_labels[i]}) if i else None54    for a, b, c, d , e, f, g, h in itertools.izip_longest(fp_fp[:15], fp_tn[:15], tn_fp[:15], tn_tn[:15], fn_fn[:15], fn_tp[:15], tp_fn[:15], tp_tp[:15]):55        add_suggestion('FP-FP', a) 56        add_suggestion('FP-TN', b) 57        add_suggestion('TN-FP', c) 58        add_suggestion('TN-TN', d) 59        add_suggestion('FN-FN', e) 60        add_suggestion('FN-TP', f) 61        add_suggestion('TP-FN', g) 62        add_suggestion('TP-TP', h) 63    return suggestions64    65def LoadPoliteness(path, percent_test=.1):66    data = []67    labels = []68    with open(path) as csvfile:69        reader = csv.DictReader(csvfile)70        for row in reader:71            data.append((row['Request'], float(row['Normalized Score'])))72    data = sorted(data, key=lambda x:x[1])73    quartile_len = len(data) / 474    negatives = [x[0] for x in data[:quartile_len]]75    positives = [x[0] for x in data[-quartile_len:]]76    random.seed(1)77    random.shuffle(positives)78    random.shuffle(negatives)79    size_test = int(len(negatives) * percent_test)80    size_train = len(negatives) - size_test81    train = positives[:size_train] + negatives[:size_train]82    train_labels = np.hstack((np.ones(size_train), np.zeros(size_train))).astype('int')83    test = positives[size_train:] + negatives[size_train:]84    test_labels = np.hstack((np.ones(size_test), np.zeros(size_test))).astype('int')85    return train, train_labels, test, test_labels86def LearnPoliteness():87    train, train_labels, test, test_labels = LoadPoliteness('data/stanford_politeness/wikipedia.annotated.csv')88    vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False, min_df=10)89    vectorizer.fit(train + test)90    train_vectors = vectorizer.transform(train)91    test_vectors = vectorizer.transform(test)92    svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)93    svm.fit(train_vectors, train_labels)94    rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)95    rf.fit(train_vectors, train_labels)96    lr = sklearn.linear_model.LogisticRegression()97    lr.fit(train_vectors, train_labels)98    suggestions = {}99    suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)100    suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)101    suggestions['svm'] = GetSuggestions(svm, test_vectors, test, test_labels)102    suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)103    suggestions['lr-svm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)104    suggestions['rf-svm'] = GetSuggestionsPair(rf, svm, test_vectors, test, test_labels)105    ret = {} 106    ret['svm'] = {}107    ret['svm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, svm.predict(test_vectors))108    ret['svm']['model'] = svm109    ret['rf'] = {}110    ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))111    ret['rf']['model'] = rf112    ret['lr'] = {}113    ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))114    ret['lr']['model'] = lr115    ret['vectorizer'] = vectorizer116    ret['class_names'] = ['rude', 'polite']117    return ret, suggestions118def Load20NG():119    cats = ['alt.atheism', 'soc.religion.christian']120    newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)121    newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)122    train, train_labels = newsgroups_train.data, newsgroups_train.target123    test, test_labels = newsgroups_test.data, newsgroups_test.target124    return train, train_labels, test, test_labels125def Learn20NG():126    train, train_labels, test, test_labels = Load20NG()127    vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False)128    vectorizer.fit(train + test)129    train_vectors = vectorizer.transform(train)130    test_vectors = vectorizer.transform(test)131    svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)132    svm.fit(train_vectors, train_labels)133    rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)134    rf.fit(train_vectors, train_labels)135    lr = sklearn.linear_model.LogisticRegression()136    lr.fit(train_vectors, train_labels)137    # This wordlist achieves 78.02% accuracy on the religion dataset138    wordlist = 'in,to,Re,In,1993,rutgers,athos,writes,article,12,And,you,on,heart,will,Chuck,not,gvg47,gvg,He,this,may,10,us,When,before,alt,uk,co,mantis,up,post,Distribution,You,Keith,kmr4,Ryan,Bill,pooh,for,the,Host,Posting,NNTP,New,Thanks,anyone,email,has,Newsreader,Nntp,wrote,agree,Sandvik,edu,clh,by,who,thoughts,thing,saturn,wwc,more,EDU,try,wouldn,am,as,world,livesey,Livesey,wpd,solntze,jon,from,it,cc,little,Conner,osrhe,here,VMS,don,than,13,would,also,18,about,University,TIN,FAQ,version,even,PL9,said,being,Yet,so,he,they,interested,geneva,17,athena,May,love,me,whether,St,COM,Inc,newton,TEK,Kent,mean,sandvik,Or,Beaverton,lot,week,need,education,our,Robert,Don,Reply,cs,which,Computer,Organization,rusnews,Jim,bmd,trw,deleted,position,now,isn,whole,mathew,00,05,Michael,subject,CA,Princeton,po,CWRU,okcforum,bil,GMT,Bake,Timmons,timmbake,mcl,sgi,au,Dan,com,Unix'.split(',')139    cleaned_train = clean_vectors_wordlist(train_vectors, vectorizer, wordlist)140    cleansvm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)141    cleansvm.fit(cleaned_train, train_labels)142    rfemb = embedding_forest.EmbeddingForest(vectorizer)143    rfemb.fit(train_vectors, train_labels)144    suggestions = {}145    suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)146    suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)147    suggestions['rfemb'] = GetSuggestions(rfemb, test_vectors, test, test_labels)148    suggestions['svm'] = GetSuggestions(svm, test_vectors, test, test_labels)149    suggestions['cleansvm'] = GetSuggestions(cleansvm, test_vectors, test, test_labels)150    suggestions['cleansvm-lr'] = GetSuggestionsPair(cleansvm, lr, test_vectors, test, test_labels)151    suggestions['cleansvm-rf'] = GetSuggestionsPair(cleansvm, rf, test_vectors, test, test_labels)152    suggestions['cleansvm-rfemb'] = GetSuggestionsPair(cleansvm, rfemb, test_vectors, test, test_labels)153    suggestions['cleansvm-svm'] = GetSuggestionsPair(cleansvm, svm, test_vectors, test, test_labels)154    suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)155    suggestions['lr-rfemb'] = GetSuggestionsPair(lr, rfemb, test_vectors, test, test_labels)156    suggestions['lr-svm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)157    suggestions['lr-cleansvm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)158    suggestions['rf-rfemb'] = GetSuggestionsPair(rf, rfemb, test_vectors, test, test_labels)159    suggestions['rf-svm'] = GetSuggestionsPair(rf, svm, test_vectors, test, test_labels)160    suggestions['rfemb-svm'] = GetSuggestionsPair(rfemb, svm, test_vectors, test, test_labels)161    ret = {} 162    ret['svm'] = {}163    ret['svm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, svm.predict(test_vectors))164    ret['svm']['model'] = svm165    ret['cleansvm'] = {}166    ret['cleansvm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, cleansvm.predict(test_vectors))167    ret['cleansvm']['model'] = cleansvm168    ret['rf'] = {}169    ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))170    ret['rf']['model'] = rf171    ret['rfemb'] = {}172    ret['rfemb']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rfemb.predict(test_vectors))173    ret['rfemb']['model'] = rfemb174    ret['lr'] = {}175    ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))176    ret['lr']['model'] = lr177    ret['vectorizer'] = vectorizer178    ret['class_names'] = ['Atheism', 'Christian']179    return ret, suggestions180def LoadSentimentFile(path):181    data = []182    labels = []183    for line in open(path):184        x, y = line.decode('utf-8', 'ignore').strip().split('\t')185        data.append(x)186        labels.append(int(y))187    return data, labels188def LoadSentiment():189    train, train_labels = LoadSentimentFile('data/sentiment-train')190    test, test_labels = LoadSentimentFile('data/sentiment-test')191    return train, train_labels, test, test_labels192def LearnSentiment():193    train, train_labels, test, test_labels = LoadSentiment()194    vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False, min_df=10)   195    vectorizer.fit(train + test)                                                          196    train_vectors = vectorizer.transform(train)                                           197    test_vectors = vectorizer.transform(test)                                             198    rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)199    rf.fit(train_vectors, train_labels)                                                   200    lr = sklearn.linear_model.LogisticRegression()                                        201    lr.fit(train_vectors, train_labels)   202    DummyModel = collections.namedtuple('model', ['predict_proba'])203    nn = DummyModel(lstm.GetLSTM())204    suggestions = {}205    suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)206    suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)207    suggestions['nn'] = GetSuggestions(nn, test, test, test_labels)208    suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)209    suggestions['lr-nn'] = GetSuggestionsPair(lr, nn, test_vectors, test, test_labels, nn=True)210    suggestions['rf-nn'] = GetSuggestionsPair(rf, nn, test_vectors, test, test_labels, nn=True)211    ret = {} 212    ret['nn'] = {}213    ret['nn']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, (nn.predict_proba(test)[:,1] > .5).astype(int))214    ret['rf'] = {}215    ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))216    ret['rf']['model'] = rf217    ret['lr'] = {}218    ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))219    ret['lr']['model'] = lr220    ret['vectorizer'] = vectorizer221    ret['class_names'] = ['Negative', 'Positive']222    return ret, suggestions223def main():224    suggestions = {}225    ret = {}226    ret['politeness'], suggestions['politeness'] = LearnPoliteness()227    ret['20ng'], suggestions['20ng'] = Learn20NG()228    ret['sentiment'], suggestions['sentiment'] = LearnSentiment()229    joblib.dump(ret, 'models/models')230    acc = {}231    for dataset in ret:232        acc[dataset] = {}233        for model in ret[dataset]:234            if model == 'class_names' or model == 'vectorizer':235                continue236            acc[dataset][model] = ret[dataset][model]['accuracy']237    ret_suggestions = {'suggestions' : suggestions, 'accuracy' : acc}238    json.dump(ret_suggestions, open('static/suggestions.json', 'w'))239if __name__ == '__main__':...q8. Multiclass Logistic Regression.py
Source:q8. Multiclass Logistic Regression.py  
1import numpy as np 2import pandas as pd 3import os 4# import matplotlib.pyplot as plt5def sigmoid(X):6    return 1.0/(1+np.exp(-X))7def logisticRegression(features,labels,learning_rate=0.01,epochs=1000,test=False,test_features=None,test_labels=None):8    """9    features: (number_examples,number_features)10    labels: (number_examples,1)11    12    returns: weights,bias13    """14    number_examples = features.shape[0]15    labels = np.reshape(labels,(number_examples,1))16    number_params = features.shape[1]17    W = np.random.randn(number_params,1)18    b = np.random.randn()19    for epoch in range(epochs):20        Z = np.dot(features,W) + b21        A = sigmoid(Z)22        loss = (labels.T).dot(np.log(A)) + (1-labels).T.dot(np.log(1-A))23        delta = (A - labels)24        dW = np.dot(features.T,delta)25        W += -learning_rate*dW/number_examples26        b += -learning_rate*np.sum(delta)/number_examples27        # if epoch % 10 == 0: 28        #     print(-loss)29    if test:30        if test_labels.any() != None and test_features.any() != None and test_features.shape[0] == test_labels.shape[0]:31            Z = np.dot(test_features,W) + b32            A = sigmoid(Z)33            accuracy = 034            A[A > 0.5] = 135            A[A < 0.5] = 0 36            test_size = test_features.shape[0]37            for datapoint in range(test_size):38                if A[datapoint] == test_labels[datapoint]:39                    accuracy += 140            print('accuracy: ',accuracy/test_size*100,'\n')                    41        else:42            raise ValueError            43    return W,b44np.random.seed(1)45cwd = os.getcwd()46file_name = 'data4.xlsx'47file_path = cwd + '\\'+file_name48excel_data = pd.ExcelFile(file_path).parse('Sheet1',header=None)49copy_data = excel_data.values50# print('\n\n\n',i,'\n\n\n')51np.random.shuffle(copy_data)52data = copy_data53data_size = data.shape[0]54train_data_size = int(data_size*0.6)55test_data_size = data_size - train_data_size56train_data = np.copy(data[:train_data_size])57test_data = np.copy(data[train_data_size:])58# print(train_data.shape,test_data.shape)59train_features = train_data[:,:4]60train_labels = train_data[:,4]61test_features = test_data[:,:4]62test_labels = test_data[:,4]63train_features = (train_features - np.mean(train_features,axis=0))/(np.std(train_features,axis=0))64test_features =  (test_features  -  np.mean(test_features,axis=0))/(np.std(test_features,axis=0))65# print(train_labels)66W_ova = {}67b_ova = {}68for i in range(1,4):69    train_labels[train_labels == i] = 170    train_labels[train_labels != 1] = 071    test_labels[test_labels == i] = 172    test_labels[test_labels != 1] = 073    # print(train_labels,'\n',i,'\n')74    _W,_b =  logisticRegression(train_features,train_labels,learning_rate=0.05,test=True,test_features=test_features,test_labels=test_labels)75    W_ova[str(i)] = _W76    b_ova[str(i)] = _b77# print(W_ova,b_ova)    78W_ovo = {}79b_ovo = {}80for i in range(1,4):81    for j in range(i,4):82        if i == j: continue83        # print('\n\n\n\n',i,j)84        data_i = copy_data[np.where(copy_data[:,-1] == i),:]85        data_j = copy_data[np.where(copy_data[:,-1] == j),:]86        # print(data_i,data_j87        number_examples = data_i.shape[1]88        number_features = data_i.shape[2]89        data_i = np.reshape(data_i,(number_examples,number_features))90        data_j = np.reshape(data_j,(number_examples,number_features))        91        dataset = np.vstack((data_i,data_j))92        dataset_size = dataset.shape[0]93        # print(data_i,data_j)94        np.random.shuffle(dataset)95        train_data_size = int(0.6*dataset_size)96        test_data_size = dataset_size - train_data_size97        train_data = np.copy(dataset[:train_data_size])98        test_data = np.copy(dataset[train_data_size:])99        # print(train_data.shape,test_data.shape)100        train_features = train_data[:,:4]101        train_labels = train_data[:,4]102        test_features = test_data[:,:4]103        test_labels = test_data[:,4]104        train_features = (train_features - np.mean(train_features,axis=0))/(np.std(train_features,axis=0))105        test_features =  (test_features  -  np.mean(test_features,axis=0))/(np.std(test_features,axis=0))106        107        np.random.shuffle(dataset)108        train_labels[train_labels == i] = 1109        train_labels[train_labels != 1] = 0110        test_labels[test_labels == i] = 1111        test_labels[test_labels != 1] = 0112        print(j,i,'\n')113        _W,_b =  logisticRegression(train_features,train_labels,learning_rate=0.1,epochs=2000,test=True,test_features=test_features,test_labels=test_labels)114        W_ovo[str(i)+str(j)] = _W115        b_ovo[str(i)+str(j)] = _b116# print(W_ovo)...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
