Test your AI Agents with the all-new Agent to Agent Testing Platform.Learn More

How to use feature_keywords method in Gherkin-python

Best Python code snippet using gherkin-python

BBC_NEWS.py

Source:BBC_NEWS.py

...57            'Keywords_CountV':x['Keywords_CountV'],58            'Content':x['Content']59        })60    """61def feature_keywords(outfile,collection,vectorizer):62    collection.delete_many({})63    data = open(outfile).read()64    labels, texts = [], []65    for i, line in enumerate(data.split("\n")):66        if len(line.split()) > 0:67            content = line.split()68            labels.append(content[0])69            texts.append(" ".join(content[1:]))70    wordslist = texts71    titlelist = labels72    transformer = TfidfTransformer()73    tfidf = vectorizer.fit_transform(wordslist)74    #print(tfidf)75    #print(vectorizer.fit_transform(wordslist))76    words = vectorizer.get_feature_names()  #ææææ¬çå³é®å77    weight = tfidf.toarray()78    n = 5 # åäºä½79    for (title, w, text) in zip(titlelist, weight, texts):80        wordsdet= ['and','of','the','to','in','will','students','project','subject','assessment','hours','with','on','be','for','you','he','she','her','his']81        print (u'{}:'.format(title))82        # æåº83        loc = np.argsort(-w)84        keywordsList = []85        #Keywords = ''86        i,j=0,087        while j < n:88            if words[loc[i]] in wordsdet:89                i += 190                continue91            keywordsList.append(words[loc[i]]+',')92            print (u'-{}: {} {}'.format(str(j + 1), words[loc[i]], w[loc[i]]))93            i +=194            j +=195        Keywords = ''.join(keywordsList)96        post = {97            'Label': title,98            'KeyWords': Keywords,99            'Content': text100        }101        collection.insert_one(post)102        print ('\n')103def TSNE(outfile):104    data = open(outfile).read()105    labels, texts = [], []106    for i, line in enumerate(data.split("\n")):107        if len(line.split()) > 0:108            content = line.split()109            labels.append(content[0])110            texts.append(" ".join(content[1:]))111    #åå»ºä¸ä¸ªdataframeï¼ååä¸ºtextålabel112    trainDF = pandas.DataFrame()113    trainDF['seriesNum'] = range(0, 2225)114    trainDF['label'] = labels115    trainDF['text'] = texts116    trainDF['category_id'] = trainDF['label'].factorize()[0]117    labels = trainDF['category_id']118    category_id_df = trainDF[['label', 'category_id']].drop_duplicates().sort_values('category_id')119    category_to_id = dict(category_id_df.values)120    id_to_category = dict(category_id_df[['category_id', 'label']].values)121    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')122    features = tfidf.fit_transform(trainDF['text']).toarray()123    SAMPLE_SIZE = int(len(features) * 0.3)124    np.random.seed(0)125    indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)126    projected_features = manifold.TSNE(n_components=2, random_state=0).fit_transform(features[indices])127    colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey']128    for category, category_id in sorted(category_to_id.items()):129        points = projected_features[(labels[indices] == category_id).values]130        plt.scatter(points[:, 0], points[:, 1], s=15, c=colors[category_id], label=category)131    plt.title("tf-idf feature vector for each article, projected on 2 dimensions.",132              fontdict=dict(fontsize=15))133    plt.legend()134    plt.show()135def multiple_classify(outfile, collection,classifier):136    collection.delete_many({}) #éæ°è¾å¥137    data = open(outfile).read()138    labels, texts = [], []139    for i, line in enumerate(data.split("\n")):140        if len(line.split()) > 0:141            content = line.split()142            labels.append(content[0])143            texts.append(" ".join(content[1:]))144    #åå»ºä¸ä¸ªdataframeï¼ååä¸ºtextålabel145    trainDF = pandas.DataFrame()146    trainDF['seriesNum'] = range(0, 2225)147    trainDF['label'] = labels148    trainDF['text'] = texts149    # print(trainDF)150    pipeline = Pipeline([151        ('tdidf_vectorizer',   TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)),152        ('classifier',         classifier)153    ])154    k_fold = model_selection.KFold(n_splits=5, shuffle=True)155    scores = []156    confusion = np.zeros((5,5))157    for train_indices, test_indices in k_fold.split(trainDF):158        sub_seriesNum = trainDF['seriesNum'][test_indices].tolist()159        train_text = trainDF['text'][train_indices]160        train_y = trainDF['label'][train_indices]161        test_text = trainDF['text'][test_indices]162        test_text_list = test_text.tolist()163        test_y = trainDF['label'][test_indices]164        encoder = preprocessing.LabelEncoder()165        train_y = encoder.fit_transform(train_y)166        test_y = encoder.fit_transform(test_y)167        pipeline.fit(train_text, train_y)168        predictions = pipeline.predict(test_text)169        for i in range(len(predictions)):170            post = {171                'Num': int(sub_seriesNum[i]),172                'Predict_Label': int(predictions[i]),173                'Actual_Label': int(test_y[i]),174                'Content': test_text_list[i]175            }176            collection.insert_one(post)177        confusion += confusion_matrix(test_y, predictions)178        score = f1_score(test_y, predictions, average='macro')179        scores.append(score)180    print('Total news classified:', len(trainDF))181    print('Score:', sum(scores)/len(scores))182    print('Confusion matrix:')183    print(confusion)184    print('\n')185def main():186    # Set up database187    myclient = pymongo.MongoClient("mongodb://localhost:27017/")188    db = myclient['BBC_NEWS']189    collection_set = {190        #'Act' : db['Student_Account'],191        #'Sbj_Info': db['Subject_Info'],192        'BBC': db['BBC_News'],193        'BBC_Result': db['BBC_News_ClassificationResult'],194        'BBC_Biagram':db['BBC_News_FeatureKeyWords_tdidf_Biagram'],195        'BBC_countV':db['BBC_News_FeatureKeyWords_CountVectorizer'],196        'BBC_tdidf':db['BBC_News_FeatureKeyWords_tdidf']197    }198    vectorizer_set = {199        'Tdidf_diagram': TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english'),200        'Count': CountVectorizer(),201        'Tdidf': TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', stop_words='english')202    }203    # Preprocess204    targetfile = "Dataset/bbc-text.csv"205    outfile = database.preprocess(targetfile)206    #TSNE(outfile)207    # Origin CSV insert to database208    Originfile_insert(outfile, collection_set['BBC'])209    # keywords210    #feature_keywords(outfile,collection_set['BBC_Biagram'],vectorizer_set['Tdidf_diagram'])211    #feature_keywords(outfile,collection_set['BBC_countV'],vectorizer_set['Count'])212    #feature_keywords(outfile,collection_set['BBC_tdidf'],vectorizer_set['Tdidf'])213    # Classify214    classifier_set = {215        'NB': MultinomialNB(),216        'SVM': SVC(kernel='linear'),217        'DT': tree.DecisionTreeClassifier()218    }219    multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['NB'])220    multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['SVM'])221    #multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['DT'])222    # Database Input223    database.LabelDecoder(collection_set['BBC_Result'])224    pipeline = [{225        '$lookup': {226            'from': 'BBC_News_FeatureKeyWords_CountVectorizer',227            'localField': 'Content',228            'foreignField' : 'Content',229            'as': 'Keywords_CountV'230        }},{231        '$lookup': {232            'from': 'BBC_News_FeatureKeyWords_tdidf',233            'localField': 'Content',234            'foreignField' : 'Content',235            'as': 'Keywords_tdidf'236        }},{237        '$lookup': {238            'from': 'BBC_News_FeatureKeyWords_tdidf_Biagram',239            'localField': 'Content',240            'foreignField' : 'Content',241            'as': 'Keywords_tdidf_Biagram'242        }},{243        '$project': {244            'Label':{'$arrayElemAt':['$Keywords_CountV.Label',0]},245            'Keywords_tdidf': {'$arrayElemAt':['$Keywords_tdidf.KeyWords', 0]},246            'Keywords_tdidf_Biagram': {'$arrayElemAt':['$Keywords_tdidf_Biagram.KeyWords', 0]},247            'Keywords_CountV': {'$arrayElemAt':['$Keywords_CountV.KeyWords', 0]},248            'Content':{'$arrayElemAt':['$Keywords_CountV.Content',0]}}}249    ]250    update_result = db['BBC_News'].aggregate(pipeline)251    for x in update_result:252        #print(x)253        db['BBC_News'].update({254            '_id':x['_id']255        }, {256            'Label':x['Label'],257            'Keywords_tdidf':x['Keywords_tdidf'],258            'Keywords_tdidf_Biagram':x['Keywords_tdidf_Biagram'],259            'Keywords_CountV':x['Keywords_CountV'],260            'Content':x['Content']261        })262    query = {263        "$where": "this.Predict_Label != this.Actual_Label"264    }265    answer = collection_set['BBC_Result'].find(query).sort('Num')266    #for x in answer:267     #   print(x)268    #feature_keywords("/Users/frank/PycharmProjects/FYP_classification/RAKE-tutorial/articles/txt/EIE3105.pdf.txt")269if __name__=='__main__':...

testing.py

Source:testing.py

1# -*- coding: utf-8 -*-2"""3    pygments.lexers.testing4    ~~~~~~~~~~~~~~~~~~~~~~~5    Lexers for testing languages.6    :copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS.7    :license: BSD, see LICENSE for details.8"""9from pygments.lexer import RegexLexer, include, bygroups10from pygments.token import Comment, Keyword, Name, String11__all__ = ['GherkinLexer']12class GherkinLexer(RegexLexer):13    """14    For `Gherkin <http://github.com/aslakhellesoy/gherkin/>` syntax.15    .. versionadded:: 1.216    """17    name = 'Gherkin'18    aliases = ['cucumber', 'gherkin']19    filenames = ['*.feature']20    mimetypes = ['text/x-gherkin']21    feature_keywords         = u'^(ê¸°ë¥|æ©è½|åè½|ãã£ã¼ãã£|Ø®Ø§ØµÙØ©|×ª××× ×|Ð¤ÑÐ½ÐºÑÑÐ¾Ð½Ð°Ð»|Ð¤ÑÐ½ÐºÑÐ¸Ð¾Ð½Ð°Ð»Ð½Ð¾ÑÑ|Ð¤ÑÐ½ÐºÑÐ¸Ð¾Ð½Ð°Ð»|Ð¤Ð¸ÑÐ°|ÐÑÐ¾Ð±Ð¸Ð½Ð°|ÐÐ¾Ð³ÑÑÐ½Ð¾ÑÑ|Ãzellik|WÅaÅciwoÅÄ|TÃnh nÄng|Trajto|SavybÄ|PoÅ¾iadavka|PoÅ¾adavek|Osobina|Ominaisuus|Omadus|OH HAI|MoguÄnost|Mogucnost|JellemzÅ|FÄ«Äa|FunzionalitÃ |FunktionalitÃ¤t|Funkcionalnost|FunkcionalitÄte|FuncÈionalitate|Functionaliteit|Functionalitate|Funcionalitat|Funcionalidade|FonctionnalitÃ©|Fitur|Feature|Egenskap|Egenskab|Crikey|CaracterÃstica|Arwedd)(:)(.*)$'22    feature_element_keywords = u'^(\\s*)(ìëë¦¬ì¤ ê°ì|ìëë¦¬ì¤|ë°°ê²½|èæ¯|å ´æ¯å¤§ç¶±|å ´æ¯|åºæ¯å¤§çº²|åºæ¯|åæ¬å¤§ç¶±|åæ¬|ãã³ãã¬|ã·ããªãªãã³ãã¬ã¼ã|ã·ããªãªãã³ãã¬|ã·ããªãªã¢ã¦ãã©ã¤ã³|ã·ããªãª|Ø³ÙÙØ§Ø±ÙÙ ÙØ®Ø·Ø·|Ø³ÙÙØ§Ø±ÙÙ|Ø§ÙØ®ÙÙÙØ©|×ª×¨×××©|×ª×× ××ª ×ª×¨×××©|×¨×§×¢|Ð¢Ð°ÑÐ¸Ñ|Ð¡ÑÐµÐ½Ð°ÑÑÐ¹|Ð¡ÑÐµÐ½Ð°ÑÐ¸Ð¾|Ð¡ÑÐµÐ½Ð°ÑÐ¸Ð¹ ÑÑÑÑÐºÑÑÑÐ°ÑÐ¸|Ð¡ÑÐµÐ½Ð°ÑÐ¸Ð¹|Ð¡ÑÑÑÐºÑÑÑÐ° ÑÑÐµÐ½Ð°ÑÑÑ|Ð¡ÑÑÑÐºÑÑÑÐ° ÑÑÐµÐ½Ð°ÑÐ¸ÑÐ°|Ð¡ÑÑÑÐºÑÑÑÐ° ÑÑÐµÐ½Ð°ÑÐ¸Ñ|Ð¡ÐºÐ¸ÑÐ°|Ð Ð°Ð¼ÐºÐ° Ð½Ð° ÑÑÐµÐ½Ð°ÑÐ¸Ð¹|ÐÑÐ¸Ð¼ÐµÑ|ÐÑÐµÐ´ÑÑÑÐ¾ÑÐ¸Ñ|ÐÑÐµÐ´Ð¸ÑÑÐ¾ÑÐ¸Ñ|ÐÐ¾Ð·Ð°Ð´Ð¸Ð½Ð°|ÐÐµÑÐµÐ´ÑÐ¼Ð¾Ð²Ð°|ÐÑÐ½Ð¾Ð²Ð°|ÐÐ¾Ð½ÑÐµÐ¿Ñ|ÐÐ¾Ð½ÑÐµÐºÑÑ|ZaÅoÅ¼enia|Wharrimean is|TÃ¬nh huá»ng|The thing of it is|Tausta|Taust|Tapausaihio|Tapaus|Szenariogrundriss|Szenario|Szablon scenariusza|Stsenaarium|Struktura scenarija|Skica|Skenario konsep|Skenario|SituÄcija|Senaryo taslaÄÄ±|Senaryo|ScÃ©nÃ¡Å|ScÃ©nario|Schema dello scenario|ScenÄrijs pÄc parauga|ScenÄrijs|ScenÃ¡r|Scenaro|Scenariusz|Scenariul de Åablon|Scenariul de sablon|Scenariu|Scenario Outline|Scenario Amlinellol|Scenario|Scenarijus|Scenarijaus Å¡ablonas|Scenarij|Scenarie|Rerefons|Raamstsenaarium|Primer|PozadÃ|Pozadina|Pozadie|Plan du scÃ©nario|Plan du ScÃ©nario|Osnova scÃ©nÃ¡Åe|Osnova|NÃ¡Ärt ScÃ©nÃ¡Åe|NÃ¡Ärt ScenÃ¡ru|Mate|MISHUN SRSLY|MISHUN|Ká»ch báº£n|Konturo de la scenaro|Kontext|Konteksts|Kontekstas|Kontekst|Koncept|Khung tÃ¬nh huá»ng|Khung ká»ch báº£n|HÃ¡ttÃ©r|Grundlage|GeÃ§miÅ|ForgatÃ³kÃ¶nyv vÃ¡zlat|ForgatÃ³kÃ¶nyv|Fono|Esquema do CenÃ¡rio|Esquema do Cenario|Esquema del escenario|Esquema de l\'escenari|Escenario|Escenari|Dis is what went down|Dasar|Contexto|Contexte|Contesto|CondiÅ£ii|Conditii|CenÃ¡rio|Cenario|Cefndir|Bá»i cáº£nh|Blokes|Bakgrunn|Bakgrund|Baggrund|Background|B4|Antecedents|Antecedentes|All y\'all|Achtergrond|Abstrakt Scenario|Abstract Scenario)(:)(.*)$'23    examples_keywords        = u'^(\\s*)(ì|ä¾å|ä¾|ãµã³ãã«|Ø§ÙØ«ÙØ©|×××××××ª|Ð¡ÑÐµÐ½Ð°ÑÐ¸ÑÐ¸|ÐÑÐ¸Ð¼ÐµÑÐ¸|ÐÑÐ¸ÐºÐ»Ð°Ð´Ð¸|ÐÐ¸ÑÐ¾Ð»Ð»Ð°Ñ|ÐÐ½Ð°ÑÐµÐ½Ð¸Ñ|Ãrnekler|Voorbeelden|Variantai|Tapaukset|Scenarios|Scenariji|Scenarijai|PÅÃklady|PÃ©ldÃ¡k|PrÃklady|PrzykÅady|Primjeri|Primeri|PiemÄri|PavyzdÅ¾iai|Paraugs|Juhtumid|Exemplos|Exemples|Exemplele|Exempel|Examples|Esempi|Enghreifftiau|Ekzemploj|Eksempler|Ejemplos|EXAMPLZ|Dá»¯ liá»u|Contoh|Cobber|Beispiele)(:)(.*)$'24    step_keywords            = u'^(\\s*)(íì§ë§|ì¡°ê±´|ë¨¼ì |ë§ì¼|ë§ì½|ë¨|ê·¸ë¦¬ê³ |ê·¸ë¬ë©´|é£éº¼|é£ä¹|èä¸|ç¶|å½|åæ|åè¨|åå¦|ä½æ¯|ä½ã|ä¸¦ä¸|ãã|ãªãã°|ãã ã|ããã|ãã¤|Ù |ÙØªÙ |ÙÙÙ |Ø¹ÙØ¯ÙØ§ |Ø«Ù |Ø¨ÙØ±Ø¶ |Ø§Ø°Ø§Ù |×××©×¨ |××× |×××× ×ª× |××× |×× |××× |Ð¯ÐºÑÐ¾ |Ð£Ð½Ð´Ð° |Ð¢Ð¾ |ÐÑÐ¸Ð¿ÑÑÑÐ¸Ð¼Ð¾, ÑÐ¾ |ÐÑÐ¸Ð¿ÑÑÑÐ¸Ð¼Ð¾ |ÐÐ½Ð´Ð° |ÐÐ¾ |ÐÐµÑÐ°Ð¹ |ÐÐµÐºÐ¸Ð½ |ÐÐ¾Ð³Ð°ÑÐ¾ |ÐÐ°Ð´Ð° |ÐÐ°Ð´ |Ð ÑÐ¾Ð¼Ñ Ð¶Ðµ |Ð |ÐÐ°Ð´Ð°ÑÐ¾ |ÐÐ°Ð´Ð°ÑÐ¸ |ÐÐ°Ð´Ð°ÑÐµ |ÐÑÐ»Ð¸ |ÐÐ¾Ð¿ÑÑÑÐ¸Ð¼ |ÐÐ°Ð´ÐµÐ½Ð¾ |ÐÐ° |ÐÐ¸ÑÐ¾Ðº |ÐÐ¼Ð¼Ð¾ |ÐÐ»Ð¸ |ÐÐ»Ðµ |ÐÐ³Ð°Ñ |Ð |Ð |Èi |Ãs |Zatati |ZakÅadajÄc |Zadato |Zadate |Zadano |Zadani |Zadan |Youse know when youse got |Youse know like when |Yna |Ya know how |Ya gotta |Y |Wun |Wtedy |When y\'all |When |Wenn |WEN |VÃ  |Ve |Und |Un |ThÃ¬ |Then y\'all |Then |Tapi |Tak |Tada |Tad |SÃ¥ |Stel |Soit |Siis |Si |Sed |Se |Quando |Quand |Quan |Pryd |Pokud |PokiaÄ¾ |PerÃ² |Pero |Pak |Oraz |Onda |Ond |Oletetaan |Og |Och |O zaman |NÃ¥r |NÃ¤r |Niin |NhÆ°ng |N |Mutta |Men |Mas |Maka |Majd |Mais |Maar |Ma |Lorsque |Lorsqu\'|Kun |Kuid |Kui |Khi |KeÄ |Ketika |KdyÅ¾ |Kaj |Kai |Kada |Kad |JeÅ¼eli |Ja |Ir |I CAN HAZ |I |Ha |Givun |Givet |Given y\'all |Given |Gitt |Gegeven |Gegeben sei |Fakat |EÄer ki |Etant donnÃ© |Et |EntÃ£o |Entonces |Entao |En |Eeldades |E |Duota |Dun |DonitaÄµo |Donat |Donada |Do |Diyelim ki |Dengan |Den youse gotta |De |Dato |Dar |Dann |Dan |Dado |DacÄ |Daca |DEN |CÃ¢nd |Cuando |Cho |Cept |Cand |Cal |But y\'all |But |Buh |Biáº¿t |Bet |BUT |AtÃ¨s |Atunci |Atesa |Anrhegedig a |Angenommen |And y\'all |And |An |Ama |Als |Alors |Allora |Ali |Aleshores |Ale |Akkor |Aber |AN |A takÃ© |A |\* )'25    tokens = {26        'comments': [27            (r'^\s*#.*$', Comment),28        ],29        'feature_elements': [30            (step_keywords, Keyword, "step_content_stack"),31            include('comments'),32            (r"(\s|.)", Name.Function),33        ],34        'feature_elements_on_stack': [35            (step_keywords, Keyword, "#pop:2"),36            include('comments'),37            (r"(\s|.)", Name.Function),38        ],39        'examples_table': [40            (r"\s+\|", Keyword, 'examples_table_header'),41            include('comments'),42            (r"(\s|.)", Name.Function),43        ],44        'examples_table_header': [45            (r"\s+\|\s*$", Keyword, "#pop:2"),46            include('comments'),47            (r"\\\|", Name.Variable),48            (r"\s*\|", Keyword),49            (r"[^|]", Name.Variable),50        ],51        'scenario_sections_on_stack': [52            (feature_element_keywords,53             bygroups(Name.Function, Keyword, Keyword, Name.Function),54             "feature_elements_on_stack"),55        ],56        'narrative': [57            include('scenario_sections_on_stack'),58            include('comments'),59            (r"(\s|.)", Name.Function),60        ],61        'table_vars': [62            (r'(<[^>]+>)', Name.Variable),63        ],64        'numbers': [65            (r'(\d+\.?\d*|\d*\.\d+)([eE][+-]?[0-9]+)?', String),66        ],67        'string': [68            include('table_vars'),69            (r'(\s|.)', String),70        ],71        'py_string': [72            (r'"""', Keyword, "#pop"),73            include('string'),74        ],75        'step_content_root': [76            (r"$", Keyword, "#pop"),77            include('step_content'),78        ],79        'step_content_stack': [80            (r"$", Keyword, "#pop:2"),81            include('step_content'),82        ],83        'step_content': [84            (r'"', Name.Function, "double_string"),85            include('table_vars'),86            include('numbers'),87            include('comments'),88            (r'(\s|.)', Name.Function),89        ],90        'table_content': [91            (r"\s+\|\s*$", Keyword, "#pop"),92            include('comments'),93            (r"\\\|", String),94            (r"\s*\|", Keyword),95            include('string'),96        ],97        'double_string': [98            (r'"', Name.Function, "#pop"),99            include('string'),100        ],101        'root': [102            (r'\n', Name.Function),103            include('comments'),104            (r'"""', Keyword, "py_string"),105            (r'\s+\|', Keyword, 'table_content'),106            (r'"', Name.Function, "double_string"),107            include('table_vars'),108            include('numbers'),109            (r'(\s*)(@[^@\r\n\t ]+)', bygroups(Name.Function, Name.Tag)),110            (step_keywords, bygroups(Name.Function, Keyword),111             'step_content_root'),112            (feature_keywords, bygroups(Keyword, Keyword, Name.Function),113             'narrative'),114            (feature_element_keywords,115             bygroups(Name.Function, Keyword, Keyword, Name.Function),116             'feature_elements'),117            (examples_keywords,118             bygroups(Name.Function, Keyword, Keyword, Name.Function),119             'examples_table'),120            (r'(\s|.)', Name.Function),121        ]...

disaster_clf.py

Source:disaster_clf.py

1import pandas as pd2import numpy as np3from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer4from sklearn.preprocessing import OneHotEncoder, LabelEncoder5from joblib import dump, load6import nltk7import re8from nltk.stem.wordnet import WordNetLemmatizer9import string10from nltk.tokenize import word_tokenize11from nltk.corpus import stopwords, wordnet12from nltk.corpus.reader import wordnet13from nltk.stem import LancasterStemmer, PorterStemmer14from sklearn.decomposition import TruncatedSVD15from sklearn.model_selection import train_test_split, GridSearchCV16from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB17from sklearn.ensemble import RandomForestClassifier18from sklearn.svm import LinearSVC, SVC19from sklearn.metrics import classification_report, accuracy_score20from sklearn.preprocessing import MinMaxScaler21from scipy.sparse import csr_matrix22from sklearn.linear_model import LogisticRegression23# in train set, keyword has NaN (missing value) = 61 rows24# in train set, location has NaN (missing value) = 2533 rows25TRAINING_FILE_NAME = 'dataset/train.csv'26KEYWORD_OHE_PATH = './lib/keyword_ohe.lib'27TEXT_VECTORIZER_PATH = './lib/text_vectorizer.lib'28TRAINING_DATAFRAME_PATH = './lib/training_df.lib'29TRAINING_TARGET_PATH = './lib/training_target.lib'30TESTING_TARGET_PATH = './lib/testing_target.lib'31TESTING_DATAFRAME_PATH = './lib/target_df.lib'32KEYWORD_LBE_PATH = './lib/keyword_lbe.lib'33USE_LABEL_ENCODER = False34SAVE_MODEL = False35USE_LEMMATIZER = False36USE_LANCASTER_STEM = True37TEST_PREDICT_FILE = './dataset/test.csv'38SUBMISSION_FILE = './submission/disaster_clf.csv'39SAMPLE_SUBMISSION_FILE = './dataset/sample_submission.csv'40def download_nltk_package():41    nltk.download('averaged_perceptron_tagger')42    nltk.download('words')43    nltk.download('punkt')44    nltk.download('stopwords')45    nltk.download('wordnet')46def data_info(df):47    print('keyword features')48    print('-------------------------------------')49    feature_keywords = df['keyword'].value_counts()50    print(feature_keywords)51    print('######################################')52    print(f'there are {feature_keywords.count()} unique features')53    print('-------------------------------------')54    55    print()56    print('location features')57    print('-------------------------------------')58    feature_locations = df['location'].value_counts()59    print(feature_locations)60    print('######################################')61    print(f'there are {feature_locations.count()} unique features')62    print('-------------------------------------')63    64    print()65    print('label')66    print('-------------------------------------')67    feature_locations = df['target'].value_counts()68    print(feature_locations)69    70    print()71def read_csv(file_name):72    data = pd.read_csv(file_name)73    # create dataframe74    train_df = pd.DataFrame(data)75    return train_df76# find part of speech of word77def get_wordnet_pos(word):78    tag = nltk.pos_tag([word])[0][1][0].upper()79    if tag.startswith('J'):80        return wordnet.ADJ81    elif tag.startswith('V'):82        return wordnet.VERB83    elif tag.startswith('N'):84        return wordnet.NOUN85    elif tag.startswith('R'):86        return wordnet.ADV87    else:88        return wordnet.VERB89def pre_process_text(df, use_lemmatizer, use_lancaster_stem):90    words = set(nltk.corpus.words.words())91    lemmatizer = WordNetLemmatizer()92    lancaster_stemmer = LancasterStemmer()93    porter_stemmer = PorterStemmer()94    stop_words = set(stopwords.words('english'))95    texts = []96    for _, row in df.iterrows():97        text = row['text']98        # remove word that is not in English corpus and transform them to lower case99        text = " ".join(w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() in words)100        # remove http tag101        text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\102                '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)103        104        #remove number105        text = re.sub(r'\d+','',text)106        107        #remove punctuation mark108        text = text.translate(str.maketrans('','', string.punctuation))109        110        #remove extra white space111        text = text.strip()112        113        # tokenize word (change to list of terms)114        text_tokenize = word_tokenize(text)115        116        # lemmatize (or stem, depends on the option) every word117        root_texts = []118        for word in text_tokenize:119            if use_lemmatizer:120                root_texts.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))121            elif use_lancaster_stem:122                root_texts.append(lancaster_stemmer.stem(word))123            else:124                root_texts.append(porter_stemmer.stem(word))125        # transform list to string 126        text = " ".join(root_texts)127        texts.append(text)128    df['text'] = texts129    return df130        131def pre_processing(df, keyword_ohe_path, keyword_lbe_path, text_vectorizer_path, 132                   df_path, save_model, use_label_encoder, use_lemmmatizer, 133                   use_lancaster_stem, vectorizer_input):134    preprocess_df = df135    shape = preprocess_df.shape[1]136    if use_label_encoder:137        # encode the keyword column using label encoder138        encoder =  LabelEncoder()139        preprocess_df['keyword'] = encoder.fit_transform(preprocess_df['keyword'])140        if save_model:141            dump(encoder, keyword_lbe_path)142    else: 143        # encode the keyword column using one hot encoder144        encoder = OneHotEncoder()145        keyword_temp = np.array(preprocess_df['keyword']).reshape(-1,1)146        keyword_encoder = encoder.fit_transform(keyword_temp).toarray()147        new_keyword = pd.DataFrame(keyword_encoder)148        # dump keyword encoder149        if save_model:150            dump(encoder, keyword_ohe_path)151        # concat encoded keyword back to the dataset152        preprocess_df = pd.concat([preprocess_df.reset_index(drop=True), new_keyword.reset_index(drop=True)],axis=1)153        preprocess_df = pd.DataFrame(preprocess_df)154        preprocess_df.rename(columns=dict(zip(preprocess_df.columns[shape:], 155                                np.array(encoder.categories_).ravel())), inplace=True)156    # perform text cleaning157    preprocess_df = pre_process_text(preprocess_df, use_lemmmatizer, use_lancaster_stem)158    159    vectorizer = None160    if vectorizer_input is None:161        # vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)162        vectorizer = CountVectorizer(stop_words='english')163        text_vector = vectorizer.fit_transform(preprocess_df['text']).toarray()164    else:165        vectorizer = vectorizer_input166        text_vector = vectorizer_input.transform(preprocess_df['text']).toarray()167    # # Truncated svd to remove dimensionality for sparse data168    # svd = TruncatedSVD(n_components=100, n_iter=10, random_state=42)169    # text_vector_tran = svd.fit_transform(text_vector)170    # new_text = pd.DataFrame(text_vector_tran)171    172    new_text = pd.DataFrame(text_vector)173    # dump text vectorizer174    if save_model:175        dump(vectorizer, text_vectorizer_path)176    # drop column keyword and text177    if not use_label_encoder:178        preprocess_df = preprocess_df.drop(columns='keyword')179    preprocess_df = preprocess_df.drop(columns='text')180    shape_2 = preprocess_df.shape[1]181    # concat vector of text to the dataset182    preprocess_df = pd.concat([preprocess_df.reset_index(drop=True), new_text.reset_index(drop=True)],axis=1)183    preprocess_df.rename(columns=dict(zip(preprocess_df.columns[shape_2:], 184                            vectorizer.get_feature_names())), inplace=True)185    186    # dump dataframe187    if save_model:188        dump(preprocess_df, df_path)189    return preprocess_df, vectorizer190# download_nltk_package()191df = read_csv(TRAINING_FILE_NAME)192X = df.drop(columns='location')193# drop all row that column keyword is NaN194X = X.dropna()195y = X['target']196X = X.drop(columns=['id', 'target'])197X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)198X_train, vectorizer = pre_processing(X_train, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 199                    TRAINING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 200                    USE_LEMMATIZER, USE_LANCASTER_STEM, None)201if SAVE_MODEL:202    dump(y_train, TRAINING_TARGET_PATH)203    dump(y_test, TESTING_TARGET_PATH) 204X_test, _ = pre_processing(X_test, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 205                    TESTING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 206                    USE_LEMMATIZER, USE_LANCASTER_STEM, vectorizer)207# load model file, in case it's need208# X_train = load(TRAINING_DATAFRAME_PATH)209# vectorizer = load(TEXT_VECTORIZER_PATH)210# X_test = load(TESTING_DATAFRAME_PATH)211# y_train = load(TRAINING_TARGET_PATH)212# y_test = load(TESTING_TARGET_PATH)213# perform scaling (to fix negative value when training MultinomialNB())214# scaler = MinMaxScaler()215# X_train = scaler.fit_transform(X_train)216# scaler = MinMaxScaler()217# X_test = scaler.fit_transform(X_test)218clf = BernoulliNB()219# clf = MultinomialNB()220# clf = GaussianNB()221# clf = RandomForestClassifier(n_jobs=3, n_estimators=500, verbose=True)222# clf = LinearSVC()223# clf = SVC(kernel='linear')224# clf = LogisticRegression()225clf.fit(X_train, y_train)226print(clf)227y_pred = clf.predict(X_test)228print(accuracy_score(y_pred, y_test))229print(classification_report(y_pred, y_test))230# prepare file to submission to kaggle231test = read_csv(TEST_PREDICT_FILE)232sample_sub= read_csv(SAMPLE_SUBMISSION_FILE)233test = test.drop(columns=['location', 'id'])234test['keyword'] = test['keyword'].fillna('ablaze')235test, _ = pre_processing(test, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 236                    TRAINING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 237                    USE_LEMMATIZER, USE_LANCASTER_STEM, vectorizer)238sample_sub['target'] = clf.predict(test)...

experiment_factory.py

Source:experiment_factory.py

1"""Module for running decoding experiments."""2from pathlib import Path3from typing import Optional, Sequence, Union4import numpy as np5import pandas as pd6from joblib import Parallel, delayed7from sklearn.model_selection import BaseCrossValidator8import pte_decode9def run_experiment(10    feature_root: Union[Path, str],11    feature_files: Union[12        Path, str, list[Path], list[str], list[Union[Path, str]]13    ],14    n_jobs: int = 1,15    **kwargs,16) -> list[Optional[pte_decode.Experiment]]:17    """Run prediction experiment with given number of files."""18    if not feature_files:19        raise ValueError("No feature files specified.")20    if not isinstance(feature_files, list):21        feature_files = [feature_files]22    if len(feature_files) == 1 or n_jobs in (0, 1):23        return [24            _run_single_experiment(25                feature_root=feature_root,26                feature_file=feature_file,27                **kwargs,28            )29            for feature_file in feature_files30        ]31    return [32        Parallel(n_jobs=n_jobs)(33            delayed(_run_single_experiment)(34                feature_root=feature_root, feature_file=feature_file, **kwargs35            )36            for feature_file in feature_files37        )38    ]  # type: ignore39def _run_single_experiment(40    feature_root: Union[Path, str],41    feature_file: Union[Path, str],42    classifier: str,43    label_channels: Sequence[str],44    target_begin: Union[str, int, float],45    target_end: Union[str, int, float],46    optimize: bool,47    balancing: Optional[str],48    out_root: Union[Path, str],49    use_channels: str,50    feature_keywords: Sequence,51    cross_validation: BaseCrossValidator,52    plot_target_channels: list[str],53    scoring: str = "balanced_accuracy",54    artifact_channels=None,55    bad_epochs_path: Optional[Union[Path, str]] = None,56    pred_mode: str = "classify",57    pred_begin: Union[int, float] = -3.0,58    pred_end: Union[int, float] = 2.0,59    use_times: int = 1,60    dist_onset: Union[int, float] = 2.0,61    dist_end: Union[int, float] = 2.0,62    excep_dist_end: Union[int, float] = 0.5,63    exceptions=None,64    feature_importance=False,65    verbose: bool = True,66) -> Optional[pte_decode.Experiment]:67    """Run experiment with single file."""68    import pte  # pylint: disable=import-outside-toplevel69    from py_neuromodulation import (70        nm_analysis,71    )  # pylint: disable=import-outside-toplevel72    print("Using file: ", feature_file)73    # Read features using py_neuromodulation74    nm_reader = nm_analysis.Feature_Reader(75        feature_dir=str(feature_root), feature_file=str(feature_file)76    )77    features = nm_reader.feature_arr78    settings = nm_reader.settings79    sidecar = nm_reader.sidecar80    # Pick label for classification81    try:82        label = _get_column_picks(83            column_picks=label_channels,84            features=features,85        )86    except ValueError as error:87        print(error, "Discarding file: {feature_file}")88        return None89    # Handle bad events file90    bad_epochs_df = pte.filetools.get_bad_epochs(91        bad_epochs_dir=bad_epochs_path, filename=feature_file92    )93    bad_epochs = bad_epochs_df.event_id.to_numpy() * 294    # Pick target for plotting predictions95    target_series = _get_column_picks(96        column_picks=plot_target_channels,97        features=features,98    )99    features_df = get_feature_df(features, feature_keywords, use_times)100    # Pick artifact channel101    if artifact_channels:102        artifacts = _get_column_picks(103            column_picks=artifact_channels,104            features=features,105        ).to_numpy()106    else:107        artifacts = None108    # Generate output file name109    out_path = _generate_outpath(110        out_root,111        feature_file,112        classifier,113        target_begin,114        target_end,115        use_channels,116        optimize,117        use_times,118    )119    dist_end = _handle_exception_files(120        fullpath=out_path,121        dist_end=dist_end,122        excep_dist_end=excep_dist_end,123        exception_files=exceptions,124    )125    side = "right" if "R_" in str(out_path) else "left"126    decoder = pte_decode.get_decoder(127        classifier=classifier,128        scoring=scoring,129        balancing=balancing,130        optimize=optimize,131    )132    # Initialize Experiment instance133    experiment = pte_decode.Experiment(134        features=features_df,135        plotting_target=target_series,136        pred_label=label,137        ch_names=sidecar["ch_names"],138        decoder=decoder,139        side=side,140        artifacts=artifacts,141        bad_epochs=bad_epochs,142        sfreq=settings["sampling_rate_features"],143        scoring=scoring,144        feature_importance=feature_importance,145        target_begin=target_begin,146        target_end=target_end,147        dist_onset=dist_onset,148        dist_end=dist_end,149        use_channels=use_channels,150        pred_mode=pred_mode,151        pred_begin=pred_begin,152        pred_end=pred_end,153        cv_outer=cross_validation,154        verbose=verbose,155    )156    experiment.run()157    experiment.save_results(path=out_path)158    # experiment.fit_and_save(path=out_path)159    return experiment160def _handle_exception_files(161    fullpath: Union[Path, str],162    dist_end: Union[int, float],163    excep_dist_end: Union[int, float],164    exception_files: Optional[Sequence] = None,165):166    """Check if current file is listed in exception files."""167    if exception_files:168        if any(exc in str(fullpath) for exc in exception_files):169            print("Exception file recognized: ", Path(fullpath).name)170            return excep_dist_end171    return dist_end172def _generate_outpath(173    root: Union[Path, str],174    feature_file: Union[Path, str],175    classifier: str,176    target_begin: Union[str, int, float],177    target_end: Union[str, int, float],178    use_channels: str,179    optimize: bool,180    use_times: int,181) -> Path:182    """Generate file name for output files."""183    if target_begin == 0.0:184        target_begin = "trial_begin"185    if target_end == 0.0:186        target_end = "trial_begin"187    target_str = "_".join(("decode", str(target_begin), str(target_end)))188    clf_str = "_".join(("model", classifier))189    ch_str = "_".join(("chs", use_channels))190    opt_str = "yes_opt" if optimize else "no_opt"191    feat_str = "_".join(("feats", str(use_times * 100), "ms"))192    out_name = "_".join((target_str, clf_str, ch_str, opt_str, feat_str))193    return Path(root, out_name, feature_file, feature_file)194def get_feature_df(195    data: pd.DataFrame, feature_keywords: Sequence, use_times: int = 1196) -> pd.DataFrame:197    """Extract features to use from given DataFrame."""198    column_picks = [199        col200        for col in data.columns201        if any(pick in col for pick in feature_keywords)202    ]203    used_features = data[column_picks]204    # Initialize list of features to use205    features = [206        used_features.rename(207            columns={col: col + "_100_ms" for col in used_features.columns}208        )209    ]210    # Use additional features from previous time points211    # use_times = 1 means no features from previous time points are212    # being used213    for use_time in np.arange(1, use_times):214        features.append(215            used_features.shift(use_time, axis=0).rename(216                columns={217                    col: col + "_" + str((use_time + 1) * 100) + "_ms"218                    for col in used_features.columns219                }220            )221        )222    # Return final features dataframe223    return pd.concat(features, axis=1).fillna(0.0)224def _get_column_picks(225    column_picks: Sequence[str],226    features: pd.DataFrame,227) -> pd.Series:228    """Return first found column pick from features DataFrame."""229    for pick in column_picks:230        for col in features.columns:231            if pick.lower() in col.lower():232                return pd.Series(data=features[col], name=col)233    raise ValueError(234        f"No valid column found. `column_picks` given: {column_picks}."...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.