Best Python code snippet using lettuce_webdriver_python
BiblioParsingUtils.py
Source:BiblioParsingUtils.py  
1__all__ = ['biblio_parser',2           'build_institutions_dic',3           'build_title_keywords',4           'check_and_drop_columns',5           'country_normalization',6           'extend_author_institutions',7           'getting_secondary_inst_list',8           'merge_database',9           'name_normalizer',10           'normalize_journal_names',11           'setting_secondary_inst_filter',12           'upgrade_col_names',13           ]14# 15# Globals used from BiblioAnalysis_Utils.BiblioGeneralGlobals:  ALIAS_UK, CHANGE, COUNTRIES,16# Globals used from BiblioAnalysis_Utils.BiblioSpecificGlobals: BLACKLISTED_WORDS, COL_NAMES,17#                                                               DIC_INST_FILENAME, DIC_LOW_WORDS, DIC_OUTDIR_PARSING ,              18#                                                               INST_FILTER_LIST, REP_UTILS, 19#                                                               NLTK_VALID_TAG_LIST, NOUN_MINIMUM_OCCURRENCES,20#                                                               RE_NUM_CONF, RE_YEAR_JOURNAL,21#                                                               SCOPUS, USECOLS_SCOPUS, WOS22# Functions used from BiblioAnalysis_Utils.BiblioGui: Select_multi_items23# Functions used from BiblioAnalysis_Utils.BiblioParsingScopus: biblio_parser_scopus24# Functions used from BiblioAnalysis_Utils.BiblioParsingWos: biblio_parser_wos25def build_title_keywords(df):26    27    '''Given the dataframe 'df' with one column 'title':28    29                    Title30            0  Experimental and CFD investigation of inert be...31            1  Impact of Silicon/Graphite Composite Electrode...32            33    the function 'build_title_keywords':34    35       1- Builds the set "keywords_TK" of the tokens appearing at least NOUN_MINIMUM_OCCURRENCE times 36    in all the article titles of the corpus. The tokens are the words of the title with nltk tags 37    belonging to the global list 'NLTK_VALID_TAG_LIST'.38       2- Adds two columns 'token' and 'pub_token' to the dataframe 'df'. The column 'token' contains39    the set of the tokenized and lemmelized (using the nltk WordNetLemmatizer) title. The column40    'pub_token' contains the list of words common to the set "keywords_TK" and to the column 'kept_tokens'41       3- Buids the list of tuples 'list_of_words_occurrences.sort'42    [(token_1,# occurrences token_1), (token_2,# occurrences token_2),...] ordered by decreasing values43    of # occurrences token_i.44       4- Suppress words pertening to BLACKLISTED_WORDS to the list  from the bag of words45    46    Args:47       df (dataframe): pub_id | Title 48       49    Returns:50       df (dataframe): pub_id | title_token | kept_tokens where title_token is the list of token of the title51         and kept_token the list of tokens with a frequency occurrence >= NOUN_MINIMUM_OCCURRENCES52       bag_of_words_occurrences (list of tuples): [(word_1,# occurrence_1), (word_2,# occurrence_2), ...]53        54    '''55    # Standard library imports56    import operator57    from collections import Counter58       59    # 3rd party imports60    import nltk61    import numpy as np62    63    # Local imports64    from BiblioAnalysis_Utils.BiblioSpecificGlobals import NLTK_VALID_TAG_LIST65    from BiblioAnalysis_Utils.BiblioSpecificGlobals import NOUN_MINIMUM_OCCURRENCES66    from BiblioAnalysis_Utils.BiblioSpecificGlobals import BLACKLISTED_WORDS67    68    def tokenizer(text):69        70        '''71        Tokenizes, lemmelizes the string 'text'. Only the words with nltk tags in the global72        NLTK_VALID_TAG_LIST are kept.73        74        ex 'Thermal stability of Mg2Si0.55Sn0.45 for thermoelectric applications' 75        gives the list : ['thermal', 'stability', 'mg2si0.55sn0.45', 'thermoelectric', 'application']76        77        Args:78            text (string): string to tokenize79            80        Returns81            The list valid_words_lemmatized 82        '''83            84        tokenized = nltk.word_tokenize(text.lower())85        valid_words = [word for (word, pos) in nltk.pos_tag(tokenized) 86                       if pos in NLTK_VALID_TAG_LIST] 87        stemmer = nltk.stem.WordNetLemmatizer()88        valid_words_lemmatized = [stemmer.lemmatize(valid_word) for valid_word in valid_words]89    90        return valid_words_lemmatized        91    df['title_token'] = df['Title'].apply(tokenizer)92    bag_of_words = np.array(df.title_token.sum()) # remove the blacklisted words from the bag of words93    for remove in BLACKLISTED_WORDS:94        bag_of_words = bag_of_words[bag_of_words != remove] 95    bag_of_words_occurrences = list(Counter(bag_of_words).items())96    bag_of_words_occurrences.sort(key = operator.itemgetter(1),reverse=True)97    keywords_TK = set([x for x,y in bag_of_words_occurrences if y>=NOUN_MINIMUM_OCCURRENCES])98    99    df['kept_tokens'] = df['title_token'].apply(lambda x :list(keywords_TK.intersection(set(x))))100   101    return df,bag_of_words_occurrences102def country_normalization(country):103    '''104    Normalize the country name for coherence seeking between wos and scopus corpuses.105    '''106    # Local imports107    from BiblioAnalysis_Utils.BiblioGeneralGlobals import ALIAS_UK108    from BiblioAnalysis_Utils.BiblioGeneralGlobals import COUNTRIES109    110    country_clean = country111    if country not in COUNTRIES:112        if country in  ALIAS_UK:113            country_clean = 'United Kingdom'114        elif 'USA' in country:115            country_clean = 'United States'116        elif ('china' in country) or ('China' in country):117            country_clean = 'China'118        elif country == 'Russia':    119            country_clean = 'Russian Federation'120        elif country == 'U Arab Emirates':    121            country_clean = 'United Arab Emirates'122        elif country == 'Vietnam':   123            country_clean = 'Viet Nam'124        else:125            country_clean = ''126    return country_clean127def build_institutions_dic(rep_utils = None, dic_inst_filename = None):128    '''129    The `builds_institutions_dic` fuction builds the dict 'inst_dic' 130    giving the mormalized names of institutions from a csv file `dic_inst_filename`.131    The name of the csv file is set in the `DIC_INST_FILENAME` global.132    133    Args: 134        rep_utils (str): name of the folder where the csv file is stored135        dic_inst_filename (str): name of the csv file.        136    137    Returns:       138        `dict`: `inst_dic` as {raw_inst:norm_inst} where 139                - raw_inst a raw institution name 140                - norm_inst is the normalized institution name.141        142    Note:143        The globals `REP_UTILS` and `DIC_INST_FILENAME` are used.144    145    '''146    # Standard library imports147    from pathlib import Path148    149    # 3rd party imports150    import pandas as pd151    152    # Local imports153    from BiblioAnalysis_Utils.BiblioSpecificGlobals  import DIC_INST_FILENAME154    from BiblioAnalysis_Utils.BiblioSpecificGlobals import REP_UTILS    155    156    if dic_inst_filename == None: dic_inst_filename = DIC_INST_FILENAME157    if rep_utils == None: rep_utils = REP_UTILS 158    159    # Setting the file path for dic_inst_filename file reading    160    path_dic_inst = Path(__file__).parent / rep_utils / Path(dic_inst_filename)161    162    # Reading and cleaning the dic_inst_filename file163    inst_dic = pd.read_csv(path_dic_inst,sep=':',header=None,encoding='latin1')164    inst_dic.sort_values([0],inplace=True)165    inst_dic[0] = inst_dic[0].str.strip()166    inst_dic[1] = inst_dic[1].str.strip()167    inst_dic = dict(zip(inst_dic[0],inst_dic[1]))168    169    return inst_dic170def setting_secondary_inst_filter(out_dir_parsing):171    '''The `setting_secondary_inst_filter` function allows building the affiliation filter "inst_filter_list"172    fron the institutions list of the corpus using the `Select_multi_items` GUI.173    174    Args:175        out_dir_parsing (path): the corpus parsing path for reading the "DIC_OUTDIR_PARSING['I2']" file.176        177    Returns:178        (list): list of tuples (institution,country) selected by the user.179        180    Notes:181        The globals 'COL_NAMES'and 'DIC_OUTDIR_PARSING' are used.182        The function `Select_multi_items`is used from `BiblioAnalysis_utils` package.183        184    '''185    186    # Standard library imports187    from pathlib import Path188    189    # 3rd party imports190    import numpy as np191    import pandas as pd192    193    # Local imports194    from BiblioAnalysis_Utils.BiblioGui import Select_multi_items195    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES196    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING197    198    199    institutions_alias = COL_NAMES['auth_inst'][4]200    country_alias = COL_NAMES['country'][2]    201    202    df_auth_inst = pd.read_csv(Path(out_dir_parsing) / Path(DIC_OUTDIR_PARSING['I2']),203                                sep = '\t')204    raw_institutions_list = []205    for auth_inst in df_auth_inst[institutions_alias]:206        raw_institutions_list.append(auth_inst)207        208    institutions_list = list(np.concatenate([raw_inst.split(';') for raw_inst in raw_institutions_list]))209    institutions_list  = sorted(list(set(institutions_list)))210    country_institution_list = [x.split('_')[1] + ':' + x.split('_')[0] for x in institutions_list]211    country_institution_list = sorted(country_institution_list)212    selected_list = Select_multi_items(country_institution_list,213                                       mode='multiple',214                                       fact=2,215                                       win_widthmm=80,216                                       win_heightmm=100,217                                       font_size=16)218    inst_filter_list = [(x.split(':')[1].strip(),x.split(':')[0].strip()) for x in selected_list]219    220    return inst_filter_list221def merge_database(database,filename,in_dir,out_dir):222    223    '''Merges several databases in one database224    225    Args:226        database (string): database type (scopus or wos)227        filename (str): name of the merged database228        in_dir (str): name of the folder where the databases are stored229        out_dir (str): name of the folder where the merged databases will be stored230    231    Notes:232        The USECOLS_SCOPUS global is used.233        234    '''235    # Standard library imports236    import os237    from pathlib import Path238    import sys239    # 3rd party imports240    import pandas as pd241    242    # Local imports243    from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS244    from BiblioAnalysis_Utils.BiblioSpecificGlobals import USECOLS_SCOPUS245    from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS246    list_data_base = []247    list_df = []248    if database == WOS:249        for path, _, files in os.walk(in_dir):250            list_data_base.extend(Path(path) / Path(file) for file in files251                                                          if file.endswith(".txt"))252        for file in list_data_base:253            list_df.append(read_database_wos(file))254    elif database == SCOPUS:255        for path, _, files in os.walk(in_dir):256            list_data_base.extend(Path(path) / Path(file) for file in files257                                                          if file.endswith(".csv"))258        for file in list_data_base:259            df = pd.read_csv(file,usecols=USECOLS_SCOPUS) # reads the database260            list_df.append(df)261            262    else:263        raise Exception(f"Sorry, unrecognized database {database} : should be {WOS} or {SCOPUS} ")264        265    result = pd.concat(list_df,ignore_index=True)266    result.to_csv(out_dir / Path(filename),sep='\t')267def name_normalizer(text):268    269    '''Normalizes the author name spelling according the three debatable rules:270            - replacing none ascii letters by ascii ones271            - capitalizing first name 272            - capitalizing surnames273            - supressing comma and dot274            275       ex: name_normalizer(" GrÃÅ-biçà-vèLU D'aillön, E-kj. ")276        >>> "Grol-Bica-Velu D'Aillon E-KJ"277        278    Args:279        text (str): text to normalize280    281    Returns282        The normalized text283        284    Notes:285        The CHANGE global is used.286    '''287    # Standard library imports288    import functools289    import re290    import unicodedata291    292    # Local imports293    from BiblioAnalysis_Utils.BiblioGeneralGlobals import CHANGE294    nfc = functools.partial(unicodedata.normalize,'NFD')295    296    text = text.translate(CHANGE) # Translate special character using global CHANGE dict297    text = nfc(text). \298           encode('ascii', 'ignore'). \299           decode('utf-8').\300           strip()301    302    re_minus = re.compile('(-[a-zA-Z]+)')       # Captures: "cCc-cC-ccc-CCc"303    for text_minus_texts in re.findall(re_minus,text):304        text = text.replace(text_minus_texts,'-' + text_minus_texts[1:].capitalize() )305    306    re_apostrophe = re.compile("('[a-zA-Z]+)")  # Captures: "cCc'cC'ccc'cc'CCc"307    for text_minus_texts in re.findall(re_apostrophe,text):308        text = text.replace(text_minus_texts,"'" + text_minus_texts[1:].capitalize() )309        310    re_minus = re.compile('([a-zA-Z]+-)')       # Captures: "cCc-" 311    for text_minus_texts in re.findall(re_minus,text):312        text = text.replace(text_minus_texts,text_minus_texts[:-1].capitalize() + '-')313        314    re_apostrophe = re.compile("([a-zA-Z]+')")  # Captures: "cCc'"315    for text_minus_texts in re.findall(re_apostrophe,text):316        text = text.replace(text_minus_texts,text_minus_texts[:-1].capitalize() + "'")317        318    re_surname = "[a-zA-Z]+\s"                  # Captures: "cCccC "319    for text_minus_texts in re.findall(re_surname,text):320        text = text.replace(text_minus_texts,text_minus_texts.capitalize())321        322    re_minus_first_name = '\s[a-zA-Z]+-[a-zA-Z]+$'     # Captures: "cCc-cC" in the first name323    for x in  re.findall(re_minus_first_name,text):324        text = text.replace(x,x.upper())325           326    return text327def normalize_journal_names(database,df_corpus):328    '''The `normalize_journal_names` function normalizes the journal names in the journals specific column 329    of the corpus dataframe through the replace of low words defined in the global 'DIC_LOW_WORDS' 330    and the drop of particular items using the regular expressions defined by 'RE_ADDS_JOURNAL' and 'RE_YEAR_JOURNAL'331    globals.332    333    Args:334        database (string): type of database among the ones defined by SCOPUS and WOS globals.335        df_corpus (dataframe): corpus dataframe to be normalized in terms of journal names.336        337    Returns:338        (dataframe): the dataframe with normalized journal names.339        340    Note:341        The globals 'COLUMN_LABEL_WOS', 'COLUMN_LABEL_SCOPUS','DIC_LOW_WORDS', 'RE_YEAR_JOURNAL', 'SCOPUS' and 'WOS' are used.342    343    '''344    # Standard library imports345    import re346    347    # Local imports348    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_WOS 349    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_SCOPUS350    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_LOW_WORDS351    from BiblioAnalysis_Utils.BiblioSpecificGlobals import RE_NUM_CONF352    from BiblioAnalysis_Utils.BiblioSpecificGlobals import RE_YEAR_JOURNAL353    from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS354    from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS355    356   357    def _normalize_low_words(text): 358        for low_word in DIC_LOW_WORDS.keys():359            text = text.replace(low_word, DIC_LOW_WORDS[low_word]).strip()360        text = " ".join(text.split())361        return text362    def _journal_normalizer(journal):363        journal = ' ' + journal + ' '364        journal = journal.lower()365        journal_list = [" " + x + " " for x in journal.split()]366        new_journal = " ".join(journal_list)367        if RE_YEAR_JOURNAL.findall(journal) or RE_NUM_CONF.findall(journal): 368            to_remove = [x for x in journal_list if (RE_YEAR_JOURNAL.findall(x) or RE_NUM_CONF.findall(x))]369            for x in to_remove: new_journal = new_journal.replace(x,'')370        new_journal = " ".join(new_journal.split())371        new_journal = _normalize_low_words(new_journal) 372        return new_journal373    374    if database == WOS:375        journal_alias = COLUMN_LABEL_WOS['journal']376    elif database == SCOPUS:377        journal_alias = COLUMN_LABEL_SCOPUS['journal']378    else:379        raise Exception(f"Sorry, unrecognized database {database}: should be {WOS} or {SCOPUS} ") 380    381    df_corpus[journal_alias] = df_corpus[journal_alias].apply(_journal_normalizer)382    383    return df_corpus384def biblio_parser(in_dir_parsing, out_dir_parsing, database, expert, rep_utils=None, inst_filter_list=None):385    386    '''Chooses the appropriate parser to parse wos or scopus databases.387    '''388    389    # Local imports390    from BiblioAnalysis_Utils.BiblioParsingScopus import biblio_parser_scopus391    from BiblioAnalysis_Utils.BiblioParsingWos import biblio_parser_wos392    from BiblioAnalysis_Utils.BiblioSpecificGlobals import INST_FILTER_LIST393    from BiblioAnalysis_Utils.BiblioSpecificGlobals import REP_UTILS394    from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS395    from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS396    397    if database == WOS:398        biblio_parser_wos(in_dir_parsing, out_dir_parsing, inst_filter_list)399    elif database == SCOPUS:400        if rep_utils == None: rep_utils = REP_UTILS401        biblio_parser_scopus(in_dir_parsing, out_dir_parsing, rep_utils, inst_filter_list)402    else:403        raise Exception(f"Sorry, unrecognized database {database} : should be wos or scopus ")404        405def check_and_drop_columns(database,df,filename):406    # Local imports407    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_WOS 408    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COLUMN_LABEL_SCOPUS409    from BiblioAnalysis_Utils.BiblioSpecificGlobals import SCOPUS410    from BiblioAnalysis_Utils.BiblioSpecificGlobals import WOS411    # Check for missing mandatory columns412    if database == WOS:413        cols_mandatory = set([val for val in COLUMN_LABEL_WOS.values() if val])414    elif database == SCOPUS:415        cols_mandatory = set([val for val in COLUMN_LABEL_SCOPUS.values() if val])    416    else:417        raise Exception(f"Sorry, unrecognized database {database} : should be {WOS} or {SCOPUS} ")418        419    cols_available = set(df.columns)420    missing_columns = cols_mandatory.difference(cols_available)421    if missing_columns:422        raise Exception(f'The mandarory columns: {",".join(missing_columns)} are missing from {filename}\nplease correct before proceeding')423    424    # Columns selection and dataframe reformatting425    cols_to_drop = list(cols_available.difference(cols_mandatory))426    df.drop(cols_to_drop,427            axis=1,428            inplace=True)                    # Drops unused columns429    df.index = range(len(df))                # Sets the pub_id in df index430    431    return df432                    433def upgrade_col_names(corpus_folder):434    435    '''Add names to the colummn of the parsing and filter_<i> files to take into account the436    upgrage of BiblioAnalysis_Utils.437    438    Args:439        corpus_folder (str): folder of the corpus to be adapted440    '''441    # Standard library imports442    import os443    444    # 3rd party imports445    import colorama446    import pandas as pd447    from colorama import Back448    from colorama import Fore449    from colorama import Style450    from pandas.core.groupby.groupby import DataError451    452    # Local imports453    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES454    455    # Beware: the new file authorsinst.dat is not present in the old parsing folders456    dict_filename_conversion  = {'addresses.dat':'address',457                                'articles.dat': 'articles',458                                'authors.dat':'authors',459                                'authorsinst.dat':'auth_inst',460                                'authorskeywords.dat':'keywords',461                                'countries.dat':'country',462                                'institutions.dat':'institution',463                                'journalkeywords.dat':'keywords',464                                'references.dat':'references',465                                'subjects.dat': 'subject',466                                'subjects2.dat':'sub_subject',467                                'titlekeywords.dat':'keywords'}468    for dirpath, dirs, files in os.walk(corpus_folder):  469        if ('parsing' in   dirpath) |  ('filter_' in  dirpath):470            for file in  [file for file in files471                          if (file.split('.')[1]=='dat') 472                          and (file!='database.dat')      # Not used this file is no longer generated473                          and (file!='keywords.dat') ]:   # Not used this file is no longer generated474                try:475                    df = pd.read_csv(os.path.join(dirpath,file),sep='\t',header=None)476                    477                    if df.loc[0].tolist() == COL_NAMES[dict_filename_conversion[file]]:478                        print(f'The file {os.path.join(dirpath,file)} is up to date')479                    else:480                        df.columns = COL_NAMES[dict_filename_conversion[file]]481                        df.to_csv(os.path.join(dirpath,file),sep='\t',index=False)482                        print(Fore.GREEN + f'*** The file {os.path.join(dirpath,file)} has been upgraded ***' + Style.RESET_ALL)483                except  pd.errors.EmptyDataError:484                    df = pd.DataFrame(columns=COL_NAMES[dict_filename_conversion[file]])485                    df.to_csv(os.path.join(dirpath,file),sep='\t',index=False)486                    print(Fore.BLUE + f'*** The EMPTY file {os.path.join(dirpath,file)} has been upgraded ***' + Style.RESET_ALL)487                except:488                    print(Fore.WHITE + Back.RED + f'Warning: File {os.path.join(dirpath,file)} not recognized as a parsing file' + Style.RESET_ALL)489                490def extend_author_institutions(in_dir,inst_filter_list):491    ''' The `extend_author_institutions`function extends the .dat file of authors with institutions 492    initialy obtained by the parsing of the corpus, with complementary information about institutions493    selected by the user.494    495    Args:496        in_dir (path): path to the .dat file of authors with institutions497        inst_filter_list (list): the affiliation filter list of tuples (institution, country) 498    Retruns:499        None500        501    Notes:502        The globals 'COL_NAMES' and 'DIC_OUTDIR_PARSING' are used503        from `BiblioAnalysis_utils` package.504    505    '''506    507    # Standard library imports508    from pathlib import Path509    510    # 3rd party imports511    import pandas as pd512    513    # Local imports514    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES515    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING516    517    def _address_inst_list(inst_names_list,institutions):518        secondary_institutions = []519        for inst in inst_names_list:520            if inst in institutions:521                secondary_institutions.append(1)522            else:523                secondary_institutions.append(0)  524             525        return secondary_institutions526    527    institutions_alias = COL_NAMES['auth_inst'][4]528    sec_institutions_alias = COL_NAMES['auth_inst'][5]529    530    # Setting the key for the name of the '.dat' file of authors with institutions 531    # obtained by parsing the corpus532    item = 'I2' 533    534    # Reading the '.dat' file                   535    read_usecols = [COL_NAMES['auth_inst'][x] for x in [0,1,2,3,4]]     536    df_I2= pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING[item]),537                     sep='\t',538                     usecols=read_usecols)539    540    # Setting an institution name for each of the institutions indicated in the institutions filter541    inst_names_list = [f'{x[0]}_{x[1]}' for x in inst_filter_list]   542    543    # Building the "sec_institution_alias" column in the 'df_I2' dataframe using "inst_filter_list"544    df_I2[sec_institutions_alias] = df_I2.apply(lambda row:545                                             _address_inst_list(inst_names_list,row[institutions_alias]),546                                             axis = 1)547    # Distributing in a 'df_inst_split' df the value lists of 'df_I2[sec_institutions_alias]' column  548    # into columns which names are in 'inst_names_list' list     549    df_inst_split = pd.DataFrame(df_I2[sec_institutions_alias].sort_index().to_list(),550                                          columns=inst_names_list)551    552    # Extending the 'df' dataframe with 'df_inst_split' dataframe553    df_I2 = pd.concat([df_I2, df_inst_split], axis=1)554    # Droping the 'df[sec_institutions_alias]' column which is no more usefull555    df_I2.drop([sec_institutions_alias], axis=1, inplace=True)556    557    # Saving the extended 'df_I2' dataframe in the same '.dat' file 558    df_I2.to_csv(in_dir/ Path(DIC_OUTDIR_PARSING[item]), 559                 index=False,560                 sep='\t') 561    562    563def getting_secondary_inst_list(out_dir_parsing):564    '''The `getting_secondary_inst_list` function provides the list of institutions of the corpus.565   566    Args:567        out_dir_parsing (path): the corpus parsing path for reading the "DIC_OUTDIR_PARSING['I2']" file 568                                that lists the authors with their institutions for each article.569       570    Returns:571        (list): list of strings 'country:institution'572       573    Notes:574        The globals 'COL_NAMES'and 'DIC_OUTDIR_PARSING' are used.       575    '''576   577    # Standard library imports578    from pathlib import Path579   580    # 3rd party imports581    import numpy as np582    import pandas as pd583   584    # Local imports585    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES586    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING587   588    589    institutions_alias = COL_NAMES['auth_inst'][4]590    country_alias = COL_NAMES['country'][2]   591    592    df_auth_inst = pd.read_csv(Path(out_dir_parsing) / Path(DIC_OUTDIR_PARSING['I2']),593                                sep = '\t')594    raw_institutions_list = []595    for auth_inst in df_auth_inst[institutions_alias]:596        raw_institutions_list.append(auth_inst)597       598    institutions_list = list(np.concatenate([raw_inst.split(';') for raw_inst in raw_institutions_list]))599    institutions_list  = sorted(list(set(institutions_list)))600 601    country_institution_list = [x.split('_')[1] + ':' + x.split('_')[0] for x in institutions_list]602    country_institution_list = sorted(country_institution_list)603   604    return country_institution_list605            606            ...BiblioFilter.py
Source:BiblioFilter.py  
1__all__ = ['filter_corpus_new',2           'filters_modification',3           'item_filter_modification',4           'item_values_list',5           'read_config_filters',]6# Functions used from BiblioAnalysis_Utils.BiblioGui: Select_multi_items, filter_item_selection7# Globals used from BiblioAnalysis_Utils.BiblioSpecificGlobals: DIC_OUTDIR_PARSING8def filter_corpus_new(in_dir, out_dir, verbose, file_config_filters):9    10    '''Filters the 11    '''12    # Reads the fitering parameters13    combine,exclusion,filter_param = read_config_filters(file_config_filters)14    15    # Builds the set of articles id to keep16    tokeep = _filter_pub_id(combine,exclusion,filter_param,in_dir)17    18    # Stores the filtered files 19    _save_filtered_files(tokeep,in_dir,out_dir)20def read_config_filters(file_config):21    """22    Parse json file to build the filtering configuration23    24    Args:25        file_config (Path): absolute path of the configuration file26       27    Returns:28        combine (str):29        filter_param (dict): {key:list of keywords}30    """31    # Standard library imports32    import json33    from collections import defaultdict34    filter_param = defaultdict(list)35    with open(file_config, "r") as read_file:36            config_filter = json.load(read_file)37    combine = config_filter["COMBINE"]38    exclusion = config_filter["EXCLUSION"]39    for key, value in config_filter.items():40        if isinstance(value, dict):41            if value['mode']:42                filter_param[key] = value["list"]43    return combine,exclusion,filter_param44def _filter_pub_id(combine,exclusion,filter_param,in_dir):45    '''<--------------------- modifié AC46    This function finds the set of the identifiers (pub_id) of the publications47    that satisfy sorting criteria.                48    Args:49       combine (string) : "intersection" or "union"; defines the combination 50       of the sets of the kept pub_id by item key51       52       exclusion (bool): if true the complementary set of the kept pub_id set53       resulting from combination is returned54       filter_param (dict): {item key: [list of items to keep]}55           ex {"CU":["France","Italy"]}56    Returns:57        tokeep (set): set of kept publications id58    '''59    # Standard library imports60    import os61    import re62    from pathlib import Path63    from string import Template64    # 3rd party imports65    import pandas as pd66    67    # Local imports68    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING69    from BiblioAnalysis_Utils.BiblioSpecificGlobals import COL_NAMES70    71    filter_on = list(filter_param.keys()) # List of items to be filtered72    73    t = Template('$colname in @$item') # Template for the query74    keepid = {}75    # Builds keepid[Y]={Y}, keepid[J]={J}, keepid[DT]={DT}, keepid[LA]={LA}76    # where {Y}, {J}, {DT} and {LA} are the sets of pub_id of articles with77    # Ymin>=Year>=Ymax, with Journal in filter_param["J"],78    # with doctypes in filter_param["J"] and with Language (LA) in  filter_param["LA"]79    #----------------------------------------------------------------------------80   81    for idx, item in enumerate(set(filter_on) & set(["Y","J","DT","LA"])):82        if idx == 0: # The first round we read the data83            df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING['A']),84                             sep='\t',85                             dtype={x : 'str' for x in COL_NAMES['articles'][1:]})86            87        if item == 'Y': #years selection88            year = [str(x) for x in filter_param['Y']]89            query = t.substitute({'colname':df.columns[2],90                                  'item':'year'})91            keepid[item] =  set(df.query(query)[df.columns[0]]) 92            93        elif item == 'J': #journal selection94            journals = filter_param['J']95            query = t.substitute({'colname':df.columns[3],96                                  'item':'journals'})              97            keepid[item] =  set(df.query(query)[df.columns[0]]) 98            99        elif item == 'DT': #document type selection100            doctypes = filter_param['DT']101            query = t.substitute({'colname':df.columns[7],102                                  'item':'doctypes'})            103            keepid[item] =  set(df.query(query)[df.columns[0]]) 104        elif item == 'LA': #language selection105            languages = filter_param['LA']106            query = t.substitute({'colname':df.columns[8],107                                  'item':'languages'})           108            keepid[item] =  set(df.query(query)[df.columns[0]]) 109            110    # Builds keepid[IK]={IK} keepid[TK]={TK} keepid[AK]={AK} 111    # where {IK}, {TK}, {AK} are the sets of pub_id of articles with112    # one keyword repectivelly in filter_param["IK"], filter_param["TK"], filter_param["AK"]113    # ---------------------------------------------------------------114    if "IK" in filter_on:115        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["IK"]), sep='\t')116        keywords =  filter_param["IK"]117        query = t.substitute({'colname':df.columns[1],118                              'item':'keywords'})119        keepid['IK'] = set(df.query(query)[df.columns[0]])120        121    if "AK" in filter_on:122        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["AK"]), sep='\t')123        keywords =  filter_param["AK"]124        query = t.substitute({'colname':df.columns[1],125                              'item':'keywords'})126        keepid['AK'] = set(df.query(query)[df.columns[0]])  127    if "TK" in filter_on:128        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["TK"]), sep='\t')129        keywords =  filter_param["TK"]130        query = t.substitute({'colname':df.columns[1],131                              'item':'keywords'})132        keepid['TK'] =  set(df.query(query)[df.columns[0]]) 133        134    # Builds keepid[AU]={AU} where {AU} is the set of pub_id 135    # of articles with at least one coauthors in the list filter_param["AU"]136    # ------------------------------------------------------------137    if "AU" in filter_on:138        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["AU"]), sep='\t')139        authors =  filter_param["AU"]140        query = t.substitute({'colname':df.columns[2],141                              'item':'authors'})142        keepid['AU'] =  set(df.query(query)[df.columns[0]]) 143    # Builds keepid[CU]={CU} where {CU} is the of pub_id 144    # of articles with at least one coauthor country in the list filter_param["CU"]145    # ------------------------------------------------------------146    if "CU" in filter_on:147        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["CU"]), sep='\t')148        countries = filter_param["CU"]149        query = t.substitute({'colname':df.columns[2],150                              'item':'countries'})151        keepid["CU"] =  set(df.query(query)[df.columns[0]]) 152    # Builds keepid[I]={I} where {I} is the of pub_id 153    # of articles with at least one coauthor institution in the list filter_param["CU"]154    # ------------------------------------------------------------155    if "I" in filter_on:156        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["I"]), sep='\t')157        institutions =  filter_param["I"]158        query = t.substitute({'colname':df.columns[2],159                              'item':'institutions'})160        keepid["I"] =  set(df.query(query)[df.columns[0]]) 161    # Builds keepid[S]={S} where {S} is the of pub_id 162    # of articles with subjects in the list filter_param["S"]163    # ------------------------------------------------------------164    if "S" in filter_on:165        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["S"]), sep='\t')166        subjects =  filter_param["S"]167        query = t.substitute({'colname':df.columns[1],168                              'item':'subjects'})169        keepid["S"] =  set(df.query(query)[df.columns[0]]) 170    # Builds keepid[S2]={S2} where {S2} is the of pub_id 171    # of articles with subsubjects in the list filter_param["S2"]172    # ------------------------------------------------------------173    if "S2" in filter_on:174        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING["S2"]), sep='\t')175        subsubjects =  filter_param["S2"]176        query = t.substitute({'colname':df.columns[1],177                              'item':'subsubjects'})       178        keepid["S2"] = set(df.query(query)[df.columns[0]]) 179        180    # Builds keepid[R]={R}, keepid[RJ]={RJ}181    # where {R} is the set acticles id with references 182    #     in the list filter_param["R"]183    # {RJ} is the set acticles id with references journal 184    #     in the list filter_param["RJ"]185    # of articles with references 186    # ------------------------------------------------------------187    if ("R" in filter_on) or ("RJ" in filter_on):188        df = pd.read_csv(in_dirg / Path(DIC_OUTDIR_PARSING["R"]), sep='\t').astype(str)189        190        if "R" in filter_on:191            find_0 = re.compile(r',\s?0')192            df['ref'] = df.apply(lambda row:re.sub(find_0,'', ', '.join(row[1:-1]))193                                 ,axis=1)194            references =  filter_param["R"]195            keepid["R"] =  set(df.query(query)[df.columns[0]])  196            197        if "RJ" in filter_on:198            refsources = filter_param["RJ"]199            query = t.substitute({'colname':df.columns[3],200                                  'item':'refsources'})201            202            keepid["RJ"] = set(df.query(query)[df.columns[0]]) 203            204    # Combines the filtering conditions union / intersection /exclusion205    # -------------------------------------------------------------------206    tokeep = [value for value in keepid.values()] # list of kept id sets207    if combine == "intersection":208        tokeep = set.intersection(*tokeep)209    if combine == "union":210        tokeep = set.union(*tokeep)211    if exclusion: 212        df = pd.read_csv(in_dir / Path(DIC_OUTDIR_PARSING['A']),sep='\t')213        set_pub_id = set(df[df.columns[0]])214        # set of all pub_id215        tokeep = set_pub_id.difference(tokeep)216        217    return tokeep218    219def _save_filtered_files(tokeep,in_dir,out_dir):220    221    '''Filters all the files with ".dat" extension located in the folder in_dir #<---------------------222    and saves the filtered files in the folder out_dir_.   #<---------------------223    The set "tokeep" contains the id (pu_id) of the articles to be kept in the filtered corpus. #<--------------224    225    Args:226        tokeep (set): set of id227        in_dir (Path): path of the folder containing the files to filter228        out_dir (Path): path of the folder where the filtered files are stored229    '''230    # Standard library imports231    import os232    from pathlib import Path233    from string import Template234    235    # 3rd party imports236    import pandas as pd237    238    # Local imports239    from BiblioAnalysis_Utils.BiblioSpecificGlobals import DIC_OUTDIR_PARSING240    241    t = Template('$colname in @tokeep') # Template for the query242    for file in DIC_OUTDIR_PARSING.values():243        df = pd.read_csv(in_dir / Path(file), sep='\t')244        query = t.substitute({'colname':df.columns[0]}) 245        df.query(query).to_csv(out_dir / Path(file), 246                                             index=False,247                                             sep="\t")248    249def item_filter_modification(item,item_values, filters_filename) :250    '''251    Modification of items values list in the json file of the filtering configuration252    for corpus filtering 253    254    Args: 255        item (str): item accronyme256        item_values (list): list of item values to be put in the json file 257        filters_filename (path): path of the json file 258        259    '''260    261    # Standard library imports262    import json263    with open(filters_filename, "r") as read_file:264        config_filter = json.load(read_file)265    266    config_filter[item]['list'] = item_values267    268    with open(filters_filename, "w") as write_file:269        jsonString = json.dumps(config_filter, indent=4)270        write_file.write(jsonString)271        272def item_values_list(item_values_file):273    '''274    Builds a list of item values from a file of the same structure 275    as the text files resulting from the corpus description (".dat" extension)276    277    Args: 278        item_values_file (path): path of the dat file that contains the item values (str)279        280    Returns:281        item_values_select (list): list of item values282    '''283    # Standard library imports284    import csv285    286    item_values = []287    with open(item_values_file, newline='') as f:288        reader = csv.reader(f)289        item_values = list(reader)290        291    item_values_select =[]292    for x in range(len(item_values)):293        mystring = str(item_values[x])294        start = 2295        end = mystring.find(',', start) - 1296        value = str(mystring[start:end]).replace("'",'"')297        item_values_select.append(value)298        299    return item_values_select300def filters_modification(config_folder,file_config_filters):301    '''302    Modification of the filter configuration 303    using a selection of item values saved in the file item_values_file (.dat file) 304    of the same structure as item files resulting from corpus description305    306    Args:307        config_folder (path): path of the configuration folder 308                              containing the file item_values_file selected interactively309        file_config_filters (path): path of the json filters configuration file310    311    '''312    313    # Standard library imports314    import os315    from pathlib import Path316    317    # Local imports318    from BiblioAnalysis_Utils.BiblioGui import Select_multi_items319    from BiblioAnalysis_Utils.BiblioGui import filter_item_selection320    # Identifying the item to be modified in the filters configuration321    filter_item = filter_item_selection()322    # Setting the folders list for item_values selection list323    folders_list = [x[0] for x in os.walk(config_folder)][1:]324    folders_list = [os.path.split(x)[-1] for x in folders_list]325    folders_list.sort()326    # Selection of the folder of the item_values selection files327    print('Please select the folder of item_values selection file via the tk window')328    myfolder_name = Select_multi_items(folders_list,'single')[0]+'/'329    myfolder = config_folder / Path(myfolder_name)330    # Setting the list of item_values selection files to be put in the filters configuration file331    files_list = os.listdir(myfolder)332    files_list.sort()333    print('\nPlease select the item_values selection file via the tk window')334    myfile = Select_multi_items(files_list,'single')[0]+'/'335    item_values_file = myfolder / Path(myfile)336    item_values_list_select = item_values_list(item_values_file) ...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
