Best Python code snippet using autotest_python
music_score_parser - 副本.py
Source:music_score_parser - 副本.py  
1'''2è°±ã=ãå°è*3å°èã=ãæ+ã'|'4æã=ãè¿é³+5è¿é³ã=ãé³é«[æ¶é¿]6é³é«ã=ã[åå]é³ç¬¦[å
«åº¦åç§»]7æ¶é¿ã=ãæ¶å¼åæ°è®°å·+8ååã=ã'#' | 'b' | 'X' | 'bb' | '&'9é³ç¬¦ = [1-7-]10å
«åº¦åç§» = å
«åº¦ååç§»ã|ãå
«åº¦éåç§»11å
«åº¦ååç§»ã=ãå
«åº¦å+12å
«åº¦åã=ã'^'13å
«åº¦éåç§»ã=ãå
«åº¦é+14å
«åº¦éã=ã'v'15'''16rex_language = '''17rex_language = line*18line = [idDefine] [comment_str] [NEWLINE]19idDefine = id '=' rex20basic_rex = plain_str | rex_str | id | group | multi_optional21multi = basic_rex | multi_bound | multi_star | multi_plus 22seq = multi+23choices = seq ('|' seq)*24rex = choices25multi_bound = basic_rex '{' num ',' [num] '}'26multi_star = basic_rex '*'27multi_plus = basic_rex '+'28multi_optional = '[' rex ']'29group = '(' rex ')'30str = rrex"'([^'\\]|\\.)*'" | rrex'"([^"\\]|\\.)*"'31plain_str = rrex'[r](?=[\'"])' str32rex_str = rrex'[r]rex(?=[\'"])' str33comment_str = rrex'#.*'34id = rrex'\b(?=[^0-9])\w+(?=[^\'"])'35num = rrex'\b[1-9])\d*(?=[^\'"])'36NEWLINE = '\n'37BEGIN = rrex'^'38END = rrex'$'39# _NOSPACES_ predefined40'''41class RexParser:42    def __init__(self):43        id = MatchForRexParserToken('id')44        num = MatchForRexParserToken('num')45        plain_str = MatchForRexParserToken('plain_str')46        rex_str = MatchForRexParserToken('rex_str')47        comment_str = MatchForRexParserToken('comment_str')48        assign = MatchForRexParserToken('spiecial_char', data = '=')49        left_small_bracket = MatchForRexParserToken('spiecial_char', data = '(')50        right_small_bracket = MatchForRexParserToken('spiecial_char', data = ')')51        left_big_bracket = MatchForRexParserToken('spiecial_char', data = '{')52        comma = MatchForRexParserToken('spiecial_char', data = ',')53        right_big_bracket = MatchForRexParserToken('spiecial_char', data = '}')54        star = MatchForRexParserToken('spiecial_char', data = '*')55        plus = MatchForRexParserToken('spiecial_char', data = '+')56        left_mid_bracket = MatchForRexParserToken('spiecial_char', data = '[')57        right_mid_bracket = MatchForRexParserToken('spiecial_char', data = ']')58        choice_char = MatchForRexParserToken('spiecial_char', data = '|')59        rex = MatchEq('rex', None)60        idDefine = MatchSequence('idDefine', [id, assign, rex])61        line = MatchSequence('line', [MatchOptional('_', idDefine), MatchOptional('_', comment_str)])62        multi_optional = MatchSequence('multi_optional', [left_mid_bracket, rex, right_mid_bracket])63        group = MatchSequence('group', [left_small_bracket, rex, right_small_bracket])64        basic_rex = MatchChoices('basic_rex', [plain_str, rex_str, id, group, multi_optional])65        multi_bound = MatchSequence('multi_bound', \66                                    [basic_rex, left_big_bracket, num, comma, \67                                     MatchOptional('_', num), right_big_bracket])68        multi_star = MatchSequence('multi_star', [basic_rex, star])69        multi_plus = MatchSequence('multi_plus', [basic_rex, plus]) 70        multi = MatchChoices('multi', [basic_rex, multi_bound, multi_star, multi_plus,])71        72        seq = MatchPlus('seq', multi)73        choices = MatchSequence('choices', \74                                [seq, MatchStar('_', MatchSequence('_', [choice_char, seq]))])75        rex.__init__('rex', choices)76        ls = list(locals().keys())77        #ls.sort()78        #self.match_fs = {name : locals()[name] for name in ls}79        match_fs = {}80        for name in ls:81            match_fs[name] = locals()[name]82        self.match_fs = match_fs83        84        return85    def idDefine2init_assign(self, idDefineResult):86        assert idDefineResult.name == 'idDefine'87        assert len(idDefineResult.data) == 388        id = idDefineResult.data[0]89        assert id.name == 'token'90        id = id.data91        assert type(id) == str92        rexResult = idDefineResult.data[-1]93        assignRightHand = self.rexResult2MatchPatternConstruct(rexResult)94        95        choicesResult = rexResult.data96        97            98    def parser(self, string):99        tk = RexTokener()100        ls = []101        match = self.match_fs['line']102        for i, line in enumerate(tk.tokenize(string)):103            r = match(line, 0, len(line))104            if r == None:105               raise Exception('parser fail at {}th line'.format(i))106            ls.append(r)107        for 108        return ls109    110 111class RexTokener:112    id = 'id'113    num = 'num'114    plain_str = 'plain_str'115    rex_str = 'rex_str'116    comment_str = 'comment_str'117    spaces = 'spaces'118    spiecial_char = 'spiecial_char'119    120    def __init__(self):121        self.special_chars = '#=(){,}*+[]|"\''122        self.plain_str_prefixes = ['r"', "r'", '"', "'"]123        self.rex_str_prefixes = ['rex"', 'rrex"', "rex'", "rrex'"]124        self.str_prefixes = self.plain_str_prefixes + self.rex_str_prefixes125        return126    127    def isSpecialChar(self, c):128        return c in self.special_chars129    def getStrPrefixes(self):130        return self.str_prefixes131    132    def matchID(self, string, start, end):133        if start != 0:134            c = string[start-1]135            if not (c.isspace() or self.isSpecialChar(c)):136                return None137            138        if start == end:139            return None140        c = string[start]141        if c.isspace() or self.isSpecialChar(c) or c.isdigit():142            return None143        144        for pre in self.getStrPrefixes():145            if string.startswith(pre, start, end):146                return None147            148        for i in range(start, end):149            c = string[i]150            if c.isspace() or self.isSpecialChar(c):151                if c in '\'"':152                    return None153                break154        else:155            i = end156        if not 0 <= start < i <= end:157            print(start, i, end)158        assert 0 <= start < i <= end159        return MatchResult(string, start, i, type=self.id, data = string[start:i])160    def matchNum(self, string, start, end):161        if start != 0:162            c = string[start-1]163            if not (c.isspace() or self.isSpecialChar(c)):164                return None165            166        if start == end or not string[start].isdigit():167            return None168        169            170        for i in range(start, end):171            c = string[i]172            if not c.isdigit():173                if c.isspace() or self.isSpecialChar(c):174                    break175                return None176        else:177            i += 1178        assert 0 <= start < i <= end179        num = int(string[start:i])180        return MatchResult(string, start, i, type=self.num, data = num)181    182    def matchStr(self, string, start, end):183        if start != 0:184            c = string[start-1]185            if not (c.isspace() or self.isSpecialChar(c)):186                return None187        for pre in self.getStrPrefixes():188            if string.startswith(pre, start, end):189                break190        else:191            return None192        close_char = pre[-1]193        name = self.rex_str if len(pre) >= 4 else self.plain_str194        is_raw_str = len(pre) in [2, 5]195        196        old_start = start197        str_start = start + len(pre)-1198        escaping = False199        for i in range(str_start+1, end):200            c = string[i]201            if c == '\\':202                escaping = not escaping203            elif c == close_char and not escaping:204                break205        else:206            return None207        i += 1208        str_end = i209        assert 0 <= str_start < str_end-1 < str_end <= end210        s = string[str_start : str_end]211        if is_raw_str:212            s = 'r' + s213        s = eval(s)214        return MatchResult(string, old_start, str_end, type=name, data = s)215    def matchSpecialCharExceptCommentChar(self, string, start, end):216        if start == end or string[start] == '#':217            return None218        219        c = string[start]220        if not self.isSpecialChar(c):221            return None222        223        return MatchResult(string, start, start+1, type=self.spiecial_char, data = c)224    225    def matchComment(self, string, start, end):226        if start == end or not string[start] == '#':227            return None228        229        return MatchResult(string, start, end, type=self.comment_str, data = string[start : end])230    231    def matchSpaces(self, string, start, end):232        if start == end or not string[start].isspace():233            return None234        235        for i in range(start, end):236            if not string[i].isspace():237                break238        else:239            i += 1240        return MatchResult(string, start, i, type=self.spaces, data = None)241    def matchOneToken(self, string, start, end):242        for f in [self.matchID, self.matchNum, self.matchStr, \243                  self.matchSpecialCharExceptCommentChar, \244                  self.matchComment, self.matchSpaces]:245            r = f(string, start, end)246            if r != None:247                return r248        return None249    250    def tokenize(self, string):251        for line in string.split('\n'):252            yield self.tokenizeLine(line)253        254    def tokenizeLine(self, line):255        line = line.strip()256        257        ls = []258        start = 0259        end = len(line)260        while True:261            r = self.matchOneToken(line, start, end)262            if r == None:263                if start != end:264                    raise ValueError('tokenizeLine fail')265                break266            if r.start == r.end:267                raise Exception('LogicError')268            if r.type == self.spaces or r.type == self.comment_str:269                # cast away270                pass271            else:272                ls.append(r)273            start = r.end274        return ls275    276            277            278import re279class MatchResult:280    def __init__(self, string, start, end, type, data):281        assert 0 <= start <= end <= len(string)282        283        self.string = string284        self.start = start285        self.end = end286        self.type = type287        self.data = data288        return289    290    def length(self):291        return self.end - self.start292    def __repr__(self):293        tpl = 'MatchResult(string={string}, start={start}, end={end}, type={type}, data={data})'294        return tpl.format(string=self.string, \295                          start=self.start, end=self.end, \296                          type=self.type, data=self.data)297def match_str(prefix, string, start, end):298    if not string.startswith(prefix, start, end):299        return None300    end = start + len(prefix)301    return MatchResult(string, start, end, type=match_str, data=prefix)302def match_rex(rex, string, start, end):303    m = re.search(rex, string[start : end])304    if not m:305        return None306    307    if not string.startswith(prefix, start, end):308        return None309    end = start + len(prefix)310    return MatchResult(string, m.start(), m.end(), type=match_rex, data=rex)311class MatchPattern:312    @staticmethod313    def default_factory():314        return MatchPattern('')315    316    def __init__(self, name):317        self.name = name318        return319    def match(self, string, start, end):320        r = self._match(string, start, end)321        if r != None:322            r.type = type(self)323        return r324    def __call__(self, string, start, end):325        return self.match(string, start, end)326    def __repr__(self):327        return '<{type}(name={name})>'.format(type=type(self), name=self.name)328class MatchEq(MatchPattern):329    @staticmethod330    def default_factory():331        return MatchPattern('', '')332    333    def __init__(self, name, matchPattern):334        super().__init__(name)335        self.matchPattern = matchPattern336        return337    def _match(self, string, start, end):338        return eq_match(self.matchPattern, string, start, end)339    340class MatchString(MatchPattern):341    @staticmethod342    def default_factory(self):343        return MatchPattern('', '')344    345    def __init__(self, name, string):346        super().__init__(name)347        self.string = string348        return349    def _match(self, string, start, end):350        return match_str(self.string, string, start, end)351class MatchRex(MatchPattern):352    @staticmethod353    def default_factory():354        return MatchPattern('', '')355    356    def __init__(self, name, rex):357        super().__init__(name)358        self.rex = rex359        return360    def _match(self, string, start, end):361        return match_rex(self.rex, string, start, end)362class MatchMulti(MatchPattern):363    @staticmethod364    def default_factory():365        return MatchPattern('', '', 0)366    367    def __init__(self, name, matchPattern, min, max=float('inf')):368        super().__init__(name)369        370        assert 0 <= min < min+1 <= max371        assert isinstance(min, int)372        373        self.matchPattern = matchPattern374        self.min = min375        self.max = max376        return377    def _match(self, string, start, end):378        return repeat_match(self.matchPattern, \379                            string, start, end, self.min, self.max)380    381class MatchSequence(MatchPattern):382    @staticmethod383    def default_factory():384        return MatchPattern('', [])385    386    def __init__(self, name, matchPatternList):387        super().__init__(name)388        self.matchPatternList = matchPatternList389        #self.match_fs = [p.match for p in matchPatternList]390        return391    def _match(self, string, start, end):392        return sequence_match(self.matchPatternList, string, start, end)393    394 395class MatchChoices(MatchPattern):396    @staticmethod397    def default_factory():398        return MatchPattern('', [])399    400    def __init__(self, name, matchPatternList):401        super().__init__(name)402        self.matchPatternList = matchPatternList403        #self.match_fs = [p.match for p in matchPatternList]404        return405    def _match(self, string, start, end):406        return choose_match(self.matchPatternList, string, start, end)407    408class MatchOptional(MatchMulti):409    @staticmethod410    def default_factory():411        return MatchPattern('', [])412    413    def __init__(self, name, matchPattern):414        super().__init__(name, matchPattern, min=0, max=1)415        return416    417class MatchStar(MatchMulti):418    @staticmethod419    def default_factory():420        return MatchPattern('', [])421    422    def __init__(self, name, matchPattern):423        super().__init__(name, matchPattern, min=0)424        return425    426class MatchPlus(MatchMulti):427    @staticmethod428    def default_factory():429        return MatchPattern('', [])430    431    def __init__(self, name, matchPattern):432        super().__init__(name, matchPattern, min=1)433        return434class MatchForRexParserToken(MatchPattern):435    @staticmethod436    def default_factory():437        return MatchPattern('', [])438    439    def __init__(self, type, data=None):440        super().__init__('token')441        self.type = type442        self.data = data443        return444    445    def _match(self, tokens, start, end):446        if start == end:447            return None448        r = tokens[start]449        if r.type != self.type:450            return None451        if self.data != None and self.data != r.data:452            return None453        return MatchResult(tokens, start, start+1, type=None, data = r)454        455def match(match_f, string, start, end):456    if not isinstance(match_f, MatchPattern):457        print(type(match_f), match_f)458    assert isinstance(match_f, MatchPattern)459    return match_f(string, start, end)460def eq_match(match_f, string, start, end):461    r = match(match_f, string, start, end)462    if r == None:463        return None464    return MatchResult(r.string, r.start, r.end, type=eq_match, data=r)465    466def repeat_match(match_f, string, start, end, min, max=float('inf')):467    assert min < max468    469    old_start = start470    ls = []471    for _ in range(min):472        r = match(match_f, string, start, end)473        if r == None:474            return None475        start = r.end476        ls.append(r)477    while len(ls) <= max:478        r = match(match_f, string, start, end)479        if r == None:480            break481        if max == float('inf') and r.start == r.end:482            raise MemoryError('match empty string infinit times')483        484        start = r.end485        ls.append(r)486        487    return MatchResult(string, old_start, start, type=repeat_match, data=ls)488def sequence_match(match_fs, string, start, end):489    assert len(match_fs)490    old_start = start491    ls = []492    for match_f in match_fs:493        r = match(match_f, string, start, end)494        if r == None:495            return None496        start = r.end497        ls.append(r)498        499    500    return MatchResult(string, old_start, start, type=sequence_match, data=ls)501def choose_match(match_fs, string, start, end):502    assert len(match_fs)503    504    ls = []505    for match_f in match_fs:506        r = match(match_f, string, start, end)507        if r == None:508            continue509        ls.append(r)510        511    if not ls:512        return None513    r = max(ls, key=lambda r:r.length())514    515    return MatchResult(r.string, r.start, r.end, type=choose_match, data=r)516'''517def star_match(match_f, string, start, end):518    return repeat_match(match_f, string, start, end, min=0)519def plus_match(match_f, string, start, end):520    return repeat_match(match_f, string, start, end, min=1)521def optional_match(match_f, string, start, end):522    return repeat_match(match_f, string, start, end, min=0, max=1)523def choose_str(strs, string, start, end):524    assert len(strs)525    assert strs == sorted(strs)526    for prefix in reversed(strs):527        r = match_str(prefix, string, start, end)528        if r != None:529            break530    else:531        return None532    return MatchResult(r.string, r.start, r.end, type=choose_str, data=r)533'''534p = RexParser()535r = p.parser(rex_language)...article_cleaner.py
Source:article_cleaner.py  
1import re2import pandas as pd3import numpy as np4import time56amino_acid_dict = {'C' : 'CYS', 'D' : 'ASP', 'S' : 'SER', 'Q' : 'GLN', 'K' : 'LYS',7    'I' : 'ILE', 'P' : 'PRO', 'T' : 'THR', 'F' : 'PHE', 'N' : 'ASN', 8    'G' : 'GLY', 'H' : 'HIS', 'L' : 'LEU', 'R' : 'ARG', 'W' : 'TRP', 9    'A' : 'ALA', 'V' : 'VAL', 'E' : 'GLU', 'Y' : 'TYR', 'M' : 'MET'}1011def amino_three(amino):12    """Return three letter amino acid from one letter"""13    return amino_acid_dict[amino]14    15def decompose_variation(variation):16    """Decompose a variation to search the three letter amino acid or use the position in regex"""1718    decompose_aa_pos_aa = re.compile("([a-z]{1,})(\d+)([a-z]{1,})")19    list_variation = decompose_aa_pos_aa.search(variation)20    if list_variation:21        aa1 = list_variation.group(1)22        aa2 = list_variation.group(3)2324        if len(aa1) + len(aa2) == 2:25            amino1 = amino_acid_dict[aa1.upper()].lower()26            position = list_variation.group(2)27            amino2 = amino_acid_dict[aa2.upper()].lower()28            return [amino1,position,amino2]29    return False3031def decompose_fusion(variation):32    """Decompose a "fusion" variation to search separately the genes fusioned"""33    decompose_g1_g2_fusion = re.compile("(\w+)\s?(\?|-)\s?(\w+)\? fusion")34    list_variation = decompose_g1_g2_fusion.search(variation)35    if list_variation:36        gene1 = list_variation.group(1)37        gene2 = list_variation.group(3)38        return "(" + gene1 + "|" + gene2 + ")"39    return False40        41    42def decompose_dup(variation):43    """Decompose a "duplication" variation"""4445    decompose_mut_pos = re.compile("([a-z]{1,})(\d+)dup")46    list_mut = decompose_mut_pos.search(variation)47    if list_mut:48        mut = list_mut.group(1)49        pos = list_mut.group(2)50        return mut + "" + pos51    return False525354def clean_text(article):55    """Clean text from extra dot ("Fig.", "...", "ie.") to enable regex to select real sentences"""56    dot3 = re.compile("[\.]{2,}")57    fig = re.compile("fig[s]?\.")58    decimal = re.compile("\d+\.\d+")    59    etal = re.compile("et al\.")60    ie = re.compile("i\.e\.")61    inc = re.compile("inc\.")62    mutation_point = re.compile("[p|c]\.")63    64    clean_article = article.lower()65    clean_article = dot3.sub(".", clean_article)66    clean_article = fig.sub("", clean_article)67    clean_article = decimal.sub("", clean_article)68    clean_article = etal.sub("", clean_article)69    clean_article = ie.sub("", clean_article)    70    clean_article = inc.sub("", clean_article)    71    clean_article = mutation_point.sub("", clean_article)    72    73    74    return clean_article7576def join_tuple_string(strings_tuple):77    return ' '.join(strings_tuple)7879def find_match(text, word):80    """Find the actual match with a regex"""81    clean = clean_text(text)82    word = word.lower()83    target_sentence = "([^.]*{}[^.]*\.)".format(word)84    before_after_target = "([^.]*\.){0,1}"    85    match_exp = re.compile(before_after_target + target_sentence + before_after_target)86    match_text = match_exp.findall(clean)87    final_match = "".join(list(map(join_tuple_string, match_text)))88    return final_match899091def extract_match(line):92    """Construction of the regex to use according to the variation"""93    94    # Cleaning text --> Already clean, no need    95    text = line["Text"]96    variation = line["Variation"]97    gene = line["Gene"].lower()98    99    if len(text) < 10000:100        return text,6101    102    if "r1627" == variation:103        return find_match(text, "162[0-9]"), 4104    if "c1385" == variation:105        return find_match(text, "p300"), 4106    107    if "hypermethylation" in variation:108        match_meth = find_match(text, "methylat")109        if len(match_meth) != 0:110            return match_meth, 2111        112    if "casp" in variation:113        match_casp = find_match(text, "casp")114        if len(match_casp) != 0:115            return match_casp, 1116    # Splice117    if "splice" in variation:118        match_splice = find_match(text, "splice")119        if len(match_splice) != 0:120            return match_splice, 2121        122    if "fs" in variation:123        match_fs = find_match(text, "frameshift")124        if len(match_fs) != 0:125            return match_fs, 2126        127    # Amplification128    if 'ampli' in variation:129        match_ampli = find_match(text, "(amplif|increse)")130        if len(match_ampli) != 0:131            return match_ampli,3132    133    # Duplication134    if "dup" in variation:135        decomp_dup = decompose_dup(variation)136        if decomp_dup:137            match_mut_pos = find_match(text, decomp_dup)138            if len(match_mut_pos) != 0:139                return match_mut_pos,2140        141        match_dup = find_match(text, "dup")142        if len(match_dup) != 0:143            return match_dup, 3144        145    # Try with * --> w802*146    if "*" in variation:147        new_var = variation.replace("*", "\\*")148        match_star = find_match(text, new_var)149        if len(match_star) != 0:150            return match_star, 1151        if "fs" in variation:152            match_fs = find_match(text, "fs\\*")153            if len(match_fs) != 0:154                return match_fs, 3155            156        match_stop_nonsense = find_match(text,"(stop|nonsense)")157        if len(match_stop_nonsense) != 0:158            return match_stop_nonsense, 2159160    # Try first match with inital variation value161    # Quality score = 1 162    initial_match = find_match(text, variation)    163    if len(initial_match) != 0:164        #print("First match ! ", variation)165        return initial_match, 1166    167    168    # deletion and insertion169    if "del" in variation or "ins" in variation:170        match_delins = find_match(text, "(deletion|insertion|delet|insert)")171        if len(match_delins) != 0:172            return match_delins,2173        174        match_delins_sentence = find_match(text, "(del|ins)(\w|\s){0,}(del|ins)")175        if len(match_delins_sentence) != 0:176            return match_delins_sentence, 3177    178    # Trunc mutations179    if "trunc" in variation:180        match_trunc = find_match(text, "trunc")181        if len(match_trunc) != 0:182            #print("Trunc", variation)183            return match_trunc, 2184        185        match_shorte = find_match(text, "(shorte|delet)")186        if len(match_shorte) != 0:187            return match_shorte,4188189190    # Fusion of two genes191    # Quality score = 2192    if "fusion" in variation:193        fusion_gene = decompose_fusion(variation)194        if fusion_gene:195            match_fusion_gene = find_match(text, fusion_gene)196            if len(match_fusion_gene) != 0:197                #print("Fusion gene1 | gene2", variation)198                return match_fusion_gene, 2199200        # Try to match the word fusion at least..201        # Quality score 4 (bad)202        match_fusion = find_match(text,"fusion")203        if len(match_fusion) != 0:204            #print("FUSION", variation)205            return match_fusion, 4206        207    aa_pos_aa = decompose_variation(variation)208    if aa_pos_aa :209        if len(aa_pos_aa) == 3:210            # If we success to split variation in 3 group --> aa1 pos aa2211            # Second try without the last amino acid --> y371212            match_variation_aa_pos = find_match(text, variation[:-1])213            if len(match_variation_aa_pos) != 0:214                #print("y371 aa_pos", variation)215                216                return match_variation_aa_pos, 2217218            # Third try with 3 letter code of amino acid --> tyr371ser219            match_aa_pos_aa = find_match(text, "".join(aa_pos_aa))220            if len(match_aa_pos_aa) != 0:221                #print("aa_pos_aa", variation)222                return match_aa_pos_aa, 1223            224            # Try with 3 letter code without the last aa --> tyr371225            match_aa_pos = find_match(text, aa_pos_aa[0] + aa_pos_aa[1])226            if len(match_aa_pos) != 0:227                #print("aa_pos", variation)228                229                return match_aa_pos,2230            231            # Match position only --> 371232            match_pos = find_match(text, aa_pos_aa[1])233            if len(match_pos) != 0:234                #print("pos", variation)235                236                return match_pos,4237            # Search word Substitution238            match_substitution = find_match(text, "substitu")239            if len(match_substitution) != 0:240                return match_substitution, 3241                242            # Match position around the real position --> 370 - 379243            match_pos_weak = find_match(text, aa_pos_aa[1][:-1] + "[0-9]")244            if len(match_pos_weak) != 0:245                #print("pos weak", variation)246                247                return match_pos_weak,5248     249    match_gene = find_match(text, gene)250    if len(match_gene) != 0:251        return match_gene, 5252            253    # score 6 ?254    return text,7255256257258def prepare_datas(file_text, file_variant, file_out, is_training):259    print("____________________________Cleaning Datas__________________________")260    print("____________________________________________________________________")261    start_time = time.perf_counter()262263    text = pd.read_csv(file_text, sep = '\|\|', engine='python')264    text.index.name = "ID"265    text.columns = ["Text"]266    variant = pd.read_csv(file_variant)267    variant.set_index("ID",inplace = True)268    269    concatenate_data = pd.merge(variant, text, on="ID").dropna()270    concatenate_data["Text"] = concatenate_data.apply(lambda line: clean_text(line["Text"]), axis = 1)271    concatenate_data["Variation"] = variant["Variation"].apply(lambda line: line.lower())272   273274    clean_match_data = concatenate_data.apply(lambda x: extract_match(x), axis = 1)275    clean_match = pd.DataFrame(list(clean_match_data), columns = ["Text","Score"], index = clean_match_data.index)276    clean_match.index.name = "ID"277278    new_data = pd.merge(concatenate_data,clean_match, on = "ID")279    if(is_training):280        final_data = new_data.loc[:,["Gene","Variation","Class","Text_y","Score"]]281        final_data.columns = ["Gene","Variation","Class","Text","Score"]282        dtf = pd.merge(pd.DataFrame(final_data.index), final_data, on ="ID")283        np.savetxt(file_out,dtf, fmt = "%d|||%s|||%s|||%d|||%s|||%d", header= "|||".join(dtf.columns), comments='')284    else:285        final_data = new_data.loc[:,["Gene","Variation","Text_y","Score"]]286        final_data.columns = ["Gene","Variation","Text","Score"]287        dtf = pd.merge(pd.DataFrame(final_data.index), final_data, on ="ID")288        np.savetxt(file_out,dtf, fmt = "%d|||%s|||%s|||%s|||%d", header= "|||".join(dtf.columns), comments='')289290   291    stop_time = time.perf_counter()292    print("____________________________________________________________________")293    print("Cleaning datas finished in {} seconds".format(stop_time-start_time))294295296297def main(is_training):298    if is_training:299        file_text = "datas/training_text"300        file_variant = "datas/training_variants"301        file_out = "datas/training_clean"302    else:303        file_text = "datas/test_text"304        file_variant = "datas/test_variants"305        file_out = "datas/test_clean"306    prepare_datas(file_text, file_variant, file_out, is_training)307308if __name__ == "__main__":
...MatchPattern.py
Source:MatchPattern.py  
1import re2import abc3from abc import abstractmethod, ABCMeta4from MatchResult import MatchResult5__all__ = ['MatchChoices', 'MatchEq', 'MatchMulti', 'MatchOptional',6           'MatchPattern', 'MatchPlus', 'MatchResult', 'MatchRex',7           'MatchSequence', 'MatchStar', 'MatchString']8def match_str(prefix, string, start, end):9    if not string.startswith(prefix, start, end):10        return None11    end = start + len(prefix)12    #print('match_str', repr(prefix), repr(string[start:end]))13    return MatchResult(string, start, end, type=match_str, data=prefix, children=None)14def match_rex(rex, string, start, end):15    m = re.search(rex, string[start : end])16    if (not m) or m.start() != 0:17        return None18    end = start + (m.end() - m.start())19    #print('rex: ', string[start: end])20    return MatchResult(string, start, end, type=match_rex, data=rex, children=None)21skip_follow_chars_by_rex = r'((?=[^\n])\s)*'22class MatchPattern(metaclass=ABCMeta):23    @staticmethod24    @abstractmethod25    def default_factory():26        raise NotImplementedError('default_factory @MatchPattern abstractmethod')27    @abstractmethod28    def _match(self):29        raise NotImplementedError('_match @MatchPattern abstractmethod')30    31    32    def __init__(self, *, name='', follow=skip_follow_chars_by_rex):33        self.name = name34        self.follow = follow35##        if follow is not skip_follow_chars_by_rex:36##            print(name, repr(follow))37        return38    def match(self, string, start, end):39##        print('name = ', self.name)40##        print('type = ', type(self))41##        print('string from ', string[start:start+30])42        r = self._match(string, start, end)43        if r != None:44            r.type = type(self)45            assert not r.name46            r.set_name(self.name)47            48            if (self.follow):49                m = re.search(self.follow, r.string[r.end : end])50##                if m:51##                    print(m.group())52##                print(repr(self.follow), repr(r.string[r.end : end]), r.end, end)53                if m and m.start() == 0:54                    assert not r.follow55                    r.append_follow(r.string[r.end : r.end+m.end()])56##            print('r.follow = {!r}'.format(r.follow))57##            print('r.substring = {!r}'.format(r.string[r.start:r.end]))58##            print('r.type = {!r}'.format(r.type))59##            print('r.data = {!r}'.format(r.data))60##        else:61##            print('r = ', r)62        return r63    def __call__(self, string, start, end):64        return self.match(string, start, end)65    def __repr__(self):66        return '<{type}(name={name})>'.format(type=type(self), name=self.name)67class MatchEq(MatchPattern):68    @staticmethod69    def default_factory():70        return MatchEq(None)71    72    def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):73        super().__init__(name=name, follow=follow)74        self.matchPattern = matchPattern75        return76    def _match(self, string, start, end):77        return eq_match(self.matchPattern, string, start, end)78    79class MatchString(MatchPattern):80    @staticmethod81    def default_factory():82        return MatchString('')83    84    def __init__(self, string, *, name='', follow=skip_follow_chars_by_rex):85        super().__init__(name=name, follow=follow)86        self.string = string87        return88    def _match(self, string, start, end):89        return match_str(self.string, string, start, end)90class MatchRex(MatchPattern):91    @staticmethod92    def default_factory():93        return MatchRex('')94    95    def __init__(self, rex, *, name='', follow=skip_follow_chars_by_rex):96        super().__init__(name=name, follow=follow)97        self.rex = rex98        return99    def _match(self, string, start, end):100        return match_rex(self.rex, string, start, end)101class MatchMulti(MatchPattern):102    @staticmethod103    def default_factory():104        return MatchMulti(None, 0)105    106    def __init__(self, matchPattern, min, max=float('inf'), *, name='', follow=skip_follow_chars_by_rex):107        super().__init__(name=name, follow=follow)108        109        assert 0 <= min < min+1 <= max110        assert isinstance(min, int)111        112        self.matchPattern = matchPattern113        self.min = min114        self.max = max115        return116    def _match(self, string, start, end):117        return repeat_match(self.matchPattern, \118                            string, start, end, self.min, self.max)119    120class MatchSequence(MatchPattern):121    @staticmethod122    def default_factory():123        return MatchSequence([])124    125    def __init__(self, matchPatternList, *, name='', follow=skip_follow_chars_by_rex):126        super().__init__(name=name, follow=follow)127        self.matchPatternList = matchPatternList128        #self.match_fs = [p.match for p in matchPatternList]129        return130    def _match(self, string, start, end):131        return sequence_match(self.matchPatternList, string, start, end)132    133 134class MatchChoices(MatchPattern):135    @staticmethod136    def default_factory():137        return MatchChoices([])138    139    def __init__(self, matchPatternList, *, name='', follow=skip_follow_chars_by_rex):140        super().__init__(name=name, follow=follow)141        self.matchPatternList = matchPatternList142        #self.match_fs = [p.match for p in matchPatternList]143        return144    def _match(self, string, start, end):145        return choose_match(self.matchPatternList, string, start, end)146    147class MatchOptional(MatchMulti):148    @staticmethod149    def default_factory():150        return MatchOptional([])151    152    def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):153        super().__init__(matchPattern, min=0, max=1, name=name, follow=follow)154        return155    156class MatchStar(MatchMulti):157    @staticmethod158    def default_factory():159        return MatchStar([])160    161    def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):162        super().__init__(matchPattern, min=0, name=name, follow=follow)163        return164    165class MatchPlus(MatchMulti):166    @staticmethod167    def default_factory():168        return MatchPlus([])169    170    def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):171        super().__init__(matchPattern, min=1, name=name, follow=follow)172        return173def match(match_f, string, start, end):174    if not isinstance(match_f, MatchPattern):175        print(type(match_f), match_f)176    assert isinstance(match_f, MatchPattern)177    return match_f(string, start, end)178def eq_match(match_f, string, start, end):179    r = match(match_f, string, start, end)180    if r == None:181        return None182    rr = MatchResult(string, r.start, r.org_end, type=eq_match, data=None, children=[r])183    rr.append_follow(r.follow)184    return rr185    186def repeat_match(match_f, string, start, end, min, max=float('inf')):187    assert min < max188    189    old_start = start190    ls = []191    for _ in range(min):192        r = match(match_f, string, start, end)193        if r == None:194            return None195        start = r.end196        ls.append(r)197    while len(ls) <= max:198        r = match(match_f, string, start, end)199        if r == None:200            break201        if max == float('inf') and r.start == r.end:202            print(match_f.name)203            print(type(match_f))204            print(string[start:end])205            print(match_f.max)206            raise MemoryError('match empty string infinit times')207        208        start = r.end209        ls.append(r)210        211    return MatchResult(string, old_start, start, type=repeat_match, data=None, children=ls)212def sequence_match(match_fs, string, start, end):213    assert len(match_fs)214    old_start = start215    ls = []216    for match_f in match_fs:217        r = match(match_f, string, start, end)218        if r == None:219            return None220        start = r.end221        ls.append(r)222        223    224    return MatchResult(string, old_start, start, type=sequence_match, data=None, children=ls)225def choose_match(match_fs, string, start, end):226    assert len(match_fs)227    228    ls = []229    for i, match_f in enumerate(match_fs):230        r = match(match_f, string, start, end)231        if r == None:232            continue233        ls.append((i,r))234        235    if not ls:236        return None237    elif len(ls) > 1:238        print('Warning: choose lengthest one from matchs')239        for i, r in ls:240            f = match_fs[i]241            print('\t', f.name)242              243    ir = max(ls, key=lambda ir:ir[-1].length())244    i, r = ir245    246    cs = [None]*len(match_fs)247    cs[i] = r248    return MatchResult(r.string, r.start, r.end, type=choose_match, data=i, children=cs)...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
