Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use clean_markup method in yandex-tank

Best Python code snippet using yandex-tank

table.py

Source:table.py

1import json2import re3import numpy as np4import sys5reload(sys)6sys.setdefaultencoding('utf8')7class WikiTable:8    def __init__(self):9        self.table_rows = []10        self.columns = []11        self.table_caption = ''12        self.entity = ''13        self.section = ''14        self.label = 'NA'15        self.table_html = ''16        self.table_json = ''17        self.table_markup = ''18        self.label_confidence = 0.019        self.table_id = -120        self.column_meta_data = dict()21    def load_data(self, line):22        data_cells = line.split('\t')23        self.entity = data_cells[0]24        self.section = data_cells[1]25        self.table_id = int(data_cells[2])26        self.label = data_cells[3]27        self.label_confidence = float(data_cells[4])28        self.table_html = data_cells[5].strip()29        self.table_json = data_cells[6].strip()30        self.table_markup = data_cells[7].strip()31        self.parse_table_data()32    def load_json(self, json_line, entity, section, table_id, col_meta_parse=False):33        self.table_id = table_id34        self.entity = entity35        self.table_json = json_line36        self.section = section37        self.parse_table_data(col_meta_parse)38    '''39        Process the table HTML into the different sub-parts: (1) table caption, (2) table header (columns), (3) table cells40    '''41    def parse_table_data(self, col_meta_parse=False):42        # first check if the table has a caption43        tbljson = json.loads(self.table_json)44        self.table_caption = tbljson['caption']45        # get the table header and process it into the different rows and columns46        rows = tbljson['rows']47        for row in rows:48            row_values = row['values']49            row_cell_dict = dict()50            for cell in row_values:51                colname = cell['column']52                if colname not in self.columns:53                    self.columns.append(colname)54                row_cell_dict[colname] = cell['value']55            self.table_rows.append(row_cell_dict)56        # load the column meta data57        if col_meta_parse:58            header = tbljson['header']59            header = header[len(header) - 1]['columns']60            for col in header:61                colname = col['name']62                values = col['value_dist']63                self.column_meta_data[colname] = []64                for value in values:65                    self.column_meta_data[colname].append(value['value'])66    '''67        Here we compute simple features w.r.t the extracted table data from the Wikipedia markup.68        69        We will look into several features here. We look if certain values from the markup are 70        represent as columns or values in our parsed table data. In this way we may find missing 71        columns or values.72    '''73    def compute_features(self, bin_buckets=10):74        lines = [line.strip() for line in self.table_markup.decode('string_escape').split('\n')]75        # keep a dictionary of features here76        features = dict()77        # instead of keeping the line indexes we bucket them into 10 bins78        # so that all tables have the same representation space79        bins = np.linspace(1, len(lines), bin_buckets)80        for idx, line in enumerate(lines):81            idx = np.digitize(idx, bins).item(0) - 182            # check first if the line is table definition or if it contains the table caption83            if re.match('{\|\s?class=', line) or line.startswith('|+') or len(line) == 0:84                continue85            '''86             otherwise we assume that here we are dealing either with the table header data or the cell values87             therefore, we will replace all characters which are ! or | which are used to delimiter the columns88             or cells in Wiki tables89            '''90            tokens = re.sub(r'!+|\|+|-', '\t', line).strip().split('\t')91            for token in tokens:92                if len(token) == 0:93                    continue94                has_token = False95                # check first if this token might be a column name96                if token in self.table_caption or token.startswith(97                        ('colspan', 'rowspan', 'bgcolor', 'style', 'class')):98                    continue99                # check here if the token is any of the values in our table cell data100                elif any((True for x in self.table_rows if token in x)):101                    has_token = True102                # add these missing tokens and their frequency for the respective lines103                if idx not in features.keys():104                    features[idx] = {}105                if token not in features[idx]:106                    features[idx][token] = [0, 0]107                if has_token:108                    features[idx][token][0] += 1109                else:110                    features[idx][token][1] += 1111        # we will aggregate for each row or feature ID the amount of tokens which are covered or uncovered112        aggr_features = {}113        bins = np.linspace(0, 1, bin_buckets)114        for idx in features:115            for token in features[idx]:116                total = sum(features[idx][token])117                covered = features[idx][token][0] / float(total)118                bin_val = np.digitize(covered, bins).item(0) - 1119                key_val = str(idx) + '-' + str(bin_val)120                if key_val not in aggr_features:121                    aggr_features[key_val] = 0122                aggr_features[key_val] += 1123        aggr_features['jacc'] = self.compute_html_markup_sim()124        aggr_features.update(self.column_value_dist(bins=bin_buckets))125        aggr_features['kl'] = self.compute_html_markup_kl()126        aggr_features['num_cols'] = len(self.columns)127        aggr_features['markup_double_exlamanation'] = self.table_markup.count('!!')128        aggr_features['markup_single_exlamanation'] = self.table_markup.count('!')129        return aggr_features130    '''131        Return the word distribution from the columns in this table132    '''133    def column_word_dist(self):134        column_features = {}135        columns = json.loads(self.table_json)['header'][len(json.loads(self.table_json)['header']) - 1]['columns']136        for idx, col in enumerate(columns):137            col_values = col['value_dist']138            for val in col_values:139                value = val['value'].encode('ascii', 'ignore').decode('ascii').decode('unicode-escape')140                wordlist = value.lower().split(' ')141                d = {v: wordlist.count(v) for v in wordlist}142                column_features.update({k: d.get(k, 0) + column_features.get(k, 0) for k in set(d.keys())})143        column_features = {k: v for k, v in column_features.iteritems() if v < 3}144        return column_features145    '''146        Compute features that are related w.r.t the distribution of column values.147    '''148    def column_value_dist(self, bins=10):149        column_features = {}150        # check the distribution of the column values151        columns = json.loads(self.table_json)['header'][len(json.loads(self.table_json)['header']) - 1]['columns']152        bin_buckets = np.linspace(1, len(columns), bins)153        for idx, col in enumerate(columns):154            col_values = col['value_dist']155            numbers, letters, other = 0, 0, 0156            for val in col_values:157                value = val['value']158                value = value.replace(' ', '').replace('"', '').replace('&', '')159                count = val['count']160                if value.isalpha():161                    letters += count162                elif value.isdigit():163                    numbers += count164                else:165                    other += count166            idx_key = str(np.digitize(idx, bin_buckets))167            total = float(numbers + letters + other)168            total = total == 0 and 1 or total169            if ('col-num-' + idx_key) not in column_features:170                column_features['col-num-' + idx_key] = []171                column_features['col-lt-' + idx_key] = []172                column_features['col-ot-' + idx_key] = []173            column_features['col-num-' + idx_key].append(numbers / total)174            column_features['col-lt-' + idx_key].append(letters / total)175            column_features['col-ot-' + idx_key].append(other / total)176        features = {}177        for key in column_features:178            features[key] = sum(column_features[key]) / len(column_features[key])179        return features180    '''181        Compute the similarity between the table as it appears in Wikipedia and its extracted version182    '''183    def compute_html_markup_sim(self):184        # compute the Jaccard sim185        clean_html = re.sub(r'<[^>]+>', ' ', self.table_html)186        set_tbl_markup = set(self.clean_wiki_markup().split(' '))187        set_tbl_html = set(clean_html.split(' '))188        score = float(len(set_tbl_markup & set_tbl_html)) / len(set_tbl_markup | set_tbl_html)189        return score190    '''191        Compute the KL divergence between the unigram language models 192        of the markup and the html representations of the table.193    '''194    def compute_html_markup_kl(self):195        clean_html = re.sub(r'<[^>]+>', ' ', self.table_html)196        html_wd = clean_html.lower().split(' ')197        d_html = {v: html_wd.count(v) for v in html_wd}198        clean_markup = self.clean_wiki_markup()199        html_mp = clean_markup.lower().split(' ')200        d_mp = {v: html_mp.count(v) for v in html_mp}201        keys = set(d_mp.keys()) | set(d_html.keys())202        a, b = np.zeros(len(keys)), np.zeros(len(keys))203        a_total = float(sum(d_html.values()))204        b_total = float(sum(d_mp.values()))205        epsilon = 0.001206        for idx, key in enumerate(keys):207            a[idx] = (key in d_html and d_html[key] or epsilon) / a_total208            b[idx] = (key in d_mp and d_mp[key] or epsilon) / b_total209        return np.sum(np.where(a != 0, a * np.log(a / b), 0))210    '''211        Clean the Wiki markup from the extracted table.212    '''213    def clean_wiki_markup(self):214        clean_markup = re.sub(r'\\\"', '"', self.table_markup.decode('unicode-escape'))215        clean_markup = re.sub(r'style=\"?(.*?)\"', ' ', clean_markup)216        clean_markup = re.sub(r'(\\n)|(class=\"?wikitable\"?)|(colspan=(.*?))|(rowspan=(.*?))|\|+', ' ',217                              clean_markup)218        clean_markup = re.sub('<ref(\s?name=(.*?))?>(.*?)</ref>', ' ', clean_markup)219        clean_markup = re.sub(r'</?span\s*>', '', clean_markup)220        clean_markup = re.sub(r'(bgcolor=\"(.*?)")|(align=\"(.*?)\")', ' ', clean_markup)221        clean_markup = re.sub(r'\]+|\[+|\"+|\'+|!+|\}+|\{+|\n+|\++', ' ', clean_markup).strip()...

utils.py

Source:utils.py

...25    ltitle2docs = {}26    for x in title2doc.keys():27        ltitle2docs.setdefault(x.lower(), []).append(title2doc[x])28    return ltitle2docs29def clean_markup(text):30    return text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")31def parse_item(text):32    items = []33    if text.startswith("# ") and len(text) > 2:34        items.extend([35            clean_markup(x).replace("?", "").replace(";", "").replace("'", "").strip() 36            for x in re.split(',|;', text[2:]) if x not in {'-', '?', 'â', ''}37        ])38    return items39def parse_translation(trans):40    res = {}41    for line in trans.split('\n'):42        if line.startswith('|'):43            l, r = line.split('=')44            res[l[1:]] = r.replace('[[', '').replace(']]', '')45    return res46def parse_wiktionary(text):47    res = {'hypernym': [], 'synonym': [], 'meaning': []}48    h1 = ""49    texts = []50    for line in text.split("\n"):51        if line.startswith("= ") and line.endswith(" ="):52            h1 = line53        if h1 == '= {{-ru-}} =':54            texts.append(line)55    text = "\n".join(texts)56    for par in text.split("\n\n"):57        for h, f in [('==== ÐÐ¸Ð¿ÐµÑÐ¾Ð½Ð¸Ð¼Ñ ====', 'hypernym'), ('==== Ð¡Ð¸Ð½Ð¾Ð½Ð¸Ð¼Ñ ====', 'synonym')]:58            if h in par:59                res[f] += [w.replace(' ', '_').lower() for line in par.split("\n") for w in parse_item(line)]60        for h, f in [('==== ÐÐ½Ð°ÑÐµÐ½Ð¸Ðµ ====', 'meaning')]:61            if h in par:62                for line in par.split('\n'):63                    if line.startswith('# ') and len(line) > 2:64                        res[f] += [clean_markup(line[2:]).lower()]65                #res[f] += [clean_markup(line[2:]).split() for line in par.split("\n") if line.startswith('# ') and len(line) > 2]66        #res[f] = [item for sublist in res[f] for item in sublist]67        #print(res[f])68        #if '=== ÐÐµÑÐµÐ²Ð¾Ð´ ===' in par:69        #    res['translation'] = par.replace('=== ÐÐµÑÐµÐ²Ð¾Ð´ ===\n', '')70    return res71def load_wiktionary(wiktionary_dump_path, vectors):72    title2docs = {key.replace(' ', '_'): val for key, val in get_title2docs(wiktionary_dump_path).items() if key in vectors}73    for title in title2docs:74        docs_info = []75        for doc in title2docs[title]:76            docs_info.append(parse_wiktionary(doc['text']))77        title2docs[title] = docs_info...

test_markup.py

Source:test_markup.py

1# -*- coding: utf-8 -*-2# ÐÐ²ÑÐ¾Ñ: ÐÑÑÐµÐ² ÐÐ»ÑÑ3# ÐÐ¿Ð¸ÑÐ°Ð½Ð¸Ðµ: Ð¢ÐµÑÑÑ Ð´Ð»Ñ ÑÐ°Ð·Ð¼ÐµÑÐºÐ¸.4import unittest5from rupo.util.data import MARKUP_EXAMPLE6from rupo.main.markup import Markup7from rupo.stress.predictor import CombinedStressPredictor8from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \9    RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH10class TestMarkup(unittest.TestCase):11    @classmethod12    def setUpClass(cls):13        cls.stress_predictor = CombinedStressPredictor(14            stress_model_path=RU_STRESS_DEFAULT_MODEL,15            zalyzniak_dict=ZALYZNYAK_DICT,16            cmu_dict=CMU_DICT,17            raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,18            stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH19        )20    @classmethod21    def tearDownClass(cls):22        del cls.stress_predictor23    def test_from_to(self):24        clean_markup = Markup()25        self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_xml(MARKUP_EXAMPLE.to_xml()))26        clean_markup = Markup()27        self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_json(MARKUP_EXAMPLE.to_json()))28    def test_process_text(self):29        text = "Ð¡Ð¾Ð»Ð¾Ð¼ÐºÐ° ÐºÐ¾ÑÐ¾Ð»Ñ ÑÐµÐ±Ñ.\n ÐÐ¾ÑÐ° Ð²Ð¸ÑÑÑÑ Ð¼Ð°Ð¹ÐºÐ¾Ð¹ Ð²."30        markup = Markup.process_text(text, self.stress_predictor)...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.