How to use clean_markup method in yandex-tank

Best Python code snippet using yandex-tank

table.py

Source:table.py Github

copy

Full Screen

1import json2import re3import numpy as np4import sys5reload(sys)6sys.setdefaultencoding('utf8')7class WikiTable:8 def __init__(self):9 self.table_rows = []10 self.columns = []11 self.table_caption = ''12 self.entity = ''13 self.section = ''14 self.label = 'NA'15 self.table_html = ''16 self.table_json = ''17 self.table_markup = ''18 self.label_confidence = 0.019 self.table_id = -120 self.column_meta_data = dict()21 def load_data(self, line):22 data_cells = line.split('\t')23 self.entity = data_cells[0]24 self.section = data_cells[1]25 self.table_id = int(data_cells[2])26 self.label = data_cells[3]27 self.label_confidence = float(data_cells[4])28 self.table_html = data_cells[5].strip()29 self.table_json = data_cells[6].strip()30 self.table_markup = data_cells[7].strip()31 self.parse_table_data()32 def load_json(self, json_line, entity, section, table_id, col_meta_parse=False):33 self.table_id = table_id34 self.entity = entity35 self.table_json = json_line36 self.section = section37 self.parse_table_data(col_meta_parse)38 '''39 Process the table HTML into the different sub-parts: (1) table caption, (2) table header (columns), (3) table cells40 '''41 def parse_table_data(self, col_meta_parse=False):42 # first check if the table has a caption43 tbljson = json.loads(self.table_json)44 self.table_caption = tbljson['caption']45 # get the table header and process it into the different rows and columns46 rows = tbljson['rows']47 for row in rows:48 row_values = row['values']49 row_cell_dict = dict()50 for cell in row_values:51 colname = cell['column']52 if colname not in self.columns:53 self.columns.append(colname)54 row_cell_dict[colname] = cell['value']55 self.table_rows.append(row_cell_dict)56 # load the column meta data57 if col_meta_parse:58 header = tbljson['header']59 header = header[len(header) - 1]['columns']60 for col in header:61 colname = col['name']62 values = col['value_dist']63 self.column_meta_data[colname] = []64 for value in values:65 self.column_meta_data[colname].append(value['value'])66 '''67 Here we compute simple features w.r.t the extracted table data from the Wikipedia markup.68 69 We will look into several features here. We look if certain values from the markup are 70 represent as columns or values in our parsed table data. In this way we may find missing 71 columns or values.72 '''73 def compute_features(self, bin_buckets=10):74 lines = [line.strip() for line in self.table_markup.decode('string_escape').split('\n')]75 # keep a dictionary of features here76 features = dict()77 # instead of keeping the line indexes we bucket them into 10 bins78 # so that all tables have the same representation space79 bins = np.linspace(1, len(lines), bin_buckets)80 for idx, line in enumerate(lines):81 idx = np.digitize(idx, bins).item(0) - 182 # check first if the line is table definition or if it contains the table caption83 if re.match('{\|\s?class=', line) or line.startswith('|+') or len(line) == 0:84 continue85 '''86 otherwise we assume that here we are dealing either with the table header data or the cell values87 therefore, we will replace all characters which are ! or | which are used to delimiter the columns88 or cells in Wiki tables89 '''90 tokens = re.sub(r'!+|\|+|-', '\t', line).strip().split('\t')91 for token in tokens:92 if len(token) == 0:93 continue94 has_token = False95 # check first if this token might be a column name96 if token in self.table_caption or token.startswith(97 ('colspan', 'rowspan', 'bgcolor', 'style', 'class')):98 continue99 # check here if the token is any of the values in our table cell data100 elif any((True for x in self.table_rows if token in x)):101 has_token = True102 # add these missing tokens and their frequency for the respective lines103 if idx not in features.keys():104 features[idx] = {}105 if token not in features[idx]:106 features[idx][token] = [0, 0]107 if has_token:108 features[idx][token][0] += 1109 else:110 features[idx][token][1] += 1111 # we will aggregate for each row or feature ID the amount of tokens which are covered or uncovered112 aggr_features = {}113 bins = np.linspace(0, 1, bin_buckets)114 for idx in features:115 for token in features[idx]:116 total = sum(features[idx][token])117 covered = features[idx][token][0] / float(total)118 bin_val = np.digitize(covered, bins).item(0) - 1119 key_val = str(idx) + '-' + str(bin_val)120 if key_val not in aggr_features:121 aggr_features[key_val] = 0122 aggr_features[key_val] += 1123 aggr_features['jacc'] = self.compute_html_markup_sim()124 aggr_features.update(self.column_value_dist(bins=bin_buckets))125 aggr_features['kl'] = self.compute_html_markup_kl()126 aggr_features['num_cols'] = len(self.columns)127 aggr_features['markup_double_exlamanation'] = self.table_markup.count('!!')128 aggr_features['markup_single_exlamanation'] = self.table_markup.count('!')129 return aggr_features130 '''131 Return the word distribution from the columns in this table132 '''133 def column_word_dist(self):134 column_features = {}135 columns = json.loads(self.table_json)['header'][len(json.loads(self.table_json)['header']) - 1]['columns']136 for idx, col in enumerate(columns):137 col_values = col['value_dist']138 for val in col_values:139 value = val['value'].encode('ascii', 'ignore').decode('ascii').decode('unicode-escape')140 wordlist = value.lower().split(' ')141 d = {v: wordlist.count(v) for v in wordlist}142 column_features.update({k: d.get(k, 0) + column_features.get(k, 0) for k in set(d.keys())})143 column_features = {k: v for k, v in column_features.iteritems() if v < 3}144 return column_features145 '''146 Compute features that are related w.r.t the distribution of column values.147 '''148 def column_value_dist(self, bins=10):149 column_features = {}150 # check the distribution of the column values151 columns = json.loads(self.table_json)['header'][len(json.loads(self.table_json)['header']) - 1]['columns']152 bin_buckets = np.linspace(1, len(columns), bins)153 for idx, col in enumerate(columns):154 col_values = col['value_dist']155 numbers, letters, other = 0, 0, 0156 for val in col_values:157 value = val['value']158 value = value.replace(' ', '').replace('"', '').replace('&', '')159 count = val['count']160 if value.isalpha():161 letters += count162 elif value.isdigit():163 numbers += count164 else:165 other += count166 idx_key = str(np.digitize(idx, bin_buckets))167 total = float(numbers + letters + other)168 total = total == 0 and 1 or total169 if ('col-num-' + idx_key) not in column_features:170 column_features['col-num-' + idx_key] = []171 column_features['col-lt-' + idx_key] = []172 column_features['col-ot-' + idx_key] = []173 column_features['col-num-' + idx_key].append(numbers / total)174 column_features['col-lt-' + idx_key].append(letters / total)175 column_features['col-ot-' + idx_key].append(other / total)176 features = {}177 for key in column_features:178 features[key] = sum(column_features[key]) / len(column_features[key])179 return features180 '''181 Compute the similarity between the table as it appears in Wikipedia and its extracted version182 '''183 def compute_html_markup_sim(self):184 # compute the Jaccard sim185 clean_html = re.sub(r'<[^>]+>', ' ', self.table_html)186 set_tbl_markup = set(self.clean_wiki_markup().split(' '))187 set_tbl_html = set(clean_html.split(' '))188 score = float(len(set_tbl_markup & set_tbl_html)) / len(set_tbl_markup | set_tbl_html)189 return score190 '''191 Compute the KL divergence between the unigram language models 192 of the markup and the html representations of the table.193 '''194 def compute_html_markup_kl(self):195 clean_html = re.sub(r'<[^>]+>', ' ', self.table_html)196 html_wd = clean_html.lower().split(' ')197 d_html = {v: html_wd.count(v) for v in html_wd}198 clean_markup = self.clean_wiki_markup()199 html_mp = clean_markup.lower().split(' ')200 d_mp = {v: html_mp.count(v) for v in html_mp}201 keys = set(d_mp.keys()) | set(d_html.keys())202 a, b = np.zeros(len(keys)), np.zeros(len(keys))203 a_total = float(sum(d_html.values()))204 b_total = float(sum(d_mp.values()))205 epsilon = 0.001206 for idx, key in enumerate(keys):207 a[idx] = (key in d_html and d_html[key] or epsilon) / a_total208 b[idx] = (key in d_mp and d_mp[key] or epsilon) / b_total209 return np.sum(np.where(a != 0, a * np.log(a / b), 0))210 '''211 Clean the Wiki markup from the extracted table.212 '''213 def clean_wiki_markup(self):214 clean_markup = re.sub(r'\\\"', '"', self.table_markup.decode('unicode-escape'))215 clean_markup = re.sub(r'style=\"?(.*?)\"', ' ', clean_markup)216 clean_markup = re.sub(r'(\\n)|(class=\"?wikitable\"?)|(colspan=(.*?))|(rowspan=(.*?))|\|+', ' ',217 clean_markup)218 clean_markup = re.sub('<ref(\s?name=(.*?))?>(.*?)</ref>', ' ', clean_markup)219 clean_markup = re.sub(r'</?span\s*>', '', clean_markup)220 clean_markup = re.sub(r'(bgcolor=\"(.*?)")|(align=\"(.*?)\")', ' ', clean_markup)221 clean_markup = re.sub(r'\]+|\[+|\"+|\'+|!+|\}+|\{+|\n+|\++', ' ', clean_markup).strip()...

Full Screen

Full Screen

utils.py

Source:utils.py Github

copy

Full Screen

...25 ltitle2docs = {}26 for x in title2doc.keys():27 ltitle2docs.setdefault(x.lower(), []).append(title2doc[x])28 return ltitle2docs29def clean_markup(text):30 return text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")31def parse_item(text):32 items = []33 if text.startswith("# ") and len(text) > 2:34 items.extend([35 clean_markup(x).replace("?", "").replace(";", "").replace("'", "").strip() 36 for x in re.split(',|;', text[2:]) if x not in {'-', '?', '—', ''}37 ])38 return items39def parse_translation(trans):40 res = {}41 for line in trans.split('\n'):42 if line.startswith('|'):43 l, r = line.split('=')44 res[l[1:]] = r.replace('[[', '').replace(']]', '')45 return res46def parse_wiktionary(text):47 res = {'hypernym': [], 'synonym': [], 'meaning': []}48 h1 = ""49 texts = []50 for line in text.split("\n"):51 if line.startswith("= ") and line.endswith(" ="):52 h1 = line53 if h1 == '= {{-ru-}} =':54 texts.append(line)55 text = "\n".join(texts)56 for par in text.split("\n\n"):57 for h, f in [('==== Гиперонимы ====', 'hypernym'), ('==== Синонимы ====', 'synonym')]:58 if h in par:59 res[f] += [w.replace(' ', '_').lower() for line in par.split("\n") for w in parse_item(line)]60 for h, f in [('==== Значение ====', 'meaning')]:61 if h in par:62 for line in par.split('\n'):63 if line.startswith('# ') and len(line) > 2:64 res[f] += [clean_markup(line[2:]).lower()]65 #res[f] += [clean_markup(line[2:]).split() for line in par.split("\n") if line.startswith('# ') and len(line) > 2]66 #res[f] = [item for sublist in res[f] for item in sublist]67 #print(res[f])68 #if '=== Перевод ===' in par:69 # res['translation'] = par.replace('=== Перевод ===\n', '')70 return res71def load_wiktionary(wiktionary_dump_path, vectors):72 title2docs = {key.replace(' ', '_'): val for key, val in get_title2docs(wiktionary_dump_path).items() if key in vectors}73 for title in title2docs:74 docs_info = []75 for doc in title2docs[title]:76 docs_info.append(parse_wiktionary(doc['text']))77 title2docs[title] = docs_info...

Full Screen

Full Screen

test_markup.py

Source:test_markup.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2# Автор: Гусев Илья3# Описание: Тесты для разметки.4import unittest5from rupo.util.data import MARKUP_EXAMPLE6from rupo.main.markup import Markup7from rupo.stress.predictor import CombinedStressPredictor8from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, CMU_DICT, \9 RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH10class TestMarkup(unittest.TestCase):11 @classmethod12 def setUpClass(cls):13 cls.stress_predictor = CombinedStressPredictor(14 stress_model_path=RU_STRESS_DEFAULT_MODEL,15 zalyzniak_dict=ZALYZNYAK_DICT,16 cmu_dict=CMU_DICT,17 raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,18 stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH19 )20 @classmethod21 def tearDownClass(cls):22 del cls.stress_predictor23 def test_from_to(self):24 clean_markup = Markup()25 self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_xml(MARKUP_EXAMPLE.to_xml()))26 clean_markup = Markup()27 self.assertEqual(MARKUP_EXAMPLE, clean_markup.from_json(MARKUP_EXAMPLE.to_json()))28 def test_process_text(self):29 text = "Соломка король себя.\n Пора виться майкой в."30 markup = Markup.process_text(text, self.stress_predictor)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run yandex-tank automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful