How to use outer_html method in Selene

Best Python code snippet using selene_python

test_md_helpers.py

Source:test_md_helpers.py Github

copy

Full Screen

...83def test_fix_unwrapped_no_tags():84 html = 'test'85 root_elem = PyQuery(html)86 res = fix_unwrapped_text(root_elem)87 res_check = res.outer_html()88 expected = PyQuery('<p>test</p>')89 assert res_check == expected.outer_html()90def test_fix_unwrapped_em_tag():91 html = '<em>wrapped</em>'92 root_elem = PyQuery(html)93 res = fix_unwrapped_text(root_elem)94 res_check = res.outer_html()95 expected = PyQuery('<em>wrapped</em>')96 assert res_check == expected.outer_html()97def test_fix_unwrapped_text_basic():98 html = '<p>1<b>2</b>3</p>'99 root_elem = PyQuery(html)100 res = fix_unwrapped_text(root_elem)101 res_check = res.outer_html()102 expected = PyQuery(103 '<p><span>1</span><span><b>2</b></span><span>3</span></p>')104 assert res_check == expected.outer_html()105def test_fix_unwrapped_text_basic_2():106 html = '<p><b>2</b>3</p>'107 root_elem = PyQuery(html)108 res = fix_unwrapped_text(root_elem)109 res_check = res.outer_html()110 expected = PyQuery('<p><span><b>2</b></span><span>3</span></p>')111 assert res_check == expected.outer_html()112def test_fix_unwrapped_text_basic_3():113 html = '<p><strong>test</strong> unwrapped</p>'114 root_elem = PyQuery(html)115 res = fix_unwrapped_text(root_elem)116 res_check = res.outer_html()117 expected = PyQuery('<p><span><strong>test</strong></span><span> unwrapped' +118 '</span<</p>')119 assert res_check == expected.outer_html()120def test_fix_unwrapped_text_basic_4():121 html = '<p><i>a</i><b>b</b><c>c</c></p>'122 root_elem = PyQuery(html)123 res = fix_unwrapped_text(root_elem)124 res_check = res.outer_html()125 expected = PyQuery(126 '<p><span><i>a</i></span><span><b>b</b></span><span><c>c' +127 '</c></span></p>')128 assert res_check == expected.outer_html()129def test_fix_unwrapped_text_deep():130 html = '<span><strong>12<b>3</b></strong></span>'131 root_elem = PyQuery(html)132 res = fix_unwrapped_text(root_elem)133 res_check = res.outer_html()134 expected = PyQuery('<p><span><strong><span>12</span><span><b>3</b></span>' +135 '</strong></span></p>')136 assert res_check == expected.html()137def test_fix_unwrapped_text_attributes():138 html = '<p><strong attr="123">test</strong> unwrapped</p>'139 root_elem = PyQuery(html)140 res = fix_unwrapped_text(root_elem)141 res_check = res.outer_html()142 expected = PyQuery('<p><span><strong attr="123">test</strong></span>' +143 '<span> unwrapped</span></p>')144 assert res_check == expected.outer_html()145def test_fix_unwrapped_no_unwrapped_basic():146 html = '<span>wrapped</span>'147 root_elem = PyQuery(html)148 res = fix_unwrapped_text(root_elem)149 res_check = res.outer_html()150 expected = PyQuery('<span>wrapped</span>')151 assert res_check == expected.outer_html()152def test_fix_unwrapped_text_no_unwrapped_basic():153 html = '<span><strong>123</strong></span>'154 root_elem = PyQuery(html)155 res = fix_unwrapped_text(root_elem)156 res_check = res.outer_html()157 expected = PyQuery('<p><span><strong>123</strong></span></p>')158 assert res_check == expected.html()159def test_fix_unwrapped_text_no_unwrapped_basic_2():160 html = '<p><span>123</span></p>'161 root_elem = PyQuery(html)162 res = fix_unwrapped_text(root_elem)163 res_check = res.outer_html()164 expected = PyQuery('<p><span>123</span></p>')165 assert res_check == expected.outer_html()166def test_fix_unwrapped_text_no_unwrapped_complex():167 html = '<p><span><i>a</i></span><span><b>b</b></span></p>'168 root_elem = PyQuery(html)169 res = fix_unwrapped_text(root_elem)170 res_check = res.outer_html()171 expected = PyQuery('<p><span><i>a</i></span><span><b>b</b></span></p>')172 assert res_check == expected.outer_html()173def test_fix_unwrapped_text_ul_basic():174 html = '<ul><li>123</li></ul>'175 root_elem = PyQuery(html)176 res = fix_unwrapped_text(root_elem)177 res_check = res.outer_html()178 expected = PyQuery('<ul><li>123</li></ul>')179 assert res_check == expected.outer_html()180def test_build_dict_ol_with_nesting():181 markdown_string = '1. parent\n2. child\n\t1. nested'182 html = markdown_to_html(markdown_string).strip()183 root_elem = PyQuery(html)184 res = fix_unwrapped_text(root_elem)185 res_check = res.outer_html()186 expected = PyQuery(187 '<ol><li>parent</li><li><span>child</span><ol><li>nested</li>' +188 '</ol></li></ol>')189 assert res_check == expected.outer_html()190def test_fix_unwrapped_text_complex():191 html = '<p>aaa <em>bbb <i>ccc</i></em> ddd <del>eee</del> fff</p>'192 root_elem = PyQuery(html)193 res = fix_unwrapped_text(root_elem)194 res_check = res.outer_html()195 expected = PyQuery('<p><span>aaa </span><span><em><span>bbb </span><span>' +196 '<i>ccc</i></span></em></span><span> ddd </span><span>' +197 '<del>eee</del></span><span> fff</span></p>')198 assert res_check == expected.outer_html()199def test_fix_unwrapped_text_complex_2():200 html = '<p>aaa <em>bbb <i>ccc<q>zzz</q>ddd</i></em> ddd <del>' + \201 'eee</del> fff</p>'202 root_elem = PyQuery(html)203 res = fix_unwrapped_text(root_elem)204 res_check = res.outer_html()205 expected = PyQuery('<p><span>aaa </span><span><em><span>bbb </span><span>' +206 '<i><span>ccc</span><span><q>zzz</q></span><span>ddd' +207 '</span></i></span></em></span><span> ddd </span>' +208 '<span><del>eee</del></span><span> fff</span></p>')209 assert res_check == expected.outer_html()210def test_fix_unwrapped_text_complex_3():211 html = '<p>aaa <em>bbb <i>ccc<span><p>zzz</p></span>ddd</i>' + \212 '</em> ddd <del>eee</del> fff</p>'213 root_elem = PyQuery(html)214 res = fix_unwrapped_text(root_elem)215 res_check = res.outer_html()216 expected = PyQuery('<p><span>aaa </span><span><em><span>bbb </span><span>' +217 '<i><span>ccc</span><span><p>zzz</p></span><span>ddd' +218 '</span></i></span></em></span><span> ddd </span>' +219 '<span><del>eee</del></span><span> fff</span></p>')220 assert res_check == expected.outer_html()221def test_no_change_fix_unwrapped_text_complex():222 html = '<p><span>aaa </span><span><em><span>bbb </span><span>' + \223 '<i><span>ccc</span><span><p>zzz</p></span><span>ddd' + \224 '</span></i></span></em></span><span> ddd </span><span>' + \225 '<del>eee</del></span><span> fff</span></p>'226 root_elem = PyQuery(html)227 res = fix_unwrapped_text(root_elem)228 res_check = res.outer_html()229 expected = PyQuery('<p><span>aaa </span><span><em><span>bbb </span><span>' +230 '<i><span>ccc</span><span><p>zzz</p></span><span>ddd' +231 '</span></i></span></em></span><span> ddd </span>' +232 '<span><del>eee</del></span><span> fff</span></p>')233 assert res_check == expected.outer_html()234def test_collapse_attrs_basic():235 input_dict = [{"type": "span", "attrs": [], "layout": {}, "extra": {},236 "contents": [237 {"type": "strong", "attrs": [], "layout": {},238 "extra": {},239 "contents": "test"}240 ]}]241 res = collapse_attrs(input_dict)242 expected = [MarkdownSection("span", "test", {}, {}, ["bold"])]243 assert res[0].get_dict() == expected[0].get_dict()244def test_collapse_attrs_nested():245 input_dict = [{"type": "span", "attrs": [], "layout": {}, "extra": {},246 "contents": [247 {"type": "strong", "attrs": [], "layout": {},...

Full Screen

Full Screen

elem_processing.py

Source:elem_processing.py Github

copy

Full Screen

1import re2from util_core.util.content_analysis import is_more_link_text3from util_core.util.html_util import (4 get_start_tag, get_tag_names, get_inner, get_tag_name, get_attrs,5 get_text, get_normalised_attrs, remove_html_content, get_tags6)7from webextractor.clustering.comparisons.computed_styles import EXPECTED_COMPUTED_STYLE_KEYS8from webextractor.element_descriptions.util import get_spatial_visibility9from webextractor.element_descriptions.descriptions import get_url_data10from webextractor.selenium_wrapper.preloading import (11 preload_element_data, get_multiple_outer__chunk, AncestorPath,12)13# whether to convert feature_set and all_computed_styles__array to integers14CONVERT_TO_NUMS = True15# r'[day_num_optword] [month_W]'16DAY_NUM_OPTWORD_MONTH = '(?P<day_num>01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|1|2|3|4|5|6|7|8|9)(st|nd|rd|th)? (of )?(?P<month_W>january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec)'17# r'[month_W] [day_num_word]'18MONTH_DAY_NUM_WORD = '(?P<month_W>january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec) (?P<day_num>01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31|1|2|3|4|5|6|7|8|9)(st|nd|rd|th)'19# r'[month_W] [year_4]20MONTH_YEAR = '(?P<month_W>january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec) (?P<year>2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020|2021|2022|2023|2024)'21# r'[time]'22TIME = '((?P<hour_1>00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|0|1|2|3|4|5|6|7|8|9)(:|\\.)(?P<minute>(0|1|2|3|4|5)(0|5))\\s?(?P<am_pm_1>a\\.m\\.|p\\.m\\.|am|pm)?|(?P<hour_2>00|01|02|03|04|05|06|07|08|09|10|11|12|1|2|3|4|5|6|7|8|9)\\s?(?P<am_pm_2>a\\.m\\.|p\\.m\\.|am|pm))'23# r'[day_name]'24DAY_NAME = '(?P<day_name>monday|tuesday|wednesday|thursday|friday|saturday|sunday|mon|tue|wed|thu|thur|thurs|fri|sat|sun)'25TEXT_DATE_REGEXPS = [26 (DAY_NUM_OPTWORD_MONTH, 'DAY_NUM_OPTWORD_MONTH'),27 (MONTH_DAY_NUM_WORD, 'MONTH_DAY_NUM_WORD'),28 (MONTH_YEAR, 'MONTH_YEAR'),29 (TIME, 'TIME'),30 (DAY_NAME, 'DAY_NAME')31]32def has_date(txt):33 if not txt:34 return None35 for reg, key in TEXT_DATE_REGEXPS:36 if re.search(reg, txt, flags=re.I):37 return key38 return None39def get_img_type(outer_htmlL):40 if '<img' in outer_htmlL:41 return 'IMG'42 elif 'background-image: url(' in outer_htmlL:43 return 'BACKGROUND_IMG'44 # todo: should we include png vs jpg?45 return None46'''47 def class_str(self):48 class_str = self.attrs.get('class', '').strip()49 if not class_str:50 return ''51 classes = []52 for cls in class_str.split(' '):53 cls = cls.lower().strip()54 if cls == 'active':55 continue56 cls = re.sub(r'\d+', '__N__', cls)57 classes.append(cls)58 return str(sorted(classes))59 @cached_property60 def tag_class_str(self):61 return self.tag_name + '__' + self.class_str62'''63def _get_class_attr_str(html_attrs):64 class_str = html_attrs.get('class', '').strip()65 if not class_str:66 return ''67 classes = []68 for cls in class_str.split(' '):69 cls = cls.lower().strip()70 if cls == 'active':71 continue72 cls = re.sub(r'\d+', '__N__', cls)73 classes.append(cls)74 return str(sorted(classes))75def _get_outer_html_data(outer_html):76 outer_htmlL = outer_html.lower()77 inner_html = get_inner(outer_html)78 txt = get_text(outer_html, inner_html)79 outer_no_content = remove_html_content(outer_htmlL, l=False)80 tag_name = get_tag_name(outer_html)81 html_attrs = get_attrs(outer_html)82 class_attr_str = _get_class_attr_str(html_attrs)83 return {84 'outer_html': outer_html,85 'outer_htmlL': outer_htmlL,86 'inner_html': inner_html,87 'outer_html_no_content': outer_no_content,88 'outer_html_no_content_rev': ''.join(reversed(outer_no_content)),89 'text': txt,90 'textL': txt.lower(),91 'text_is_digit': txt.isdigit(),92 'num_tags': round(outer_html.count('<') / 2),93 'text_has_date': has_date(txt) or '',94 'text_is_more_link': is_more_link_text(txt),95 'start_tag': get_start_tag(outer_html, normalise=True),96 'tag_name': tag_name,97 'tags_key': '__'.join(get_tags(outer_html, l=True, exclude_br=True)),98 'tag_class_str': tag_name + '__' + class_attr_str,99 'html_attrs': html_attrs,100 'img_type': get_img_type(outer_htmlL) or '',101 # 'img_url': get_img_url(outer_htmlL, self.page_url_host) doesn't seem to be used?102 }103def process_outer_html__url(elem_ids, outers, context):104 partial_descs = []105 for i, outer_html in enumerate(outers):106 ed = _get_outer_html_data(outer_html)107 ed['node_id'] = elem_ids[i]108 ed.update(get_url_data(outer_html, context))109 partial_descs.append(ed)110 return partial_descs111def process_ancestor_paths(elem_ids, ancestor_paths):112 partial_descs = []113 for i, path in enumerate(ancestor_paths):114 path = AncestorPath(path)115 parent_outer = path.get(1, 'outer_html') or ''116 parent_outerL = parent_outer.lower() if parent_outer else ''117 ed = {118 'node_id': elem_ids[i],119 'ancestor_path': path,120 'parent_tag_name': path.get(1, 'tag_name') or '',121 'parent_outer_html': parent_outer,122 'parent_outer_htmlL': parent_outerL,123 'parent_outer_html_no_content': remove_html_content(parent_outerL) if parent_outer else '',124 'parent_start_tag': get_start_tag(parent_outer, normalise=True) if parent_outer else '',125 'parent_img_type': (get_img_type(parent_outerL) or '') if parent_outer else ''126 }127 partial_descs.append(ed)128 return partial_descs129def process_xpaths(elem_ids, xpaths):130 partial_descs = []131 for i, xpath in enumerate(xpaths):132 ed = {'xpath': xpath, 'node_id': elem_ids[i]}133 xpath_no_nums = re.sub(r'\[\d{1,2}\]', '', xpath)134 if xpath_no_nums.count('/') < 5:135 ed['xpath_suffix'] = xpath_no_nums136 else:137 ed['xpath_suffix'] = '/'.join(xpath_no_nums.split('/')[-4:])138 partial_descs.append(ed)139 return partial_descs140def _create_desc_from_rect(elem_id, rect):141 area = int(round(rect['height'] * rect['width']))142 rect = {143 'x': int(round(rect['x'])),144 'y': int(round(rect['y'])),145 'height': int(round(rect['height'])),146 'width': int(round(rect['width'])),147 'area': area148 }149 # todo: rect_int, rect_box?150 return {151 'node_id': elem_id,152 'rect': rect,153 'spatial_visibility': get_spatial_visibility(rect)154 }155def process_rects(elem_ids, rects):156 partial_descs = []157 for i, rect in enumerate(rects):158 ed = _create_desc_from_rect(159 elem_ids[i], rect160 )161 partial_descs.append(ed)162 return partial_descs163def _add_computed_style_integers(partial_descs, context):164 computed_style_string_to_num = context['computed_style_string_to_num']165 for ed in partial_descs:166 ed['all_computed_styles__array_int'] = [167 -1 for k in EXPECTED_COMPUTED_STYLE_KEYS168 ]169 for i, cc in enumerate(ed['all_computed_styles__array']):170 if len(cc) == 0:171 ed['all_computed_styles__array_int'][i] = -1172 continue173 key = (i, cc)174 if key not in computed_style_string_to_num:175 computed_style_string_to_num[key] = len(computed_style_string_to_num)176 ed['all_computed_styles__array_int'][i] = computed_style_string_to_num[key]177def process_computed_styles(elem_ids, computed_styles, context):178 partial_descs = []179 for i, cs_data in enumerate(computed_styles):180 ed = {181 'node_id': elem_ids[i],182 'font-size': cs_data['font-size'],183 'font-weight': cs_data['font-weight'],184 'font-colour': cs_data['color'],185 'color': cs_data['color'],186 'cssComputed__visibility': cs_data['visibility'].lower(),187 'cssComputed__display': cs_data['display'].lower(),188 'jquery__is_hidden': not cs_data['is_visible_jquery'],189 # NOTE: 'spatial_visibility' is computed in process_rects()190 # and 'driver__is_displayed' is computed in the main thread191 'all_computed_styles': cs_data['all_computed_styles'], # used by neural net192 }193 ed['all_computed_styles__array'] = [194 ed['all_computed_styles'].get(k, '')[:100]195 for k in EXPECTED_COMPUTED_STYLE_KEYS196 ]197 partial_descs.append(ed)198 if CONVERT_TO_NUMS:199 #with collection.lock:200 _add_computed_style_integers(partial_descs, context)...

Full Screen

Full Screen

test_table.py

Source:test_table.py Github

copy

Full Screen

...34 # print '内嵌table'35 # else:36 # table_result.append(table)37 # print '1111'38 # print table.outer_html()39 if table.find('table').length > 0:40 continue41 result += table.outer_html()42 # print result43 table_list = PyQuery(result, parser='html').find('table')44 print table_list.length45 for item in table_list.items():46 print item.text()47 print '###################'48 del table_list[0]49 print table_list.length50 for item in table_list.items():51 print item.text()52 print '###################'53 del table_list[0]54 print table_list.length55 for item in table_list.items():56 print item.text()57 print '###################'58 # for table in table_list.items():59 # # print table.length60 # # print table.is_('table')61 # #62 # # print table.next().length63 # # print table.next().is_('table')64 # #65 # # print table.next().next().length66 # # print table.next().next().is_('table')67 # print table.parent().length68 # print table.parent().parent().length69 # print table.parent().parent().parent().length70 # break71 # table = table_list72 # print table.length73 # print table.is_('table')74 # # print table_list.outer_html()75 # table = table_list.next()76 # print table.length77 # print table.is_('table')78 #79 # # print table_list.next().outer_html()80 # table = table_list.next().next()81 # print table.length82 # print table.is_('table')83 #84 # table = table_list.next().next().next()85 # print table.length86 # print table.is_('table')87 # if table_list.next().next() is None:88 # print 'next next is None'89 # for item in PyQuery(result, parser='html').find('table').items():90 # print '111'91 # print item.outer_html()92 #93 # parent = table.parent()94 # for item in parent.items():95 # print type(item.outer_html())96 # print len(item.outer_html())97 # #print item.outer_html()98 # p = PyQuery(item.outer_html(), parser='html')99 # print p.outer_html()100 # for table in table_list.items():101 # print table.outer_html()102 # print len(table_result)103 # for table in table_result:104 # print table.outer_html()105# with open('表格测试4.txt') as p_file:106# lines = p_file.read().decode(encoding='utf-8')107# bs = BeautifulSoup(lines, 'lxml')108# # print bs109# table_list = bs.find_all('table')110# # print len(table_list)111# for item_table in table_list:112# print len(item_table)113# for item in item_table.contents:114# if isinstance(item, NavigableString):115# continue116# # print item117# print type(item)118# if item.name == 'tr':...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run Selene automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful