How to use is_full_url method in localstack

Best Python code snippet using localstack_python

auto_scraper.py

Source:auto_scraper.py Github

copy

Full Screen

1import json2from collections import defaultdict3from urllib.parse import urljoin, urlparse4import requests5from bs4 import BeautifulSoup6from autoscraper.utils import unique, get_random_str7class AutoScraper(object):8 request_headers = {9 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \10 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'11 }12 def __init__(self, stack_list=None, url=None):13 self.stack_list = stack_list14 self.url = url15 def save(self, file_path):16 data = dict(url=self.url, stack_list=self.stack_list)17 with open(file_path, 'w') as f:18 json.dump(data, f)19 def load(self, file_path):20 with open(file_path, 'r') as f:21 data = json.load(f)22 # for backward compatibility23 if isinstance(data, list):24 self.stack_list = data25 return26 self.stack_list = data['stack_list']27 self.url = data['url']28 @classmethod29 def _get_soup(cls, url=None, html=None, request_args=None):30 request_args = request_args or {}31 if html:32 return BeautifulSoup(html, 'lxml')33 headers = dict(cls.request_headers)34 if url:35 headers['Host'] = urlparse(url).netloc36 user_headers = request_args.pop('headers', {})37 headers.update(user_headers)38 html = requests.get(url, headers=headers, **request_args).text39 return BeautifulSoup(html, 'lxml')40 @staticmethod41 def _get_valid_attrs(item):42 return {43 k: v if v != [] else '' for k, v in item.attrs.items() if k in {'class', 'style'}44 }45 def _child_has_text(self, child, text):46 child_text = child.getText().strip()47 if text == child_text:48 child.wanted_attr = None49 return True50 for key, value in child.attrs.items():51 if not isinstance(value, str):52 continue53 value = value.strip()54 if text == value:55 child.wanted_attr = key56 return True57 if key in {'href', 'src'}:58 full_url = urljoin(self.url, value)59 if text == full_url:60 child.wanted_attr = key61 child.is_full_url = True62 return True63 return False64 def _get_children(self, soup, text):65 text = text.strip()66 children = reversed(soup.findChildren())67 children = list(filter(lambda x: self._child_has_text(x, text), children))68 return children69 def build(self, url=None, wanted_list=None, html=None, request_args=None):70 self.url = url71 soup = self._get_soup(url=url, html=html, request_args=request_args)72 result_list = []73 stack_list = []74 for wanted in wanted_list:75 children = self._get_children(soup, wanted)76 for child in children:77 result, stack = self._get_result_for_child(child, soup)78 result_list += result79 stack_list.append(stack)80 result_list = unique(result_list)81 if all(w in result_list for w in wanted_list):82 self.stack_list = unique(stack_list)83 return result_list84 return None85 @classmethod86 def _build_stack(cls, child):87 content = [(child.name, cls._get_valid_attrs(child))]88 parent = child89 while True:90 grand_parent = parent.findParent()91 if not grand_parent:92 break93 children = grand_parent.findAll(parent.name, cls._get_valid_attrs(parent),94 recursive=False)95 for i, c in enumerate(children):96 if c == parent:97 content.insert(98 0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i))99 break100 if grand_parent.name == 'html':101 break102 parent = grand_parent103 wanted_attr = getattr(child, 'wanted_attr', None)104 is_full_url = getattr(child, 'is_full_url', False)105 stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url)106 stack['stack_id'] = 'rule_' + get_random_str(4)107 return stack108 def _get_result_for_child(self, child, soup):109 stack = self._build_stack(child)110 result = self._get_result_with_stack(stack, soup)111 return result, stack112 def _fetch_result_from_child(self, child, wanted_attr, is_full_url):113 if wanted_attr is None:114 return child.getText().strip()115 if wanted_attr not in child.attrs:116 return None117 if is_full_url:118 return urljoin(self.url, child.attrs[wanted_attr])119 return child.attrs[wanted_attr]120 def _get_result_with_stack(self, stack, soup):121 parents = [soup]122 for _, item in enumerate(stack['content']):123 children = []124 for parent in parents:125 children += parent.findAll(item[0], item[1], recursive=False)126 parents = children127 wanted_attr = stack['wanted_attr']128 is_full_url = stack['is_full_url']129 result = [self._fetch_result_from_child(i, wanted_attr, is_full_url) for i in parents]130 result = list(filter(lambda x: x, result))131 return result132 def _get_result_with_stack_index_based(self, stack, soup):133 p = soup.findChildren(recursive=False)[0]134 stack_content = stack['content']135 for index, item in enumerate(stack_content[:-1]):136 content = stack_content[index + 1]137 p = p.findAll(content[0], content[1], recursive=False)138 if not p:139 return []140 idx = min(len(p) - 1, item[2])141 p = p[idx]142 result = [self._fetch_result_from_child(p, stack['wanted_attr'], stack['is_full_url'])]143 result = list(filter(lambda x: x, result))144 return result145 def _get_result_by_func(self, func, url, html, soup, request_args, grouped):146 if url:147 self.url = url148 if not soup:149 soup = self._get_soup(url=url, html=html, request_args=request_args)150 result_list = []151 grouped_result = defaultdict(list)152 for stack in self.stack_list:153 result = func(stack, soup)154 if not grouped:155 result_list += result156 continue157 stack_id = stack['stack_id']158 grouped_result[stack_id] += result159 return dict(grouped_result) if grouped else unique(result_list)160 def get_result_similar(self, url=None, html=None, soup=None, request_args=None, grouped=False):161 func = self._get_result_with_stack162 return self._get_result_by_func(func, url, html, soup, request_args, grouped)163 def get_result_exact(self, url=None, html=None, soup=None, request_args=None, grouped=False):164 func = self._get_result_with_stack_index_based165 return self._get_result_by_func(func, url, html, soup, request_args, grouped)166 def get_result(self, url=None, html=None, request_args=None):167 soup = self._get_soup(url=url, html=html, request_args=request_args)168 similar = self.get_result_similar(soup=soup)169 exact = self.get_result_exact(soup=soup)170 return similar, exact171 def remove_rules(self, rules):172 self.stack_list = list(filter(lambda x: x['stack_id'] not in rules, self.stack_list))173 def keep_rules(self, rules):174 self.stack_list = list(filter(lambda x: x['stack_id'] in rules, self.stack_list))175 def generate_python_code(self):176 # deprecated...

Full Screen

Full Screen

code_template.py

Source:code_template.py Github

copy

Full Screen

1from urllib.parse import urljoin, urlparse2import requests3from bs4 import BeautifulSoup4class GeneratedAutoScraper(object):5 request_headers = {6 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; \7 Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36'8 }9 def __init__(self):10 self.url = ''11 self.stack_list = "{STACK_LIST}"12 @staticmethod13 def _get_soup(url=None, html=None, request_args=None):14 if html:15 return BeautifulSoup(html, 'lxml')16 request_args = request_args if request_args else {}17 headers = dict(GeneratedAutoScraper.request_headers)18 if url:19 headers['Host'] = urlparse(url).netloc20 headers = request_args.get('headers', headers)21 html = requests.get(url, headers=headers, **request_args).text22 return BeautifulSoup(html, 'lxml')23 @staticmethod24 def unique(item_list):25 unique_list = []26 for item in item_list:27 if item not in unique_list:28 unique_list.append(item)29 return unique_list30 def _fetch_result_from_child(self, child, wanted_attr, is_full_url):31 if wanted_attr is None:32 return child.getText().strip().rstrip()33 if wanted_attr not in child.attrs:34 return None35 if is_full_url:36 return urljoin(self.url, child.attrs[wanted_attr])37 return child.attrs[wanted_attr]38 def _get_result_with_stack(self, stack, soup):39 parents = [soup]40 for _, item in enumerate(stack['content']):41 children = []42 for parent in parents:43 children += parent.findAll(item[0], item[1], recursive=False)44 parents = children45 wanted_attr = stack['wanted_attr']46 is_full_url = stack['is_full_url']47 result = [self._fetch_result_from_child(i, wanted_attr, is_full_url) for i in parents]48 result = list(filter(lambda x: x, result))49 return result50 def _get_result_with_stack_index_based(self, stack, soup):51 p = soup.findChildren(recursive=False)[0]52 stack_content = stack['content']53 for index, item in enumerate(stack_content[:-1]):54 p = p.findAll(stack_content[index + 1][0], recursive=False)[item[2]]55 result = self._fetch_result_from_child(p, stack['wanted_attr'], stack['is_full_url'])56 return result57 def get_result_similar(self, url=None, html=None, soup=None, request_args=None):58 if url:59 self.url = url60 if not soup:61 soup = self._get_soup(url=url, html=html, request_args=request_args)62 result = []63 for stack in self.stack_list:64 result += self._get_result_with_stack(stack, soup)65 return self.unique(result)66 def get_result_exact(self, url=None, html=None, soup=None, request_args=None):67 if url:68 self.url = url69 if not soup:70 soup = self._get_soup(url=url, html=html, request_args=request_args)71 result = []72 for stack in self.stack_list:73 try:74 result.append(self._get_result_with_stack_index_based(stack, soup))75 except IndexError:76 continue77 return self.unique(result)78 def get_result(self, url=None, html=None, request_args=None):79 soup = self._get_soup(url=url, html=html, request_args=request_args)80 similar = self.get_result_similar(soup=soup)81 exact = self.get_result_exact(soup=soup)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run localstack automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful