How to use is_full_url method in localstack

Best Python code snippet using localstack_python

auto_scraper.py

Source:auto_scraper.py

1import json2from collections import defaultdict3from urllib.parse import urljoin, urlparse4import requests5from bs4 import BeautifulSoup6from autoscraper.utils import unique, get_random_str7class AutoScraper(object):8    request_headers = {9        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \10            (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'11    }12    def __init__(self, stack_list=None, url=None):13        self.stack_list = stack_list14        self.url = url15    def save(self, file_path):16        data = dict(url=self.url, stack_list=self.stack_list)17        with open(file_path, 'w') as f:18            json.dump(data, f)19    def load(self, file_path):20        with open(file_path, 'r') as f:21            data = json.load(f)22        # for backward compatibility23        if isinstance(data, list):24            self.stack_list = data25            return26        self.stack_list = data['stack_list']27        self.url = data['url']28    @classmethod29    def _get_soup(cls, url=None, html=None, request_args=None):30        request_args = request_args or {}31        if html:32            return BeautifulSoup(html, 'lxml')33        headers = dict(cls.request_headers)34        if url:35            headers['Host'] = urlparse(url).netloc36        user_headers = request_args.pop('headers', {})37        headers.update(user_headers)38        html = requests.get(url, headers=headers, **request_args).text39        return BeautifulSoup(html, 'lxml')40    @staticmethod41    def _get_valid_attrs(item):42        return {43            k: v if v != [] else '' for k, v in item.attrs.items() if k in {'class', 'style'}44        }45    def _child_has_text(self, child, text):46        child_text = child.getText().strip()47        if text == child_text:48            child.wanted_attr = None49            return True50        for key, value in child.attrs.items():51            if not isinstance(value, str):52                continue53            value = value.strip()54            if text == value:55                child.wanted_attr = key56                return True57            if key in {'href', 'src'}:58                full_url = urljoin(self.url, value)59                if text == full_url:60                    child.wanted_attr = key61                    child.is_full_url = True62                    return True63        return False64    def _get_children(self, soup, text):65        text = text.strip()66        children = reversed(soup.findChildren())67        children = list(filter(lambda x: self._child_has_text(x, text), children))68        return children69    def build(self, url=None, wanted_list=None, html=None, request_args=None):70        self.url = url71        soup = self._get_soup(url=url, html=html, request_args=request_args)72        result_list = []73        stack_list = []74        for wanted in wanted_list:75            children = self._get_children(soup, wanted)76            for child in children:77                result, stack = self._get_result_for_child(child, soup)78                result_list += result79                stack_list.append(stack)80        result_list = unique(result_list)81        if all(w in result_list for w in wanted_list):82            self.stack_list = unique(stack_list)83            return result_list84        return None85    @classmethod86    def _build_stack(cls, child):87        content = [(child.name, cls._get_valid_attrs(child))]88        parent = child89        while True:90            grand_parent = parent.findParent()91            if not grand_parent:92                break93            children = grand_parent.findAll(parent.name, cls._get_valid_attrs(parent),94                                                         recursive=False)95            for i, c in enumerate(children):96                if c == parent:97                    content.insert(98                        0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i))99                    break100            if grand_parent.name == 'html':101                break102            parent = grand_parent103        wanted_attr = getattr(child, 'wanted_attr', None)104        is_full_url = getattr(child, 'is_full_url', False)105        stack = dict(content=content, wanted_attr=wanted_attr, is_full_url=is_full_url)106        stack['stack_id'] = 'rule_' + get_random_str(4)107        return stack108    def _get_result_for_child(self, child, soup):109        stack = self._build_stack(child)110        result = self._get_result_with_stack(stack, soup)111        return result, stack112    def _fetch_result_from_child(self, child, wanted_attr, is_full_url):113        if wanted_attr is None:114            return child.getText().strip()115        if wanted_attr not in child.attrs:116            return None117        if is_full_url:118            return urljoin(self.url, child.attrs[wanted_attr])119        return child.attrs[wanted_attr]120    def _get_result_with_stack(self, stack, soup):121        parents = [soup]122        for _, item in enumerate(stack['content']):123            children = []124            for parent in parents:125                children += parent.findAll(item[0], item[1], recursive=False)126            parents = children127        wanted_attr = stack['wanted_attr']128        is_full_url = stack['is_full_url']129        result = [self._fetch_result_from_child(i, wanted_attr, is_full_url) for i in parents]130        result = list(filter(lambda x: x, result))131        return result132    def _get_result_with_stack_index_based(self, stack, soup):133        p = soup.findChildren(recursive=False)[0]134        stack_content = stack['content']135        for index, item in enumerate(stack_content[:-1]):136            content = stack_content[index + 1]137            p = p.findAll(content[0], content[1], recursive=False)138            if not p:139                return []140            idx = min(len(p) - 1, item[2])141            p = p[idx]142        result = [self._fetch_result_from_child(p, stack['wanted_attr'], stack['is_full_url'])]143        result = list(filter(lambda x: x, result))144        return result145    def _get_result_by_func(self, func, url, html, soup, request_args, grouped):146        if url:147            self.url = url148        if not soup:149            soup = self._get_soup(url=url, html=html, request_args=request_args)150        result_list = []151        grouped_result = defaultdict(list)152        for stack in self.stack_list:153            result = func(stack, soup)154            if not grouped:155                result_list += result156                continue157            stack_id = stack['stack_id']158            grouped_result[stack_id] += result159        return dict(grouped_result) if grouped else unique(result_list)160    def get_result_similar(self, url=None, html=None, soup=None, request_args=None, grouped=False):161        func = self._get_result_with_stack162        return self._get_result_by_func(func, url, html, soup, request_args, grouped)163    def get_result_exact(self, url=None, html=None, soup=None, request_args=None, grouped=False):164        func = self._get_result_with_stack_index_based165        return self._get_result_by_func(func, url, html, soup, request_args, grouped)166    def get_result(self, url=None, html=None, request_args=None):167        soup = self._get_soup(url=url, html=html, request_args=request_args)168        similar = self.get_result_similar(soup=soup)169        exact = self.get_result_exact(soup=soup)170        return similar, exact171    def remove_rules(self, rules):172        self.stack_list = list(filter(lambda x: x['stack_id'] not in rules, self.stack_list))173    def keep_rules(self, rules):174        self.stack_list = list(filter(lambda x: x['stack_id'] in rules, self.stack_list))175    def generate_python_code(self):176        # deprecated...

code_template.py

Source:code_template.py

1from urllib.parse import urljoin, urlparse2import requests3from bs4 import BeautifulSoup4class GeneratedAutoScraper(object):5    request_headers = {6        'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; \7            Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36'8    }9    def __init__(self):10        self.url = ''11        self.stack_list = "{STACK_LIST}"12    @staticmethod13    def _get_soup(url=None, html=None, request_args=None):14        if html:15            return BeautifulSoup(html, 'lxml')16        request_args = request_args if request_args else {}17        headers = dict(GeneratedAutoScraper.request_headers)18        if url:19            headers['Host'] = urlparse(url).netloc20        headers = request_args.get('headers', headers)21        html = requests.get(url, headers=headers, **request_args).text22        return BeautifulSoup(html, 'lxml')23    @staticmethod24    def unique(item_list):25        unique_list = []26        for item in item_list:27            if item not in unique_list:28                unique_list.append(item)29        return unique_list30    def _fetch_result_from_child(self, child, wanted_attr, is_full_url):31        if wanted_attr is None:32            return child.getText().strip().rstrip()33        if wanted_attr not in child.attrs:34            return None35        if is_full_url:36            return urljoin(self.url, child.attrs[wanted_attr])37        return child.attrs[wanted_attr]38    def _get_result_with_stack(self, stack, soup):39        parents = [soup]40        for _, item in enumerate(stack['content']):41            children = []42            for parent in parents:43                children += parent.findAll(item[0], item[1], recursive=False)44            parents = children45        wanted_attr = stack['wanted_attr']46        is_full_url = stack['is_full_url']47        result = [self._fetch_result_from_child(i, wanted_attr, is_full_url) for i in parents]48        result = list(filter(lambda x: x, result))49        return result50    def _get_result_with_stack_index_based(self, stack, soup):51        p = soup.findChildren(recursive=False)[0]52        stack_content = stack['content']53        for index, item in enumerate(stack_content[:-1]):54            p = p.findAll(stack_content[index + 1][0], recursive=False)[item[2]]55        result = self._fetch_result_from_child(p, stack['wanted_attr'], stack['is_full_url'])56        return result57    def get_result_similar(self, url=None, html=None, soup=None, request_args=None):58        if url:59            self.url = url60        if not soup:61            soup = self._get_soup(url=url, html=html, request_args=request_args)62        result = []63        for stack in self.stack_list:64            result += self._get_result_with_stack(stack, soup)65        return self.unique(result)66    def get_result_exact(self, url=None, html=None, soup=None, request_args=None):67        if url:68            self.url = url69        if not soup:70            soup = self._get_soup(url=url, html=html, request_args=request_args)71        result = []72        for stack in self.stack_list:73            try:74                result.append(self._get_result_with_stack_index_based(stack, soup))75            except IndexError:76                continue77        return self.unique(result)78    def get_result(self, url=None, html=None, request_args=None):79        soup = self._get_soup(url=url, html=html, request_args=request_args)80        similar = self.get_result_similar(soup=soup)81        exact = self.get_result_exact(soup=soup)...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.