Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use _normalize_link method in Lemoncheesecake

Best Python code snippet using lemoncheesecake

crawler.py

Source:crawler.py

...16from urllib3.exceptions import LocationParseError17from urllib.parse import urlparse, urljoin18class UrlUtilsMixin:19    @staticmethod20    def _normalize_link(link, root_url: str) -> str:21        try:22            parsed_url = urlparse(link)23        except ValueError:24            return None25        parsed_root_url = urlparse(root_url)26        if link.startswith("//"):27            return f"{parsed_root_url.scheme}://{parsed_url.netloc}{parsed_url.path}"28        if not parsed_url.scheme:29            return urljoin(root_url, link)30        return link31    @staticmethod32    def _is_valid_url(url: str) -> bool:33        regex = re.compile(34            r'^(?:http|ftp)s?://' 35            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  36            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  37            r'(?::\d+)?' 38            r'(?:/?|[/?]\S+)$', re.IGNORECASE)39        return re.match(regex, url) is not None40class Crawler(UrlUtilsMixin):41    """42    Crawler thar browses random pages from a given set of links and stores them into the 43    `cache` folder.44    """    45    _links: Set[str]46    _blacklist: Set[str]47   48    def __init__(self, links: int, parallel: int, cache: bool) -> None:49        self._links = set()50        self._blacklist = set()51        self._total_links = links52        self._parallel = parallel53        self._cache = cache54        if self._cache:55            dir_path = os.path.dirname(os.path.realpath(__file__))56            self._cache_dir = str(Path(dir_path) / 'cache')57            os.system(f'mkdir -p {self._cache_dir}')58            59    async def _request(self, url: str) -> Optional[str]:60        try:61            async with aiohttp.ClientSession() as session:62                async with session.get(url, timeout=5) as response:63                    return await response.text()64        except:65            logging.debug("Exception on URL: %s" % url)66            return None67    def _is_blacklisted(self, url: str) -> bool:68        return url in self._blacklist69    def _should_accept_url(self, url: str) -> bool:70        return url and self._is_valid_url(url) and not self._is_blacklisted(url)71    def _extract_urls(self, body: str, root_url: str) -> List[str]:72        pattern = r"href=[\"'](?!#)(.*?)[\"'].*?"  73        urls = re.findall(pattern, str(body))74        normalize_urls = [self._normalize_link(url, root_url) for url in urls]75        filtered_urls = list(filter(self._should_accept_url, normalize_urls))76        return filtered_urls77    def load_config_file(self, file_path: str) -> None:78        """79        Load a configuration file with blacklisted urls and root urls for starting the 80        crawler.81        """82        with open(file_path, 'r') as config_file:83            config = json.load(config_file)84            for link in config['blacklisted_urls']:85                self._blacklist.add(link)86            for link in config["root_urls"]:87                self._links.add(link)88    async def browse(self) -> None:...

test_crawler_blocking.py

Source:test_crawler_blocking.py

...64        )65    )66    filter_result.sort()67    assert filter_result == checkup_html[result]68def test_crawler_normalize_link():69    cr = crawler.Crawler("https://google.com", "")70    assert (71        cr._normalize_link("https://google.com/help", "https://google.com/")72        == "https://google.com/help"73    )74    assert (75        cr._normalize_link("/help", "https://google.com/")76        == "https://google.com/help"77    )78    assert (79        cr._normalize_link(80            "https://mail.google.com/help#fragment", "https://google.com/"81        )82        == "https://mail.google.com/help"83    )84def test_remove_query():85    cr = crawler.Crawler("https://google.com", "")86    assert (87        cr._remove_query("https://google.com/search?page=42")88        == "https://google.com/search"...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.