How to use _normalize_link method in Lemoncheesecake

Best Python code snippet using lemoncheesecake

crawler.py

Source:crawler.py Github

copy

Full Screen

...16from urllib3.exceptions import LocationParseError17from urllib.parse import urlparse, urljoin18class UrlUtilsMixin:19 @staticmethod20 def _normalize_link(link, root_url: str) -> str:21 try:22 parsed_url = urlparse(link)23 except ValueError:24 return None25 parsed_root_url = urlparse(root_url)26 if link.startswith("//"):27 return f"{parsed_root_url.scheme}://{parsed_url.netloc}{parsed_url.path}"28 if not parsed_url.scheme:29 return urljoin(root_url, link)30 return link31 @staticmethod32 def _is_valid_url(url: str) -> bool:33 regex = re.compile(34 r'^(?:http|ftp)s?://' 35 r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' 36 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' 37 r'(?::\d+)?' 38 r'(?:/?|[/?]\S+)$', re.IGNORECASE)39 return re.match(regex, url) is not None40class Crawler(UrlUtilsMixin):41 """42 Crawler thar browses random pages from a given set of links and stores them into the 43 `cache` folder.44 """ 45 _links: Set[str]46 _blacklist: Set[str]47 48 def __init__(self, links: int, parallel: int, cache: bool) -> None:49 self._links = set()50 self._blacklist = set()51 self._total_links = links52 self._parallel = parallel53 self._cache = cache54 if self._cache:55 dir_path = os.path.dirname(os.path.realpath(__file__))56 self._cache_dir = str(Path(dir_path) / 'cache')57 os.system(f'mkdir -p {self._cache_dir}')58 59 async def _request(self, url: str) -> Optional[str]:60 try:61 async with aiohttp.ClientSession() as session:62 async with session.get(url, timeout=5) as response:63 return await response.text()64 except:65 logging.debug("Exception on URL: %s" % url)66 return None67 def _is_blacklisted(self, url: str) -> bool:68 return url in self._blacklist69 def _should_accept_url(self, url: str) -> bool:70 return url and self._is_valid_url(url) and not self._is_blacklisted(url)71 def _extract_urls(self, body: str, root_url: str) -> List[str]:72 pattern = r"href=[\"'](?!#)(.*?)[\"'].*?" 73 urls = re.findall(pattern, str(body))74 normalize_urls = [self._normalize_link(url, root_url) for url in urls]75 filtered_urls = list(filter(self._should_accept_url, normalize_urls))76 return filtered_urls77 def load_config_file(self, file_path: str) -> None:78 """79 Load a configuration file with blacklisted urls and root urls for starting the 80 crawler.81 """82 with open(file_path, 'r') as config_file:83 config = json.load(config_file)84 for link in config['blacklisted_urls']:85 self._blacklist.add(link)86 for link in config["root_urls"]:87 self._links.add(link)88 async def browse(self) -> None:...

Full Screen

Full Screen

test_crawler_blocking.py

Source:test_crawler_blocking.py Github

copy

Full Screen

...64 )65 )66 filter_result.sort()67 assert filter_result == checkup_html[result]68def test_crawler_normalize_link():69 cr = crawler.Crawler("https://google.com", "")70 assert (71 cr._normalize_link("https://google.com/help", "https://google.com/")72 == "https://google.com/help"73 )74 assert (75 cr._normalize_link("/help", "https://google.com/")76 == "https://google.com/help"77 )78 assert (79 cr._normalize_link(80 "https://mail.google.com/help#fragment", "https://google.com/"81 )82 == "https://mail.google.com/help"83 )84def test_remove_query():85 cr = crawler.Crawler("https://google.com", "")86 assert (87 cr._remove_query("https://google.com/search?page=42")88 == "https://google.com/search"...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run Lemoncheesecake automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful