How to use trim_url method in Playwright Python

Best Python code snippet using playwright-python

hiv_org2.py

Source:hiv_org2.py Github

copy

Full Screen

...9from w3lib.html import remove_tags, remove_tags_with_content10def get_domain(url):11 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)12 return matched.split('://')[-1]13def trim_url(url):14 matched = re.match('^(?:http[s]?://)+[^/]*', url).group(0)15 return matched16class OrgWebsite(scrapy.Item):17 link = scrapy.Field()18 domain = scrapy.Field()19 referer = scrapy.Field()20class HIVBootstraper(scrapy.Spider):21 #TODO : Change custom setting when not debugging22 name = 'hiv_bootstraper'23 custom_settings = {24 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivBootstrapScrapingPipeline': 300},25 'CLOSESPIDER_ITEMCOUNT': 10026 }27 saved_domains = []28 dead_ends = {}29 restricted_sections = []30 def __init__(self, **kw):31 super(HIVBootstraper, self).__init__(**kw)32 # self.start_urls = self.__getattribute__()33 # self.allowed_domains = [get_domain(self.start_urls[0])]34 logging.info('Starting Bootstrap Spider with : %s', ', '.join(self.start_urls))35 def parse(self, response):36 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)37 for link in links:38 if get_domain(link.url) not in self.saved_domains:39 self.saved_domains.append(get_domain(link.url))40 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),41 referer=trim_url(response.request.url))42 yield orgwebsite43 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)44 if len(links) == 0:45 try:46 self.dead_ends[response.request.url] += 147 except:48 self.dead_ends[response.request.url] = 149 self._update_restrictions()50 else:51 for link in next_links:52 yield scrapy.Request(link.url, callback=self.parse)53 def _update_restrictions(self):54 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]55class HIVChecker(scrapy.Spider) :56 name = 'hiv_checker'57 start_urls= []58 custom_settings = {59 'ITEM_PIPELINES': {'hiv_scraping.pipelines.ClfHIVPipeline': 300} #CheckHIVPipeline60 }61 def start_requests(self):62 return [scrapy.Request(dom, callback=self.hiv_check) for dom in self._load_domains_to_check()]63 def hiv_check(self, response): #parse method64 sel = Selector(response = response)65 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()66 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])67 yield {'domain' : trim_url(response.request.url),68 'text_dump' : word_dump}69 def _has_content(self, txt):70 for t in txt :71 if t not in ['\n', '\t', ' ', '\r'] :72 return True73 def _load_domains_to_check(self):74 doms = pd.read_csv('domains.csv')75 doms = doms[doms['to_crawl'].isnull()].sort_values(by='references')['domain'].tolist()76 logging.info("%s new domains to be check for HIV" % str(len(doms)))77 return doms78class HIVSatellite(scrapy.Spider):79 #TODO : update the dead-end mechanism to also check whether the new pages are relevant80 name = 'hiv_satellite'81 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.HivSatScrapingPipeline': 300},82 'CLOSESPIDER_PAGECOUNT' : 500}83 saved_domains = []84 dead_ends = {}85 restricted_sections = []86 def __init__(self, **kw):87 super(HIVSatellite, self).__init__(**kw)88 self.start_urls, self.allowed_domains = self._get_starting_state()89 if len(self.start_urls)==1 :90 logging.info('New satellite spider : %s', self.start_urls[0])91 def parse(self, response):92 # TODO : Find a way to have the exact same logic as the HIVBootstrap spider (maybe just have the exact same type?)93 links = LinkExtractor(allow=(), deny=self.allowed_domains + self.saved_domains).extract_links(response)94 for link in links:95 if get_domain(link.url) not in self.saved_domains:96 self.saved_domains.append(get_domain(link.url))97 orgwebsite = OrgWebsite(link=link.url, domain=trim_url(link.url),98 referer=trim_url(response.request.url))99 yield orgwebsite100 next_links = LinkExtractor(allow=self.allowed_domains, deny=self.restricted_sections).extract_links(response)101 if len(links) == 0:102 try:103 self.dead_ends[response.request.url] += 1104 except:105 self.dead_ends[response.request.url] = 1106 self._update_restrictions()107 else:108 for link in next_links:109 yield scrapy.Request(link.url, callback=self.parse)110 def _update_restrictions(self):111 self.restricted_sections = [k for k in self.dead_ends.keys() if self.dead_ends[k] > 3]112 def _get_starting_state(self):113 doms = pd.read_csv('domains.csv')114 eligible_doms = doms[np.logical_and(doms['to_crawl'] == 1, doms['crawled'] == 0)]['domain'].tolist()115 if len(eligible_doms) > 0 :116 # take first result117 chosen_dom = eligible_doms[0]118 # update file119 doms.loc[doms['domain'] == chosen_dom, 'crawled'] = 1120 doms.to_csv('domains.csv', index=False)121 return [chosen_dom], [get_domain(chosen_dom)]122 else :123 return [],[]124class DataSetBuilder(scrapy.Spider):125 name = 'dataset_builder'126 start_urls= []127 custom_settings = { 'ITEM_PIPELINES': {'hiv_scraping.pipelines.DataSetPipeline': 300} }128 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')129 def start_requests(self):130 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]131 def parse(self, response):132 sel = Selector(response = response)133 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()134 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])135 yield {'domain' : trim_url(response.request.url),136 'text_dump' : word_dump,137 'hiv' : self.dom_lbl[self.dom_lbl['domain']==trim_url(response.request.url)]['hiv'].values[0],138 'research': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['research'].values[0],139 'gov': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['gov'].values[0],140 'uni': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['uni'].values[0],141 'ngo': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['ngo'].values[0],142 'association': self.dom_lbl[self.dom_lbl['domain'] == trim_url(response.request.url)]['association'].values[0]}143 def _load_domains(self):144 doms = pd.read_csv('dataset/dom_lbl.csv')145 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()146 return dom_list147 def _has_content(self, txt):148 for t in txt :149 if t not in ['\n', '\t', ' ', '\r'] :150 return True151 return False152class DataSetEnricher(scrapy.Spider):153 # TODO : Change custom setting when not debugging154 name = 'dataset_enricher'155 start_urls = []156 custom_settings = {'ITEM_PIPELINES': {'hiv_scraping.pipelines.EnrichPipeline': 300}}157 dom_lbl = pd.read_csv('dataset/dom_lbl.csv')158 def start_requests(self):159 return [scrapy.Request(dom, callback=self.parse) for dom in self._load_domains()]160 def parse(self, response):161 sel = Selector(response=response)162 raw_dump = sel.xpath('//body/descendant-or-self::*[not(self::script)]/text()').extract()163 word_dump = ' '.join([txt for txt in raw_dump if self._has_content(txt)])164 yield {'domain': trim_url(response.request.url),165 'about_dump': word_dump}166 def _load_domains(self):167 doms = pd.read_csv('dataset/dom_lbl.csv')168 dom_list = doms[doms['hiv'].notnull()]['domain'].tolist()169 about_list = [d + "/about" for d in dom_list] + [d + "/about-us" for d in dom_list]170 return about_list171 def _has_content(self, txt):172 for t in txt:173 if t not in ['\n', '\t', ' ', '\r']:174 return True...

Full Screen

Full Screen

test_tools_web.py

Source:test_tools_web.py Github

copy

Full Screen

...98ENCLOSING_PAIRS = [('(', ')'), ('[', ']'), ('{', '}'), ('<', '>')]99@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)100def test_trim_url_remove_trailing_char(trailing_char):101 test_url = 'http://example.com/'102 assert trim_url(test_url + trailing_char) == test_url103 # assert trailing_char removed only if it is trailing104 test_url = 'http://example.com/' + trailing_char + 'content'105 assert trim_url(test_url) == test_url106@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)107def test_trim_url_remove_trailing_enclosing(left, right):108 # right without left => right is removed109 test_url = 'http://example.com/'110 assert test_url == trim_url(test_url + right)111 # right after path without left => right is removed112 test_url = 'http://example.com/a'113 assert test_url == trim_url(test_url + right)114 # trailing left without right => left is kept115 test_url = 'http://example.com/a' + left116 assert test_url == trim_url(test_url)117 # left before content without right => left is kept118 test_url = 'http://example.com/a' + left + 'something'119 assert test_url == trim_url(test_url)120 # left + content + right => right is kept121 assert test_url + right == trim_url(test_url + right)122@pytest.mark.parametrize('trailing_char', TRAILING_CHARS)123@pytest.mark.parametrize('left, right', ENCLOSING_PAIRS)124def test_trim_url_trailing_char_and_enclosing(trailing_char, left, right):125 test_url = 'http://example.com/'126 assert test_url == trim_url(test_url + right + trailing_char)127 # assert the trailing char is kept if there is something else128 test_url = 'http://example.com/' + trailing_char...

Full Screen

Full Screen

html.py

Source:html.py Github

copy

Full Screen

...57 lead, middle, trail = match.groups()58 if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \59 len(middle) > 0 and middle[0] in string.letters + string.digits and \60 (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):61 middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))62 if middle.startswith('http://') or middle.startswith('https://'):63 middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))64 if '@' in middle and not middle.startswith('www.') and not ':' in middle \65 and simple_email_re.match(middle):66 middle = '<a href="mailto:%s">%s</a>' % (middle, middle)67 if lead + middle + trail != word:68 words[i] = lead + middle + trail69 return ''.join(words)70def clean_html(text):71 """72 Cleans the given HTML. Specifically, it does the following:73 * Converts <b> and <i> to <strong> and <em>.74 * Encodes all ampersands correctly.75 * Removes all "target" attributes from <a> tags.76 * Removes extraneous HTML, such as presentational tags that open and77 immediately close and <br clear="all">....

Full Screen

Full Screen

test_utils.py

Source:test_utils.py Github

copy

Full Screen

...24 err.get_error()["error"] == data["error"],25 err.get_error()["errcode"] == data["errcode"],26 ]27 )28def test_trim_url():29 """Test trim_url"""30 url = "https://example.com"31 assert trim_url(url) == url32 assert trim_url(f"{url}/") == url33 path = "/foo/bar"34 assert trim_url(path) == path35 assert trim_url(f"{path}/") == path36def get_nodeinfo_index(base: str):37 resp = {38 "links": [39 {40 "href": f"{base}/.well-known/nodeinfo/2.0.json",41 "rel": "http://nodeinfo.diaspora.software/ns/schema/2.0",42 }43 ]44 }45 return (f"{base}/.well-known/nodeinfo", resp)46def get_nodeinfo_resp():47 nodeinfo = {48 "version": "2.0",49 "software": {...

Full Screen

Full Screen

Playwright tutorial

LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.

Chapters:

  1. What is Playwright : Playwright is comparatively new but has gained good popularity. Get to know some history of the Playwright with some interesting facts connected with it.
  2. How To Install Playwright : Learn in detail about what basic configuration and dependencies are required for installing Playwright and run a test. Get a step-by-step direction for installing the Playwright automation framework.
  3. Playwright Futuristic Features: Launched in 2020, Playwright gained huge popularity quickly because of some obliging features such as Playwright Test Generator and Inspector, Playwright Reporter, Playwright auto-waiting mechanism and etc. Read up on those features to master Playwright testing.
  4. What is Component Testing: Component testing in Playwright is a unique feature that allows a tester to test a single component of a web application without integrating them with other elements. Learn how to perform Component testing on the Playwright automation framework.
  5. Inputs And Buttons In Playwright: Every website has Input boxes and buttons; learn about testing inputs and buttons with different scenarios and examples.
  6. Functions and Selectors in Playwright: Learn how to launch the Chromium browser with Playwright. Also, gain a better understanding of some important functions like “BrowserContext,” which allows you to run multiple browser sessions, and “newPage” which interacts with a page.
  7. Handling Alerts and Dropdowns in Playwright : Playwright interact with different types of alerts and pop-ups, such as simple, confirmation, and prompt, and different types of dropdowns, such as single selector and multi-selector get your hands-on with handling alerts and dropdown in Playright testing.
  8. Playwright vs Puppeteer: Get to know about the difference between two testing frameworks and how they are different than one another, which browsers they support, and what features they provide.
  9. Run Playwright Tests on LambdaTest: Playwright testing with LambdaTest leverages test performance to the utmost. You can run multiple Playwright tests in Parallel with the LammbdaTest test cloud. Get a step-by-step guide to run your Playwright test on the LambdaTest platform.
  10. Playwright Python Tutorial: Playwright automation framework support all major languages such as Python, JavaScript, TypeScript, .NET and etc. However, there are various advantages to Python end-to-end testing with Playwright because of its versatile utility. Get the hang of Playwright python testing with this chapter.
  11. Playwright End To End Testing Tutorial: Get your hands on with Playwright end-to-end testing and learn to use some exciting features such as TraceViewer, Debugging, Networking, Component testing, Visual testing, and many more.
  12. Playwright Video Tutorial: Watch the video tutorials on Playwright testing from experts and get a consecutive in-depth explanation of Playwright automation testing.

Run Playwright Python automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful