Test your AI Agents with the all-new Agent to Agent Testing Platform.Learn More

How to use new_page method in Playwright Python

Best Python code snippet using playwright-python

comments.py

Source:comments.py

1import scrapy2from scrapy.loader import ItemLoader3from scrapy.exceptions import CloseSpider4from fbcrawl.spiders.fbcrawl import FacebookSpider5from fbcrawl.items import CommentsItem, parse_date, parse_date26from datetime import datetime7class CommentsSpider(FacebookSpider):8    """9    Parse FB comments, given a post (needs credentials)10    """    11    name = "comments"12    custom_settings = {13        'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \14                               'source_url','url'],15        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',16        'CONCURRENT_REQUESTS' : 117    }18    def __init__(self, *args, **kwargs):19        if 'post' in kwargs and 'page' in kwargs:20            raise AttributeError('You need to specifiy only one between post and page')21        elif 'post' in kwargs:22            self.page = kwargs['post']23            self.type = 'post'24        elif 'page' in kwargs:25            self.type = 'page'26        27        super().__init__(*args,**kwargs)28    def parse_page(self, response):29        '''30        '''31        if self.type == 'post':32            yield scrapy.Request(url=response.url,33                                 callback=self.parse_post,34                                 priority=10,35                                 meta={'index':1})36        elif self.type == 'page':37            #select all posts38            for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):     39                many_features = post.xpath('./@data-ft').get()40                date = []41                date.append(many_features)42                date = parse_date(date,{'lang':self.lang})43                current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date44    45                if current_date is None:46                    date_string = post.xpath('.//abbr/text()').get()47                    date = parse_date2([date_string],{'lang':self.lang})48                    current_date = datetime(date.year,date.month,date.day) if date is not None else date                49                    date = str(date)50                if abs(self.count) + 1 > self.max:51                    raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))52                self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))53                #returns full post-link in a list54                post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 55                temp_post = response.urljoin(post[0])56                self.count -= 157                yield scrapy.Request(temp_post, 58                                     self.parse_post, 59                                     priority = self.count,60                                     meta={'index':1})61    62            #load following page, try to click on "more"63            #after few pages have been scraped, the "more" link might disappears 64            #if not present look for the highest year not parsed yet65            #click once on the year and go back to clicking "more"66            #new_page is different for groups67            if self.group == 1:68                new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()      69            else:70                new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      71                #this is why lang is needed     72            if not new_page: 73                self.logger.info('[!] "more" link not found, will look for a "year" link')74                #self.k is the year link that we look for 75                if response.meta['flag'] == self.k and self.k >= self.year:                76                    xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"77                    new_page = response.xpath(xpath).extract()78                    if new_page:79                        new_page = response.urljoin(new_page[0])80                        self.k -= 181                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))82                        yield scrapy.Request(new_page, 83                                             callback=self.parse_page, 84                                             priority = -1000, 85                                             meta={'flag':self.k})86                    else:87                        while not new_page: #sometimes the years are skipped this handles small year gaps88                            self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))89                            self.k -= 190                            if self.k < self.year:91                                raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))92                            xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"93                            new_page = response.xpath(xpath).extract()94                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))95                        new_page = response.urljoin(new_page[0])96                        self.k -= 197                        yield scrapy.Request(new_page, 98                                             callback=self.parse_page,99                                             priority = -1000,100                                             meta={'flag':self.k}) 101                else:102                    self.logger.info('Crawling has finished with no errors!')103            else:104                new_page = response.urljoin(new_page[0])105                if 'flag' in response.meta:106                    self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))107                    yield scrapy.Request(new_page, 108                                         callback=self.parse_page, 109                                         priority = -1000, 110                                         meta={'flag':response.meta['flag']})111                else:112                    self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))113                    yield scrapy.Request(new_page, 114                                         callback=self.parse_page, 115                                         priority = -1000, 116                                         meta={'flag':self.k})117    def parse_post(self, response):118        '''119        parse post does multiple things:120            1) loads replied-to-comments page one-by-one (for DFS)121            2) call parse_reply on the nested comments122            3) adds simple (not-replied-to) comments123            4) follows to new comment page124        '''125        #load replied-to comments pages126        #select nested comment one-by-one matching with the index: response.meta['index']127        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'128        group_flag = response.meta['group'] if 'group' in response.meta else None129        for reply in response.xpath(path):130            source = reply.xpath('.//h3/a/text()').extract()131            answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract()132            ans = response.urljoin(answer[::-1][0])133            self.logger.info('{} nested comment'.format(str(response.meta['index'])))134            yield scrapy.Request(ans,135                                 callback=self.parse_reply,136                                 priority=1000,137                                 meta={'reply_to':source,138                                       'url':response.url,139                                       'index':response.meta['index'],140                                       'flag':'init',141                                       'group':group_flag})142        #load regular comments     143        if not response.xpath(path): #prevents from exec144            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'145            for i,reply in enumerate(response.xpath(path2)):146                self.logger.info('{} regular comment'.format(i+1))147                new = ItemLoader(item=CommentsItem(),selector=reply)148                new.context['lang'] = self.lang           149                new.add_xpath('source','.//h3/a/text()')  150                new.add_xpath('source_url','.//h3/a/@href')   151                new.add_xpath('text','.//div[h3]/div[1]//text()')152                new.add_xpath('date','.//abbr/text()')153                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')154                new.add_value('url',response.url)155                yield new.load_item()156            157        #new comment page158        if not response.xpath(path):159            #for groups160            next_xpath = './/div[contains(@id,"see_next")]'161            prev_xpath = './/div[contains(@id,"see_prev")]'162            if not response.xpath(next_xpath) or group_flag == 1:163                for next_page in response.xpath(prev_xpath):164                    new_page = next_page.xpath('.//@href').extract()165                    new_page = response.urljoin(new_page[0])166                    self.logger.info('New page to be crawled {}'.format(new_page))167                    yield scrapy.Request(new_page,168                                         callback=self.parse_post,169                                         meta={'index':1,170                                               'group':1})        171            else:172                for next_page in response.xpath(next_xpath):173                    new_page = next_page.xpath('.//@href').extract()174                    new_page = response.urljoin(new_page[0])175                    self.logger.info('New page to be crawled {}'.format(new_page))176                    yield scrapy.Request(new_page,177                                         callback=self.parse_post,178                                         meta={'index':1,179                                               'group':group_flag})        180        181    def parse_reply(self,response):182        '''183        parse reply to comments, root comment is added if flag184        '''185#        from scrapy.utils.response import open_in_browser186#        open_in_browser(response)187    188        if response.meta['flag'] == 'init':189            #parse root comment190            for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 191                new = ItemLoader(item=CommentsItem(),selector=root)192                new.context['lang'] = self.lang           193                new.add_xpath('source','.//h3/a/text()')  194                new.add_xpath('source_url','.//h3/a/@href') 195                new.add_value('reply_to','ROOT')196                new.add_xpath('text','.//div[1]//text()')197                new.add_xpath('date','.//abbr/text()')198                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')199                new.add_value('url',response.url)200                yield new.load_item()201            #parse all replies in the page202            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 203                new = ItemLoader(item=CommentsItem(),selector=reply)204                new.context['lang'] = self.lang           205                new.add_xpath('source','.//h3/a/text()')  206                new.add_xpath('source_url','.//h3/a/@href') 207                new.add_value('reply_to',response.meta['reply_to'])208                new.add_xpath('text','.//div[h3]/div[1]//text()')209                new.add_xpath('date','.//abbr/text()')210                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')211                new.add_value('url',response.url)   212                yield new.load_item()213                214            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()215            if back:216                self.logger.info('Back found, more nested comments')217                back_page = response.urljoin(back[0])218                yield scrapy.Request(back_page, 219                                     callback=self.parse_reply,220                                     priority = 1000,221                                     meta={'reply_to':response.meta['reply_to'],222                                           'flag':'back',223                                           'url':response.meta['url'],224                                           'index':response.meta['index'],225                                           'group':response.meta['group']})226            else:227                next_reply = response.meta['url']228                self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))229                yield scrapy.Request(next_reply,230                                     callback=self.parse_post,231                                     meta={'index':response.meta['index']+1,232                                           'group':response.meta['group']})233                234        elif response.meta['flag'] == 'back':235            #parse all comments236            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 237                new = ItemLoader(item=CommentsItem(),selector=reply)238                new.context['lang'] = self.lang           239                new.add_xpath('source','.//h3/a/text()')  240                new.add_xpath('source_url','.//h3/a/@href') 241                new.add_value('reply_to',response.meta['reply_to'])242                new.add_xpath('text','.//div[h3]/div[1]//text()')243                new.add_xpath('date','.//abbr/text()')244                new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()')245                new.add_value('url',response.url)   246                yield new.load_item()247            #keep going backwards248            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()249            self.logger.info('Back found, more nested comments')250            if back:251                back_page = response.urljoin(back[0])252                yield scrapy.Request(back_page, 253                                     callback=self.parse_reply,254                                     priority=1000,255                                     meta={'reply_to':response.meta['reply_to'],256                                           'flag':'back',257                                           'url':response.meta['url'],258                                           'index':response.meta['index'],259                                           'group':response.meta['group']})260            else:261                next_reply = response.meta['url']262                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))263                yield scrapy.Request(next_reply,264                                     callback=self.parse_post,265                                     meta={'index':response.meta['index']+1,266                                           'group':response.meta['group']})267                268# =============================================================================269# CRAWL REACTIONS270# =============================================================================271#    def parse_reactions(self,response):272#        new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item'])273#        new.context['lang'] = self.lang           274#        new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()")275#        new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()")276#        new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()")277#        new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()")278#        new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()")279#        new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()")        280#        yield new.load_item()     281#282#    #substitute283#    yield new.load_item()284#    â¾â¾â¾â¾â¾â¾â¾â¾â¾|â¾â¾â¾â¾â¾â¾â¾â¾â¾â¾â¾285#    _________v___286#    #response --> reply/root287#    reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href")288#    reactions = response.urljoin(reactions[0].extract())289#    if reactions:290#        yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new})291#    else:...

profiles.py

Source:profiles.py

1import scrapy2from scrapy.loader import ItemLoader3from scrapy.exceptions import CloseSpider4from fbcrawl.spiders.fbcrawl import FacebookSpider5from fbcrawl.items import ProfileItem, parse_date, parse_date26from datetime import datetime7class ProfileSpider(FacebookSpider):8    """9    Parse FB profiles10    """    11    name = "profiles"12    custom_settings = {13        'FEED_EXPORT_FIELDS': ['name','gender','birthday','current_city',14                               'hometown','work','education','interested_in',15                               'page'],16        'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',17        'CONCURRENT_REQUESTS' : 118    }19    def __init__(self, *args, **kwargs):20        if 'post' in kwargs and 'page' in kwargs:21            raise AttributeError('You need to specifiy only one between post and page')22        elif 'post' in kwargs:23            self.page = kwargs['post']24            self.type = 'post'25        elif 'page' in kwargs:26            self.type = 'page'27        28        super().__init__(*args,**kwargs)29    def parse_page(self, response):30        '''31        '''32        if self.type == 'post':33            yield scrapy.Request(url=response.url,34                                 callback=self.parse_post,35                                 priority=10,36                                 meta={'index':1})37        elif self.type == 'page':38            #select all posts39            for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"):     40                many_features = post.xpath('./@data-ft').get()41                date = []42                date.append(many_features)43                date = parse_date(date,{'lang':self.lang})44                current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date45    46                if current_date is None:47                    date_string = post.xpath('.//abbr/text()').get()48                    date = parse_date2([date_string],{'lang':self.lang})49                    current_date = datetime(date.year,date.month,date.day) if date is not None else date                50                    date = str(date)51                if abs(self.count) + 1 > self.max:52                    raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count)))53                self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date))54                #returns full post-link in a list55                post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 56                temp_post = response.urljoin(post[0])57                self.count -= 158                yield scrapy.Request(temp_post, 59                                     self.parse_post, 60                                     priority = self.count,61                                     meta={'index':1})62    63            #load following page, try to click on "more"64            #after few pages have been scraped, the "more" link might disappears 65            #if not present look for the highest year not parsed yet66            #click once on the year and go back to clicking "more"67            #new_page is different for groups68            if self.group == 1:69                new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract()      70            else:71                new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract()      72                #this is why lang is needed     73            if not new_page: 74                self.logger.info('[!] "more" link not found, will look for a "year" link')75                #self.k is the year link that we look for 76                if response.meta['flag'] == self.k and self.k >= self.year:                77                    xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"78                    new_page = response.xpath(xpath).extract()79                    if new_page:80                        new_page = response.urljoin(new_page[0])81                        self.k -= 182                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))83                        yield scrapy.Request(new_page, 84                                             callback=self.parse_page, 85                                             priority = -1000, 86                                             meta={'flag':self.k})87                    else:88                        while not new_page: #sometimes the years are skipped this handles small year gaps89                            self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1))90                            self.k -= 191                            if self.k < self.year:92                                raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date))93                            xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href"94                            new_page = response.xpath(xpath).extract()95                        self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page))96                        new_page = response.urljoin(new_page[0])97                        self.k -= 198                        yield scrapy.Request(new_page, 99                                             callback=self.parse_page,100                                             priority = -1000,101                                             meta={'flag':self.k}) 102                else:103                    self.logger.info('Crawling has finished with no errors!')104            else:105                new_page = response.urljoin(new_page[0])106                if 'flag' in response.meta:107                    self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page))108                    yield scrapy.Request(new_page, 109                                         callback=self.parse_page, 110                                         priority = -1000, 111                                         meta={'flag':response.meta['flag']})112                else:113                    self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page))114                    yield scrapy.Request(new_page, 115                                         callback=self.parse_page, 116                                         priority = -1000, 117                                         meta={'flag':self.k})118    def parse_post(self, response):119        '''120        parse post does multiple things:121            1) loads replied-to-comments page one-by-one (for DFS)122            2) call parse_reply on the nested comments123            3) adds simple (not-replied-to) comments124            4) follows to new comment page125        '''126        #load replied-to comments pages127        #select nested comment one-by-one matching with the index: response.meta['index']128        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]'  + '['+ str(response.meta['index']) + ']'129        group_flag = response.meta['group'] if 'group' in response.meta else None130        for reply in response.xpath(path):131            rep = reply.xpath('.//h3/a/@href').get()132            profile =  'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'133            yield scrapy.Request(profile,134                 callback=self.parse_profile,135                 priority=1000,136                 meta={'url':response.url,137                       'index':response.meta['index'],138                       'flag':'init',139                       'group':group_flag})140        #load regular comments     141        if not response.xpath(path): #prevents from exec142            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'143            for i,reply in enumerate(response.xpath(path2)):144                self.logger.info('{} regular comment'.format(i+1))145                rep = reply.xpath('.//h3/a/@href').get()146                profile =  'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'147                yield scrapy.Request(profile,148                     callback=self.parse_profile,149                     priority=1000,150                     meta={'url':response.url,151                           'index':response.meta['index'],152                           'flag':'init',153                           'group':group_flag})154            155        #new comment page156        if not response.xpath(path):157            #for groups158            next_xpath = './/div[contains(@id,"see_next")]'159            prev_xpath = './/div[contains(@id,"see_prev")]'160            if not response.xpath(next_xpath) or group_flag == 1:161                for next_page in response.xpath(prev_xpath):162                    new_page = next_page.xpath('.//@href').extract()163                    new_page = response.urljoin(new_page[0])164                    self.logger.info('New page to be crawled {}'.format(new_page))165                    yield scrapy.Request(new_page,166                                         callback=self.parse_post,167                                         meta={'index':1,168                                               'group':1})        169            else:170                for next_page in response.xpath(next_xpath):171                    new_page = next_page.xpath('.//@href').extract()172                    new_page = response.urljoin(new_page[0])173                    self.logger.info('New page to be crawled {}'.format(new_page))174                    yield scrapy.Request(new_page,175                                         callback=self.parse_post,176                                         meta={'index':1,177                                               'group':group_flag})        178        179    def parse_reply(self,response):180        '''181        parse reply to comments, root comment is added if flag182        '''183#        from scrapy.utils.response import open_in_browser184#        open_in_browser(response)185    186        if response.meta['flag'] == 'init':187            #parse root comment188            for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 189                rep = root.xpath('.//h3/a/@href').get()190                profile =  'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'191                yield scrapy.Request(profile,192                     callback=self.parse_profile,193                     priority=1000,194                     meta={'url':response.url,195                           'index':response.meta['index'],196                           'flag':'init',197                           'group':response['group_flag']})198            #parse all replies in the page199            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 200                rep = reply.xpath('.//h3/a/@href').get()201                profile =  'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'202                yield scrapy.Request(profile,203                     callback=self.parse_profile,204                     priority=1000,205                     meta={'url':response.url,206                           'index':response.meta['index'],207                           'flag':'init',208                           'group':response['group_flag']})209                210            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()211            if back:212                self.logger.info('Back found, more nested comments')213                back_page = response.urljoin(back[0])214                yield scrapy.Request(back_page, 215                                     callback=self.parse_reply,216                                     priority = 1000,217                                     meta={'reply_to':response.meta['reply_to'],218                                           'flag':'back',219                                           'url':response.meta['url'],220                                           'index':response.meta['index'],221                                           'group':response.meta['group']})222            else:223                next_reply = response.meta['url']224                self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url']))225                yield scrapy.Request(next_reply,226                                     callback=self.parse_post,227                                     meta={'index':response.meta['index']+1,228                                           'group':response.meta['group']})229                230        elif response.meta['flag'] == 'back':231            #parse all comments232            for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 233                rep = reply.xpath('.//h3/a/@href').extract()[0]234                profile =  'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about'235                yield scrapy.Request(profile,236                     callback=self.parse_profile,237                     priority=1000,238                     meta={'url':response.url,239                           'index':response.meta['index'],240                           'flag':'init',241                           'group':response['group_flag']})242            #keep going backwards243            back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract()244            self.logger.info('Back found, more nested comments')245            if back:246                back_page = response.urljoin(back[0])247                yield scrapy.Request(back_page, 248                                     callback=self.parse_reply,249                                     priority=1000,250                                     meta={'reply_to':response.meta['reply_to'],251                                           'flag':'back',252                                           'url':response.meta['url'],253                                           'index':response.meta['index'],254                                           'group':response.meta['group']})255            else:256                next_reply = response.meta['url']257                self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url']))258                yield scrapy.Request(next_reply,259                                     callback=self.parse_post,260                                     meta={'index':response.meta['index']+1,261                                           'group':response.meta['group']})262    263        264    def parse_profile(self,response):265        new = ItemLoader(item=ProfileItem(),response=response)266        self.logger.info('Crawling profile info')267        new.add_xpath('name','//span/div/span/strong/text()')268        new.add_xpath('gender',"//div[@id='basic-info']//div[@title='Gender']//div/text()")269        new.add_xpath('birthday',"//div[@id='basic-info']//div[@title='Birthday']//div/text()")270        new.add_xpath('current_city',"//div[@id='living']//div[@title='Current City']//a/text()")271        new.add_xpath('hometown',"//div[@id='living']//div[@title='Hometown']//a/text()")272        new.add_xpath('work',"//div[@id='work']//a/text()")273        new.add_xpath('education',"//div[@id='education']//a/text()")274        new.add_xpath('interested_in',"//div[@id='interested-in']//div[not(contains(text(),'Interested In'))]/text()")275        new.add_xpath('page',"//div[@id='contact-info']//div[@title='Facebook']//div/text()")...

helpers.py

Source:helpers.py

...14from v1.models.sublanding_page import SublandingPage15def save_page(page):16    page.save()17    return page.save_revision()18def save_new_page(child, root=None):19    if not root:20        root = HomePage.objects.get(title="CFGov")21    root.add_child(instance=child)22    return save_page(page=child)23def publish_page(child):24    revision = save_new_page(child=child)25    revision.publish()26def publish_changes(child):27    revision = save_page(page=child)28    revision.publish()29def get_parent_route(site, parent_path=None):30    # return list of route paths31    root = site.root_page32    # since parent was not provided, make root33    parent = root34    # if a parent path is provided, use that as parent35    if parent_path:36        path_components = [37            component for component in parent_path.split("/") if component38        ]...

test_infobase.py

Source:test_infobase.py

1from infogami.infobase import server2import web3import unittest4import urllib, urllib25import simplejson6def browser():7    if web.config.get('test_url'):8        b = web.browser.Browser()9        b.open('http://0.0.0.0:8080')10        return b11    else:12        return server.app.browser()13b = browser()14def request(path, method="GET", data=None, headers={}):15    if method == 'GET' and data is not None:16        path = path + '?' + urllib.urlencode(data)17        data = None18    if isinstance(data, dict):19        data = simplejson.dumps(data)20    url = urllib.basejoin(b.url, path)21    req = urllib2.Request(url, data, headers)22    req.get_method = lambda: method23    b.do_request(req)24    if b.status == 200:25        return b.data and simplejson.loads(b.data)26    else:27        return None28def get(key):29    d = request('/test/get?key=' + key)30    return d31def echo(msg):32    request('/_echo', method='POST', data=msg)33def save(query):34    return request('/test/save' + query['key'], method='POST', data=query)35def save_many(query, comment=''):36    return request('/test/save_many', method='POST', data=urllib.urlencode({'query': simplejson.dumps(query), 'comment': comment}))37        38class DatabaseTest(unittest.TestCase):39    pass40    41class InfobaseTestCase(unittest.TestCase):42    def clear_threadlocal(self):43        import threading44        t = threading.currentThread()45        if hasattr(t, '_d'):46            del t._d47    def setUp(self):48        self.clear_threadlocal()49        global b50        b = browser()51        try:52            # create new database with name "test"53            self.assertEquals2(request("/test", method="PUT"), {"ok": True})54        except Exception:55            self.tearDown()56            raise57        # reset browser cookies58        b.reset()59    def tearDown(self):60        self.clear_threadlocal()61        # delete test database62        request('/test', method="DELETE")63    def assertEquals2(self, a, b):64        """Asserts two objects are same.65        """66        # special case to say don't worry about this value.67        if b == '*':68            return True69        elif isinstance(a, dict):70            self.assertTrue(isinstance(b, dict))71            # key '*' means skip additional keys.72            skip_additional = b.pop('*', False)73            if not skip_additional:74                self.assertEquals(a.keys(), b.keys())75            for k in b.keys():76                self.assertEquals2(a[k], b[k])77        elif isinstance(a, list):78            self.assertEquals(len(a), len(b))79            for x, y in zip(a, b):80                self.assertEquals2(x, y)81        else:82            self.assertEquals(a, b)83class DocumentTest(InfobaseTestCase):84    def test_simple(self):85        self.assertEquals2(request('/'), {'infobase': 'welcome', 'version': '*'})86        self.assertEquals2(request('/test'), {'name': 'test'})87        self.assertEquals2(request('/test/get?key=/type/type'), {'key': '/type/type', 'type': {'key': '/type/type'}, '*': True})88        89        request('/test/get?key=/not-there')90        self.assertEquals(b.status, 404)91        92    def test_save(self):93        x = {'key': '/new_page', 'type': {'key': '/type/object'}, 'x': 1, 's': 'hello'}94        d = request('/test/save/new_page', method="POST", data=x)95        self.assertEquals(b.status, 200)96        self.assertEquals(d, {'key': '/new_page', 'revision': 1})97        98        # verify data99        d = request('/test/get?key=/new_page')100        expected = dict({'latest_revision': 1, 'revision': 1, '*': True}, **d)101        self.assertEquals2(d, expected)102        # nothing should be modified when saved with the same data.103        d = request('/test/save/new_page', method="POST", data=x)104        self.assertEquals(b.status, 200)105        self.assertEquals(d, {})106    def test_versions(self):107        x = {'key': '/new_page', 'type': {'key': '/type/object'}, 'x': 1, 's': 'hello'}108        d = request('/test/save/new_page', method="POST", data=x)109        # verify revisions110        q = {'key': '/new_page'}111        d = request('/test/versions', method='GET', data={'query': simplejson.dumps({'key': '/new_page'})}) 112        self.assertEquals2(d, [{'key': '/new_page', 'revision': 1, '*': True}])113        d = request('/test/versions', method='GET', data={'query': simplejson.dumps({'limit': 1})}) 114        self.assertEquals2(d, [{'key': '/new_page', 'revision': 1, '*': True}])115        116        # try a failed save and make sure new revisions are not created117        request('/test/save/new_page', method='POST', data={'key': '/new_page', 'type': '/type/no-such-type'})118        self.assertNotEquals(b.status, 200)119        q = {'key': '/new_page'}120        d = request('/test/versions', method='GET', data={'query': simplejson.dumps({'key': '/new_page'})}) 121        self.assertEquals2(d, [{'key': '/new_page', 'revision': 1, '*': True}])122        d = request('/test/versions', method='GET', data={'query': simplejson.dumps({'limit': 1})}) 123        self.assertEquals2(d, [{'key': '/new_page', 'revision': 1, '*': True}])124        # save the page and make sure new revision is created.125        d = request('/test/save/new_page', method='POST', data=dict(x, title='foo'))126        self.assertEquals(d, {'key': '/new_page', 'revision': 2})127        d = request('/test/versions', method='GET', data={'query': simplejson.dumps({'key': '/new_page'})}) 128        self.assertEquals2(d, [{'key': '/new_page', 'revision': 2, '*': True}, {'key': '/new_page', 'revision': 1, '*': True}])129    def test_save_many(self):130        q = [131            {'key': '/one', 'type': {'key': '/type/object'}, 'n': 1},132            {'key': '/two', 'type': {'key': '/type/object'}, 'n': 2}133        ]134        d = request('/test/save_many', method='POST', data=urllib.urlencode({'query': simplejson.dumps(q)}))135        self.assertEquals(d, [{'key': '/one', 'revision': 1}, {'key': '/two', 'revision': 1}])136        self.assertEquals2(get('/one'), {'key': '/one', 'type': {'key': '/type/object'}, 'n': 1, 'revision': 1,'*': True})137        self.assertEquals2(get('/two'), {'key': '/two', 'type': {'key': '/type/object'}, 'n': 2, 'revision': 1, '*': True})138        # saving with same data should not create new revisions139        d = request('/test/save_many', method='POST', data=urllib.urlencode({'query': simplejson.dumps(q)}))140        self.assertEquals(d, [])141        # try bad query142        q = [143            {'key': '/zero', 'type': {'key': '/type/object'}, 'n': 0},144            {'key': '/one', 'type': {'key': '/type/object'}, 'n': 11},145            {'key': '/two', 'type': {'key': '/type/no-such-type'}, 'n': 2}146        ]147        d = request('/test/save_many', method='POST', data=urllib.urlencode({'query': simplejson.dumps(q)}))148        self.assertNotEquals(b.status, 200)149        d = get('/zero')150        self.assertEquals(b.status, 404)151# create author, book and collection types to test validations152types = [{153    "key": "/type/author",154    "type": "/type/type",155    "kind": "regular",156    "properties": [{157        "name": "name",158        "expected_type": {"key": "/type/string"},159        "unique": True160    }, {161        "name": "bio",162        "expected_type": {"key": "/type/text"},163        "unique": True164    }]165}, {166    "key": "/type/book",167    "type": "/type/type",168    "kind": "regular",169    "properties": [{170        "name": "title",171        "expected_type": {"key": "/type/string"},172        "unique": True173    }, {174        "name": "authors",175        "expected_type": {"key": "/type/author"},176        "unique": False177    }, {178        "name": "publisher",179        "expected_type": {"key": "/type/string"},180        "unique": True181    }, {182        "name": "description",183        "expected_type": {"key": "/type/text"},184        "unique": True185    }]186}, {187    "key": "/type/collection",188    "type": "/type/type",189    "kind": "regular",190    "properties": [{191        "name": "name",192        "expected_type": {"key": "/type/string"},193        "unique": True194    }, {195        "name": "books",196        "expected_type": {"key": "/type/book"},197        "unique": False198    }]199}]200class MoreDocumentTest(DocumentTest):201    def setUp(self):202        DocumentTest.setUp(self)203        save_many(types)204    def test_save_validation(self):205        # ok: name is string206        d = save({'key': '/author/x', 'type': '/type/author', 'name': 'x'})207        self.assertEquals(b.status, 200)208        self.assertEquals(d, {"key": "/author/x", "revision": 1})209        210        # error: name is int instead of string211        d = save({'key': '/author/x', 'type': '/type/author', 'name': 42})212        self.assertEquals(b.status, 400)213        # error: name is list instead of single value214        d = save({'key': '/author/x', 'type': '/type/author', 'name': ['x', 'y']})215        self.assertEquals(b.status, 400)216    def test_validation_when_type_changes(self):217        # create an author and a book218        save({'key': '/author/x', 'type': '/type/author', 'name': 'x'})219        save({'key': '/book/x', 'type': '/type/book', 'title': 'x', 'authors': [{'key': '/author/x'}], 'publisher': 'publisher_x'})220        # change schema of "/type/book" and make expected_type of "publisher" as "/type/publisher"221        save({222            "key": "/type/publisher",223            "type": "/type/type",224            "kind": "regular",225            "properties": [{226                "name": "name",227                "expected_type": "/type/string",228                "unique": True229             }]230        })231        d = get('/type/book')232        assert d['properties'][2]['name'] == "publisher"233        d['properties'][2]['expected_type'] = {"key": "/type/publisher"}234        save(d)235        # now changing just the title of the book should not fail.236        d = get('/book/x')237        d['title'] = 'xx'238        save(d)239        self.assertEquals(b.status, 200)240if __name__ == "__main__":...

Playwright tutorial

LambdaTest’s Playwright tutorial will give you a broader idea about the Playwright automation framework, its unique features, and use cases with examples to exceed your understanding of Playwright testing. This tutorial will give A to Z guidance, from installing the Playwright framework to some best practices and advanced concepts.

Chapters:

What is Playwright : Playwright is comparatively new but has gained good popularity. Get to know some history of the Playwright with some interesting facts connected with it.
How To Install Playwright : Learn in detail about what basic configuration and dependencies are required for installing Playwright and run a test. Get a step-by-step direction for installing the Playwright automation framework.
Playwright Futuristic Features: Launched in 2020, Playwright gained huge popularity quickly because of some obliging features such as Playwright Test Generator and Inspector, Playwright Reporter, Playwright auto-waiting mechanism and etc. Read up on those features to master Playwright testing.
What is Component Testing: Component testing in Playwright is a unique feature that allows a tester to test a single component of a web application without integrating them with other elements. Learn how to perform Component testing on the Playwright automation framework.
Inputs And Buttons In Playwright: Every website has Input boxes and buttons; learn about testing inputs and buttons with different scenarios and examples.
Functions and Selectors in Playwright: Learn how to launch the Chromium browser with Playwright. Also, gain a better understanding of some important functions like “BrowserContext,” which allows you to run multiple browser sessions, and “newPage” which interacts with a page.
Handling Alerts and Dropdowns in Playwright : Playwright interact with different types of alerts and pop-ups, such as simple, confirmation, and prompt, and different types of dropdowns, such as single selector and multi-selector get your hands-on with handling alerts and dropdown in Playright testing.
Playwright vs Puppeteer: Get to know about the difference between two testing frameworks and how they are different than one another, which browsers they support, and what features they provide.
Run Playwright Tests on LambdaTest: Playwright testing with LambdaTest leverages test performance to the utmost. You can run multiple Playwright tests in Parallel with the LammbdaTest test cloud. Get a step-by-step guide to run your Playwright test on the LambdaTest platform.
Playwright Python Tutorial: Playwright automation framework support all major languages such as Python, JavaScript, TypeScript, .NET and etc. However, there are various advantages to Python end-to-end testing with Playwright because of its versatile utility. Get the hang of Playwright python testing with this chapter.
Playwright End To End Testing Tutorial: Get your hands on with Playwright end-to-end testing and learn to use some exciting features such as TraceViewer, Debugging, Networking, Component testing, Visual testing, and many more.
Playwright Video Tutorial: Watch the video tutorials on Playwright testing from experts and get a consecutive in-depth explanation of Playwright automation testing.

Run Playwright Python automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

View sample repo

View Documentation Sign up Free

›

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

How to use new_page method in Playwright Python

Playwright tutorial

Chapters:

Run Playwright Python automation tests on LambdaTest cloud grid

Try LambdaTest Now !!

Was this article helpful?