Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use repeat method of extractor class

Best Atoum code snippet using extractor.repeat

ContentExtractorTest.php

Source:ContentExtractorTest.php

...288    public function dataForStrip(): array289    {290        return [291            // strip nav element and keep only the p292            ['//nav', '<html><body><nav id="high">hello !hello !hello !hello !hello !hello !hello !hello !hello !</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],293            // strip p element and keep the nav294            ['//p', '<html><body><nav id="high">' . str_repeat('hello !', 20) . '</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'this is the best part of the show'],295        ];296    }297    /**298     * @dataProvider dataForStrip299     */300    public function testApplyStrip(string $pattern, string $html, string $removedContent): void301    {302        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);303        $config = new SiteConfig();304        $config->strip = [$pattern];305        $contentExtractor->process(306            $html,307            'https://lemonde.io/35941909',308            $config309        );310        $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));311    }312    public function dataForStripIdOrClass(): array313    {314        return [315            ['commentlist', '<html><body><nav id="commentlist">hello !hello !hello !hello !hello !hello !hello !hello !hello !</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],316            ['related_post', '<html><body><nav id="high">' . str_repeat('hello !', 20) . '</nav><p class="related_post">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'this is the best part of the show'],317            ['related', '<html><body><nav id="high">' . str_repeat('lorem ipsum dolor', 20) . '</nav><p class="related_post">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', null, 'class="related_post"'],318        ];319    }320    /**321     * @dataProvider dataForStripIdOrClass322     */323    public function testApplyStripIdOrClass(string $pattern, string $html, ?string $removedContent, string $matchContent = null): void324    {325        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);326        $config = new SiteConfig();327        $config->strip_id_or_class = [$pattern];328        $contentExtractor->process(329            $html,330            'https://lemonde.io/35941909',331            $config332        );333        $content = $this->getReadabilityContent($contentExtractor);334        if (null === $removedContent) {335            $this->assertStringContainsString((string) $matchContent, $content);336        } else {337            $this->assertStringNotContainsString($removedContent, $content);338        }339    }340    public function dataForStripImageSrc(): array341    {342        return [343            ['doubleclick.net', '<html><body><img src="https://www.doubleclick.net/pub.jpg"/></nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'https://www.doubleclick.net/pub.jpg'],344            // array('related_post', '<html><body><nav id="high">'.str_repeat('hello !', 20).'</nav><p class="related_post">'.str_repeat('this is the best part of the show', 10).'</p></body></html>', 'this is the best part of the show'),345        ];346    }347    /**348     * @dataProvider dataForStripImageSrc349     */350    public function testApplyStripImageSrc(string $pattern, string $html, string $removedContent): void351    {352        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);353        $config = new SiteConfig();354        $config->strip_image_src = [$pattern];355        $res = $contentExtractor->process(356            $html,357            'https://lemonde.io/35941909',358            $config359        );360        $this->assertTrue($res, 'Extraction went well');361        $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));362    }363    public function dataForStripDisplayNoneAndInstapaper(): array364    {365        return [366            // remove element with class "instapaper_ignore"367            ['<html><body><p class="instapaper_ignore">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],368            // remove element with class "entry-unrelated"369            ['<html><body><p class="entry-unrelated">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],370        ];371    }372    /**373     * @dataProvider dataForStripDisplayNoneAndInstapaper374     */375    public function testApplyStripDisplayNoneAndInstapaper(string $html, string $removedContent): void376    {377        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);378        $config = new SiteConfig();379        $res = $contentExtractor->process(380            $html,381            'https://lemonde.io/35941909',382            $config383        );384        $this->assertTrue($res, 'Extraction went well');385        $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));386    }387    public function dataForStripAttr(): array388    {389        return [390            [['//*/@class'], '<html><body><div class="hello world"><i class="class">bar</i>class="foo"' . str_repeat('this is the best part of the show', 10) . ' <a class="hc" href="void">link</a></div></body></html>', [391                    'removedContent' => ['class="class"', 'class="hello world"', 'class="hc"'],392                    'keptContent' => ['class="foo"', '<a href="void"', '<em>bar'],393                ],394            ],395            [['//img/@class', '//p/@class'], '<html><body><img class="bar-class" src="void" /><a class="hello" href="void">link</a> <p class="yes">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', [396                    'removedContent' => ['class="bar-class"', 'class="yes"'],397                    'keptContent' => ['class="hello"'],398                ],399            ],400        ];401    }402    /**403     * @dataProvider dataForStripAttr404     */405    public function testApplyStripAttr(array $patterns, string $html, array $assertions): void406    {407        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);408        $config = new SiteConfig();409        $config->strip = $patterns;410        $res = $contentExtractor->process(411            $html,412            'https://lemonde.io/35941909',413            $config414        );415        $content = $this->getReadabilityContent($contentExtractor);416        foreach ($assertions['removedContent'] as $removedContent) {417            $this->assertStringNotContainsString($removedContent, $content);418        }419        foreach ($assertions['keptContent'] as $keptContent) {420            $this->assertStringContainsString($keptContent, $content);421        }422    }423    public function dataForExtractBody(): array424    {425        return [426            // extract one element427            [428                "//p[@class='content']",429                '<html><body><p class="content">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',430                '<p class="content">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p>',431            ],432            // extract multiple element433            [434                "//p[@class='content_wrapper']",435                '<html><body><p class="content_wrapper">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p class="content_wrapper">' . str_repeat('this is the best part of the show', 5) . '</p></body></html>',436                '<div><p class="content_wrapper">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p class="content_wrapper">' . str_repeat('this is the best part of the show', 5) . '</p></div>',437            ],438        ];439    }440    /**441     * @dataProvider dataForExtractBody442     */443    public function testExtractBody(string $pattern, string $html, string $expectedContent): void444    {445        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);446        $config = new SiteConfig();447        $config->body = [$pattern];448        $res = $contentExtractor->process(449            $html,450            'https://lemonde.io/35941909',451            $config452        );453        $this->assertTrue($res, 'Extraction went well');454        $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));455    }456    public function dataForExtractHNews(): array457    {458        return [459            // the all hNews tested460            [461                '<html><body><div class="hentry"><p class="entry-title">hello !</p><time pubdate="2015-01-01">2015-01-01</time><a class="vcard author">hello !</a>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',462                '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',463                [464                    'title' => 'hello !',465                    'date' => '2015-01-01T00:00:00+01:00',466                    'authors' => ['hello !'],467                ],468            ],469            // hNews with bad date470            [471                '<html><body><div class="hentry"><time pubdate="2015-01-01">aweomse!</time>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',472                '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',473                [474                    'date' => null,475                ],476            ],477            // hNews with many authors478            [479                '<html><body><div class="hentry"><p class="vcard author"><a class="fn">first boy</a><a class="fn">first girl</a></p>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',480                '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',481                [482                    'authors' => ['first boy', 'first girl'],483                ],484            ],485            // hNews with many content486            [487                '<html><body><div class="hentry"><p class="entry-content">hello !hello !hello !hello !hello !hello !hello !</p><p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',488                '<div><p class="entry-content">hello !hello !hello !hello !hello !hello !hello !</p><p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div>',489                [],490            ],491        ];492    }493    /**494     * @dataProvider dataForExtractHNews495     */496    public function testExtractHNews(string $html, string $expectedContent, array $expectedElements): void497    {498        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);499        $config = new SiteConfig();500        $res = $contentExtractor->process(501            $html,502            'https://lemonde.io/35941909',503            $config504        );505        $this->assertTrue($res, 'Extraction went well');506        $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));507        foreach ($expectedElements as $key => $value) {508            $this->assertSame($contentExtractor->{'get' . ucfirst($key)}(), $value);509        }510    }511    /**512     * Extract content from instapaper class.513     */514    public function testExtractInstapaper(): void515    {516        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);517        $config = new SiteConfig();518        $res = $contentExtractor->process(519            '<html><body><div><p class="instapaper_title">hello !</p>hello !hello !hello !hello !hello !hello !hello !<p class="instapaper_body">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',520            'https://lemonde.io/35941909',521            $config522        );523        $this->assertTrue($res, 'Extraction went well');524        $this->assertSame('<p class="instapaper_body">' . str_repeat('this is the best part of the show', 10) . '</p>', $this->getXmlContent($contentExtractor));525        $this->assertSame($contentExtractor->getTitle(), 'hello !');526    }527    public function dataForExtractSchemaOrg(): array528    {529        return [530            // articleBody on one element531            [532                '<html><body><div>hello !hello !hello !hello !hello !hello !hello !<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',533                '<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p>',534            ],535            // articleBody on two elements536            [537                '<html><body><div><p itemprop="articleBody">hello !hello !hello !hello !hello !hello !hello !</p><p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',538                '<div><p itemprop="articleBody">hello !hello !hello !hello !hello !hello !hello !</p><p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div>',539            ],540            // articleBody on img element541            [542                '<html><body><div><p itemprop="articleBody"><img src="http://0.0.0.0/image.jpg" /></p></div></body></html>',543                '<p itemprop="articleBody"><img src="http://0.0.0.0/image.jpg"/></p>',544            ],545        ];546    }547    /**548     * @dataProvider dataForExtractSchemaOrg549     */550    public function testExtractSchemaOrg(string $html, string $expectedContent): void551    {552        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);553        $config = new SiteConfig();554        $res = $contentExtractor->process(555            $html,556            'https://lemonde.io/35941909',557            $config558        );559        $this->assertTrue($res, 'Extraction went well');560        $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));561    }562    /**563     * Test that if the first h* found in the body is the same as the extracted title, it'll be removed.564     */565    public function testRemoveHFromBody(): void566    {567        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);568        $config = new SiteConfig();569        $config->body = ['//div'];570        $config->title = ['//title'];571        $res = $contentExtractor->process(572            '<html><head><title>My Title</title></head><body><div><h3>My Title</h3>' . str_repeat('this is the best part of the show', 10) . '</div></body></html>',573            'https://lemonde.io/35941909',574            $config575        );576        $this->assertTrue($res, 'Extraction went well');577        $this->assertStringNotContainsString('My Title', $this->getXmlContent($contentExtractor));578        $this->assertSame('My Title', $contentExtractor->getTitle());579    }580    public function dataForlazyLoad(): array581    {582        return [583            // test with img attribute data-src584            [585                '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/big_image.jpg" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="></div>',586                '<img src="http://0.0.0.0/big_image.jpg"',587            ],588            // test with img attribute data-lazy-src589            [590                '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-lazy-src="http://0.0.0.0/big_image.jpg" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="></div>',591                '<img src="http://0.0.0.0/big_image.jpg"',592            ],593            // test with img attribute data-src and image in noscript594            [595                '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-lazy-src="http://0.0.0.0/big_image.jpg" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="><noscript><img src="http://0.0.0.0/big_image_noscript.jpg"></noscript></div>',596                '<img src="http://0.0.0.0/big_image_noscript.jpg"',597            ],598            // test with img attribute data-original599            [600                '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" data-original="http://0.0.0.0/big_image.jpg" class="lazy"/></div>',601                '<img src="http://0.0.0.0/big_image.jpg"',602            ],603            // test with img attribute data-sources604            [605                '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" data-sources="http://0.0.0.0/big_image.jpg"/></div>',606                '<img src="http://0.0.0.0/big_image.jpg"',607            ],608            // test with img attribute from site config609            [610                '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" data-toto-src="http://0.0.0.0/big_image.jpg"/></div>',611                '<img src="http://0.0.0.0/big_image.jpg"',612            ],613            // test with img attribute data-srcset614            [615                '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/src.jpg" data-srcset="http://0.0.0.0/srcset1 680w, http://0.0.0.0/srcset2 1536w"/></div>',616                '<img src="http://0.0.0.0/src.jpg" srcset="http://0.0.0.0/srcset1 680w, http://0.0.0.0/srcset2 1536w"/>',617            ],618            // test with img attribute data-srcset empty619            [620                '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/src.jpg" data-srcset=""/></div>',621                '<img src="http://0.0.0.0/src.jpg"/>',622            ],623        ];624    }625    /**626     * Test that if the first h* found in the body is the same as the extracted title, it'll be removed.627     *628     * @dataProvider dataForlazyLoad629     */630    public function testConvertLazyLoadImages(string $html, string $htmlExpected): void631    {632        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);633        $config = new SiteConfig();634        $config->body = ['//div'];635        $config->src_lazy_load_attr = 'data-toto-src';636        $res = $contentExtractor->process(637            $html,638            'https://lemonde.io/35941909',639            $config640        );641        $this->assertTrue($res, 'Extraction went well');642        $this->assertStringContainsString($htmlExpected, $this->getXmlContent($contentExtractor));643    }644    public function testIframeEmbeddedContent(): void645    {646        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);647        $config = new SiteConfig();648        // '//header' is a bad pattern, and it will jump to the next one649        $config->body = ['//header', '//div'];650        // obviously a bad parser which will be converted to use the default one651        $config->parser = 'toto';652        $res = $contentExtractor->process(653            '<div>' . str_repeat('this is the best part of the show', 10) . '</div><div class="video_player"><iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320"></iframe></div>',654            'https://lemonde.io/35941909',655            $config656        );657        $this->assertTrue($res, 'Extraction went well');658        $this->assertStringContainsString('<iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320">[embedded content]</iframe>', $this->getXmlContent($contentExtractor));659    }660    public function testLogMessage(): void661    {662        $logger = new Logger('foo');663        $handler = new TestHandler($level = Logger::INFO);664        $logger->pushHandler($handler);665        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);666        $contentExtractor->setLogger($logger);667        $config = new SiteConfig();668        $contentExtractor->process(669            '<html>&lt;iframe &gt;&lt;/iframe&gt;</html>',670            'https://vimeo.com/35941909',671            $config672        );673        $records = $handler->getRecords();674        $this->assertGreaterThanOrEqual(6, $records);675        $this->assertSame('Attempting to parse HTML with {parser}', $records[0]['message']);676        $this->assertSame('libxml', $records[0]['context']['parser']);677        $this->assertSame('Opengraph "og:" data: {ogData}', $records[2]['message']);678        $this->assertSame('Opengraph "article:" data: {ogData}', $records[3]['message']);679        $this->assertSame('Trying {pattern} for language', $records[4]['message']);680        $this->assertSame('Trying {pattern} for language', $records[5]['message']);681        $this->assertSame('Using Readability', $records[6]['message']);682        $this->assertSame('Date is bad (wrong year): {date}', $records[7]['message']);683        $this->assertSame('Attempting to parse HTML with {parser}', $records[9]['message']);684    }685    public function testWithCustomFiltersForReadability(): void686    {687        $contentExtractor = new ContentExtractor(688            self::$contentExtractorConfig689            + ['readability' => [690                'post_filters' => ['!<head[^>]*>(.*?)</head>!is' => ''],691                'pre_filters' => ['!</?noscript>!is' => ''],692            ]]693        );694        $config = new SiteConfig();695        $res = $contentExtractor->process(696            '<!DOCTYPE html>697<html lang="fr" dir="ltr">698<head>699<base href="http://www.lhc-france.fr/" />700<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />701<meta name="generator" content="SPIP 3.0.17 [21515]" />702<link rel="shortcut icon" href="squelettes/favicon.ico" />703<script type=\'text/javascript\'>704document.createElement("header");document.createElement("footer");document.createElement("section");document.createElement("aside");document.createElement("nav");document.createElement("article");document.createElement("time");705</script>706<!--[if lt IE 9]>707        <meta http-equiv="X-UA-Compatible" content="IE=edge" />708        <script type="text/javascript" src="http://www.lhc-france.fr/squelettes/js/ie.js"></script>709<![endif]-->710<script type="text/javascript" src="http://www.lhc-france.fr/squelettes/js/modernizr.js"></script>711<script type="text/javascript">712function handleError(){return true;}713window.onerror = handleError;714dossier_squelettes = \'squelettes\';715secteurid=6;articleid=907;article_jour=19;article_mois=12;article_annee=2016;716</script>717<link rel="alternate" type="application/rss+xml" title="Actualitï¿½ï¿½s du LHC" href="http://feeds.feedburner.com/lhcfranceactus?format=xml" />718<link rel="alternate" type="application/rss+xml" title="La BD du LHC" href="http://www.lhc-france.fr/?page=backend&id_rubrique=65" />719<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/styles-urlabs-b1fc-urlabs-b1fc-minify-3f10.css" type="text/css" media="all" />720<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/milkbox-urlabs-fe01-urlabs-fe01-minify-1d16.css" media="screen" />721<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/styles.print-urlabs-2157-urlabs-2157-minify-d3e7.css" type="text/css" media="print" />722<link rel="stylesheet" href="http://www.lhc-france.fr/squelettes/styles.rouge.css" type="text/css" media="all" />723<script type="text/javascript" src="http://www.lhc-france.fr/local/cache-js/AC_RunActiveContent-minify-d850.js"></script>724<title>Novembre 2016 - Je voudrais de la matiï¿½ï¿½re noire ï¿½ï¿½ Noï¿½ï¿½l... | LHC France</title>725<meta name="robots" content="index, follow, all" />726<meta name="description" content="La contribution du CNRS et du CEA au LHC, un instrument international de physique des particules situï¿½ï¿½ au Cern. Avec toute l\'actualitï¿½ï¿½ du projet et la BD du LHC." />727<meta name="keywords" content="LHC,Higgs,Atlas,CMS,Alice,LHCb,accï¿½ï¿½lï¿½ï¿½rateur,particule,Cern,grille,dï¿½ï¿½tecteur,expï¿½ï¿½riences,boson de higgs" />728<meta name="verify-v1" content="WWk3UJy6FdmEUs2ZATuUi6+OQnIL3Sci3WmPHmaWQWs=" />729<meta name="verify-v1" content="VAs7L6UxdHUoi699A76rt8aDBfL4c6hBE3vJw2SRbh4=" />730<meta property="og:image" content="http://www.lhc-france.fr/IMG/arton907.jpg" />731<meta property="fb:admins" content="thomas.diluccio,proyoledegieux"/>732</head>733<body class="rouge "><p>' . str_repeat('This is important. ', 20) . '</p></body></html>',734            'https://lemonde.io/35941909',735            $config736        );737        $this->assertTrue($res, 'Extraction went well');738        $this->assertStringNotContainsString('<head>', $this->getXmlContent($contentExtractor));739        $this->assertStringNotContainsString('<base>', $this->getXmlContent($contentExtractor));740    }741    public function testNativeAd(): void742    {743        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);744        $res = $contentExtractor->process(745            ' <meta property="og:url" content="https://nativead.io/sponsored/woops"/><p>hihi</p>',746            'https://nativead.io/woops!'747        );748        $this->assertTrue($res, 'Extraction went well');749        $this->assertTrue($contentExtractor->isNativeAd());750        $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));751    }752    public function testJsonLd(): void753    {754        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);755        $res = $contentExtractor->process(756            ' <script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><p>hihi</p>',757            'https://nativead.io/jsonld'758        );759        $this->assertTrue($res, 'Extraction went well');760        $this->assertSame('title !!', $contentExtractor->getTitle());761        $this->assertSame('2017-10-23T16:05:38+02:00', $contentExtractor->getDate());762        $this->assertStringContainsString('bob', (string) ((array) $contentExtractor->getAuthors())[0]);763        $this->assertSame('https://static.jsonld.io/medias.jpg', $contentExtractor->getImage());764        $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));765    }766    public function testJsonLdWithMultipleAuthors(): void767    {768        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);769        $res = $contentExtractor->process(770            '<script type="application/ld+json">{"@context":"https://schema.org","@type":"NewsArticle","author":[{"@type":"Person","name":"Elisa Thevenet"},{"@type":"Person","name":"Humphrey Bogart"}]}</script>',771            'https://nativead.io/jsonld'772        );773        /** @var \DOMNode */774        $contentBlock = $contentExtractor->getContent();775        $this->assertSame([776            'Elisa Thevenet',777            'Humphrey Bogart',778        ], $contentExtractor->getAuthors());779    }780    public function testNoDefinedHtml(): void781    {782        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);783        $res = $contentExtractor->process('', 'https://nativead.io/jsonld');784        $this->assertFalse($res);785        $this->assertEmpty($contentExtractor->getImage());786    }787    public function testOpenGraph(): void788    {789        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);790        $res = $contentExtractor->process(791            ' <meta property="og:title" content="title !!"/>792            <meta property="og:site_name" content="opengraph.io" />793            <meta property="og:type" content="article"/>794            <meta property="og:locale" content="fr_FR"/>795            <meta property="og:url" content="//opengraph.io/1954872.html"/>796            <meta property="article:published_time" content="2017-10-23T17:04:21Z-09:00"/>797            <meta property="article:modified_time" content="2017-10-23T17:04:17Z-09:00"/>798            <meta property="og:image" content="http://static.opengraph.io/medias_11570.jpg"/>799            <meta property="og:image:url" content="http://static.opengraph.io/medias_11570.jpg"/>800            <meta property="og:image:secure_url" content="https://static.opengraph.io/medias_11570.jpg"/>801            <p>hihi</p>',802            'https://nativead.io/opengraph'803        );804        $this->assertTrue($res);805        $this->assertSame('title !!', $contentExtractor->getTitle());806        $this->assertSame('2017-10-23T17:04:21+00:00', $contentExtractor->getDate());807        $this->assertSame('fr_FR', $contentExtractor->getLanguage());808        $this->assertSame('https://static.opengraph.io/medias_11570.jpg', $contentExtractor->getImage());809        $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));810    }811    public function testAvoidDataUriImageInOpenGraph(): void812    {813        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);814        $res = $contentExtractor->process(815            ' <html><meta content="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" property="og:image" /><meta content="http://www.io.lol" property="og:url"/><p>hihi</p></html>',816            'https://nativead.io/opengraph'817        );818        $this->assertTrue($res);819        $this->assertEmpty($contentExtractor->getImage());820        $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));821    }822    public function testJsonLdIgnoreList(): void823    {824        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);825        $res = $contentExtractor->process(826            '<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "NewsArticle", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "headline": "The Foobar Company is launching globally", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',827            'https://example.com/jsonld'828        );829        $this->assertTrue($res, 'Extraction went well');830        $this->assertSame('The Foobar Company is launching globally', $contentExtractor->getTitle());831        $this->assertStringContainsString('Foobar CEO', (string) ((array) $contentExtractor->getAuthors())[0]);832    }833    public function testJsonLdIgnoreListWithPeriodical(): void834    {835        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);836        $res = $contentExtractor->process(837            '<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Periodical", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "name": "Foobar Company", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><h1>Hello world, this is title</h1><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',838            'https://example.com/jsonld'839        );840        $this->assertTrue($res, 'Extraction went well');841        $this->assertSame('Hello world, this is title', $contentExtractor->getTitle());842    }843    public function testJsonLdSkipper(): void844    {845        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);846        $config = new SiteConfig();847        $config->skip_json_ld = true;848        $res = $contentExtractor->process(849            '<html><script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><body><div>hello !hello !hello !hello !hello !hello !hello !<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',850            'https://skipjsonld.io/jsonld',851            $config852        );853        $this->assertTrue($res, 'Extraction went well');854        $this->assertEmpty($contentExtractor->getTitle());855        $this->assertNull($contentExtractor->getDate());856        $this->assertEmpty($contentExtractor->getAuthors());857        $this->assertStringContainsString('this is the best part of the show', $this->getXmlContent($contentExtractor));858    }859    public function testJsonLdName(): void860    {861        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);862        $res = $contentExtractor->process(863            ' <script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "name": "name !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><p>hihi</p>',864            'https://nativead.io/jsonld'865        );866        $this->assertSame('name !!', $contentExtractor->getTitle());867    }868    public function testJsonLdDateArray(): void869    {870        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);871        $res = $contentExtractor->process(872            ' <script type="application/ld+json">{ "@context": "http://schema.org", "@type": "NewsArticle", "description": "Smoke rises from the 998-tonne fuel tanker Shoko Maru after it exploded off the coast of Himeji, western Japan, in this photo taken and released May 29, 2014.  REUTERS/5th Regional Coast Guard Headqua", "headline": "Editor&#039;s choice", "url": "https://www.reuters.com/news/picture/editors-choice-idUSRTR3RD95", "thumbnailUrl": "https://s3.reutersmedia.net/resources/r/?m=02&d=20140529&t=2&i=901254582&w=&fh=810&fw=545&ll=&pl=&sq=&r=2014-05-29T132753Z_2_GM1EA5T1BTD01_RTRMADP_0_JAPAN", "dateCreated": "2014-05-29T13:27:53+0000", "dateModified": "2014-05-29T13:27:53+0000", "articleSection": "RCOMUS_24", "creator": ["JaShong King"], "keywords": ["24 HOURS IN PICTURES", "Slideshow"], "about": "Slideshow", "author": ["JaShong King"], "datePublished": ["05/29/2014"] }</script><p>hihi</p>',873            'https://nativead.io/jsonld'874        );875        $this->assertSame('2014-05-29T00:00:00+02:00', $contentExtractor->getDate());876    }877    public function testJsonLdImageUrlArray(): void878    {879        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);880        $res = $contentExtractor->process(881            ' <script type="application/ld+json">{ "@context": "http://schema.org", "@type": "NewsArticle", "description": "Smoke rises from the 998-tonne fuel tanker Shoko Maru after it exploded off the coast of Himeji, western Japan, in this photo taken and released May 29, 2014.  REUTERS/5th Regional Coast Guard Headqua", "headline": "Editor&#039;s choice", "url": "https://www.reuters.com/news/picture/editors-choice-idUSRTR3RD95", "thumbnailUrl": "https://s3.reutersmedia.net/resources/r/?m=02&d=20140529&t=2&i=901254582&w=&fh=810&fw=545&ll=&pl=&sq=&r=2014-05-29T132753Z_2_GM1EA5T1BTD01_RTRMADP_0_JAPAN", "dateCreated": "2014-05-29T13:27:53+0000", "dateModified": "2014-05-29T13:27:53+0000", "articleSection": "RCOMUS_24", "creator": ["JaShong King"], "keywords": ["24 HOURS IN PICTURES", "Slideshow"], "about": "Slideshow", "author": ["JaShong King"], "datePublished": ["05/29/2014"], "image": { "@type": "ImageObject", "url": [ "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_1x1.png", "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_4x3.png", "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_16x9.png" ]} }</script><p>hihi</p>',882            'https://nativead.io/jsonld'883        );884        $this->assertSame('https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_1x1.png', $contentExtractor->getImage());885    }886    public function testUniqueAuthors(): void887    {888        $url = 'https://www.lemonde.fr/pixels/article/2018/05/30/bloodstained-curse-of-the-moon-delicieux-jeu-de-vampires-a-la-mode-des-annees-1980_5307173_4408996.html';889        $html = '<script type="application/ld+json">{"author":{"@type":"Person","name":"William Audureau"}}</script><a class="auteur" target="_blank" href="/journaliste/william-audureau/">William Audureau</a>';890        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);891        $siteConfig = $contentExtractor->buildSiteConfig($url);892        $contentExtractor->process(893            $html,894            $url,895            $siteConfig896        );897        $authors = (array) $contentExtractor->getAuthors();898        $authorsUnique = array_unique($authors);899        $this->assertTrue(\count($authors) === \count($authorsUnique), 'There is no duplicate authors');900    }901    public function testBodyAsDomAttribute(): void902    {903        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);904        $config = new SiteConfig();905        // a xpath retrieving a dom attribute906        $config->body = ['//iframe/@src'];907        $res = $contentExtractor->process(908            '   <iframe src="blog_0x34.md.html" frameborder="0" style="overflow:hidden; display:block; position: absolute; height: 80%; width:100%;"></iframe>',909            'https://domattr.io/woops!',910            $config911        );912        $this->assertFalse($res, 'Extraction failed');913    }914    public function testBadDate(): void915    {916        $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);917        $res = $contentExtractor->process(918            '   <meta property="article:published_time" content="-0001-11-30T00:00:00+00:00" /> <p>' . str_repeat('this is the best part of the show', 10) . '</p> ',919            'https://domattr.io/woops!'920        );921        $this->assertTrue($res, 'Extraction went fine');922        $this->assertNull($contentExtractor->getDate(), 'Date got vanish because it was wrong');923    }924    public function dataForProcessWrapIn(): array925    {926        return [927            // blockquote with a nested div928            [929                [930                    'blockquote' => "//div[@class='cond1']",931                ],932                "//blockquote/div[@class='cond1']/p",...

extractor.php

Source:extractor.php

...283			->array(annotations\extractor::toArray($value = uniqid()))->isEqualTo(array($value))284			->array(annotations\extractor::toArray(($value = uniqid()) . ' ' . ($otherValue = uniqid())))->isEqualTo(array($value, $otherValue))285		;286	}287	protected static function repeat($char, $max, $min = 1)288	{289		return str_repeat($char, rand($min, rand($min, $max)));290	}291	protected static function space($max = 10, $min = 1)292	{293		return self::repeat(' ', $max, $min);294	}295	protected static function star($max = 10, $min = 2)296	{297		return self::repeat('*', $max, $min);298	}299}...

repeat

Using AI Code Generation

1$extractor = new Extractor();2$extractor->repeat("Hello World", 5);3$extractor = new Extractor();4$extractor->repeat("Hello World", 5);5$extractor = new Extractor();6$extractor->repeat("Hello World", 5);7$extractor = new Extractor();8$extractor->repeat("Hello World", 5);9$extractor = new Extractor();10$extractor->repeat("Hello World", 5);11$extractor = new Extractor();12$extractor->repeat("Hello World", 5);13$extractor = new Extractor();14$extractor->repeat("Hello World", 5);15$extractor = new Extractor();16$extractor->repeat("Hello World", 5);17$extractor = new Extractor();18$extractor->repeat("Hello World", 5);19$extractor = new Extractor();20$extractor->repeat("Hello World", 5);21$extractor = new Extractor();22$extractor->repeat("Hello World", 5);23$extractor = new Extractor();24$extractor->repeat("Hello World", 5);25$extractor = new Extractor();26$extractor->repeat("Hello World", 5);27$extractor = new Extractor();28$extractor->repeat("Hello World",

repeat

Using AI Code Generation

1$extractor = new Extractor();2$extractor->repeat('Hello',3);3$extractor = new Extractor();4$extractor->repeat('Hello',3);5To solve this problem, we can use the include_once() function. This function is used to include a file only once. So, if we use include_once() function in the 1.php file, the Extractor class will be loaded only once and if we use include_once() function in the 2.php file, the Extractor class will not be loaded again. So, the repeat method will be called only once. The code will look like this:6include_once 'Extractor.php';7$extractor = new Extractor();8$extractor->repeat('Hello',3);9include_once 'Extractor.php';10$extractor = new Extractor();11$extractor->repeat('Hello',3);

repeat

Using AI Code Generation

1$extractor = new Extractor();2$extractor->repeat(2, 2);3$extractor->repeat(3, 3);4$extractor = new Extractor();5$extractor->repeat(2, 2);6$extractor->repeat(3, 3);7$extractor = new Extractor();8$extractor->repeat(2, 2);9$extractor->repeat(3, 3);10$extractor = new Extractor();11$extractor->repeat(2, 2);12$extractor->repeat(3, 3);13$extractor = new Extractor();14$extractor->repeat(2, 2);15$extractor->repeat(3, 3);16$extractor = new Extractor();17$extractor->repeat(2, 2);18$extractor->repeat(3, 3);19$extractor = new Extractor();20$extractor->repeat(2, 2);21$extractor->repeat(3, 3);22$extractor = new Extractor();23$extractor->repeat(2, 2);24$extractor->repeat(3, 3);25$extractor = new Extractor();26$extractor->repeat(2, 2);27$extractor->repeat(3, 3);28$extractor = new Extractor();29$extractor->repeat(2, 2);30$extractor->repeat(3, 3);31$extractor = new Extractor();32$extractor->repeat(2, 2);33$extractor->repeat(3,

repeat

Using AI Code Generation

1require_once("extractor.class.php");2$extractor = new extractor();3$extractor->repeat(10);4require_once("extractor.class.php");5$extractor = new extractor();6$extractor->repeat(20);7require_once("extractor.class.php");8$extractor = new extractor();9$extractor->repeat(30);10require_once("extractor.class.php");11$extractor = new extractor();12$extractor->repeat(40);13require_once("extractor.class.php");14$extractor = new extractor();15$extractor->repeat(50);16require_once("extractor.class.php");17$extractor = new extractor();18$extractor->repeat(60);19require_once("extractor.class.php");20$extractor = new extractor();21$extractor->repeat(70);22require_once("extractor.class.php");23$extractor = new extractor();24$extractor->repeat(80);25require_once("extractor.class.php");26$extractor = new extractor();27$extractor->repeat(90);28require_once("extractor.class.php");29$extractor = new extractor();30$extractor->repeat(100);31require_once("extractor.class.php");32$extractor = new extractor();33$extractor->repeat(110);34require_once("extractor.class.php");

repeat

Using AI Code Generation

1require_once 'extractor.php';2$extractor = new extractor;3$extractor->repeat('test', 5);4require_once 'extractor.php';5$extractor = new extractor;6$extractor->repeat('test', 5);7require_once 'extractor.php';8$extractor = new extractor;9$extractor->repeat('test', 5);10require_once 'extractor.php';11$extractor = new extractor;12$extractor->repeat('test', 5);13require_once 'extractor.php';14$extractor = new extractor;15$extractor->repeat('test', 5);16require_once 'extractor.php';17$extractor = new extractor;18$extractor->repeat('test', 5);19require_once 'extractor.php';20$extractor = new extractor;21$extractor->repeat('test', 5);22require_once 'extractor.php';23$extractor = new extractor;24$extractor->repeat('test', 5);25require_once 'extractor.php';26$extractor = new extractor;27$extractor->repeat('test', 5);28require_once 'extractor.php';29$extractor = new extractor;30$extractor->repeat('test', 5);31require_once 'extractor.php';32$extractor = new extractor;33$extractor->repeat('test', 5);34require_once 'extractor.php';35$extractor = new extractor;36$extractor->repeat('test', 5);

repeat

Using AI Code Generation

1require_once 'extractor.php';2$extractor = new Extractor();3$extractor->setRepeat(1);4$extractor->getRepeat();5$extractor->getHtml();6$extractor->getLinks();7require_once 'extractor.php';8$extractor = new Extractor();9$extractor->setRepeat(2);10$extractor->getRepeat();11$extractor->getHtml();12$extractor->getLinks();13require_once 'extractor.php';14$extractor = new Extractor();15$extractor->setRepeat(3);16$extractor->getRepeat();17$extractor->getHtml();18$extractor->getLinks();19require_once 'extractor.php';20$extractor = new Extractor();21$extractor->setRepeat(4);22$extractor->getRepeat();23$extractor->getHtml();24$extractor->getLinks();25require_once 'extractor.php';26$extractor = new Extractor();27$extractor->setRepeat(5);28$extractor->getRepeat();29$extractor->getHtml();30$extractor->getLinks();31require_once 'extractor.php';32$extractor = new Extractor();33$extractor->setRepeat(6);34$extractor->getRepeat();35$extractor->getHtml();36$extractor->getLinks();37require_once 'extractor.php';38$extractor = new Extractor();39$extractor->setRepeat(7);40$extractor->getRepeat();

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.