Best Atoum code snippet using extractor.repeat
ContentExtractorTest.php
Source:ContentExtractorTest.php
...288 public function dataForStrip(): array289 {290 return [291 // strip nav element and keep only the p292 ['//nav', '<html><body><nav id="high">hello !hello !hello !hello !hello !hello !hello !hello !hello !</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],293 // strip p element and keep the nav294 ['//p', '<html><body><nav id="high">' . str_repeat('hello !', 20) . '</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'this is the best part of the show'],295 ];296 }297 /**298 * @dataProvider dataForStrip299 */300 public function testApplyStrip(string $pattern, string $html, string $removedContent): void301 {302 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);303 $config = new SiteConfig();304 $config->strip = [$pattern];305 $contentExtractor->process(306 $html,307 'https://lemonde.io/35941909',308 $config309 );310 $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));311 }312 public function dataForStripIdOrClass(): array313 {314 return [315 ['commentlist', '<html><body><nav id="commentlist">hello !hello !hello !hello !hello !hello !hello !hello !hello !</nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],316 ['related_post', '<html><body><nav id="high">' . str_repeat('hello !', 20) . '</nav><p class="related_post">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'this is the best part of the show'],317 ['related', '<html><body><nav id="high">' . str_repeat('lorem ipsum dolor', 20) . '</nav><p class="related_post">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', null, 'class="related_post"'],318 ];319 }320 /**321 * @dataProvider dataForStripIdOrClass322 */323 public function testApplyStripIdOrClass(string $pattern, string $html, ?string $removedContent, string $matchContent = null): void324 {325 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);326 $config = new SiteConfig();327 $config->strip_id_or_class = [$pattern];328 $contentExtractor->process(329 $html,330 'https://lemonde.io/35941909',331 $config332 );333 $content = $this->getReadabilityContent($contentExtractor);334 if (null === $removedContent) {335 $this->assertStringContainsString((string) $matchContent, $content);336 } else {337 $this->assertStringNotContainsString($removedContent, $content);338 }339 }340 public function dataForStripImageSrc(): array341 {342 return [343 ['doubleclick.net', '<html><body><img src="https://www.doubleclick.net/pub.jpg"/></nav><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'https://www.doubleclick.net/pub.jpg'],344 // array('related_post', '<html><body><nav id="high">'.str_repeat('hello !', 20).'</nav><p class="related_post">'.str_repeat('this is the best part of the show', 10).'</p></body></html>', 'this is the best part of the show'),345 ];346 }347 /**348 * @dataProvider dataForStripImageSrc349 */350 public function testApplyStripImageSrc(string $pattern, string $html, string $removedContent): void351 {352 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);353 $config = new SiteConfig();354 $config->strip_image_src = [$pattern];355 $res = $contentExtractor->process(356 $html,357 'https://lemonde.io/35941909',358 $config359 );360 $this->assertTrue($res, 'Extraction went well');361 $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));362 }363 public function dataForStripDisplayNoneAndInstapaper(): array364 {365 return [366 // remove element with class "instapaper_ignore"367 ['<html><body><p class="instapaper_ignore">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],368 // remove element with class "entry-unrelated"369 ['<html><body><p class="entry-unrelated">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', 'hello !'],370 ];371 }372 /**373 * @dataProvider dataForStripDisplayNoneAndInstapaper374 */375 public function testApplyStripDisplayNoneAndInstapaper(string $html, string $removedContent): void376 {377 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);378 $config = new SiteConfig();379 $res = $contentExtractor->process(380 $html,381 'https://lemonde.io/35941909',382 $config383 );384 $this->assertTrue($res, 'Extraction went well');385 $this->assertStringNotContainsString($removedContent, $this->getReadabilityContent($contentExtractor));386 }387 public function dataForStripAttr(): array388 {389 return [390 [['//*/@class'], '<html><body><div class="hello world"><i class="class">bar</i>class="foo"' . str_repeat('this is the best part of the show', 10) . ' <a class="hc" href="void">link</a></div></body></html>', [391 'removedContent' => ['class="class"', 'class="hello world"', 'class="hc"'],392 'keptContent' => ['class="foo"', '<a href="void"', '<em>bar'],393 ],394 ],395 [['//img/@class', '//p/@class'], '<html><body><img class="bar-class" src="void" /><a class="hello" href="void">link</a> <p class="yes">' . str_repeat('this is the best part of the show', 10) . '</p></body></html>', [396 'removedContent' => ['class="bar-class"', 'class="yes"'],397 'keptContent' => ['class="hello"'],398 ],399 ],400 ];401 }402 /**403 * @dataProvider dataForStripAttr404 */405 public function testApplyStripAttr(array $patterns, string $html, array $assertions): void406 {407 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);408 $config = new SiteConfig();409 $config->strip = $patterns;410 $res = $contentExtractor->process(411 $html,412 'https://lemonde.io/35941909',413 $config414 );415 $content = $this->getReadabilityContent($contentExtractor);416 foreach ($assertions['removedContent'] as $removedContent) {417 $this->assertStringNotContainsString($removedContent, $content);418 }419 foreach ($assertions['keptContent'] as $keptContent) {420 $this->assertStringContainsString($keptContent, $content);421 }422 }423 public function dataForExtractBody(): array424 {425 return [426 // extract one element427 [428 "//p[@class='content']",429 '<html><body><p class="content">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',430 '<p class="content">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p>',431 ],432 // extract multiple element433 [434 "//p[@class='content_wrapper']",435 '<html><body><p class="content_wrapper">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p class="content_wrapper">' . str_repeat('this is the best part of the show', 5) . '</p></body></html>',436 '<div><p class="content_wrapper">hello !hello !hello !hello !hello !hello !hello !hello !hello !</p><p class="content_wrapper">' . str_repeat('this is the best part of the show', 5) . '</p></div>',437 ],438 ];439 }440 /**441 * @dataProvider dataForExtractBody442 */443 public function testExtractBody(string $pattern, string $html, string $expectedContent): void444 {445 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);446 $config = new SiteConfig();447 $config->body = [$pattern];448 $res = $contentExtractor->process(449 $html,450 'https://lemonde.io/35941909',451 $config452 );453 $this->assertTrue($res, 'Extraction went well');454 $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));455 }456 public function dataForExtractHNews(): array457 {458 return [459 // the all hNews tested460 [461 '<html><body><div class="hentry"><p class="entry-title">hello !</p><time pubdate="2015-01-01">2015-01-01</time><a class="vcard author">hello !</a>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',462 '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',463 [464 'title' => 'hello !',465 'date' => '2015-01-01T00:00:00+01:00',466 'authors' => ['hello !'],467 ],468 ],469 // hNews with bad date470 [471 '<html><body><div class="hentry"><time pubdate="2015-01-01">aweomse!</time>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',472 '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',473 [474 'date' => null,475 ],476 ],477 // hNews with many authors478 [479 '<html><body><div class="hentry"><p class="vcard author"><a class="fn">first boy</a><a class="fn">first girl</a></p>hello !hello !hello !hello !hello !hello !hello !<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',480 '<p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p>',481 [482 'authors' => ['first boy', 'first girl'],483 ],484 ],485 // hNews with many content486 [487 '<html><body><div class="hentry"><p class="entry-content">hello !hello !hello !hello !hello !hello !hello !</p><p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',488 '<div><p class="entry-content">hello !hello !hello !hello !hello !hello !hello !</p><p class="entry-content">' . str_repeat('this is the best part of the show', 10) . '</p></div>',489 [],490 ],491 ];492 }493 /**494 * @dataProvider dataForExtractHNews495 */496 public function testExtractHNews(string $html, string $expectedContent, array $expectedElements): void497 {498 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);499 $config = new SiteConfig();500 $res = $contentExtractor->process(501 $html,502 'https://lemonde.io/35941909',503 $config504 );505 $this->assertTrue($res, 'Extraction went well');506 $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));507 foreach ($expectedElements as $key => $value) {508 $this->assertSame($contentExtractor->{'get' . ucfirst($key)}(), $value);509 }510 }511 /**512 * Extract content from instapaper class.513 */514 public function testExtractInstapaper(): void515 {516 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);517 $config = new SiteConfig();518 $res = $contentExtractor->process(519 '<html><body><div><p class="instapaper_title">hello !</p>hello !hello !hello !hello !hello !hello !hello !<p class="instapaper_body">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',520 'https://lemonde.io/35941909',521 $config522 );523 $this->assertTrue($res, 'Extraction went well');524 $this->assertSame('<p class="instapaper_body">' . str_repeat('this is the best part of the show', 10) . '</p>', $this->getXmlContent($contentExtractor));525 $this->assertSame($contentExtractor->getTitle(), 'hello !');526 }527 public function dataForExtractSchemaOrg(): array528 {529 return [530 // articleBody on one element531 [532 '<html><body><div>hello !hello !hello !hello !hello !hello !hello !<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',533 '<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p>',534 ],535 // articleBody on two elements536 [537 '<html><body><div><p itemprop="articleBody">hello !hello !hello !hello !hello !hello !hello !</p><p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',538 '<div><p itemprop="articleBody">hello !hello !hello !hello !hello !hello !hello !</p><p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div>',539 ],540 // articleBody on img element541 [542 '<html><body><div><p itemprop="articleBody"><img src="http://0.0.0.0/image.jpg" /></p></div></body></html>',543 '<p itemprop="articleBody"><img src="http://0.0.0.0/image.jpg"/></p>',544 ],545 ];546 }547 /**548 * @dataProvider dataForExtractSchemaOrg549 */550 public function testExtractSchemaOrg(string $html, string $expectedContent): void551 {552 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);553 $config = new SiteConfig();554 $res = $contentExtractor->process(555 $html,556 'https://lemonde.io/35941909',557 $config558 );559 $this->assertTrue($res, 'Extraction went well');560 $this->assertSame($expectedContent, $this->getXmlContent($contentExtractor));561 }562 /**563 * Test that if the first h* found in the body is the same as the extracted title, it'll be removed.564 */565 public function testRemoveHFromBody(): void566 {567 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);568 $config = new SiteConfig();569 $config->body = ['//div'];570 $config->title = ['//title'];571 $res = $contentExtractor->process(572 '<html><head><title>My Title</title></head><body><div><h3>My Title</h3>' . str_repeat('this is the best part of the show', 10) . '</div></body></html>',573 'https://lemonde.io/35941909',574 $config575 );576 $this->assertTrue($res, 'Extraction went well');577 $this->assertStringNotContainsString('My Title', $this->getXmlContent($contentExtractor));578 $this->assertSame('My Title', $contentExtractor->getTitle());579 }580 public function dataForlazyLoad(): array581 {582 return [583 // test with img attribute data-src584 [585 '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/big_image.jpg" src=""></div>',586 '<img src="http://0.0.0.0/big_image.jpg"',587 ],588 // test with img attribute data-lazy-src589 [590 '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-lazy-src="http://0.0.0.0/big_image.jpg" src=""></div>',591 '<img src="http://0.0.0.0/big_image.jpg"',592 ],593 // test with img attribute data-src and image in noscript594 [595 '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-lazy-src="http://0.0.0.0/big_image.jpg" src=""><noscript><img src="http://0.0.0.0/big_image_noscript.jpg"></noscript></div>',596 '<img src="http://0.0.0.0/big_image_noscript.jpg"',597 ],598 // test with img attribute data-original599 [600 '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="" data-original="http://0.0.0.0/big_image.jpg" class="lazy"/></div>',601 '<img src="http://0.0.0.0/big_image.jpg"',602 ],603 // test with img attribute data-sources604 [605 '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="" data-sources="http://0.0.0.0/big_image.jpg"/></div>',606 '<img src="http://0.0.0.0/big_image.jpg"',607 ],608 // test with img attribute from site config609 [610 '<div>' . str_repeat('this is the best part of the show', 10) . '<img src="" data-toto-src="http://0.0.0.0/big_image.jpg"/></div>',611 '<img src="http://0.0.0.0/big_image.jpg"',612 ],613 // test with img attribute data-srcset614 [615 '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/src.jpg" data-srcset="http://0.0.0.0/srcset1 680w, http://0.0.0.0/srcset2 1536w"/></div>',616 '<img src="http://0.0.0.0/src.jpg" srcset="http://0.0.0.0/srcset1 680w, http://0.0.0.0/srcset2 1536w"/>',617 ],618 // test with img attribute data-srcset empty619 [620 '<div>' . str_repeat('this is the best part of the show', 10) . '<img data-src="http://0.0.0.0/src.jpg" data-srcset=""/></div>',621 '<img src="http://0.0.0.0/src.jpg"/>',622 ],623 ];624 }625 /**626 * Test that if the first h* found in the body is the same as the extracted title, it'll be removed.627 *628 * @dataProvider dataForlazyLoad629 */630 public function testConvertLazyLoadImages(string $html, string $htmlExpected): void631 {632 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);633 $config = new SiteConfig();634 $config->body = ['//div'];635 $config->src_lazy_load_attr = 'data-toto-src';636 $res = $contentExtractor->process(637 $html,638 'https://lemonde.io/35941909',639 $config640 );641 $this->assertTrue($res, 'Extraction went well');642 $this->assertStringContainsString($htmlExpected, $this->getXmlContent($contentExtractor));643 }644 public function testIframeEmbeddedContent(): void645 {646 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);647 $config = new SiteConfig();648 // '//header' is a bad pattern, and it will jump to the next one649 $config->body = ['//header', '//div'];650 // obviously a bad parser which will be converted to use the default one651 $config->parser = 'toto';652 $res = $contentExtractor->process(653 '<div>' . str_repeat('this is the best part of the show', 10) . '</div><div class="video_player"><iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320"></iframe></div>',654 'https://lemonde.io/35941909',655 $config656 );657 $this->assertTrue($res, 'Extraction went well');658 $this->assertStringContainsString('<iframe src="http://www.dailymotion.com/embed/video/x2kjh59" frameborder="0" width="534" height="320">[embedded content]</iframe>', $this->getXmlContent($contentExtractor));659 }660 public function testLogMessage(): void661 {662 $logger = new Logger('foo');663 $handler = new TestHandler($level = Logger::INFO);664 $logger->pushHandler($handler);665 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);666 $contentExtractor->setLogger($logger);667 $config = new SiteConfig();668 $contentExtractor->process(669 '<html><iframe ></iframe></html>',670 'https://vimeo.com/35941909',671 $config672 );673 $records = $handler->getRecords();674 $this->assertGreaterThanOrEqual(6, $records);675 $this->assertSame('Attempting to parse HTML with {parser}', $records[0]['message']);676 $this->assertSame('libxml', $records[0]['context']['parser']);677 $this->assertSame('Opengraph "og:" data: {ogData}', $records[2]['message']);678 $this->assertSame('Opengraph "article:" data: {ogData}', $records[3]['message']);679 $this->assertSame('Trying {pattern} for language', $records[4]['message']);680 $this->assertSame('Trying {pattern} for language', $records[5]['message']);681 $this->assertSame('Using Readability', $records[6]['message']);682 $this->assertSame('Date is bad (wrong year): {date}', $records[7]['message']);683 $this->assertSame('Attempting to parse HTML with {parser}', $records[9]['message']);684 }685 public function testWithCustomFiltersForReadability(): void686 {687 $contentExtractor = new ContentExtractor(688 self::$contentExtractorConfig689 + ['readability' => [690 'post_filters' => ['!<head[^>]*>(.*?)</head>!is' => ''],691 'pre_filters' => ['!</?noscript>!is' => ''],692 ]]693 );694 $config = new SiteConfig();695 $res = $contentExtractor->process(696 '<!DOCTYPE html>697<html lang="fr" dir="ltr">698<head>699<base href="http://www.lhc-france.fr/" />700<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />701<meta name="generator" content="SPIP 3.0.17 [21515]" />702<link rel="shortcut icon" href="squelettes/favicon.ico" />703<script type=\'text/javascript\'>704document.createElement("header");document.createElement("footer");document.createElement("section");document.createElement("aside");document.createElement("nav");document.createElement("article");document.createElement("time");705</script>706<!--[if lt IE 9]>707 <meta http-equiv="X-UA-Compatible" content="IE=edge" />708 <script type="text/javascript" src="http://www.lhc-france.fr/squelettes/js/ie.js"></script>709<![endif]-->710<script type="text/javascript" src="http://www.lhc-france.fr/squelettes/js/modernizr.js"></script>711<script type="text/javascript">712function handleError(){return true;}713window.onerror = handleError;714dossier_squelettes = \'squelettes\';715secteurid=6;articleid=907;article_jour=19;article_mois=12;article_annee=2016;716</script>717<link rel="alternate" type="application/rss+xml" title="Actualit��s du LHC" href="http://feeds.feedburner.com/lhcfranceactus?format=xml" />718<link rel="alternate" type="application/rss+xml" title="La BD du LHC" href="http://www.lhc-france.fr/?page=backend&id_rubrique=65" />719<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/styles-urlabs-b1fc-urlabs-b1fc-minify-3f10.css" type="text/css" media="all" />720<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/milkbox-urlabs-fe01-urlabs-fe01-minify-1d16.css" media="screen" />721<link rel="stylesheet" href="http://www.lhc-france.fr/local/cache-css/styles.print-urlabs-2157-urlabs-2157-minify-d3e7.css" type="text/css" media="print" />722<link rel="stylesheet" href="http://www.lhc-france.fr/squelettes/styles.rouge.css" type="text/css" media="all" />723<script type="text/javascript" src="http://www.lhc-france.fr/local/cache-js/AC_RunActiveContent-minify-d850.js"></script>724<title>Novembre 2016 - Je voudrais de la mati��re noire �� No��l... | LHC France</title>725<meta name="robots" content="index, follow, all" />726<meta name="description" content="La contribution du CNRS et du CEA au LHC, un instrument international de physique des particules situ�� au Cern. Avec toute l\'actualit�� du projet et la BD du LHC." />727<meta name="keywords" content="LHC,Higgs,Atlas,CMS,Alice,LHCb,acc��l��rateur,particule,Cern,grille,d��tecteur,exp��riences,boson de higgs" />728<meta name="verify-v1" content="WWk3UJy6FdmEUs2ZATuUi6+OQnIL3Sci3WmPHmaWQWs=" />729<meta name="verify-v1" content="VAs7L6UxdHUoi699A76rt8aDBfL4c6hBE3vJw2SRbh4=" />730<meta property="og:image" content="http://www.lhc-france.fr/IMG/arton907.jpg" />731<meta property="fb:admins" content="thomas.diluccio,proyoledegieux"/>732</head>733<body class="rouge "><p>' . str_repeat('This is important. ', 20) . '</p></body></html>',734 'https://lemonde.io/35941909',735 $config736 );737 $this->assertTrue($res, 'Extraction went well');738 $this->assertStringNotContainsString('<head>', $this->getXmlContent($contentExtractor));739 $this->assertStringNotContainsString('<base>', $this->getXmlContent($contentExtractor));740 }741 public function testNativeAd(): void742 {743 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);744 $res = $contentExtractor->process(745 ' <meta property="og:url" content="https://nativead.io/sponsored/woops"/><p>hihi</p>',746 'https://nativead.io/woops!'747 );748 $this->assertTrue($res, 'Extraction went well');749 $this->assertTrue($contentExtractor->isNativeAd());750 $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));751 }752 public function testJsonLd(): void753 {754 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);755 $res = $contentExtractor->process(756 ' <script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><p>hihi</p>',757 'https://nativead.io/jsonld'758 );759 $this->assertTrue($res, 'Extraction went well');760 $this->assertSame('title !!', $contentExtractor->getTitle());761 $this->assertSame('2017-10-23T16:05:38+02:00', $contentExtractor->getDate());762 $this->assertStringContainsString('bob', (string) ((array) $contentExtractor->getAuthors())[0]);763 $this->assertSame('https://static.jsonld.io/medias.jpg', $contentExtractor->getImage());764 $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));765 }766 public function testJsonLdWithMultipleAuthors(): void767 {768 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);769 $res = $contentExtractor->process(770 '<script type="application/ld+json">{"@context":"https://schema.org","@type":"NewsArticle","author":[{"@type":"Person","name":"Elisa Thevenet"},{"@type":"Person","name":"Humphrey Bogart"}]}</script>',771 'https://nativead.io/jsonld'772 );773 /** @var \DOMNode */774 $contentBlock = $contentExtractor->getContent();775 $this->assertSame([776 'Elisa Thevenet',777 'Humphrey Bogart',778 ], $contentExtractor->getAuthors());779 }780 public function testNoDefinedHtml(): void781 {782 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);783 $res = $contentExtractor->process('', 'https://nativead.io/jsonld');784 $this->assertFalse($res);785 $this->assertEmpty($contentExtractor->getImage());786 }787 public function testOpenGraph(): void788 {789 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);790 $res = $contentExtractor->process(791 ' <meta property="og:title" content="title !!"/>792 <meta property="og:site_name" content="opengraph.io" />793 <meta property="og:type" content="article"/>794 <meta property="og:locale" content="fr_FR"/>795 <meta property="og:url" content="//opengraph.io/1954872.html"/>796 <meta property="article:published_time" content="2017-10-23T17:04:21Z-09:00"/>797 <meta property="article:modified_time" content="2017-10-23T17:04:17Z-09:00"/>798 <meta property="og:image" content="http://static.opengraph.io/medias_11570.jpg"/>799 <meta property="og:image:url" content="http://static.opengraph.io/medias_11570.jpg"/>800 <meta property="og:image:secure_url" content="https://static.opengraph.io/medias_11570.jpg"/>801 <p>hihi</p>',802 'https://nativead.io/opengraph'803 );804 $this->assertTrue($res);805 $this->assertSame('title !!', $contentExtractor->getTitle());806 $this->assertSame('2017-10-23T17:04:21+00:00', $contentExtractor->getDate());807 $this->assertSame('fr_FR', $contentExtractor->getLanguage());808 $this->assertSame('https://static.opengraph.io/medias_11570.jpg', $contentExtractor->getImage());809 $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));810 }811 public function testAvoidDataUriImageInOpenGraph(): void812 {813 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);814 $res = $contentExtractor->process(815 ' <html><meta content="" property="og:image" /><meta content="http://www.io.lol" property="og:url"/><p>hihi</p></html>',816 'https://nativead.io/opengraph'817 );818 $this->assertTrue($res);819 $this->assertEmpty($contentExtractor->getImage());820 $this->assertStringContainsString('<p>hihi</p>', $this->getXmlContent($contentExtractor));821 }822 public function testJsonLdIgnoreList(): void823 {824 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);825 $res = $contentExtractor->process(826 '<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "NewsArticle", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "headline": "The Foobar Company is launching globally", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',827 'https://example.com/jsonld'828 );829 $this->assertTrue($res, 'Extraction went well');830 $this->assertSame('The Foobar Company is launching globally', $contentExtractor->getTitle());831 $this->assertStringContainsString('Foobar CEO', (string) ((array) $contentExtractor->getAuthors())[0]);832 }833 public function testJsonLdIgnoreListWithPeriodical(): void834 {835 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);836 $res = $contentExtractor->process(837 '<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Periodical", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "name": "Foobar Company", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><h1>Hello world, this is title</h1><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',838 'https://example.com/jsonld'839 );840 $this->assertTrue($res, 'Extraction went well');841 $this->assertSame('Hello world, this is title', $contentExtractor->getTitle());842 }843 public function testJsonLdSkipper(): void844 {845 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);846 $config = new SiteConfig();847 $config->skip_json_ld = true;848 $res = $contentExtractor->process(849 '<html><script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><body><div>hello !hello !hello !hello !hello !hello !hello !<p itemprop="articleBody">' . str_repeat('this is the best part of the show', 10) . '</p></div></body></html>',850 'https://skipjsonld.io/jsonld',851 $config852 );853 $this->assertTrue($res, 'Extraction went well');854 $this->assertEmpty($contentExtractor->getTitle());855 $this->assertNull($contentExtractor->getDate());856 $this->assertEmpty($contentExtractor->getAuthors());857 $this->assertStringContainsString('this is the best part of the show', $this->getXmlContent($contentExtractor));858 }859 public function testJsonLdName(): void860 {861 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);862 $res = $contentExtractor->process(863 ' <script type="application/ld+json">{ "@context": "https:\/\/schema.org", "@type": "NewsArticle", "headline": "title !!", "name": "name !!", "mainEntityOfPage": "http:\/\/jsonld.io\/toto", "datePublished": "2017-10-23T16:05:38+02:00", "dateModified": "2017-10-23T16:06:28+02:00", "description": "it is describe", "articlebody": " my body", "relatedLink": "", "image": { "@type": "ImageObject", "url": "https:\/\/static.jsonld.io\/medias.jpg", "height": "830", "width": "532" }, "author": { "@type": "Person", "name": "bob", "sameAs": ["https:\/\/twitter.com\/bob"] }, "keywords": ["syndicat", "usine", "licenciement", "Emmanuel Macron", "creuse", "plan social", "Automobile"] }</script><p>hihi</p>',864 'https://nativead.io/jsonld'865 );866 $this->assertSame('name !!', $contentExtractor->getTitle());867 }868 public function testJsonLdDateArray(): void869 {870 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);871 $res = $contentExtractor->process(872 ' <script type="application/ld+json">{ "@context": "http://schema.org", "@type": "NewsArticle", "description": "Smoke rises from the 998-tonne fuel tanker Shoko Maru after it exploded off the coast of Himeji, western Japan, in this photo taken and released May 29, 2014. REUTERS/5th Regional Coast Guard Headqua", "headline": "Editor's choice", "url": "https://www.reuters.com/news/picture/editors-choice-idUSRTR3RD95", "thumbnailUrl": "https://s3.reutersmedia.net/resources/r/?m=02&d=20140529&t=2&i=901254582&w=&fh=810&fw=545&ll=&pl=&sq=&r=2014-05-29T132753Z_2_GM1EA5T1BTD01_RTRMADP_0_JAPAN", "dateCreated": "2014-05-29T13:27:53+0000", "dateModified": "2014-05-29T13:27:53+0000", "articleSection": "RCOMUS_24", "creator": ["JaShong King"], "keywords": ["24 HOURS IN PICTURES", "Slideshow"], "about": "Slideshow", "author": ["JaShong King"], "datePublished": ["05/29/2014"] }</script><p>hihi</p>',873 'https://nativead.io/jsonld'874 );875 $this->assertSame('2014-05-29T00:00:00+02:00', $contentExtractor->getDate());876 }877 public function testJsonLdImageUrlArray(): void878 {879 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);880 $res = $contentExtractor->process(881 ' <script type="application/ld+json">{ "@context": "http://schema.org", "@type": "NewsArticle", "description": "Smoke rises from the 998-tonne fuel tanker Shoko Maru after it exploded off the coast of Himeji, western Japan, in this photo taken and released May 29, 2014. REUTERS/5th Regional Coast Guard Headqua", "headline": "Editor's choice", "url": "https://www.reuters.com/news/picture/editors-choice-idUSRTR3RD95", "thumbnailUrl": "https://s3.reutersmedia.net/resources/r/?m=02&d=20140529&t=2&i=901254582&w=&fh=810&fw=545&ll=&pl=&sq=&r=2014-05-29T132753Z_2_GM1EA5T1BTD01_RTRMADP_0_JAPAN", "dateCreated": "2014-05-29T13:27:53+0000", "dateModified": "2014-05-29T13:27:53+0000", "articleSection": "RCOMUS_24", "creator": ["JaShong King"], "keywords": ["24 HOURS IN PICTURES", "Slideshow"], "about": "Slideshow", "author": ["JaShong King"], "datePublished": ["05/29/2014"], "image": { "@type": "ImageObject", "url": [ "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_1x1.png", "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_4x3.png", "https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_16x9.png" ]} }</script><p>hihi</p>',882 'https://nativead.io/jsonld'883 );884 $this->assertSame('https://statics.estadao.com.br/s2016/portal/img/json-ld/estadao_1x1.png', $contentExtractor->getImage());885 }886 public function testUniqueAuthors(): void887 {888 $url = 'https://www.lemonde.fr/pixels/article/2018/05/30/bloodstained-curse-of-the-moon-delicieux-jeu-de-vampires-a-la-mode-des-annees-1980_5307173_4408996.html';889 $html = '<script type="application/ld+json">{"author":{"@type":"Person","name":"William Audureau"}}</script><a class="auteur" target="_blank" href="/journaliste/william-audureau/">William Audureau</a>';890 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);891 $siteConfig = $contentExtractor->buildSiteConfig($url);892 $contentExtractor->process(893 $html,894 $url,895 $siteConfig896 );897 $authors = (array) $contentExtractor->getAuthors();898 $authorsUnique = array_unique($authors);899 $this->assertTrue(\count($authors) === \count($authorsUnique), 'There is no duplicate authors');900 }901 public function testBodyAsDomAttribute(): void902 {903 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);904 $config = new SiteConfig();905 // a xpath retrieving a dom attribute906 $config->body = ['//iframe/@src'];907 $res = $contentExtractor->process(908 ' <iframe src="blog_0x34.md.html" frameborder="0" style="overflow:hidden; display:block; position: absolute; height: 80%; width:100%;"></iframe>',909 'https://domattr.io/woops!',910 $config911 );912 $this->assertFalse($res, 'Extraction failed');913 }914 public function testBadDate(): void915 {916 $contentExtractor = new ContentExtractor(self::$contentExtractorConfig);917 $res = $contentExtractor->process(918 ' <meta property="article:published_time" content="-0001-11-30T00:00:00+00:00" /> <p>' . str_repeat('this is the best part of the show', 10) . '</p> ',919 'https://domattr.io/woops!'920 );921 $this->assertTrue($res, 'Extraction went fine');922 $this->assertNull($contentExtractor->getDate(), 'Date got vanish because it was wrong');923 }924 public function dataForProcessWrapIn(): array925 {926 return [927 // blockquote with a nested div928 [929 [930 'blockquote' => "//div[@class='cond1']",931 ],932 "//blockquote/div[@class='cond1']/p",...
extractor.php
Source:extractor.php
...283 ->array(annotations\extractor::toArray($value = uniqid()))->isEqualTo(array($value))284 ->array(annotations\extractor::toArray(($value = uniqid()) . ' ' . ($otherValue = uniqid())))->isEqualTo(array($value, $otherValue))285 ;286 }287 protected static function repeat($char, $max, $min = 1)288 {289 return str_repeat($char, rand($min, rand($min, $max)));290 }291 protected static function space($max = 10, $min = 1)292 {293 return self::repeat(' ', $max, $min);294 }295 protected static function star($max = 10, $min = 2)296 {297 return self::repeat('*', $max, $min);298 }299}...
repeat
Using AI Code Generation
1$extractor = new Extractor();2$extractor->repeat("Hello World", 5);3$extractor = new Extractor();4$extractor->repeat("Hello World", 5);5$extractor = new Extractor();6$extractor->repeat("Hello World", 5);7$extractor = new Extractor();8$extractor->repeat("Hello World", 5);9$extractor = new Extractor();10$extractor->repeat("Hello World", 5);11$extractor = new Extractor();12$extractor->repeat("Hello World", 5);13$extractor = new Extractor();14$extractor->repeat("Hello World", 5);15$extractor = new Extractor();16$extractor->repeat("Hello World", 5);17$extractor = new Extractor();18$extractor->repeat("Hello World", 5);19$extractor = new Extractor();20$extractor->repeat("Hello World", 5);21$extractor = new Extractor();22$extractor->repeat("Hello World", 5);23$extractor = new Extractor();24$extractor->repeat("Hello World", 5);25$extractor = new Extractor();26$extractor->repeat("Hello World", 5);27$extractor = new Extractor();28$extractor->repeat("Hello World",
repeat
Using AI Code Generation
1$extractor = new Extractor();2$extractor->repeat('Hello',3);3$extractor = new Extractor();4$extractor->repeat('Hello',3);5To solve this problem, we can use the include_once() function. This function is used to include a file only once. So, if we use include_once() function in the 1.php file, the Extractor class will be loaded only once and if we use include_once() function in the 2.php file, the Extractor class will not be loaded again. So, the repeat method will be called only once. The code will look like this:6include_once 'Extractor.php';7$extractor = new Extractor();8$extractor->repeat('Hello',3);9include_once 'Extractor.php';10$extractor = new Extractor();11$extractor->repeat('Hello',3);
repeat
Using AI Code Generation
1$extractor = new Extractor();2$extractor->repeat(2, 2);3$extractor->repeat(3, 3);4$extractor = new Extractor();5$extractor->repeat(2, 2);6$extractor->repeat(3, 3);7$extractor = new Extractor();8$extractor->repeat(2, 2);9$extractor->repeat(3, 3);10$extractor = new Extractor();11$extractor->repeat(2, 2);12$extractor->repeat(3, 3);13$extractor = new Extractor();14$extractor->repeat(2, 2);15$extractor->repeat(3, 3);16$extractor = new Extractor();17$extractor->repeat(2, 2);18$extractor->repeat(3, 3);19$extractor = new Extractor();20$extractor->repeat(2, 2);21$extractor->repeat(3, 3);22$extractor = new Extractor();23$extractor->repeat(2, 2);24$extractor->repeat(3, 3);25$extractor = new Extractor();26$extractor->repeat(2, 2);27$extractor->repeat(3, 3);28$extractor = new Extractor();29$extractor->repeat(2, 2);30$extractor->repeat(3, 3);31$extractor = new Extractor();32$extractor->repeat(2, 2);33$extractor->repeat(3,
repeat
Using AI Code Generation
1require_once("extractor.class.php");2$extractor = new extractor();3$extractor->repeat(10);4require_once("extractor.class.php");5$extractor = new extractor();6$extractor->repeat(20);7require_once("extractor.class.php");8$extractor = new extractor();9$extractor->repeat(30);10require_once("extractor.class.php");11$extractor = new extractor();12$extractor->repeat(40);13require_once("extractor.class.php");14$extractor = new extractor();15$extractor->repeat(50);16require_once("extractor.class.php");17$extractor = new extractor();18$extractor->repeat(60);19require_once("extractor.class.php");20$extractor = new extractor();21$extractor->repeat(70);22require_once("extractor.class.php");23$extractor = new extractor();24$extractor->repeat(80);25require_once("extractor.class.php");26$extractor = new extractor();27$extractor->repeat(90);28require_once("extractor.class.php");29$extractor = new extractor();30$extractor->repeat(100);31require_once("extractor.class.php");32$extractor = new extractor();33$extractor->repeat(110);34require_once("extractor.class.php");
repeat
Using AI Code Generation
1require_once 'extractor.php';2$extractor = new extractor;3$extractor->repeat('test', 5);4require_once 'extractor.php';5$extractor = new extractor;6$extractor->repeat('test', 5);7require_once 'extractor.php';8$extractor = new extractor;9$extractor->repeat('test', 5);10require_once 'extractor.php';11$extractor = new extractor;12$extractor->repeat('test', 5);13require_once 'extractor.php';14$extractor = new extractor;15$extractor->repeat('test', 5);16require_once 'extractor.php';17$extractor = new extractor;18$extractor->repeat('test', 5);19require_once 'extractor.php';20$extractor = new extractor;21$extractor->repeat('test', 5);22require_once 'extractor.php';23$extractor = new extractor;24$extractor->repeat('test', 5);25require_once 'extractor.php';26$extractor = new extractor;27$extractor->repeat('test', 5);28require_once 'extractor.php';29$extractor = new extractor;30$extractor->repeat('test', 5);31require_once 'extractor.php';32$extractor = new extractor;33$extractor->repeat('test', 5);34require_once 'extractor.php';35$extractor = new extractor;36$extractor->repeat('test', 5);
repeat
Using AI Code Generation
1require_once 'extractor.php';2$extractor = new Extractor();3$extractor->setRepeat(1);4$extractor->getRepeat();5$extractor->getHtml();6$extractor->getLinks();7require_once 'extractor.php';8$extractor = new Extractor();9$extractor->setRepeat(2);10$extractor->getRepeat();11$extractor->getHtml();12$extractor->getLinks();13require_once 'extractor.php';14$extractor = new Extractor();15$extractor->setRepeat(3);16$extractor->getRepeat();17$extractor->getHtml();18$extractor->getLinks();19require_once 'extractor.php';20$extractor = new Extractor();21$extractor->setRepeat(4);22$extractor->getRepeat();23$extractor->getHtml();24$extractor->getLinks();25require_once 'extractor.php';26$extractor = new Extractor();27$extractor->setRepeat(5);28$extractor->getRepeat();29$extractor->getHtml();30$extractor->getLinks();31require_once 'extractor.php';32$extractor = new Extractor();33$extractor->setRepeat(6);34$extractor->getRepeat();35$extractor->getHtml();36$extractor->getLinks();37require_once 'extractor.php';38$extractor = new Extractor();39$extractor->setRepeat(7);40$extractor->getRepeat();
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Execute automation tests with repeat on a cloud-based Grid of 3000+ real browsers and operating systems for both web and mobile applications.
Test now for FreeGet 100 minutes of automation test minutes FREE!!