Best Python code snippet using tempest_python
test_tree.py
Source:test_tree.py  
1import datetime2import difflib3import textwrap4from decimal import Decimal5from email.utils import format_datetime6from unittest import TestCase7import requests_mock8from dateutil.tz import tzoffset9from tests.helpers import gzip10from usp.log import create_logger11from usp.objects.page import (12    SitemapPage,13    SitemapNewsStory,14    SitemapPageChangeFrequency,15)16from usp.objects.sitemap import (17    IndexRobotsTxtSitemap,18    PagesXMLSitemap,19    IndexXMLSitemap,20    InvalidSitemap,21    PagesTextSitemap,22    IndexWebsiteSitemap,23    PagesRSSSitemap,24    PagesAtomSitemap,25)26from usp.tree import sitemap_tree_for_homepage27# FIXME various exotic properties28# FIXME XML vulnerabilities with Expat29# FIXME max. recursion level30# FIXME tests responses that are too big31log = create_logger(__name__)32class TestSitemapTree(TestCase):33    TEST_BASE_URL = 'http://test_ultimate-sitemap-parser.com'  # mocked by HTTPretty34    # Publication / "last modified" date35    TEST_DATE_DATETIME = datetime.datetime(36        year=2009, month=12, day=17, hour=12, minute=4, second=56,37        tzinfo=tzoffset(None, 7200),38    )39    TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat()40    """Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps)."""41    TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME)42    """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps)."""43    TEST_PUBLICATION_NAME = 'Test publication'44    TEST_PUBLICATION_LANGUAGE = 'en'45    @staticmethod46    def fallback_to_404_not_found_matcher(request):47        """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress."""48        return requests_mock.create_response(49            request,50            status_code=404,51            reason='Not Found',52            headers={'Content-Type': 'text/html'},53            text="<h1>404 Not Found!</h1>",54        )55    # noinspection DuplicatedCode56    def test_sitemap_tree_for_homepage(self):57        """Test sitemap_tree_for_homepage()."""58        with requests_mock.Mocker() as m:59            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)60            m.get(61                self.TEST_BASE_URL + '/',62                text='This is a homepage.',63            )64            m.get(65                self.TEST_BASE_URL + '/robots.txt',66                headers={'Content-Type': 'text/plain'},67                text=textwrap.dedent("""68                    User-agent: *69                    Disallow: /whatever70    71                    Sitemap: {base_url}/sitemap_pages.xml72                    73                    # Intentionally spelled as "Site-map" as Google tolerates this:74                    # https://github.com/google/robotstxt/blob/master/robots.cc#L703 75                    Site-map: {base_url}/sitemap_news_index_1.xml76                """.format(base_url=self.TEST_BASE_URL)).strip(),77            )78            # One sitemap for random static pages79            m.get(80                self.TEST_BASE_URL + '/sitemap_pages.xml',81                headers={'Content-Type': 'application/xml'},82                text=textwrap.dedent("""83                    <?xml version="1.0" encoding="UTF-8"?>84                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">85                        <url>86                            <loc>{base_url}/about.html</loc>87                            <lastmod>{last_modified_date}</lastmod>88                            <changefreq>monthly</changefreq>89                            <priority>0.8</priority>90                        </url>91                        <url>92                            <loc>{base_url}/contact.html</loc>93                            <lastmod>{last_modified_date}</lastmod>94    95                            <!-- Invalid change frequency -->96                            <changefreq>when we feel like it</changefreq>97    98                            <!-- Invalid priority -->99                            <priority>1.1</priority>100    101                        </url>102                    </urlset>103                """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(),104            )105            # Index sitemap pointing to sitemaps with stories106            m.get(107                self.TEST_BASE_URL + '/sitemap_news_index_1.xml',108                headers={'Content-Type': 'application/xml'},109                text=textwrap.dedent("""110                    <?xml version="1.0" encoding="UTF-8"?>111                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">112                        <sitemap>113                            <loc>{base_url}/sitemap_news_1.xml</loc>114                            <lastmod>{last_modified}</lastmod>115                        </sitemap>116                        <sitemap>117                            <loc>{base_url}/sitemap_news_index_2.xml</loc>118                            <lastmod>{last_modified}</lastmod>119                        </sitemap>120                    </sitemapindex>121                """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),122            )123            # First sitemap with actual stories124            m.get(125                self.TEST_BASE_URL + '/sitemap_news_1.xml',126                headers={'Content-Type': 'application/xml'},127                text=textwrap.dedent("""128                    <?xml version="1.0" encoding="UTF-8"?>129                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"130                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"131                            xmlns:xhtml="http://www.w3.org/1999/xhtml">132    133                        <url>134                            <loc>{base_url}/news/foo.html</loc>135    136                            <!-- Element present but empty -->137                            <lastmod />138    139                            <!-- Some other XML namespace -->140                            <xhtml:link rel="alternate"141                                        media="only screen and (max-width: 640px)"142                                        href="{base_url}/news/foo.html?mobile=1" />143    144                            <news:news>145                                <news:publication>146                                    <news:name>{publication_name}</news:name>147                                    <news:language>{publication_language}</news:language>148                                </news:publication>149                                <news:publication_date>{publication_date}</news:publication_date>150                                <news:title>Foo <foo></news:title>    <!-- HTML entity decoding -->151                            </news:news>152                        </url>153    154                        <!-- Has a duplicate story in /sitemap_news_2.xml -->155                        <url>156                            <loc>{base_url}/news/bar.html</loc>157                            <xhtml:link rel="alternate"158                                        media="only screen and (max-width: 640px)"159                                        href="{base_url}/news/bar.html?mobile=1" />160                            <news:news>161                                <news:publication>162                                    <news:name>{publication_name}</news:name>163                                    <news:language>{publication_language}</news:language>164                                </news:publication>165                                <news:publication_date>{publication_date}</news:publication_date>166                                <news:title>Bar & bar</news:title>167                            </news:news>168                        </url>169    170                    </urlset>171                """.format(172                    base_url=self.TEST_BASE_URL,173                    publication_name=self.TEST_PUBLICATION_NAME,174                    publication_language=self.TEST_PUBLICATION_LANGUAGE,175                    publication_date=self.TEST_DATE_STR_ISO8601,176                )).strip(),177            )178            # Another index sitemap pointing to a second sitemaps with stories179            m.get(180                self.TEST_BASE_URL + '/sitemap_news_index_2.xml',181                headers={'Content-Type': 'application/xml'},182                text=textwrap.dedent("""183                    <?xml version="1.0" encoding="UTF-8"?>184                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">185    186                        <sitemap>187                            <!-- Extra whitespace added around URL -->188                            <loc>  {base_url}/sitemap_news_2.xml  </loc>189                            <lastmod>{last_modified}</lastmod>190                        </sitemap>191    192                        <!-- Nonexistent sitemap -->193                        <sitemap>194                            <loc>{base_url}/sitemap_news_missing.xml</loc>195                            <lastmod>{last_modified}</lastmod>196                        </sitemap>197    198                    </sitemapindex>199                """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),200            )201            # Second sitemap with actual stories202            m.get(203                self.TEST_BASE_URL + '/sitemap_news_2.xml',204                headers={'Content-Type': 'application/xml'},205                text=textwrap.dedent("""206                    <?xml version="1.0" encoding="UTF-8"?>207                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"208                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"209                            xmlns:xhtml="http://www.w3.org/1999/xhtml">210    211                        <!-- Has a duplicate story in /sitemap_news_1.xml -->212                        <url>213                            <!-- Extra whitespace added around URL -->214                            <loc>  {base_url}/news/bar.html  </loc>215                            <xhtml:link rel="alternate"216                                        media="only screen and (max-width: 640px)"217                                        href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" />218                            <news:news>219                                <news:publication>220                                    <news:name>{publication_name}</news:name>221                                    <news:language>{publication_language}</news:language>222                                </news:publication>223                                <news:publication_date>{publication_date}</news:publication_date>224    225                                <tag_without_inner_character_data name="value" />226    227                                <news:title>Bar & bar</news:title>228                            </news:news>229                        </url>230    231                        <url>232                            <loc>{base_url}/news/baz.html</loc>233                            <xhtml:link rel="alternate"234                                        media="only screen and (max-width: 640px)"235                                        href="{base_url}/news/baz.html?mobile=1" />236                            <news:news>237                                <news:publication>238                                    <news:name>{publication_name}</news:name>239                                    <news:language>{publication_language}</news:language>240                                </news:publication>241                                <news:publication_date>{publication_date}</news:publication_date>242                                <news:title><![CDATA[BÄ
ž]]></news:title>    <!-- CDATA and UTF-8 -->243                            </news:news>244                        </url>245    246                    </urlset>247                """.format(248                    base_url=self.TEST_BASE_URL,249                    publication_name=self.TEST_PUBLICATION_NAME,250                    publication_language=self.TEST_PUBLICATION_LANGUAGE,251                    publication_date=self.TEST_DATE_STR_ISO8601,252                )).strip(),253            )254            # Nonexistent sitemap255            m.get(256                self.TEST_BASE_URL + '/sitemap_news_missing.xml',257                status_code=404,258                reason='Not Found',259                headers={'Content-Type': 'text/html'},260                text="<h1>404 Not Found!</h1>",261            )262            expected_sitemap_tree = IndexWebsiteSitemap(263                url='{}/'.format(self.TEST_BASE_URL),264                sub_sitemaps=[265                    IndexRobotsTxtSitemap(266                        url='{}/robots.txt'.format(self.TEST_BASE_URL),267                        sub_sitemaps=[268                            PagesXMLSitemap(269                                url='{}/sitemap_pages.xml'.format(self.TEST_BASE_URL),270                                pages=[271                                    SitemapPage(272                                        url='{}/about.html'.format(self.TEST_BASE_URL),273                                        last_modified=self.TEST_DATE_DATETIME,274                                        news_story=None,275                                        change_frequency=SitemapPageChangeFrequency.MONTHLY,276                                        priority=Decimal('0.8'),277                                    ),278                                    SitemapPage(279                                        url='{}/contact.html'.format(self.TEST_BASE_URL),280                                        last_modified=self.TEST_DATE_DATETIME,281                                        news_story=None,282                                        # Invalid input -- should be reset to "always"283                                        change_frequency=SitemapPageChangeFrequency.ALWAYS,284                                        # Invalid input -- should be reset to 0.5 (the default as per the spec)285                                        priority=Decimal('0.5'),286                                    )287                                ],288                            ),289                            IndexXMLSitemap(290                                url='{}/sitemap_news_index_1.xml'.format(self.TEST_BASE_URL),291                                sub_sitemaps=[292                                    PagesXMLSitemap(293                                        url='{}/sitemap_news_1.xml'.format(self.TEST_BASE_URL),294                                        pages=[295                                            SitemapPage(296                                                url='{}/news/foo.html'.format(self.TEST_BASE_URL),297                                                news_story=SitemapNewsStory(298                                                    title='Foo <foo>',299                                                    publish_date=self.TEST_DATE_DATETIME,300                                                    publication_name=self.TEST_PUBLICATION_NAME,301                                                    publication_language=self.TEST_PUBLICATION_LANGUAGE,302                                                ),303                                            ),304                                            SitemapPage(305                                                url='{}/news/bar.html'.format(self.TEST_BASE_URL),306                                                news_story=SitemapNewsStory(307                                                    title='Bar & bar',308                                                    publish_date=self.TEST_DATE_DATETIME,309                                                    publication_name=self.TEST_PUBLICATION_NAME,310                                                    publication_language=self.TEST_PUBLICATION_LANGUAGE,311                                                ),312                                            ),313                                        ]314                                    ),315                                    IndexXMLSitemap(316                                        url='{}/sitemap_news_index_2.xml'.format(self.TEST_BASE_URL),317                                        sub_sitemaps=[318                                            PagesXMLSitemap(319                                                url='{}/sitemap_news_2.xml'.format(self.TEST_BASE_URL),320                                                pages=[321                                                    SitemapPage(322                                                        url='{}/news/bar.html'.format(self.TEST_BASE_URL),323                                                        news_story=SitemapNewsStory(324                                                            title='Bar & bar',325                                                            publish_date=self.TEST_DATE_DATETIME,326                                                            publication_name=self.TEST_PUBLICATION_NAME,327                                                            publication_language=self.TEST_PUBLICATION_LANGUAGE,328                                                        ),329                                                    ),330                                                    SitemapPage(331                                                        url='{}/news/baz.html'.format(self.TEST_BASE_URL),332                                                        news_story=SitemapNewsStory(333                                                            title='BÄ
ž',334                                                            publish_date=self.TEST_DATE_DATETIME,335                                                            publication_name=self.TEST_PUBLICATION_NAME,336                                                            publication_language=self.TEST_PUBLICATION_LANGUAGE,337                                                        ),338                                                    ),339                                                ],340                                            ),341                                            InvalidSitemap(342                                                url='{}/sitemap_news_missing.xml'.format(self.TEST_BASE_URL),343                                                reason=(344                                                    'Unable to fetch sitemap from {base_url}/sitemap_news_missing.xml: '345                                                    '404 Not Found'346                                                ).format(base_url=self.TEST_BASE_URL),347                                            ),348                                        ],349                                    ),350                                ],351                            ),352                        ],353                    )354                ]355            )356            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)357            expected_lines = str(expected_sitemap_tree).split()358            actual_lines = str(actual_sitemap_tree).split()359            diff = difflib.ndiff(expected_lines, actual_lines)360            diff_str = '\n'.join(diff)361            assert expected_sitemap_tree == actual_sitemap_tree, diff_str362            assert len(list(actual_sitemap_tree.all_pages())) == 6363    def test_sitemap_tree_for_homepage_gzip(self):364        """Test sitemap_tree_for_homepage() with gzipped sitemaps."""365        with requests_mock.Mocker() as m:366            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)367            m.get(368                self.TEST_BASE_URL + '/',369                text='This is a homepage.',370            )371            m.get(372                self.TEST_BASE_URL + '/robots.txt',373                headers={'Content-Type': 'text/plain'},374                text=textwrap.dedent("""375                    User-agent: *376                    Disallow: /whatever377    378                    Sitemap: {base_url}/sitemap_1.gz379                    Sitemap: {base_url}/sitemap_2.dat380                    Sitemap: {base_url}/sitemap_3.xml.gz381                """.format(base_url=self.TEST_BASE_URL)).strip(),382            )383            # Gzipped sitemap without correct HTTP header but with .gz extension384            m.get(385                self.TEST_BASE_URL + '/sitemap_1.gz',386                content=gzip(textwrap.dedent("""387                    <?xml version="1.0" encoding="UTF-8"?>388                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"389                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">390                        <url>391                            <loc>{base_url}/news/foo.html</loc>392                            <news:news>393                                <news:publication>394                                    <news:name>{publication_name}</news:name>395                                    <news:language>{publication_language}</news:language>396                                </news:publication>397                                <news:publication_date>{publication_date}</news:publication_date>398                                <news:title>Foo <foo></news:title>    <!-- HTML entity decoding -->399                            </news:news>400                        </url>401                    </urlset>402                """.format(403                    base_url=self.TEST_BASE_URL,404                    publication_name=self.TEST_PUBLICATION_NAME,405                    publication_language=self.TEST_PUBLICATION_LANGUAGE,406                    publication_date=self.TEST_DATE_STR_ISO8601,407                )).strip()),408            )409            # Gzipped sitemap with correct HTTP header but without .gz extension410            m.get(411                self.TEST_BASE_URL + '/sitemap_2.dat',412                headers={'Content-Type': 'application/x-gzip'},413                content=gzip(textwrap.dedent("""414                    <?xml version="1.0" encoding="UTF-8"?>415                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"416                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">417                        <url>418                            <loc>{base_url}/news/bar.html</loc>419                            <news:news>420                                <news:publication>421                                    <news:name>{publication_name}</news:name>422                                    <news:language>{publication_language}</news:language>423                                </news:publication>424                                <news:publication_date>{publication_date}</news:publication_date>425                                <news:title><![CDATA[BÄ
r]]></news:title>    <!-- CDATA and UTF-8 -->426                            </news:news>427                        </url>428                    </urlset>429                """.format(430                    base_url=self.TEST_BASE_URL,431                    publication_name=self.TEST_PUBLICATION_NAME,432                    publication_language=self.TEST_PUBLICATION_LANGUAGE,433                    publication_date=self.TEST_DATE_STR_ISO8601,434                )).strip()),435            )436            # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't437            m.get(438                self.TEST_BASE_URL + '/sitemap_3.xml.gz',439                headers={'Content-Type': 'application/x-gzip'},440                text=textwrap.dedent("""441                    <?xml version="1.0" encoding="UTF-8"?>442                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"443                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">444                        <url>445                            <loc>{base_url}/news/baz.html</loc>446                            <news:news>447                                <news:publication>448                                    <news:name>{publication_name}</news:name>449                                    <news:language>{publication_language}</news:language>450                                </news:publication>451                                <news:publication_date>{publication_date}</news:publication_date>452                                <news:title><![CDATA[BÄ
ž]]></news:title>    <!-- CDATA and UTF-8 -->453                            </news:news>454                        </url>455                    </urlset>456                """.format(457                    base_url=self.TEST_BASE_URL,458                    publication_name=self.TEST_PUBLICATION_NAME,459                    publication_language=self.TEST_PUBLICATION_LANGUAGE,460                    publication_date=self.TEST_DATE_STR_ISO8601,461                )).strip(),462            )463            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)464            # Don't do an in-depth check, we just need to make sure that gunzip works465            assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)466            assert len(actual_sitemap_tree.sub_sitemaps) == 1467            assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)468            # noinspection PyUnresolvedReferences469            assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3470            # noinspection PyUnresolvedReferences471            sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]472            assert isinstance(sitemap_1, PagesXMLSitemap)473            assert len(sitemap_1.pages) == 1474            # noinspection PyUnresolvedReferences475            sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]476            assert isinstance(sitemap_2, PagesXMLSitemap)477            assert len(sitemap_2.pages) == 1478            # noinspection PyUnresolvedReferences479            sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2]480            assert isinstance(sitemap_3, PagesXMLSitemap)481            assert len(sitemap_3.pages) == 1482    def test_sitemap_tree_for_homepage_plain_text(self):483        """Test sitemap_tree_for_homepage() with plain text sitemaps."""484        with requests_mock.Mocker() as m:485            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)486            m.get(487                self.TEST_BASE_URL + '/',488                text='This is a homepage.',489            )490            m.get(491                self.TEST_BASE_URL + '/robots.txt',492                headers={'Content-Type': 'text/plain'},493                text=textwrap.dedent("""494                    User-agent: *495                    Disallow: /whatever496    497                    Sitemap: {base_url}/sitemap_1.txt498                    Sitemap: {base_url}/sitemap_2.txt.dat499                """.format(base_url=self.TEST_BASE_URL)).strip(),500            )501            # Plain text uncompressed sitemap (no Content-Type header)502            m.get(503                self.TEST_BASE_URL + '/sitemap_1.txt',504                text=textwrap.dedent("""505    506                    {base_url}/news/foo.html507    508    509                    {base_url}/news/bar.html510    511                    Some other stuff which totally doesn't look like an URL512                """.format(base_url=self.TEST_BASE_URL)).strip(),513            )514            # Plain text compressed sitemap without .gz extension515            m.get(516                self.TEST_BASE_URL + '/sitemap_2.txt.dat',517                headers={'Content-Type': 'application/x-gzip'},518                content=gzip(textwrap.dedent("""519                    {base_url}/news/bar.html520                        {base_url}/news/baz.html521                """.format(base_url=self.TEST_BASE_URL)).strip()),522            )523            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)524            assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)525            assert len(actual_sitemap_tree.sub_sitemaps) == 1526            assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)527            # noinspection PyUnresolvedReferences528            assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2529            # noinspection PyUnresolvedReferences530            sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]531            assert isinstance(sitemap_1, PagesTextSitemap)532            assert len(sitemap_1.pages) == 2533            # noinspection PyUnresolvedReferences534            sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]535            assert isinstance(sitemap_2, PagesTextSitemap)536            assert len(sitemap_2.pages) == 2537            pages = list(actual_sitemap_tree.all_pages())538            assert len(pages) == 4539            assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages540            assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages541            assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages542    # noinspection DuplicatedCode543    def test_sitemap_tree_for_homepage_rss_atom(self):544        """Test sitemap_tree_for_homepage() with RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""545        with requests_mock.Mocker() as m:546            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)547            m.get(548                self.TEST_BASE_URL + '/',549                text='This is a homepage.',550            )551            m.get(552                self.TEST_BASE_URL + '/robots.txt',553                headers={'Content-Type': 'text/plain'},554                text=textwrap.dedent("""555                    User-agent: *556                    Disallow: /whatever557                    Sitemap: {base_url}/sitemap_rss.xml558                    Sitemap: {base_url}/sitemap_atom_0_3.xml559                    Sitemap: {base_url}/sitemap_atom_1_0.xml560                """.format(base_url=self.TEST_BASE_URL)).strip(),561            )562            # RSS 2.0 sitemap563            m.get(564                self.TEST_BASE_URL + '/sitemap_rss.xml',565                headers={'Content-Type': 'application/rss+xml'},566                text=textwrap.dedent("""567                    <?xml version="1.0" encoding="UTF-8"?>568                    <rss version="2.0">569                        <channel>570                            <title>Test RSS 2.0 feed</title>571                            <description>This is a test RSS 2.0 feed.</description>572                            <link>{base_url}</link>573                            <pubDate>{pub_date}</pubDate>574                            <item>575                                <title>Test RSS 2.0 story #1</title>576                                <description>This is a test RSS 2.0 story #1.</description>577                                <link>{base_url}/rss_story_1.html</link>578                                <guid isPermaLink="true">{base_url}/rss_story_1.html</guid>579                                <pubDate>{pub_date}</pubDate>580                            </item>581                            <item>582                                <title>Test RSS 2.0 story #2</title>583                                <description>This is a test RSS 2.0 story #2.</description>584                                <link>{base_url}/rss_story_2.html</link>585                                <guid isPermaLink="true">{base_url}/rss_story_2.html</guid>586                                <pubDate>{pub_date}</pubDate>587                            </item>588                        </channel>589                    </rss>590                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),591            )592            # Atom 0.3 sitemap593            m.get(594                self.TEST_BASE_URL + '/sitemap_atom_0_3.xml',595                headers={'Content-Type': 'application/atom+xml'},596                text=textwrap.dedent("""597                    <?xml version="1.0" encoding="UTF-8"?>598                    <feed version="0.3" xmlns="http://purl.org/atom/ns#">599                        <title>Test Atom 0.3 feed</title>600                        <link rel="alternate" type="text/html" href="{base_url}" />601                        <modified>{pub_date}</modified>602                        <entry>603                            <title>Test Atom 0.3 story #1</title>604                            <link rel="alternate" type="text/html" href="{base_url}/atom_0_3_story_1.html" />605                            <id>{base_url}/atom_0_3_story_1.html</id>606                            <issued>{pub_date}</issued>607                        </entry>608                        <entry>609                            <title>Test Atom 0.3 story #2</title>610                            <link rel="alternate" type="text/html" href="{base_url}/atom_0_3_story_2.html" />611                            <id>{base_url}/atom_0_3_story_2.html</id>612                            <issued>{pub_date}</issued>613                        </entry>614                    </feed>615                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),616            )617            # Atom 1.0 sitemap618            m.get(619                self.TEST_BASE_URL + '/sitemap_atom_1_0.xml',620                headers={'Content-Type': 'application/atom+xml'},621                text=textwrap.dedent("""622                    <?xml version="1.0" encoding="UTF-8"?>623                    <feed xmlns="http://www.w3.org/2005/Atom">624                        <title>Test Atom 1.0 feed</title>625                        <subtitle>This is a test Atom 1.0 feed.</subtitle>626                        <link href="{base_url}/sitemap_atom_1_0.xml" rel="self" />627                        <link href="{base_url}" />628                        <id>{base_url}</id>629                        <updated>{pub_date}</updated>630                        <entry>631                            <title>Test Atom 1.0 story #1</title>632                            <link href="{base_url}/atom_1_0_story_1.html" />633                            <link rel="alternate" type="text/html" href="{base_url}/atom_1_0_story_1.html?alt" />634                            <link rel="edit" href="{base_url}/atom_1_0_story_1.html?edit" />635                            <id>{base_url}/atom_1_0_story_1.html</id>636                            <updated>{pub_date}</updated>637                            <summary>This is test atom 1.0 story #1.</summary>638                            <content type="xhtml">639                                <div xmlns="http://www.w3.org/1999/xhtml">640                                    <p>This is test atom 1.0 story #1.</p>641                                </div>642                            </content>643                            <author>644                                <name>John Doe</name>645                                <email>johndoe@example.com</email>646                            </author>647                        </entry>648                        <entry>649                            <title>Test Atom 1.0 story #2</title>650                            <link href="{base_url}/atom_1_0_story_2.html" />651                            <link rel="alternate" type="text/html" href="{base_url}/atom_1_0_story_2.html?alt" />652                            <link rel="edit" href="{base_url}/atom_1_0_story_2.html?edit" />653                            <id>{base_url}/atom_1_0_story_2.html</id>654                            <updated>{pub_date}</updated>655                            <summary>This is test atom 1.0 story #2.</summary>656                            <content type="xhtml">657                                <div xmlns="http://www.w3.org/1999/xhtml">658                                    <p>This is test atom 1.0 story #2.</p>659                                </div>660                            </content>661                            <author>662                                <name>John Doe</name>663                                <email>johndoe@example.com</email>664                            </author>665                        </entry>666                    </feed>667                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),668            )669            expected_sitemap_tree = IndexWebsiteSitemap(670                url='{}/'.format(self.TEST_BASE_URL),671                sub_sitemaps=[672                    IndexRobotsTxtSitemap(673                        url='{}/robots.txt'.format(self.TEST_BASE_URL),674                        sub_sitemaps=[675                            PagesRSSSitemap(676                                url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL),677                                pages=[678                                    SitemapPage(679                                        url='{}/rss_story_1.html'.format(self.TEST_BASE_URL),680                                        news_story=SitemapNewsStory(681                                            title='Test RSS 2.0 story #1',682                                            publish_date=self.TEST_DATE_DATETIME,683                                        ),684                                    ),685                                    SitemapPage(686                                        url='{}/rss_story_2.html'.format(self.TEST_BASE_URL),687                                        news_story=SitemapNewsStory(688                                            title='Test RSS 2.0 story #2',689                                            publish_date=self.TEST_DATE_DATETIME,690                                        )691                                    )692                                ]693                            ),694                            PagesAtomSitemap(695                                url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),696                                pages=[697                                    SitemapPage(698                                        url='{}/atom_0_3_story_1.html'.format(self.TEST_BASE_URL),699                                        news_story=SitemapNewsStory(700                                            title='Test Atom 0.3 story #1',701                                            publish_date=self.TEST_DATE_DATETIME,702                                        ),703                                    ),704                                    SitemapPage(705                                        url='{}/atom_0_3_story_2.html'.format(self.TEST_BASE_URL),706                                        news_story=SitemapNewsStory(707                                            title='Test Atom 0.3 story #2',708                                            publish_date=self.TEST_DATE_DATETIME,709                                        )710                                    )711                                ]712                            ),713                            PagesAtomSitemap(714                                url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),715                                pages=[716                                    SitemapPage(717                                        url='{}/atom_1_0_story_1.html'.format(self.TEST_BASE_URL),718                                        news_story=SitemapNewsStory(719                                            title='Test Atom 1.0 story #1',720                                            publish_date=self.TEST_DATE_DATETIME,721                                        ),722                                    ),723                                    SitemapPage(724                                        url='{}/atom_1_0_story_2.html'.format(self.TEST_BASE_URL),725                                        news_story=SitemapNewsStory(726                                            title='Test Atom 1.0 story #2',727                                            publish_date=self.TEST_DATE_DATETIME,728                                        )729                                    )730                                ]731                            ),732                        ]733                    )734                ]735            )736            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)737            expected_lines = str(expected_sitemap_tree).split()738            actual_lines = str(actual_sitemap_tree).split()739            diff = difflib.ndiff(expected_lines, actual_lines)740            diff_str = '\n'.join(diff)741            assert expected_sitemap_tree == actual_sitemap_tree, diff_str742            assert len(list(actual_sitemap_tree.all_pages())) == 6743    def test_sitemap_tree_for_homepage_rss_atom_empty(self):744        """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""745        with requests_mock.Mocker() as m:746            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)747            m.get(748                self.TEST_BASE_URL + '/',749                text='This is a homepage.',750            )751            m.get(752                self.TEST_BASE_URL + '/robots.txt',753                headers={'Content-Type': 'text/plain'},754                text=textwrap.dedent("""755                    User-agent: *756                    Disallow: /whatever757                    Sitemap: {base_url}/sitemap_rss.xml758                    Sitemap: {base_url}/sitemap_atom_0_3.xml759                    Sitemap: {base_url}/sitemap_atom_1_0.xml760                """.format(base_url=self.TEST_BASE_URL)).strip(),761            )762            # RSS 2.0 sitemap763            m.get(764                self.TEST_BASE_URL + '/sitemap_rss.xml',765                headers={'Content-Type': 'application/rss+xml'},766                text=textwrap.dedent("""767                    <?xml version="1.0" encoding="UTF-8"?>768                    <rss version="2.0">769                        <channel>770                            <title>Test RSS 2.0 feed</title>771                            <description>This is a test RSS 2.0 feed.</description>772                            <link>{base_url}</link>773                            <pubDate>{pub_date}</pubDate>774                        </channel>775                    </rss>776                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),777            )778            # Atom 0.3 sitemap779            m.get(780                self.TEST_BASE_URL + '/sitemap_atom_0_3.xml',781                headers={'Content-Type': 'application/atom+xml'},782                text=textwrap.dedent("""783                    <?xml version="1.0" encoding="UTF-8"?>784                    <feed version="0.3" xmlns="http://purl.org/atom/ns#">785                        <title>Test Atom 0.3 feed</title>786                        <link rel="alternate" type="text/html" href="{base_url}" />787                        <modified>{pub_date}</modified>788                    </feed>789                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),790            )791            # Atom 1.0 sitemap792            m.get(793                self.TEST_BASE_URL + '/sitemap_atom_1_0.xml',794                headers={'Content-Type': 'application/atom+xml'},795                text=textwrap.dedent("""796                    <?xml version="1.0" encoding="UTF-8"?>797                    <feed xmlns="http://www.w3.org/2005/Atom">798                        <title>Test Atom 1.0 feed</title>799                        <subtitle>This is a test Atom 1.0 feed.</subtitle>800                        <link href="{base_url}/sitemap_atom_1_0.xml" rel="self" />801                        <link href="{base_url}" />802                        <id>{base_url}</id>803                        <updated>{pub_date}</updated>804                    </feed>805                """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),806            )807            expected_sitemap_tree = IndexWebsiteSitemap(808                url='{}/'.format(self.TEST_BASE_URL),809                sub_sitemaps=[810                    IndexRobotsTxtSitemap(811                        url='{}/robots.txt'.format(self.TEST_BASE_URL),812                        sub_sitemaps=[813                            PagesRSSSitemap(814                                url='{}/sitemap_rss.xml'.format(self.TEST_BASE_URL),815                                pages=[]816                            ),817                            PagesAtomSitemap(818                                url='{}/sitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),819                                pages=[]820                            ),821                            PagesAtomSitemap(822                                url='{}/sitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),823                                pages=[]824                            ),825                        ]826                    )827                ]828            )829            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)830            assert expected_sitemap_tree == actual_sitemap_tree831            assert len(list(actual_sitemap_tree.all_pages())) == 0832    def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):833        """Test sitemap_tree_for_homepage() with clipped XML.834        Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the835        server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with836        this behavior, so we have to support this too.837        """838        with requests_mock.Mocker() as m:839            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)840            m.get(841                self.TEST_BASE_URL + '/',842                text='This is a homepage.',843            )844            m.get(845                self.TEST_BASE_URL + '/robots.txt',846                headers={'Content-Type': 'text/plain'},847                text=textwrap.dedent("""848                    User-agent: *849                    Disallow: /whatever850    851                    Sitemap: {base_url}/sitemap.xml852                """.format(base_url=self.TEST_BASE_URL)).strip(),853            )854            m.get(855                self.TEST_BASE_URL + '/sitemap.xml',856                text=textwrap.dedent("""857                    <?xml version="1.0" encoding="UTF-8"?>858                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"859                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">860                        <url>861                            <loc>{base_url}/news/first.html</loc>862                            <news:news>863                                <news:publication>864                                    <news:name>{publication_name}</news:name>865                                    <news:language>{publication_language}</news:language>866                                </news:publication>867                                <news:publication_date>{publication_date}</news:publication_date>868                                <news:title>First story</news:title>869                            </news:news>870                        </url>871                        <url>872                            <loc>{base_url}/news/second.html</loc>873                            <news:news>874                                <news:publication>875                                    <news:name>{publication_name}</news:name>876                                    <news:language>{publication_language}</news:language>877                                </news:publication>878                                <news:publication_date>{publication_date}</news:publication_date>879                                <news:title>Second story</news:title>880                            </news:news>881                        </url>882    883                        <!-- The following story shouldn't get added as the XML ends prematurely -->884                        <url>885                            <loc>{base_url}/news/third.html</loc>886                            <news:news>887                                <news:publication>888                                    <news:name>{publication_name}</news:name>889                                    <news:language>{publication_language}</news:language>890                                </news:publication>891                                <news:publicat892                """.format(893                    base_url=self.TEST_BASE_URL,894                    publication_name=self.TEST_PUBLICATION_NAME,895                    publication_language=self.TEST_PUBLICATION_LANGUAGE,896                    publication_date=self.TEST_DATE_STR_ISO8601,897                )).strip(),898            )899            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)900            assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)901            assert len(actual_sitemap_tree.sub_sitemaps) == 1902            assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)903            # noinspection PyUnresolvedReferences904            assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1905            # noinspection PyUnresolvedReferences906            sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]907            assert isinstance(sitemap, PagesXMLSitemap)908            assert len(sitemap.pages) == 2909    def test_sitemap_tree_for_homepage_no_sitemap(self):910        """Test sitemap_tree_for_homepage() with no sitemaps listed in robots.txt."""911        with requests_mock.Mocker() as m:912            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)913            m.get(914                self.TEST_BASE_URL + '/',915                text='This is a homepage.',916            )917            m.get(918                self.TEST_BASE_URL + '/robots.txt',919                headers={'Content-Type': 'text/plain'},920                text=textwrap.dedent("""921                    User-agent: *922                    Disallow: /whatever923                """.format(base_url=self.TEST_BASE_URL)).strip(),924            )925            expected_sitemap_tree = IndexWebsiteSitemap(926                url='{}/'.format(self.TEST_BASE_URL),927                sub_sitemaps=[928                    IndexRobotsTxtSitemap(929                        url='{}/robots.txt'.format(self.TEST_BASE_URL),930                        sub_sitemaps=[],931                    )932                ]933            )934            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)935            assert expected_sitemap_tree == actual_sitemap_tree936    def test_sitemap_tree_for_homepage_unpublished_sitemap(self):937        """Test sitemap_tree_for_homepage() with some sitemaps not published in robots.txt."""938        with requests_mock.Mocker() as m:939            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)940            m.get(941                self.TEST_BASE_URL + '/',942                text='This is a homepage.',943            )944            m.get(945                self.TEST_BASE_URL + '/robots.txt',946                headers={'Content-Type': 'text/plain'},947                text=textwrap.dedent("""948                    User-agent: *949                    Disallow: /whatever950                    951                    Sitemap: {base_url}/sitemap_public.xml952                """.format(base_url=self.TEST_BASE_URL)).strip(),953            )954            # Public sitemap (linked to from robots.txt)955            m.get(956                self.TEST_BASE_URL + '/sitemap_public.xml',957                text=textwrap.dedent("""958                    <?xml version="1.0" encoding="UTF-8"?>959                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">960                        <url>961                            <loc>{base_url}/news/public.html</loc>962                        </url>963                    </urlset>964                """.format(965                    base_url=self.TEST_BASE_URL,966                    publication_name=self.TEST_PUBLICATION_NAME,967                    publication_language=self.TEST_PUBLICATION_LANGUAGE,968                    publication_date=self.TEST_DATE_STR_ISO8601,969                )).strip(),970            )971            # Private sitemap (to be discovered by trying out a few paths)972            m.get(973                self.TEST_BASE_URL + '/sitemap_index.xml',974                text=textwrap.dedent("""975                    <?xml version="1.0" encoding="UTF-8"?>976                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">977                        <url>978                            <loc>{base_url}/news/private.html</loc>979                        </url>980                    </urlset>981                """.format(982                    base_url=self.TEST_BASE_URL,983                    publication_name=self.TEST_PUBLICATION_NAME,984                    publication_language=self.TEST_PUBLICATION_LANGUAGE,985                    publication_date=self.TEST_DATE_STR_ISO8601,986                )).strip(),987            )988            expected_sitemap_tree = IndexWebsiteSitemap(989                url='{}/'.format(self.TEST_BASE_URL),990                sub_sitemaps=[991                    IndexRobotsTxtSitemap(992                        url='{}/robots.txt'.format(self.TEST_BASE_URL),993                        sub_sitemaps=[994                            PagesXMLSitemap(995                                url='{}/sitemap_public.xml'.format(self.TEST_BASE_URL),996                                pages=[997                                    SitemapPage(998                                        url='{}/news/public.html'.format(self.TEST_BASE_URL),999                                    ),1000                                ],1001                            ),1002                        ],1003                    ),1004                    PagesXMLSitemap(1005                        url='{}/sitemap_index.xml'.format(self.TEST_BASE_URL),1006                        pages=[1007                            SitemapPage(1008                                url='{}/news/private.html'.format(self.TEST_BASE_URL),1009                            ),1010                        ],1011                    ),1012                ]1013            )1014            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1015            assert expected_sitemap_tree == actual_sitemap_tree1016    def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):1017        """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""1018        with requests_mock.Mocker() as m:1019            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1020            m.get(1021                self.TEST_BASE_URL + '/',1022                text='This is a homepage.',1023            )1024            m.get(1025                self.TEST_BASE_URL + '/robots.txt',1026                headers={'Content-Type': ''},1027                text=textwrap.dedent("""1028                    User-agent: *1029                    Disallow: /whatever1030                """.format(base_url=self.TEST_BASE_URL)).strip(),1031            )1032            expected_sitemap_tree = IndexWebsiteSitemap(1033                url='{}/'.format(self.TEST_BASE_URL),1034                sub_sitemaps=[1035                    IndexRobotsTxtSitemap(1036                        url='{}/robots.txt'.format(self.TEST_BASE_URL),1037                        sub_sitemaps=[],1038                    )1039                ]1040            )1041            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1042            assert expected_sitemap_tree == actual_sitemap_tree1043    def test_sitemap_tree_for_homepage_no_robots_txt(self):1044        """Test sitemap_tree_for_homepage() with no robots.txt."""1045        with requests_mock.Mocker() as m:1046            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1047            m.get(1048                self.TEST_BASE_URL + '/',1049                text='This is a homepage.',1050            )1051            # Nonexistent robots.txt1052            m.get(1053                self.TEST_BASE_URL + '/robots.txt',1054                status_code=404,1055                reason='Not Found',1056                headers={'Content-Type': 'text/html'},1057                text="<h1>404 Not Found!</h1>",1058            )1059            expected_sitemap_tree = IndexWebsiteSitemap(1060                url='{}/'.format(self.TEST_BASE_URL),1061                sub_sitemaps=[1062                    InvalidSitemap(1063                        url='{}/robots.txt'.format(self.TEST_BASE_URL),1064                        reason=(1065                            'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found'1066                        ).format(base_url=self.TEST_BASE_URL),1067                    )1068                ]1069            )1070            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1071            assert expected_sitemap_tree == actual_sitemap_tree1072    def test_sitemap_tree_for_homepage_huge_sitemap(self):1073        """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""1074        page_count = 10001075        sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>1076            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1077                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"1078                    xmlns:xhtml="http://www.w3.org/1999/xhtml">1079        """1080        for x in range(page_count):1081            sitemap_xml += """1082                <url>1083                    <loc>{base_url}/news/page_{x}.html</loc>1084                    <!-- Element present but empty -->1085                    <lastmod />1086                    <!-- Some other XML namespace -->1087                    <xhtml:link rel="alternate"1088                                media="only screen and (max-width: 640px)"1089                                href="{base_url}/news/page_{x}.html?mobile=1" />1090                    <news:news>1091                        <news:publication>1092                            <news:name>{publication_name}</news:name>1093                            <news:language>{publication_language}</news:language>1094                        </news:publication>1095                        <news:publication_date>{publication_date}</news:publication_date>1096                        <news:title>Foo <foo></news:title>    <!-- HTML entity decoding -->1097                    </news:news>1098                </url>1099            """.format(1100                x=x,1101                base_url=self.TEST_BASE_URL,1102                publication_name=self.TEST_PUBLICATION_NAME,1103                publication_language=self.TEST_PUBLICATION_LANGUAGE,1104                publication_date=self.TEST_DATE_STR_ISO8601,1105            )1106        sitemap_xml += "</urlset>"1107        with requests_mock.Mocker() as m:1108            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1109            m.get(1110                self.TEST_BASE_URL + '/',1111                text='This is a homepage.',1112            )1113            m.get(1114                self.TEST_BASE_URL + '/robots.txt',1115                headers={'Content-Type': 'text/plain'},1116                text=textwrap.dedent("""1117                    User-agent: *1118                    Disallow: /whatever1119    1120                    Sitemap: {base_url}/sitemap.xml.gz1121                """.format(base_url=self.TEST_BASE_URL)).strip(),1122            )1123            m.get(1124                self.TEST_BASE_URL + '/sitemap.xml.gz',1125                headers={'Content-Type': 'application/x-gzip'},1126                content=gzip(sitemap_xml),1127            )1128            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1129            assert len(list(actual_sitemap_tree.all_pages())) == page_count1130    def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):1131        """Test sitemap_tree_for_homepage() with weird (but valid) spacing."""1132        with requests_mock.Mocker() as m:1133            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1134            m.get(1135                self.TEST_BASE_URL + '/',1136                text='This is a homepage.',1137            )1138            robots_txt_body = ""1139            robots_txt_body += "User-agent: *\n"1140            # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL1141            robots_txt_body += " Sitemap:{base_url}/sitemap.xml    ".format(base_url=self.TEST_BASE_URL)1142            m.get(1143                self.TEST_BASE_URL + '/robots.txt',1144                headers={'Content-Type': 'text/plain'},1145                text=robots_txt_body,1146            )1147            m.get(1148                self.TEST_BASE_URL + '/sitemap.xml',1149                text=textwrap.dedent("""1150                    <?xml version="1.0" encoding="UTF-8"?>1151                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1152                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">1153                        <url>1154                            <loc>{base_url}/news/first.html</loc>1155                            <news:news>1156                                <news:publication>1157                                    <news:name>{publication_name}</news:name>1158                                    <news:language>{publication_language}</news:language>1159                                </news:publication>1160                                <news:publication_date>{publication_date}</news:publication_date>1161                                <news:title>First story</news:title>1162                            </news:news>1163                        </url>1164                    </urlset>1165                """.format(1166                    base_url=self.TEST_BASE_URL,1167                    publication_name=self.TEST_PUBLICATION_NAME,1168                    publication_language=self.TEST_PUBLICATION_LANGUAGE,1169                    publication_date=self.TEST_DATE_STR_ISO8601,1170                )).strip(),1171            )1172            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1173            assert len(list(actual_sitemap_tree.all_pages())) == 11174    def test_sitemap_tree_for_homepage_utf8_bom(self):1175        """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""1176        robots_txt_body = textwrap.dedent("""1177            User-agent: *1178            Disallow: /whatever1179            Sitemap: {base_url}/sitemap.xml1180        """.format(base_url=self.TEST_BASE_URL)).strip()1181        sitemap_xml_body = textwrap.dedent("""1182            <?xml version="1.0" encoding="UTF-8"?>1183            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"1184                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">1185                <url>1186                    <loc>{base_url}/news/first.html</loc>1187                    <news:news>1188                        <news:publication>1189                            <news:name>{publication_name}</news:name>1190                            <news:language>{publication_language}</news:language>1191                        </news:publication>1192                        <news:publication_date>{publication_date}</news:publication_date>1193                        <news:title>First story</news:title>1194                    </news:news>1195                </url>1196            </urlset>1197        """.format(1198            base_url=self.TEST_BASE_URL,1199            publication_name=self.TEST_PUBLICATION_NAME,1200            publication_language=self.TEST_PUBLICATION_LANGUAGE,1201            publication_date=self.TEST_DATE_STR_ISO8601,1202        )).strip()1203        robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig')1204        sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig')1205        with requests_mock.Mocker() as m:1206            m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1207            m.get(1208                self.TEST_BASE_URL + '/',1209                text='This is a homepage.',1210            )1211            m.get(1212                self.TEST_BASE_URL + '/robots.txt',1213                headers={'Content-Type': 'text/plain'},1214                content=robots_txt_body_encoded,1215            )1216            m.get(1217                self.TEST_BASE_URL + '/sitemap.xml',1218                content=sitemap_xml_body_encoded,1219            )1220            actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)...combinations.py
Source:combinations.py  
1'''Test cases arguments combinations.'''2import os3import sys4import inflection5TEST_DIR = os.path.abspath(os.path.dirname(__file__))6if TEST_DIR not in sys.path:7    sys.path.append(TEST_DIR)8from consts import TEMPDIR, TEST_BASE_URL  # noqa: E4029from http_request_codegen.hrc_string import replace_multiple  # noqa: E40210def argument_combination_to_filename(combination_name, index):11    return '{}.{}.expect.txt'.format(12        str(index).zfill(3),13        inflection.parameterize(14            replace_multiple(15                combination_name, replacements={16                    '"': '-double-quote-',17                    '\'': '-single-quote-',18                },19            ),20        ),21    )22def combination_arguments_to_kwargs(arguments):23    kwargs = {}24    for key, value in arguments.items():25        if key == 'kwargs':26            kwargs.update(value)27        else:28            kwargs[key] = value29    return kwargs30def get_argument_combinations(31    method='GET', include_filenames=True,32    dirpath=None,33):34    response = [35        {36            'name': 'URL',37            'arguments': {38                'url': TEST_BASE_URL,39            },40        },41        {42            'name': 'URL wrapping (no wrap)',43            'arguments': {44                'url': TEST_BASE_URL,45                'wrap': 99999,46            },47        },48        {49            'name': 'URL wrapping (wrap 15)',50            'arguments': {51                'url': TEST_BASE_URL,52                'wrap': 15,53            },54        },55        {56            'name': 'Parameter',57            'arguments': {58                'url': TEST_BASE_URL,59                'parameters': [60                    {61                        'name': 'param-1',62                        'value': 'value-1',63                    },64                ],65            },66        },67        {68            'name': 'Parameters',69            'arguments': {70                'url': TEST_BASE_URL,71                'parameters': [72                    {73                        'name': 'param-1',74                        'value': 'foo',75                    },76                    {77                        'name': 'param-2',78                        'value': 1,79                    },80                    {81                        'name': 'param-3',82                        'value': .777,83                    },84                    {85                        'name': 'param-4',86                        'value': True,87                    },88                ],89            },90        },91        {92            'name': 'Parameter wrapping value',93            'arguments': {94                'url': TEST_BASE_URL,95                'parameters': [96                    {97                        'name': 'param-1',98                        'value': 'foo-bar-baz' * 50,99                    },100                ],101            },102        },103        {104            'name': 'Parameters, one wrapping value',105            'arguments': {106                'url': TEST_BASE_URL,107                'parameters': [108                    {109                        'name': 'param-1',110                        'value': 'foo-bar-baz' * 50,111                    },112                    {113                        'name': 'param-2',114                        'value': 'value-2',115                    },116                ],117            },118        },119        {120            'name': 'Parameter escaping quotes',121            'arguments': {122                'url': TEST_BASE_URL,123                'parameters': [124                    {125                        'name': 'param-1-with-\'\'-quotes',126                        'value': 'value-1-with-\'\'-quotes',127                    },128                ],129            },130        },131        {132            'name': 'URL + header',133            'arguments': {134                'url': TEST_BASE_URL,135                'headers': {136                    'Content-Type': 'application/json',137                },138            },139        },140        {141            'name': 'URL + headers',142            'arguments': {143                'url': TEST_BASE_URL,144                'headers': {145                    'Content-Type': 'application/json',146                    'Accept-Language': 'es',147                },148            },149        },150        {151            'name': 'URL + header wrapping value',152            'arguments': {153                'url': TEST_BASE_URL,154                'headers': {155                    'Content-Type': 'application/json' * 5,156                },157            },158        },159        {160            'name': 'URL + headers, one wrapping value',161            'arguments': {162                'url': TEST_BASE_URL,163                'headers': {164                    'Content-Type': 'application/json' * 5,165                    'Accept-Language': '*',166                },167            },168        },169        {170            'name': 'URL + header escaping quotes',171            'arguments': {172                'url': TEST_BASE_URL,173                'headers': {174                    'Accept-Language': 'Header value with \'\' quotes',175                },176            },177        },178        {179            'name': 'URL + kwarg',180            'arguments': {181                'url': TEST_BASE_URL,182                'kwargs': {183                    'timeout': 5,184                },185            },186        },187        {188            'name': 'URL + kwargs',189            'arguments': {190                'url': TEST_BASE_URL,191                'kwargs': {192                    'timeout': 5,193                    'stream': True,194                },195            },196        },197        {198            'name': 'URL + kwarg escaping quotes',199            'arguments': {200                'url': TEST_BASE_URL,201                'kwargs': {202                    'cookies': {203                        'foo': 'value with \'\' quotes',204                    },205                },206            },207        },208        {209            'name': 'URL + kwarg wrapping value',210            'arguments': {211                'url': TEST_BASE_URL,212                'kwargs': {213                    'cookies': {214                        'bar': 'foo bar baz ' * 50,215                    },216                },217            },218        },219        {220            'name': 'URL + kwargs, one wrapping value',221            'arguments': {222                'url': TEST_BASE_URL,223                'kwargs': {224                    'cookies': {225                        'bar': 'foo bar baz ' * 50,226                    },227                    'stream': True,228                },229            },230        },231        {232            'name': 'Parameter + header',233            'arguments': {234                'url': TEST_BASE_URL,235                'parameters': [236                    {237                        'name': 'param-1',238                        'value': 'value-1',239                    },240                ],241                'headers': {242                    'Content-Type': 'application/json',243                },244            },245        },246        {247            'name': 'Parameter + header (oneline)',248            'arguments': {249                'url': TEST_BASE_URL,250                'parameters': [251                    {252                        'name': 'param-1',253                        'value': 'value-1',254                    },255                ],256                'headers': {257                    'Content-Type': 'application/json',258                },259                'oneline': True,260            },261        },262        {263            'name': 'Parameters + header',264            'arguments': {265                'url': TEST_BASE_URL,266                'parameters': [267                    {268                        'name': 'param-1',269                        'value': 'value-1',270                    },271                    {272                        'name': 'param-2',273                        'value': 'value-2',274                    },275                ],276                'headers': {277                    'Content-Type': 'application/json',278                },279            },280        },281        {282            'name': 'Parameter + headers',283            'arguments': {284                'url': TEST_BASE_URL,285                'parameters': [286                    {287                        'name': 'param-1',288                        'value': 'value-1',289                    },290                ],291                'headers': {292                    'Content-Type': 'application/json',293                    'Accept-Language': '*',294                },295            },296        },297        {298            'name': 'Parameters + headers',299            'arguments': {300                'url': TEST_BASE_URL,301                'parameters': [302                    {303                        'name': 'param-1',304                        'value': 'value-1',305                    },306                    {307                        'name': 'param-2',308                        'value': 'value-2',309                    },310                ],311                'headers': {312                    'Content-Type': 'application/json',313                    'Accept-Language': '*',314                },315            },316        },317        {318            'name': 'Parameter + kwarg',319            'arguments': {320                'url': TEST_BASE_URL,321                'parameters': [322                    {323                        'name': 'param-1',324                        'value': 'value-1',325                    },326                ],327                'kwargs': {328                    'timeout': 10,329                },330            },331        },332        {333            'name': 'Parameter + kwarg (oneline)',334            'arguments': {335                'url': TEST_BASE_URL,336                'parameters': [337                    {338                        'name': 'a',339                        'value': 'b',340                    },341                ],342                'kwargs': {343                    'timeout': 10,344                },345                'oneline': True,346            },347        },348        {349            'name': 'Parameters + kwarg',350            'arguments': {351                'url': TEST_BASE_URL,352                'parameters': [353                    {354                        'name': 'param-1',355                        'value': 'value-1',356                    },357                    {358                        'name': 'param-2',359                        'value': 'value-2',360                    },361                ],362                'kwargs': {363                    'timeout': 10,364                },365            },366        },367        {368            'name': 'Parameter + kwargs',369            'arguments': {370                'url': TEST_BASE_URL,371                'parameters': [372                    {373                        'name': 'param-1',374                        'value': 'value-1',375                    },376                ],377                'kwargs': {378                    'timeout': 10,379                    'stream': True,380                },381            },382        },383        {384            'name': 'Parameters + kwargs',385            'arguments': {386                'url': TEST_BASE_URL,387                'parameters': [388                    {389                        'name': 'param-1',390                        'value': 'value-1',391                    },392                    {393                        'name': 'param-2',394                        'value': 'value-2',395                    },396                ],397                'kwargs': {398                    'timeout': 10,399                    'stream': True,400                },401            },402        },403        {404            'name': 'URL + header + kwarg',405            'arguments': {406                'url': TEST_BASE_URL,407                'headers': {408                    'Content-Type': 'application/json',409                },410                'kwargs': {411                    'timeout': 5,412                },413            },414        },415        {416            'name': 'URL + header + kwarg (oneline)',417            'arguments': {418                'url': TEST_BASE_URL,419                'headers': {420                    'Content-Type': 'application/json',421                },422                'kwargs': {423                    'timeout': 5,424                },425                'oneline': True,426            },427        },428        {429            'name': 'URL + headers + kwarg',430            'arguments': {431                'url': TEST_BASE_URL,432                'headers': {433                    'Content-Type': 'application/json',434                    'Accept-Language': '*',435                },436                'kwargs': {437                    'timeout': 5,438                },439            },440        },441        {442            'name': 'URL + header + kwargs',443            'arguments': {444                'url': TEST_BASE_URL,445                'headers': {446                    'Accept-Language': '*',447                },448                'kwargs': {449                    'timeout': 5,450                    'stream': False,451                },452            },453        },454        {455            'name': 'URL + headers + kwargs',456            'arguments': {457                'url': TEST_BASE_URL,458                'headers': {459                    'Content-Type': 'application/json',460                    'Accept-Language': '*',461                },462                'kwargs': {463                    'timeout': 5,464                    'stream': False,465                },466            },467        },468        {469            'name': 'Parameter + header + kwarg',470            'arguments': {471                'url': TEST_BASE_URL,472                'parameters': [473                    {474                        'name': 'param-1',475                        'value': 'value-1',476                    },477                ],478                'headers': {479                    'Content-Type': 'application/json',480                },481                'kwargs': {482                    'timeout': 5,483                },484            },485        },486        {487            'name': 'Parameter + header + kwargs',488            'arguments': {489                'url': TEST_BASE_URL,490                'parameters': [491                    {492                        'name': 'param-1',493                        'value': 'value-1',494                    },495                ],496                'headers': {497                    'Content-Type': 'application/json',498                },499                'kwargs': {500                    'timeout': 5,501                    'stream': True,502                },503            },504        },505        {506            'name': 'Parameters + header + kwarg',507            'arguments': {508                'url': TEST_BASE_URL,509                'parameters': [510                    {511                        'name': 'param-1',512                        'value': 'value-1',513                    },514                    {515                        'name': 'param-2',516                        'value': 7.77,517                    },518                ],519                'headers': {520                    'Content-Type': 'application/json',521                },522                'kwargs': {523                    'timeout': 5,524                },525            },526        },527        {528            'name': 'Parameters + header + kwargs',529            'arguments': {530                'url': TEST_BASE_URL,531                'parameters': [532                    {533                        'name': 'param-1',534                        'value': 'value-1',535                    },536                    {537                        'name': 'param-2',538                        'value': 7.77,539                    },540                ],541                'headers': {542                    'Content-Type': 'application/json',543                },544                'kwargs': {545                    'timeout': 5,546                    'stream': False,547                },548            },549        },550        {551            'name': 'Parameters + headers + kwarg',552            'arguments': {553                'url': TEST_BASE_URL,554                'parameters': [555                    {556                        'name': 'param-1',557                        'value': 'value-1',558                    },559                    {560                        'name': 'param-2',561                        'value': 7.77,562                    },563                ],564                'headers': {565                    'Content-Type': 'application/json',566                    'Accept-Language': 'fr',567                },568                'kwargs': {569                    'timeout': 5,570                },571            },572        },573        {574            'name': 'Parameters + headers + kwargs',575            'arguments': {576                'url': TEST_BASE_URL,577                'parameters': [578                    {579                        'name': 'param-1',580                        'value': 'value-1',581                    },582                    {583                        'name': 'param-2',584                        'value': 7.77,585                    },586                ],587                'headers': {588                    'Content-Type': 'application/json',589                    'Accept-Language': 'fr',590                },591                'kwargs': {592                    'timeout': 5,593                    'stream': True,594                },595            },596        },597        {598            'name': 'Setup',599            'arguments': {600                'url': TEST_BASE_URL,601                'setup': True,602            },603        },604        {605            'name': 'No setup',606            'arguments': {607                'url': TEST_BASE_URL,608                'setup': False,609            },610        },611        {612            'name': 'Custom setup',613            'arguments': {614                'url': TEST_BASE_URL,615                'setup': 'custom_setup=1\n\n',616            },617        },618        {619            'name': 'Custom teardown',620            'arguments': {621                'url': TEST_BASE_URL,622                'teardown': '\n\ncustom_teardown=1',623            },624        },625        {626            'name': 'Quote character \'',627            'arguments': {628                'url': TEST_BASE_URL,629                'quote_char': '\'',630            },631        },632        {633            'name': 'Quote character "',634            'arguments': {635                'url': TEST_BASE_URL,636                'quote_char': '"',637            },638        },639        {640            'name': 'Indent 2 spaces',641            'arguments': {642                'url': TEST_BASE_URL,643                'indent': '  ',644                'headers': {645                    'Accept-Language': 'es en fr * ' * 20,646                },647            },648        },649        {650            'name': 'Indent 4 spaces',651            'arguments': {652                'url': TEST_BASE_URL,653                'indent': '    ',654                'headers': {655                    'Accept-Language': 'es en fr * ' * 20,656                },657            },658        },659        {660            'name': 'One line',661            'arguments': {662                'url': TEST_BASE_URL,663                'oneline': True,664            },665        },666        {667            'name': 'One line + no setup',668            'arguments': {669                'url': TEST_BASE_URL,670                'oneline': True,671                'setup': False,672            },673        },674        {675            'name': 'Wrap 0',676            'arguments': {677                'url': TEST_BASE_URL,678                'wrap': 0,679            },680        },681        {682            'name': 'Wrap 1',683            'arguments': {684                'url': TEST_BASE_URL,685                'wrap': 1,686            },687        },688        {689            'name': 'Wrap 10',690            'arguments': {691                'url': TEST_BASE_URL,692                'wrap': 10,693            },694        },695        {696            'name': 'Wrap 20',697            'arguments': {698                'url': TEST_BASE_URL,699                'wrap': 20,700            },701        },702        {703            'name': 'Wrap 25',704            'arguments': {705                'url': TEST_BASE_URL,706                'wrap': 25,707            },708        },709        {710            'name': 'Wrap 30',711            'arguments': {712                'url': TEST_BASE_URL,713                'wrap': 30,714            },715        },716        {717            'name': 'Wrap 35',718            'arguments': {719                'url': TEST_BASE_URL,720                'wrap': 35,721            },722        },723        {724            'name': 'Wrap 40',725            'arguments': {726                'url': TEST_BASE_URL,727                'wrap': 40,728            },729        },730        {731            'name': 'Wrap infinite',732            'arguments': {733                'url': TEST_BASE_URL,734                'wrap': float('inf'),735            },736        },737        {738            'name': 'Wrap null is infinite',739            'arguments': {740                'url': TEST_BASE_URL,741                'wrap': None,742            },743        },744    ]745    if method.lower() == 'post':746        response.extend([747            {748                'name': 'Data by parameter (text/plain)',749                'arguments': {750                    'url': TEST_BASE_URL,751                    'parameters': [752                        {753                            'name': '',754                            'value': 'foo bar baz ' * 3,755                        },756                    ],757                    'headers': {758                        'Content-Type': 'text/plain',759                    },760                },761            },762            {763                'name': 'Data by parameter (text/plain) wrapping value',764                'arguments': {765                    'url': TEST_BASE_URL,766                    'parameters': [767                        {768                            'name': '',769                            'value': 'foo bar baz ' * 30,770                        },771                    ],772                    'headers': {773                        'Content-Type': 'text/plain',774                    },775                },776            },777            {778                'name': 'Data by parameter (application/json)',779                'arguments': {780                    'url': TEST_BASE_URL,781                    'parameters': [782                        {783                            'name': 'param-1',784                            'value': 'value-1',785                        },786                    ],787                    'headers': {788                        'Content-Type': 'application/json',789                    },790                },791            },792            {793                'name': 'Data by parameters (application/json)',794                'arguments': {795                    'url': TEST_BASE_URL,796                    'parameters': [797                        {798                            'name': 'param-int',799                            'value': 1,800                        },801                        {802                            'name': 'param-float',803                            'value': .777,804                        },805                        {806                            'name': 'param-bool',807                            'value': True,808                        },809                    ],810                    'headers': {811                        'Content-Type': 'application/json',812                    },813                },814            },815            {816                'name': (817                    'Data by parameter'818                    ' (application/x-www-form-urlencoded)'819                ),820                'arguments': {821                    'url': TEST_BASE_URL,822                    'parameters': [823                        {824                            'name': 'param-1',825                            'value': 'value-1',826                        },827                    ],828                    'headers': {829                        'Content-Type': 'application/x-www-form-urlencoded',830                    },831                },832            },833            {834                'name': (835                    'Data by parameters'836                    ' (application/x-www-form-urlencoded)'837                ),838                'arguments': {839                    'url': TEST_BASE_URL,840                    'parameters': [841                        {842                            'name': 'param-int',843                            'value': 1,844                        },845                        {846                            'name': 'param-float',847                            'value': .777,848                        },849                        {850                            'name': 'param-bool',851                            'value': True,852                        },853                    ],854                    'headers': {855                        'Content-Type': 'application/x-www-form-urlencoded',856                    },857                },858            },859            {860                'name': 'File by filepath (multipart/form-data)',861                'arguments': {862                    'url': TEST_BASE_URL,863                    'files': {864                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),865                    },866                },867            },868            {869                'name': 'Files by filepath (multipart/form-data)',870                'arguments': {871                    'url': TEST_BASE_URL,872                    'files': {873                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),874                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),875                    },876                },877            },878            {879                'name': 'File by filepath (multipart/form-data) wrapping',880                'arguments': {881                    'url': TEST_BASE_URL,882                    'files': {883                        'param-1': os.path.join(884                            TEMPDIR, '%s.ext' % ('foo' * 40),885                        ),886                    },887                },888            },889            {890                'name': (891                    'Files by filepath (multipart/form-data)'892                    ' with Content-Type'893                ),894                'arguments': {895                    'url': TEST_BASE_URL,896                    'files': {897                        'param-1': (898                            os.path.join(TEMPDIR, 'file-1.ext'),899                            'text/plain',900                        ),901                        'param-2': (902                            os.path.join(TEMPDIR, 'file-2.ext'),903                            'text/csv',904                        ),905                    },906                },907            },908            {909                'name': (910                    'File by filepath (multipart/form-data)'911                    ' with Content-Type wrapping'912                ),913                'arguments': {914                    'url': TEST_BASE_URL,915                    'files': {916                        'param-1': (917                            os.path.join(TEMPDIR, 'file-1.ext'),918                            'text/plain ' * 20,919                        ),920                    },921                },922            },923            {924                'name': (925                    'File by filepath (multipart/form-data),'926                    ' Content-Type, header'927                ),928                'arguments': {929                    'url': TEST_BASE_URL,930                    'files': {931                        'param-1': (932                            os.path.join(TEMPDIR, 'file-1.ext'),933                            'text/plain',934                            {'Accept-Language': 'es'},935                        ),936                    },937                },938            },939            {940                'name': (941                    'File by filepath (multipart/form-data),'942                    ' Content-Type, headers'943                ),944                'arguments': {945                    'url': TEST_BASE_URL,946                    'files': {947                        'param-1': (948                            os.path.join(TEMPDIR, 'file-1.ext'),949                            'text/plain',950                            {951                                'Accept-Language': 'es',952                                'Accept-Charset': 'utf-8',953                            },954                        ),955                    },956                },957            },958            {959                'name': 'Files by filepath (multipart/form-data) + parameter',960                'arguments': {961                    'url': TEST_BASE_URL,962                    'files': {963                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),964                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),965                    },966                    'parameters': [967                        {968                            'name': 'param-1',969                            'value': 'value-1',970                        },971                    ],972                },973            },974            {975                'name': 'Files by filepath (multipart/form-data) + parameters',976                'arguments': {977                    'url': TEST_BASE_URL,978                    'files': {979                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),980                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),981                    },982                    'parameters': [983                        {984                            'name': 'param-1',985                            'value': 'value-1',986                        },987                        {988                            'name': 'param-2',989                            'value': 'value-2',990                        },991                    ],992                },993            },994            {995                'name': (996                    'Files by filepath (multipart/form-data) + parameter'997                    ' + header'998                ),999                'arguments': {1000                    'url': TEST_BASE_URL,1001                    'files': {1002                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1003                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1004                    },1005                    'parameters': [1006                        {1007                            'name': 'param-1',1008                            'value': 'value-1',1009                        },1010                    ],1011                    'headers': {1012                        'Accept-Language': 'fr',1013                    },1014                },1015            },1016            {1017                'name': (1018                    'Files by filepath (multipart/form-data) + parameter'1019                    ' + headers'1020                ),1021                'arguments': {1022                    'url': TEST_BASE_URL,1023                    'files': {1024                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1025                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1026                    },1027                    'parameters': [1028                        {1029                            'name': 'param-1',1030                            'value': 'value-1',1031                        },1032                    ],1033                    'headers': {1034                        'Accept-Language': 'fr',1035                        'Accept-Charset': 'utf-8',1036                    },1037                },1038            },1039            {1040                'name': (1041                    'Files by filepath (multipart/form-data) + parameters'1042                    ' + header'1043                ),1044                'arguments': {1045                    'url': TEST_BASE_URL,1046                    'files': {1047                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1048                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1049                    },1050                    'parameters': [1051                        {1052                            'name': 'param-1',1053                            'value': 'value-1',1054                        },1055                        {1056                            'name': 'param-2',1057                            'value': 'value-2',1058                        },1059                    ],1060                    'headers': {1061                        'Accept-Language': 'es',1062                    },1063                },1064            },1065            {1066                'name': (1067                    'Files by filepath (multipart/form-data) + parameters'1068                    ' + headers'1069                ),1070                'arguments': {1071                    'url': TEST_BASE_URL,1072                    'files': {1073                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1074                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1075                    },1076                    'parameters': [1077                        {1078                            'name': 'param-1',1079                            'value': 'value-1',1080                        },1081                        {1082                            'name': 'param-2',1083                            'value': 'value-2',1084                        },1085                    ],1086                    'headers': {1087                        'Accept-Language': 'fr',1088                        'Accept-Charset': 'utf-8',1089                    },1090                },1091            },1092            {1093                'name': (1094                    'Files by filepath (multipart/form-data) + parameter'1095                    ' + header + kwarg'1096                ),1097                'arguments': {1098                    'url': TEST_BASE_URL,1099                    'files': {1100                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1101                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1102                    },1103                    'parameters': [1104                        {1105                            'name': 'param-1',1106                            'value': 'value-1',1107                        },1108                    ],1109                    'headers': {1110                        'Accept-Language': 'fr',1111                    },1112                    'kwargs': {1113                        'timeout': 10,1114                    },1115                },1116            },1117            {1118                'name': (1119                    'Files by filepath (multipart/form-data) + parameter'1120                    ' + headers + kwarg'1121                ),1122                'arguments': {1123                    'url': TEST_BASE_URL,1124                    'files': {1125                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1126                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1127                    },1128                    'parameters': [1129                        {1130                            'name': 'param-1',1131                            'value': 'value-1',1132                        },1133                    ],1134                    'headers': {1135                        'Accept-Language': 'fr',1136                        'Accept-Charset': 'utf-8',1137                    },1138                    'kwargs': {1139                        'timeout': 10,1140                    },1141                },1142            },1143            {1144                'name': (1145                    'Files by filepath (multipart/form-data) + parameters'1146                    ' + header + kwarg'1147                ),1148                'arguments': {1149                    'url': TEST_BASE_URL,1150                    'files': {1151                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1152                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1153                    },1154                    'parameters': [1155                        {1156                            'name': 'param-1',1157                            'value': 'value-1',1158                        },1159                        {1160                            'name': 'param-2',1161                            'value': 'value-2',1162                        },1163                    ],1164                    'headers': {1165                        'Accept-Language': 'fr',1166                    },1167                    'kwargs': {1168                        'timeout': 10,1169                    },1170                },1171            },1172            {1173                'name': (1174                    'Files by filepath (multipart/form-data) + parameters'1175                    ' + headers + kwarg'1176                ),1177                'arguments': {1178                    'url': TEST_BASE_URL,1179                    'files': {1180                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1181                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1182                    },1183                    'parameters': [1184                        {1185                            'name': 'param-1',1186                            'value': 'value-1',1187                        },1188                        {1189                            'name': 'param-2',1190                            'value': 'value-2',1191                        },1192                    ],1193                    'headers': {1194                        'Accept-Language': 'fr',1195                        'Accept-Charset': 'utf-8',1196                    },1197                    'kwargs': {1198                        'timeout': 10,1199                    },1200                },1201            },1202            {1203                'name': (1204                    'Files by filepath (multipart/form-data) + parameter'1205                    ' + header + kwargs'1206                ),1207                'arguments': {1208                    'url': TEST_BASE_URL,1209                    'files': {1210                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1211                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1212                    },1213                    'parameters': [1214                        {1215                            'name': 'param-1',1216                            'value': 'value-1',1217                        },1218                    ],1219                    'headers': {1220                        'Accept-Language': 'fr',1221                    },1222                    'kwargs': {1223                        'timeout': 10,1224                        'cookies': {1225                            'hello': 'world',1226                        },1227                    },1228                },1229            },1230            {1231                'name': (1232                    'Files by filepath (multipart/form-data) + parameter'1233                    ' + headers + kwargs'1234                ),1235                'arguments': {1236                    'url': TEST_BASE_URL,1237                    'files': {1238                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1239                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1240                    },1241                    'parameters': [1242                        {1243                            'name': 'param-1',1244                            'value': 'value-1',1245                        },1246                    ],1247                    'headers': {1248                        'Accept-Language': 'fr',1249                        'Accept-Charset': 'utf-8',1250                    },1251                    'kwargs': {1252                        'timeout': 10,1253                        'stream': False,1254                    },1255                },1256            },1257            {1258                'name': (1259                    'Files by filepath (multipart/form-data) + parameters'1260                    ' + header + kwargs'1261                ),1262                'arguments': {1263                    'url': TEST_BASE_URL,1264                    'files': {1265                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1266                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1267                    },1268                    'parameters': [1269                        {1270                            'name': 'param-1',1271                            'value': 'value-1',1272                        },1273                        {1274                            'name': 'param-2',1275                            'value': 'value-2',1276                        },1277                    ],1278                    'headers': {1279                        'Accept-Language': 'fr',1280                    },1281                    'kwargs': {1282                        'timeout': 10,1283                        'stream': False,1284                    },1285                },1286            },1287            {1288                'name': (1289                    'Files by filepath (multipart/form-data) + parameters'1290                    ' + headers + kwargs'1291                ),1292                'arguments': {1293                    'url': TEST_BASE_URL,1294                    'files': {1295                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1296                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1297                    },1298                    'parameters': [1299                        {1300                            'name': 'param-1',1301                            'value': 'value-1',1302                        },1303                        {1304                            'name': 'param-2',1305                            'value': 'value-2',1306                        },1307                    ],1308                    'headers': {1309                        'Accept-Language': 'fr',1310                        'Accept-Charset': 'utf-8',1311                    },1312                    'kwargs': {1313                        'timeout': 10,1314                        'stream': False,1315                    },1316                },1317            },1318            {1319                'name': (1320                    'No setup + files by filepath (multipart/form-data)'1321                    ' + parameters + headers + kwargs + '1322                ),1323                'arguments': {1324                    'url': TEST_BASE_URL,1325                    'setup': False,1326                    'files': {1327                        'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1328                        'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1329                    },1330                    'parameters': [1331                        {1332                            'name': 'param-1',1333                            'value': 'value-1',1334                        },1335                        {1336                            'name': 'param-2',1337                            'value': 'value-2',1338                        },1339                    ],1340                    'headers': {1341                        'Accept-Language': 'fr',1342                        'Accept-Charset': 'utf-8',1343                    },1344                    'kwargs': {1345                        'timeout': 10,1346                        'stream': False,1347                    },1348                },1349            },1350        ])1351    if include_filenames:1352        for index, args_group in enumerate(response):1353            fname = argument_combination_to_filename(1354                args_group['name'], index,1355            )1356            if dirpath and os.path.exists(dirpath):1357                fname = os.path.join(dirpath, fname)1358            args_group['filename'] = fname...test_logoscraper.py
Source:test_logoscraper.py  
1from bs4 import BeautifulSoup2import scraper.logoscraper as logoscraper3test_base_url = "https://www.testbase.com"4def test_get_logo_should_pass():5    htmls = [6        f"<div class='logo'><img src='https://www.test.com'></img></div>",7        f"<div id='logo'><img src='https://www.test.com'></img></div>",8        f"<a><img src='https://www.test.com'></img></a>",9        f"<div><img src='https://www.test.com'></img></div>",10        f"<a href={test_base_url}><img src='https://www.test.com'></img></div>",11    ]12    for html in htmls:13        assert (14            logoscraper.get_logo(BeautifulSoup(html, "html.parser"), test_base_url)15            == "https://www.test.com"16        )17def test_find_image_tag():18    result = logoscraper.find_img_tag(19        BeautifulSoup(20            f'<a><img src="https://{test_base_url}"></img></a>', "html.parser"21        ),22        test_base_url,23    )24    assert result25def test_find_image_tag_return_itself():26    result = logoscraper.find_img_tag(27        BeautifulSoup(f'<img src="https://{test_base_url}"></img>', "html.parser"),28        test_base_url,29    )30    assert result31def test_find_image_tag_retrun_none():32    result = logoscraper.find_img_tag(33        BeautifulSoup(f'<a href="https://{test_base_url}"></a>', "html.parser"),34        test_base_url,35    )36    assert result is None37def test_format_image_source():38    html = BeautifulSoup(f"<img src='{test_base_url}'></img>", "html.parser").find(39        "img"40    )41    assert logoscraper.format_image_source(html, test_base_url) == test_base_url42def test_format_image_source_no_source():43    html = BeautifulSoup(f"<img></img>", "html.parser").find("img")44    assert logoscraper.format_image_source(html, test_base_url) is None45def test_format_image_source_relative_path():46    html = BeautifulSoup(47        f"<img src='resources/images/image.png'></img>", "html.parser"48    ).find("img")49    assert (50        logoscraper.format_image_source(html, test_base_url)51        == f"{test_base_url}/resources/images/image.png"...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
