Best Python code snippet using selene_python
test_tree.py
Source:test_tree.py
1import datetime2import difflib3import textwrap4from decimal import Decimal5from email.utils import format_datetime6from unittest import TestCase7import requests_mock8from dateutil.tz import tzoffset9from tests.helpers import gzip10from usp.log import create_logger11from usp.objects.page import (12 SitemapPage,13 SitemapNewsStory,14 SitemapPageChangeFrequency,15)16from usp.objects.sitemap import (17 IndexRobotsTxtSitemap,18 PagesXMLSitemap,19 IndexXMLSitemap,20 InvalidSitemap,21 PagesTextSitemap,22 IndexWebsiteSitemap,23 PagesRSSSitemap,24 PagesAtomSitemap,25)26from usp.tree import sitemap_tree_for_homepage27# FIXME various exotic properties28# FIXME XML vulnerabilities with Expat29# FIXME max. recursion level30# FIXME tests responses that are too big31log = create_logger(__name__)32class TestSitemapTree(TestCase):33 TEST_BASE_URL = 'http:/β/βtest_ultimate-sitemap-parser.com' # mocked by HTTPretty34 # Publication /β "last modified" date35 TEST_DATE_DATETIME = datetime.datetime(36 year=2009, month=12, day=17, hour=12, minute=4, second=56,37 tzinfo=tzoffset(None, 7200),38 )39 TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat()40 """Test string date formatted as ISO 8601 (for XML and Atom 0.3 /β 1.0 sitemaps)."""41 TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME)42 """Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps)."""43 TEST_PUBLICATION_NAME = 'Test publication'44 TEST_PUBLICATION_LANGUAGE = 'en'45 @staticmethod46 def fallback_to_404_not_found_matcher(request):47 """Reply with "404 Not Found" to unmatched URLs instead of throwing NoMockAddress."""48 return requests_mock.create_response(49 request,50 status_code=404,51 reason='Not Found',52 headers={'Content-Type': 'text/βhtml'},53 text="<h1>404 Not Found!</βh1>",54 )55 # noinspection DuplicatedCode56 def test_sitemap_tree_for_homepage(self):57 """Test sitemap_tree_for_homepage()."""58 with requests_mock.Mocker() as m:59 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)60 m.get(61 self.TEST_BASE_URL + '/β',62 text='This is a homepage.',63 )64 m.get(65 self.TEST_BASE_URL + '/βrobots.txt',66 headers={'Content-Type': 'text/βplain'},67 text=textwrap.dedent("""68 User-agent: *69 Disallow: /βwhatever70 71 Sitemap: {base_url}/βsitemap_pages.xml72 73 # Intentionally spelled as "Site-map" as Google tolerates this:74 # https:/β/βgithub.com/βgoogle/βrobotstxt/βblob/βmaster/βrobots.cc#L703 75 Site-map: {base_url}/βsitemap_news_index_1.xml76 """.format(base_url=self.TEST_BASE_URL)).strip(),77 )78 # One sitemap for random static pages79 m.get(80 self.TEST_BASE_URL + '/βsitemap_pages.xml',81 headers={'Content-Type': 'application/βxml'},82 text=textwrap.dedent("""83 <?xml version="1.0" encoding="UTF-8"?>84 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9">85 <url>86 <loc>{base_url}/βabout.html</βloc>87 <lastmod>{last_modified_date}</βlastmod>88 <changefreq>monthly</βchangefreq>89 <priority>0.8</βpriority>90 </βurl>91 <url>92 <loc>{base_url}/βcontact.html</βloc>93 <lastmod>{last_modified_date}</βlastmod>94 95 <!-- Invalid change frequency -->96 <changefreq>when we feel like it</βchangefreq>97 98 <!-- Invalid priority -->99 <priority>1.1</βpriority>100 101 </βurl>102 </βurlset>103 """.format(base_url=self.TEST_BASE_URL, last_modified_date=self.TEST_DATE_STR_ISO8601)).strip(),104 )105 # Index sitemap pointing to sitemaps with stories106 m.get(107 self.TEST_BASE_URL + '/βsitemap_news_index_1.xml',108 headers={'Content-Type': 'application/βxml'},109 text=textwrap.dedent("""110 <?xml version="1.0" encoding="UTF-8"?>111 <sitemapindex xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9">112 <sitemap>113 <loc>{base_url}/βsitemap_news_1.xml</βloc>114 <lastmod>{last_modified}</βlastmod>115 </βsitemap>116 <sitemap>117 <loc>{base_url}/βsitemap_news_index_2.xml</βloc>118 <lastmod>{last_modified}</βlastmod>119 </βsitemap>120 </βsitemapindex>121 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),122 )123 # First sitemap with actual stories124 m.get(125 self.TEST_BASE_URL + '/βsitemap_news_1.xml',126 headers={'Content-Type': 'application/βxml'},127 text=textwrap.dedent("""128 <?xml version="1.0" encoding="UTF-8"?>129 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"130 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9"131 xmlns:xhtml="http:/β/βwww.w3.org/β1999/βxhtml">132 133 <url>134 <loc>{base_url}/βnews/βfoo.html</βloc>135 136 <!-- Element present but empty -->137 <lastmod /β>138 139 <!-- Some other XML namespace -->140 <xhtml:link rel="alternate"141 media="only screen and (max-width: 640px)"142 href="{base_url}/βnews/βfoo.html?mobile=1" /β>143 144 <news:news>145 <news:publication>146 <news:name>{publication_name}</βnews:name>147 <news:language>{publication_language}</βnews:language>148 </βnews:publication>149 <news:publication_date>{publication_date}</βnews:publication_date>150 <news:title>Foo <foo></βnews:title> <!-- HTML entity decoding -->151 </βnews:news>152 </βurl>153 154 <!-- Has a duplicate story in /βsitemap_news_2.xml -->155 <url>156 <loc>{base_url}/βnews/βbar.html</βloc>157 <xhtml:link rel="alternate"158 media="only screen and (max-width: 640px)"159 href="{base_url}/βnews/βbar.html?mobile=1" /β>160 <news:news>161 <news:publication>162 <news:name>{publication_name}</βnews:name>163 <news:language>{publication_language}</βnews:language>164 </βnews:publication>165 <news:publication_date>{publication_date}</βnews:publication_date>166 <news:title>Bar & bar</βnews:title>167 </βnews:news>168 </βurl>169 170 </βurlset>171 """.format(172 base_url=self.TEST_BASE_URL,173 publication_name=self.TEST_PUBLICATION_NAME,174 publication_language=self.TEST_PUBLICATION_LANGUAGE,175 publication_date=self.TEST_DATE_STR_ISO8601,176 )).strip(),177 )178 # Another index sitemap pointing to a second sitemaps with stories179 m.get(180 self.TEST_BASE_URL + '/βsitemap_news_index_2.xml',181 headers={'Content-Type': 'application/βxml'},182 text=textwrap.dedent("""183 <?xml version="1.0" encoding="UTF-8"?>184 <sitemapindex xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9">185 186 <sitemap>187 <!-- Extra whitespace added around URL -->188 <loc> {base_url}/βsitemap_news_2.xml </βloc>189 <lastmod>{last_modified}</βlastmod>190 </βsitemap>191 192 <!-- Nonexistent sitemap -->193 <sitemap>194 <loc>{base_url}/βsitemap_news_missing.xml</βloc>195 <lastmod>{last_modified}</βlastmod>196 </βsitemap>197 198 </βsitemapindex>199 """.format(base_url=self.TEST_BASE_URL, last_modified=self.TEST_DATE_STR_ISO8601)).strip(),200 )201 # Second sitemap with actual stories202 m.get(203 self.TEST_BASE_URL + '/βsitemap_news_2.xml',204 headers={'Content-Type': 'application/βxml'},205 text=textwrap.dedent("""206 <?xml version="1.0" encoding="UTF-8"?>207 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"208 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9"209 xmlns:xhtml="http:/β/βwww.w3.org/β1999/βxhtml">210 211 <!-- Has a duplicate story in /βsitemap_news_1.xml -->212 <url>213 <!-- Extra whitespace added around URL -->214 <loc> {base_url}/βnews/βbar.html </βloc>215 <xhtml:link rel="alternate"216 media="only screen and (max-width: 640px)"217 href="{base_url}/βnews/βbar.html?mobile=1#fragment_is_to_be_removed" /β>218 <news:news>219 <news:publication>220 <news:name>{publication_name}</βnews:name>221 <news:language>{publication_language}</βnews:language>222 </βnews:publication>223 <news:publication_date>{publication_date}</βnews:publication_date>224 225 <tag_without_inner_character_data name="value" /β>226 227 <news:title>Bar & bar</βnews:title>228 </βnews:news>229 </βurl>230 231 <url>232 <loc>{base_url}/βnews/βbaz.html</βloc>233 <xhtml:link rel="alternate"234 media="only screen and (max-width: 640px)"235 href="{base_url}/βnews/βbaz.html?mobile=1" /β>236 <news:news>237 <news:publication>238 <news:name>{publication_name}</βnews:name>239 <news:language>{publication_language}</βnews:language>240 </βnews:publication>241 <news:publication_date>{publication_date}</βnews:publication_date>242 <news:title><![CDATA[BΓΒ
Γ
ΒΎ]]></βnews:title> <!-- CDATA and UTF-8 -->243 </βnews:news>244 </βurl>245 246 </βurlset>247 """.format(248 base_url=self.TEST_BASE_URL,249 publication_name=self.TEST_PUBLICATION_NAME,250 publication_language=self.TEST_PUBLICATION_LANGUAGE,251 publication_date=self.TEST_DATE_STR_ISO8601,252 )).strip(),253 )254 # Nonexistent sitemap255 m.get(256 self.TEST_BASE_URL + '/βsitemap_news_missing.xml',257 status_code=404,258 reason='Not Found',259 headers={'Content-Type': 'text/βhtml'},260 text="<h1>404 Not Found!</βh1>",261 )262 expected_sitemap_tree = IndexWebsiteSitemap(263 url='{}/β'.format(self.TEST_BASE_URL),264 sub_sitemaps=[265 IndexRobotsTxtSitemap(266 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),267 sub_sitemaps=[268 PagesXMLSitemap(269 url='{}/βsitemap_pages.xml'.format(self.TEST_BASE_URL),270 pages=[271 SitemapPage(272 url='{}/βabout.html'.format(self.TEST_BASE_URL),273 last_modified=self.TEST_DATE_DATETIME,274 news_story=None,275 change_frequency=SitemapPageChangeFrequency.MONTHLY,276 priority=Decimal('0.8'),277 ),278 SitemapPage(279 url='{}/βcontact.html'.format(self.TEST_BASE_URL),280 last_modified=self.TEST_DATE_DATETIME,281 news_story=None,282 # Invalid input -- should be reset to "always"283 change_frequency=SitemapPageChangeFrequency.ALWAYS,284 # Invalid input -- should be reset to 0.5 (the default as per the spec)285 priority=Decimal('0.5'),286 )287 ],288 ),289 IndexXMLSitemap(290 url='{}/βsitemap_news_index_1.xml'.format(self.TEST_BASE_URL),291 sub_sitemaps=[292 PagesXMLSitemap(293 url='{}/βsitemap_news_1.xml'.format(self.TEST_BASE_URL),294 pages=[295 SitemapPage(296 url='{}/βnews/βfoo.html'.format(self.TEST_BASE_URL),297 news_story=SitemapNewsStory(298 title='Foo <foo>',299 publish_date=self.TEST_DATE_DATETIME,300 publication_name=self.TEST_PUBLICATION_NAME,301 publication_language=self.TEST_PUBLICATION_LANGUAGE,302 ),303 ),304 SitemapPage(305 url='{}/βnews/βbar.html'.format(self.TEST_BASE_URL),306 news_story=SitemapNewsStory(307 title='Bar & bar',308 publish_date=self.TEST_DATE_DATETIME,309 publication_name=self.TEST_PUBLICATION_NAME,310 publication_language=self.TEST_PUBLICATION_LANGUAGE,311 ),312 ),313 ]314 ),315 IndexXMLSitemap(316 url='{}/βsitemap_news_index_2.xml'.format(self.TEST_BASE_URL),317 sub_sitemaps=[318 PagesXMLSitemap(319 url='{}/βsitemap_news_2.xml'.format(self.TEST_BASE_URL),320 pages=[321 SitemapPage(322 url='{}/βnews/βbar.html'.format(self.TEST_BASE_URL),323 news_story=SitemapNewsStory(324 title='Bar & bar',325 publish_date=self.TEST_DATE_DATETIME,326 publication_name=self.TEST_PUBLICATION_NAME,327 publication_language=self.TEST_PUBLICATION_LANGUAGE,328 ),329 ),330 SitemapPage(331 url='{}/βnews/βbaz.html'.format(self.TEST_BASE_URL),332 news_story=SitemapNewsStory(333 title='BΓΒ
Γ
ΒΎ',334 publish_date=self.TEST_DATE_DATETIME,335 publication_name=self.TEST_PUBLICATION_NAME,336 publication_language=self.TEST_PUBLICATION_LANGUAGE,337 ),338 ),339 ],340 ),341 InvalidSitemap(342 url='{}/βsitemap_news_missing.xml'.format(self.TEST_BASE_URL),343 reason=(344 'Unable to fetch sitemap from {base_url}/βsitemap_news_missing.xml: '345 '404 Not Found'346 ).format(base_url=self.TEST_BASE_URL),347 ),348 ],349 ),350 ],351 ),352 ],353 )354 ]355 )356 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)357 expected_lines = str(expected_sitemap_tree).split()358 actual_lines = str(actual_sitemap_tree).split()359 diff = difflib.ndiff(expected_lines, actual_lines)360 diff_str = '\n'.join(diff)361 assert expected_sitemap_tree == actual_sitemap_tree, diff_str362 assert len(list(actual_sitemap_tree.all_pages())) == 6363 def test_sitemap_tree_for_homepage_gzip(self):364 """Test sitemap_tree_for_homepage() with gzipped sitemaps."""365 with requests_mock.Mocker() as m:366 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)367 m.get(368 self.TEST_BASE_URL + '/β',369 text='This is a homepage.',370 )371 m.get(372 self.TEST_BASE_URL + '/βrobots.txt',373 headers={'Content-Type': 'text/βplain'},374 text=textwrap.dedent("""375 User-agent: *376 Disallow: /βwhatever377 378 Sitemap: {base_url}/βsitemap_1.gz379 Sitemap: {base_url}/βsitemap_2.dat380 Sitemap: {base_url}/βsitemap_3.xml.gz381 """.format(base_url=self.TEST_BASE_URL)).strip(),382 )383 # Gzipped sitemap without correct HTTP header but with .gz extension384 m.get(385 self.TEST_BASE_URL + '/βsitemap_1.gz',386 content=gzip(textwrap.dedent("""387 <?xml version="1.0" encoding="UTF-8"?>388 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"389 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">390 <url>391 <loc>{base_url}/βnews/βfoo.html</βloc>392 <news:news>393 <news:publication>394 <news:name>{publication_name}</βnews:name>395 <news:language>{publication_language}</βnews:language>396 </βnews:publication>397 <news:publication_date>{publication_date}</βnews:publication_date>398 <news:title>Foo <foo></βnews:title> <!-- HTML entity decoding -->399 </βnews:news>400 </βurl>401 </βurlset>402 """.format(403 base_url=self.TEST_BASE_URL,404 publication_name=self.TEST_PUBLICATION_NAME,405 publication_language=self.TEST_PUBLICATION_LANGUAGE,406 publication_date=self.TEST_DATE_STR_ISO8601,407 )).strip()),408 )409 # Gzipped sitemap with correct HTTP header but without .gz extension410 m.get(411 self.TEST_BASE_URL + '/βsitemap_2.dat',412 headers={'Content-Type': 'application/βx-gzip'},413 content=gzip(textwrap.dedent("""414 <?xml version="1.0" encoding="UTF-8"?>415 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"416 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">417 <url>418 <loc>{base_url}/βnews/βbar.html</βloc>419 <news:news>420 <news:publication>421 <news:name>{publication_name}</βnews:name>422 <news:language>{publication_language}</βnews:language>423 </βnews:publication>424 <news:publication_date>{publication_date}</βnews:publication_date>425 <news:title><![CDATA[BΓΒ
r]]></βnews:title> <!-- CDATA and UTF-8 -->426 </βnews:news>427 </βurl>428 </βurlset>429 """.format(430 base_url=self.TEST_BASE_URL,431 publication_name=self.TEST_PUBLICATION_NAME,432 publication_language=self.TEST_PUBLICATION_LANGUAGE,433 publication_date=self.TEST_DATE_STR_ISO8601,434 )).strip()),435 )436 # Sitemap which appears to be gzipped (due to extension and Content-Type) but really isn't437 m.get(438 self.TEST_BASE_URL + '/βsitemap_3.xml.gz',439 headers={'Content-Type': 'application/βx-gzip'},440 text=textwrap.dedent("""441 <?xml version="1.0" encoding="UTF-8"?>442 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"443 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">444 <url>445 <loc>{base_url}/βnews/βbaz.html</βloc>446 <news:news>447 <news:publication>448 <news:name>{publication_name}</βnews:name>449 <news:language>{publication_language}</βnews:language>450 </βnews:publication>451 <news:publication_date>{publication_date}</βnews:publication_date>452 <news:title><![CDATA[BΓΒ
Γ
ΒΎ]]></βnews:title> <!-- CDATA and UTF-8 -->453 </βnews:news>454 </βurl>455 </βurlset>456 """.format(457 base_url=self.TEST_BASE_URL,458 publication_name=self.TEST_PUBLICATION_NAME,459 publication_language=self.TEST_PUBLICATION_LANGUAGE,460 publication_date=self.TEST_DATE_STR_ISO8601,461 )).strip(),462 )463 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)464 # Don't do an in-depth check, we just need to make sure that gunzip works465 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)466 assert len(actual_sitemap_tree.sub_sitemaps) == 1467 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)468 # noinspection PyUnresolvedReferences469 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 3470 # noinspection PyUnresolvedReferences471 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]472 assert isinstance(sitemap_1, PagesXMLSitemap)473 assert len(sitemap_1.pages) == 1474 # noinspection PyUnresolvedReferences475 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]476 assert isinstance(sitemap_2, PagesXMLSitemap)477 assert len(sitemap_2.pages) == 1478 # noinspection PyUnresolvedReferences479 sitemap_3 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[2]480 assert isinstance(sitemap_3, PagesXMLSitemap)481 assert len(sitemap_3.pages) == 1482 def test_sitemap_tree_for_homepage_plain_text(self):483 """Test sitemap_tree_for_homepage() with plain text sitemaps."""484 with requests_mock.Mocker() as m:485 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)486 m.get(487 self.TEST_BASE_URL + '/β',488 text='This is a homepage.',489 )490 m.get(491 self.TEST_BASE_URL + '/βrobots.txt',492 headers={'Content-Type': 'text/βplain'},493 text=textwrap.dedent("""494 User-agent: *495 Disallow: /βwhatever496 497 Sitemap: {base_url}/βsitemap_1.txt498 Sitemap: {base_url}/βsitemap_2.txt.dat499 """.format(base_url=self.TEST_BASE_URL)).strip(),500 )501 # Plain text uncompressed sitemap (no Content-Type header)502 m.get(503 self.TEST_BASE_URL + '/βsitemap_1.txt',504 text=textwrap.dedent("""505 506 {base_url}/βnews/βfoo.html507 508 509 {base_url}/βnews/βbar.html510 511 Some other stuff which totally doesn't look like an URL512 """.format(base_url=self.TEST_BASE_URL)).strip(),513 )514 # Plain text compressed sitemap without .gz extension515 m.get(516 self.TEST_BASE_URL + '/βsitemap_2.txt.dat',517 headers={'Content-Type': 'application/βx-gzip'},518 content=gzip(textwrap.dedent("""519 {base_url}/βnews/βbar.html520 {base_url}/βnews/βbaz.html521 """.format(base_url=self.TEST_BASE_URL)).strip()),522 )523 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)524 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)525 assert len(actual_sitemap_tree.sub_sitemaps) == 1526 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)527 # noinspection PyUnresolvedReferences528 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2529 # noinspection PyUnresolvedReferences530 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]531 assert isinstance(sitemap_1, PagesTextSitemap)532 assert len(sitemap_1.pages) == 2533 # noinspection PyUnresolvedReferences534 sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]535 assert isinstance(sitemap_2, PagesTextSitemap)536 assert len(sitemap_2.pages) == 2537 pages = list(actual_sitemap_tree.all_pages())538 assert len(pages) == 4539 assert SitemapPage(url='{}/βnews/βfoo.html'.format(self.TEST_BASE_URL)) in pages540 assert SitemapPage(url='{}/βnews/βbar.html'.format(self.TEST_BASE_URL)) in pages541 assert SitemapPage(url='{}/βnews/βbaz.html'.format(self.TEST_BASE_URL)) in pages542 # noinspection DuplicatedCode543 def test_sitemap_tree_for_homepage_rss_atom(self):544 """Test sitemap_tree_for_homepage() with RSS 2.0 /β Atom 0.3 /β Atom 1.0 feeds."""545 with requests_mock.Mocker() as m:546 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)547 m.get(548 self.TEST_BASE_URL + '/β',549 text='This is a homepage.',550 )551 m.get(552 self.TEST_BASE_URL + '/βrobots.txt',553 headers={'Content-Type': 'text/βplain'},554 text=textwrap.dedent("""555 User-agent: *556 Disallow: /βwhatever557 Sitemap: {base_url}/βsitemap_rss.xml558 Sitemap: {base_url}/βsitemap_atom_0_3.xml559 Sitemap: {base_url}/βsitemap_atom_1_0.xml560 """.format(base_url=self.TEST_BASE_URL)).strip(),561 )562 # RSS 2.0 sitemap563 m.get(564 self.TEST_BASE_URL + '/βsitemap_rss.xml',565 headers={'Content-Type': 'application/βrss+xml'},566 text=textwrap.dedent("""567 <?xml version="1.0" encoding="UTF-8"?>568 <rss version="2.0">569 <channel>570 <title>Test RSS 2.0 feed</βtitle>571 <description>This is a test RSS 2.0 feed.</βdescription>572 <link>{base_url}</βlink>573 <pubDate>{pub_date}</βpubDate>574 <item>575 <title>Test RSS 2.0 story #1</βtitle>576 <description>This is a test RSS 2.0 story #1.</βdescription>577 <link>{base_url}/βrss_story_1.html</βlink>578 <guid isPermaLink="true">{base_url}/βrss_story_1.html</βguid>579 <pubDate>{pub_date}</βpubDate>580 </βitem>581 <item>582 <title>Test RSS 2.0 story #2</βtitle>583 <description>This is a test RSS 2.0 story #2.</βdescription>584 <link>{base_url}/βrss_story_2.html</βlink>585 <guid isPermaLink="true">{base_url}/βrss_story_2.html</βguid>586 <pubDate>{pub_date}</βpubDate>587 </βitem>588 </βchannel>589 </βrss>590 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),591 )592 # Atom 0.3 sitemap593 m.get(594 self.TEST_BASE_URL + '/βsitemap_atom_0_3.xml',595 headers={'Content-Type': 'application/βatom+xml'},596 text=textwrap.dedent("""597 <?xml version="1.0" encoding="UTF-8"?>598 <feed version="0.3" xmlns="http:/β/βpurl.org/βatom/βns#">599 <title>Test Atom 0.3 feed</βtitle>600 <link rel="alternate" type="text/βhtml" href="{base_url}" /β>601 <modified>{pub_date}</βmodified>602 <entry>603 <title>Test Atom 0.3 story #1</βtitle>604 <link rel="alternate" type="text/βhtml" href="{base_url}/βatom_0_3_story_1.html" /β>605 <id>{base_url}/βatom_0_3_story_1.html</βid>606 <issued>{pub_date}</βissued>607 </βentry>608 <entry>609 <title>Test Atom 0.3 story #2</βtitle>610 <link rel="alternate" type="text/βhtml" href="{base_url}/βatom_0_3_story_2.html" /β>611 <id>{base_url}/βatom_0_3_story_2.html</βid>612 <issued>{pub_date}</βissued>613 </βentry>614 </βfeed>615 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),616 )617 # Atom 1.0 sitemap618 m.get(619 self.TEST_BASE_URL + '/βsitemap_atom_1_0.xml',620 headers={'Content-Type': 'application/βatom+xml'},621 text=textwrap.dedent("""622 <?xml version="1.0" encoding="UTF-8"?>623 <feed xmlns="http:/β/βwww.w3.org/β2005/βAtom">624 <title>Test Atom 1.0 feed</βtitle>625 <subtitle>This is a test Atom 1.0 feed.</βsubtitle>626 <link href="{base_url}/βsitemap_atom_1_0.xml" rel="self" /β>627 <link href="{base_url}" /β>628 <id>{base_url}</βid>629 <updated>{pub_date}</βupdated>630 <entry>631 <title>Test Atom 1.0 story #1</βtitle>632 <link href="{base_url}/βatom_1_0_story_1.html" /β>633 <link rel="alternate" type="text/βhtml" href="{base_url}/βatom_1_0_story_1.html?alt" /β>634 <link rel="edit" href="{base_url}/βatom_1_0_story_1.html?edit" /β>635 <id>{base_url}/βatom_1_0_story_1.html</βid>636 <updated>{pub_date}</βupdated>637 <summary>This is test atom 1.0 story #1.</βsummary>638 <content type="xhtml">639 <div xmlns="http:/β/βwww.w3.org/β1999/βxhtml">640 <p>This is test atom 1.0 story #1.</βp>641 </βdiv>642 </βcontent>643 <author>644 <name>John Doe</βname>645 <email>johndoe@example.com</βemail>646 </βauthor>647 </βentry>648 <entry>649 <title>Test Atom 1.0 story #2</βtitle>650 <link href="{base_url}/βatom_1_0_story_2.html" /β>651 <link rel="alternate" type="text/βhtml" href="{base_url}/βatom_1_0_story_2.html?alt" /β>652 <link rel="edit" href="{base_url}/βatom_1_0_story_2.html?edit" /β>653 <id>{base_url}/βatom_1_0_story_2.html</βid>654 <updated>{pub_date}</βupdated>655 <summary>This is test atom 1.0 story #2.</βsummary>656 <content type="xhtml">657 <div xmlns="http:/β/βwww.w3.org/β1999/βxhtml">658 <p>This is test atom 1.0 story #2.</βp>659 </βdiv>660 </βcontent>661 <author>662 <name>John Doe</βname>663 <email>johndoe@example.com</βemail>664 </βauthor>665 </βentry>666 </βfeed>667 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),668 )669 expected_sitemap_tree = IndexWebsiteSitemap(670 url='{}/β'.format(self.TEST_BASE_URL),671 sub_sitemaps=[672 IndexRobotsTxtSitemap(673 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),674 sub_sitemaps=[675 PagesRSSSitemap(676 url='{}/βsitemap_rss.xml'.format(self.TEST_BASE_URL),677 pages=[678 SitemapPage(679 url='{}/βrss_story_1.html'.format(self.TEST_BASE_URL),680 news_story=SitemapNewsStory(681 title='Test RSS 2.0 story #1',682 publish_date=self.TEST_DATE_DATETIME,683 ),684 ),685 SitemapPage(686 url='{}/βrss_story_2.html'.format(self.TEST_BASE_URL),687 news_story=SitemapNewsStory(688 title='Test RSS 2.0 story #2',689 publish_date=self.TEST_DATE_DATETIME,690 )691 )692 ]693 ),694 PagesAtomSitemap(695 url='{}/βsitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),696 pages=[697 SitemapPage(698 url='{}/βatom_0_3_story_1.html'.format(self.TEST_BASE_URL),699 news_story=SitemapNewsStory(700 title='Test Atom 0.3 story #1',701 publish_date=self.TEST_DATE_DATETIME,702 ),703 ),704 SitemapPage(705 url='{}/βatom_0_3_story_2.html'.format(self.TEST_BASE_URL),706 news_story=SitemapNewsStory(707 title='Test Atom 0.3 story #2',708 publish_date=self.TEST_DATE_DATETIME,709 )710 )711 ]712 ),713 PagesAtomSitemap(714 url='{}/βsitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),715 pages=[716 SitemapPage(717 url='{}/βatom_1_0_story_1.html'.format(self.TEST_BASE_URL),718 news_story=SitemapNewsStory(719 title='Test Atom 1.0 story #1',720 publish_date=self.TEST_DATE_DATETIME,721 ),722 ),723 SitemapPage(724 url='{}/βatom_1_0_story_2.html'.format(self.TEST_BASE_URL),725 news_story=SitemapNewsStory(726 title='Test Atom 1.0 story #2',727 publish_date=self.TEST_DATE_DATETIME,728 )729 )730 ]731 ),732 ]733 )734 ]735 )736 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)737 expected_lines = str(expected_sitemap_tree).split()738 actual_lines = str(actual_sitemap_tree).split()739 diff = difflib.ndiff(expected_lines, actual_lines)740 diff_str = '\n'.join(diff)741 assert expected_sitemap_tree == actual_sitemap_tree, diff_str742 assert len(list(actual_sitemap_tree.all_pages())) == 6743 def test_sitemap_tree_for_homepage_rss_atom_empty(self):744 """Test sitemap_tree_for_homepage() with empty RSS 2.0 /β Atom 0.3 /β Atom 1.0 feeds."""745 with requests_mock.Mocker() as m:746 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)747 m.get(748 self.TEST_BASE_URL + '/β',749 text='This is a homepage.',750 )751 m.get(752 self.TEST_BASE_URL + '/βrobots.txt',753 headers={'Content-Type': 'text/βplain'},754 text=textwrap.dedent("""755 User-agent: *756 Disallow: /βwhatever757 Sitemap: {base_url}/βsitemap_rss.xml758 Sitemap: {base_url}/βsitemap_atom_0_3.xml759 Sitemap: {base_url}/βsitemap_atom_1_0.xml760 """.format(base_url=self.TEST_BASE_URL)).strip(),761 )762 # RSS 2.0 sitemap763 m.get(764 self.TEST_BASE_URL + '/βsitemap_rss.xml',765 headers={'Content-Type': 'application/βrss+xml'},766 text=textwrap.dedent("""767 <?xml version="1.0" encoding="UTF-8"?>768 <rss version="2.0">769 <channel>770 <title>Test RSS 2.0 feed</βtitle>771 <description>This is a test RSS 2.0 feed.</βdescription>772 <link>{base_url}</βlink>773 <pubDate>{pub_date}</βpubDate>774 </βchannel>775 </βrss>776 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_RFC2822)).strip(),777 )778 # Atom 0.3 sitemap779 m.get(780 self.TEST_BASE_URL + '/βsitemap_atom_0_3.xml',781 headers={'Content-Type': 'application/βatom+xml'},782 text=textwrap.dedent("""783 <?xml version="1.0" encoding="UTF-8"?>784 <feed version="0.3" xmlns="http:/β/βpurl.org/βatom/βns#">785 <title>Test Atom 0.3 feed</βtitle>786 <link rel="alternate" type="text/βhtml" href="{base_url}" /β>787 <modified>{pub_date}</βmodified>788 </βfeed>789 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),790 )791 # Atom 1.0 sitemap792 m.get(793 self.TEST_BASE_URL + '/βsitemap_atom_1_0.xml',794 headers={'Content-Type': 'application/βatom+xml'},795 text=textwrap.dedent("""796 <?xml version="1.0" encoding="UTF-8"?>797 <feed xmlns="http:/β/βwww.w3.org/β2005/βAtom">798 <title>Test Atom 1.0 feed</βtitle>799 <subtitle>This is a test Atom 1.0 feed.</βsubtitle>800 <link href="{base_url}/βsitemap_atom_1_0.xml" rel="self" /β>801 <link href="{base_url}" /β>802 <id>{base_url}</βid>803 <updated>{pub_date}</βupdated>804 </βfeed>805 """.format(base_url=self.TEST_BASE_URL, pub_date=self.TEST_DATE_STR_ISO8601)).strip(),806 )807 expected_sitemap_tree = IndexWebsiteSitemap(808 url='{}/β'.format(self.TEST_BASE_URL),809 sub_sitemaps=[810 IndexRobotsTxtSitemap(811 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),812 sub_sitemaps=[813 PagesRSSSitemap(814 url='{}/βsitemap_rss.xml'.format(self.TEST_BASE_URL),815 pages=[]816 ),817 PagesAtomSitemap(818 url='{}/βsitemap_atom_0_3.xml'.format(self.TEST_BASE_URL),819 pages=[]820 ),821 PagesAtomSitemap(822 url='{}/βsitemap_atom_1_0.xml'.format(self.TEST_BASE_URL),823 pages=[]824 ),825 ]826 )827 ]828 )829 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)830 assert expected_sitemap_tree == actual_sitemap_tree831 assert len(list(actual_sitemap_tree.all_pages())) == 0832 def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):833 """Test sitemap_tree_for_homepage() with clipped XML.834 Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the835 server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with836 this behavior, so we have to support this too.837 """838 with requests_mock.Mocker() as m:839 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)840 m.get(841 self.TEST_BASE_URL + '/β',842 text='This is a homepage.',843 )844 m.get(845 self.TEST_BASE_URL + '/βrobots.txt',846 headers={'Content-Type': 'text/βplain'},847 text=textwrap.dedent("""848 User-agent: *849 Disallow: /βwhatever850 851 Sitemap: {base_url}/βsitemap.xml852 """.format(base_url=self.TEST_BASE_URL)).strip(),853 )854 m.get(855 self.TEST_BASE_URL + '/βsitemap.xml',856 text=textwrap.dedent("""857 <?xml version="1.0" encoding="UTF-8"?>858 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"859 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">860 <url>861 <loc>{base_url}/βnews/βfirst.html</βloc>862 <news:news>863 <news:publication>864 <news:name>{publication_name}</βnews:name>865 <news:language>{publication_language}</βnews:language>866 </βnews:publication>867 <news:publication_date>{publication_date}</βnews:publication_date>868 <news:title>First story</βnews:title>869 </βnews:news>870 </βurl>871 <url>872 <loc>{base_url}/βnews/βsecond.html</βloc>873 <news:news>874 <news:publication>875 <news:name>{publication_name}</βnews:name>876 <news:language>{publication_language}</βnews:language>877 </βnews:publication>878 <news:publication_date>{publication_date}</βnews:publication_date>879 <news:title>Second story</βnews:title>880 </βnews:news>881 </βurl>882 883 <!-- The following story shouldn't get added as the XML ends prematurely -->884 <url>885 <loc>{base_url}/βnews/βthird.html</βloc>886 <news:news>887 <news:publication>888 <news:name>{publication_name}</βnews:name>889 <news:language>{publication_language}</βnews:language>890 </βnews:publication>891 <news:publicat892 """.format(893 base_url=self.TEST_BASE_URL,894 publication_name=self.TEST_PUBLICATION_NAME,895 publication_language=self.TEST_PUBLICATION_LANGUAGE,896 publication_date=self.TEST_DATE_STR_ISO8601,897 )).strip(),898 )899 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)900 assert isinstance(actual_sitemap_tree, IndexWebsiteSitemap)901 assert len(actual_sitemap_tree.sub_sitemaps) == 1902 assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)903 # noinspection PyUnresolvedReferences904 assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1905 # noinspection PyUnresolvedReferences906 sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]907 assert isinstance(sitemap, PagesXMLSitemap)908 assert len(sitemap.pages) == 2909 def test_sitemap_tree_for_homepage_no_sitemap(self):910 """Test sitemap_tree_for_homepage() with no sitemaps listed in robots.txt."""911 with requests_mock.Mocker() as m:912 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)913 m.get(914 self.TEST_BASE_URL + '/β',915 text='This is a homepage.',916 )917 m.get(918 self.TEST_BASE_URL + '/βrobots.txt',919 headers={'Content-Type': 'text/βplain'},920 text=textwrap.dedent("""921 User-agent: *922 Disallow: /βwhatever923 """.format(base_url=self.TEST_BASE_URL)).strip(),924 )925 expected_sitemap_tree = IndexWebsiteSitemap(926 url='{}/β'.format(self.TEST_BASE_URL),927 sub_sitemaps=[928 IndexRobotsTxtSitemap(929 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),930 sub_sitemaps=[],931 )932 ]933 )934 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)935 assert expected_sitemap_tree == actual_sitemap_tree936 def test_sitemap_tree_for_homepage_unpublished_sitemap(self):937 """Test sitemap_tree_for_homepage() with some sitemaps not published in robots.txt."""938 with requests_mock.Mocker() as m:939 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)940 m.get(941 self.TEST_BASE_URL + '/β',942 text='This is a homepage.',943 )944 m.get(945 self.TEST_BASE_URL + '/βrobots.txt',946 headers={'Content-Type': 'text/βplain'},947 text=textwrap.dedent("""948 User-agent: *949 Disallow: /βwhatever950 951 Sitemap: {base_url}/βsitemap_public.xml952 """.format(base_url=self.TEST_BASE_URL)).strip(),953 )954 # Public sitemap (linked to from robots.txt)955 m.get(956 self.TEST_BASE_URL + '/βsitemap_public.xml',957 text=textwrap.dedent("""958 <?xml version="1.0" encoding="UTF-8"?>959 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9">960 <url>961 <loc>{base_url}/βnews/βpublic.html</βloc>962 </βurl>963 </βurlset>964 """.format(965 base_url=self.TEST_BASE_URL,966 publication_name=self.TEST_PUBLICATION_NAME,967 publication_language=self.TEST_PUBLICATION_LANGUAGE,968 publication_date=self.TEST_DATE_STR_ISO8601,969 )).strip(),970 )971 # Private sitemap (to be discovered by trying out a few paths)972 m.get(973 self.TEST_BASE_URL + '/βsitemap_index.xml',974 text=textwrap.dedent("""975 <?xml version="1.0" encoding="UTF-8"?>976 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9">977 <url>978 <loc>{base_url}/βnews/βprivate.html</βloc>979 </βurl>980 </βurlset>981 """.format(982 base_url=self.TEST_BASE_URL,983 publication_name=self.TEST_PUBLICATION_NAME,984 publication_language=self.TEST_PUBLICATION_LANGUAGE,985 publication_date=self.TEST_DATE_STR_ISO8601,986 )).strip(),987 )988 expected_sitemap_tree = IndexWebsiteSitemap(989 url='{}/β'.format(self.TEST_BASE_URL),990 sub_sitemaps=[991 IndexRobotsTxtSitemap(992 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),993 sub_sitemaps=[994 PagesXMLSitemap(995 url='{}/βsitemap_public.xml'.format(self.TEST_BASE_URL),996 pages=[997 SitemapPage(998 url='{}/βnews/βpublic.html'.format(self.TEST_BASE_URL),999 ),1000 ],1001 ),1002 ],1003 ),1004 PagesXMLSitemap(1005 url='{}/βsitemap_index.xml'.format(self.TEST_BASE_URL),1006 pages=[1007 SitemapPage(1008 url='{}/βnews/βprivate.html'.format(self.TEST_BASE_URL),1009 ),1010 ],1011 ),1012 ]1013 )1014 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1015 assert expected_sitemap_tree == actual_sitemap_tree1016 def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):1017 """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""1018 with requests_mock.Mocker() as m:1019 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1020 m.get(1021 self.TEST_BASE_URL + '/β',1022 text='This is a homepage.',1023 )1024 m.get(1025 self.TEST_BASE_URL + '/βrobots.txt',1026 headers={'Content-Type': ''},1027 text=textwrap.dedent("""1028 User-agent: *1029 Disallow: /βwhatever1030 """.format(base_url=self.TEST_BASE_URL)).strip(),1031 )1032 expected_sitemap_tree = IndexWebsiteSitemap(1033 url='{}/β'.format(self.TEST_BASE_URL),1034 sub_sitemaps=[1035 IndexRobotsTxtSitemap(1036 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),1037 sub_sitemaps=[],1038 )1039 ]1040 )1041 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1042 assert expected_sitemap_tree == actual_sitemap_tree1043 def test_sitemap_tree_for_homepage_no_robots_txt(self):1044 """Test sitemap_tree_for_homepage() with no robots.txt."""1045 with requests_mock.Mocker() as m:1046 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1047 m.get(1048 self.TEST_BASE_URL + '/β',1049 text='This is a homepage.',1050 )1051 # Nonexistent robots.txt1052 m.get(1053 self.TEST_BASE_URL + '/βrobots.txt',1054 status_code=404,1055 reason='Not Found',1056 headers={'Content-Type': 'text/βhtml'},1057 text="<h1>404 Not Found!</βh1>",1058 )1059 expected_sitemap_tree = IndexWebsiteSitemap(1060 url='{}/β'.format(self.TEST_BASE_URL),1061 sub_sitemaps=[1062 InvalidSitemap(1063 url='{}/βrobots.txt'.format(self.TEST_BASE_URL),1064 reason=(1065 'Unable to fetch sitemap from {base_url}/βrobots.txt: 404 Not Found'1066 ).format(base_url=self.TEST_BASE_URL),1067 )1068 ]1069 )1070 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1071 assert expected_sitemap_tree == actual_sitemap_tree1072 def test_sitemap_tree_for_homepage_huge_sitemap(self):1073 """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""1074 page_count = 10001075 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>1076 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"1077 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9"1078 xmlns:xhtml="http:/β/βwww.w3.org/β1999/βxhtml">1079 """1080 for x in range(page_count):1081 sitemap_xml += """1082 <url>1083 <loc>{base_url}/βnews/βpage_{x}.html</βloc>1084 <!-- Element present but empty -->1085 <lastmod /β>1086 <!-- Some other XML namespace -->1087 <xhtml:link rel="alternate"1088 media="only screen and (max-width: 640px)"1089 href="{base_url}/βnews/βpage_{x}.html?mobile=1" /β>1090 <news:news>1091 <news:publication>1092 <news:name>{publication_name}</βnews:name>1093 <news:language>{publication_language}</βnews:language>1094 </βnews:publication>1095 <news:publication_date>{publication_date}</βnews:publication_date>1096 <news:title>Foo <foo></βnews:title> <!-- HTML entity decoding -->1097 </βnews:news>1098 </βurl>1099 """.format(1100 x=x,1101 base_url=self.TEST_BASE_URL,1102 publication_name=self.TEST_PUBLICATION_NAME,1103 publication_language=self.TEST_PUBLICATION_LANGUAGE,1104 publication_date=self.TEST_DATE_STR_ISO8601,1105 )1106 sitemap_xml += "</βurlset>"1107 with requests_mock.Mocker() as m:1108 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1109 m.get(1110 self.TEST_BASE_URL + '/β',1111 text='This is a homepage.',1112 )1113 m.get(1114 self.TEST_BASE_URL + '/βrobots.txt',1115 headers={'Content-Type': 'text/βplain'},1116 text=textwrap.dedent("""1117 User-agent: *1118 Disallow: /βwhatever1119 1120 Sitemap: {base_url}/βsitemap.xml.gz1121 """.format(base_url=self.TEST_BASE_URL)).strip(),1122 )1123 m.get(1124 self.TEST_BASE_URL + '/βsitemap.xml.gz',1125 headers={'Content-Type': 'application/βx-gzip'},1126 content=gzip(sitemap_xml),1127 )1128 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1129 assert len(list(actual_sitemap_tree.all_pages())) == page_count1130 def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):1131 """Test sitemap_tree_for_homepage() with weird (but valid) spacing."""1132 with requests_mock.Mocker() as m:1133 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1134 m.get(1135 self.TEST_BASE_URL + '/β',1136 text='This is a homepage.',1137 )1138 robots_txt_body = ""1139 robots_txt_body += "User-agent: *\n"1140 # Extra space before "Sitemap:", no space after "Sitemap:", and extra space after sitemap URL1141 robots_txt_body += " Sitemap:{base_url}/βsitemap.xml ".format(base_url=self.TEST_BASE_URL)1142 m.get(1143 self.TEST_BASE_URL + '/βrobots.txt',1144 headers={'Content-Type': 'text/βplain'},1145 text=robots_txt_body,1146 )1147 m.get(1148 self.TEST_BASE_URL + '/βsitemap.xml',1149 text=textwrap.dedent("""1150 <?xml version="1.0" encoding="UTF-8"?>1151 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"1152 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">1153 <url>1154 <loc>{base_url}/βnews/βfirst.html</βloc>1155 <news:news>1156 <news:publication>1157 <news:name>{publication_name}</βnews:name>1158 <news:language>{publication_language}</βnews:language>1159 </βnews:publication>1160 <news:publication_date>{publication_date}</βnews:publication_date>1161 <news:title>First story</βnews:title>1162 </βnews:news>1163 </βurl>1164 </βurlset>1165 """.format(1166 base_url=self.TEST_BASE_URL,1167 publication_name=self.TEST_PUBLICATION_NAME,1168 publication_language=self.TEST_PUBLICATION_LANGUAGE,1169 publication_date=self.TEST_DATE_STR_ISO8601,1170 )).strip(),1171 )1172 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)1173 assert len(list(actual_sitemap_tree.all_pages())) == 11174 def test_sitemap_tree_for_homepage_utf8_bom(self):1175 """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""1176 robots_txt_body = textwrap.dedent("""1177 User-agent: *1178 Disallow: /βwhatever1179 Sitemap: {base_url}/βsitemap.xml1180 """.format(base_url=self.TEST_BASE_URL)).strip()1181 sitemap_xml_body = textwrap.dedent("""1182 <?xml version="1.0" encoding="UTF-8"?>1183 <urlset xmlns="http:/β/βwww.sitemaps.org/βschemas/βsitemap/β0.9"1184 xmlns:news="http:/β/βwww.google.com/βschemas/βsitemap-news/β0.9">1185 <url>1186 <loc>{base_url}/βnews/βfirst.html</βloc>1187 <news:news>1188 <news:publication>1189 <news:name>{publication_name}</βnews:name>1190 <news:language>{publication_language}</βnews:language>1191 </βnews:publication>1192 <news:publication_date>{publication_date}</βnews:publication_date>1193 <news:title>First story</βnews:title>1194 </βnews:news>1195 </βurl>1196 </βurlset>1197 """.format(1198 base_url=self.TEST_BASE_URL,1199 publication_name=self.TEST_PUBLICATION_NAME,1200 publication_language=self.TEST_PUBLICATION_LANGUAGE,1201 publication_date=self.TEST_DATE_STR_ISO8601,1202 )).strip()1203 robots_txt_body_encoded = robots_txt_body.encode('utf-8-sig')1204 sitemap_xml_body_encoded = sitemap_xml_body.encode('utf-8-sig')1205 with requests_mock.Mocker() as m:1206 m.add_matcher(TestSitemapTree.fallback_to_404_not_found_matcher)1207 m.get(1208 self.TEST_BASE_URL + '/β',1209 text='This is a homepage.',1210 )1211 m.get(1212 self.TEST_BASE_URL + '/βrobots.txt',1213 headers={'Content-Type': 'text/βplain'},1214 content=robots_txt_body_encoded,1215 )1216 m.get(1217 self.TEST_BASE_URL + '/βsitemap.xml',1218 content=sitemap_xml_body_encoded,1219 )1220 actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)...
combinations.py
Source:combinations.py
1'''Test cases arguments combinations.'''2import os3import sys4import inflection5TEST_DIR = os.path.abspath(os.path.dirname(__file__))6if TEST_DIR not in sys.path:7 sys.path.append(TEST_DIR)8from consts import TEMPDIR, TEST_BASE_URL # noqa: E4029from http_request_codegen.hrc_string import replace_multiple # noqa: E40210def argument_combination_to_filename(combination_name, index):11 return '{}.{}.expect.txt'.format(12 str(index).zfill(3),13 inflection.parameterize(14 replace_multiple(15 combination_name, replacements={16 '"': '-double-quote-',17 '\'': '-single-quote-',18 },19 ),20 ),21 )22def combination_arguments_to_kwargs(arguments):23 kwargs = {}24 for key, value in arguments.items():25 if key == 'kwargs':26 kwargs.update(value)27 else:28 kwargs[key] = value29 return kwargs30def get_argument_combinations(31 method='GET', include_filenames=True,32 dirpath=None,33):34 response = [35 {36 'name': 'URL',37 'arguments': {38 'url': TEST_BASE_URL,39 },40 },41 {42 'name': 'URL wrapping (no wrap)',43 'arguments': {44 'url': TEST_BASE_URL,45 'wrap': 99999,46 },47 },48 {49 'name': 'URL wrapping (wrap 15)',50 'arguments': {51 'url': TEST_BASE_URL,52 'wrap': 15,53 },54 },55 {56 'name': 'Parameter',57 'arguments': {58 'url': TEST_BASE_URL,59 'parameters': [60 {61 'name': 'param-1',62 'value': 'value-1',63 },64 ],65 },66 },67 {68 'name': 'Parameters',69 'arguments': {70 'url': TEST_BASE_URL,71 'parameters': [72 {73 'name': 'param-1',74 'value': 'foo',75 },76 {77 'name': 'param-2',78 'value': 1,79 },80 {81 'name': 'param-3',82 'value': .777,83 },84 {85 'name': 'param-4',86 'value': True,87 },88 ],89 },90 },91 {92 'name': 'Parameter wrapping value',93 'arguments': {94 'url': TEST_BASE_URL,95 'parameters': [96 {97 'name': 'param-1',98 'value': 'foo-bar-baz' * 50,99 },100 ],101 },102 },103 {104 'name': 'Parameters, one wrapping value',105 'arguments': {106 'url': TEST_BASE_URL,107 'parameters': [108 {109 'name': 'param-1',110 'value': 'foo-bar-baz' * 50,111 },112 {113 'name': 'param-2',114 'value': 'value-2',115 },116 ],117 },118 },119 {120 'name': 'Parameter escaping quotes',121 'arguments': {122 'url': TEST_BASE_URL,123 'parameters': [124 {125 'name': 'param-1-with-\'\'-quotes',126 'value': 'value-1-with-\'\'-quotes',127 },128 ],129 },130 },131 {132 'name': 'URL + header',133 'arguments': {134 'url': TEST_BASE_URL,135 'headers': {136 'Content-Type': 'application/βjson',137 },138 },139 },140 {141 'name': 'URL + headers',142 'arguments': {143 'url': TEST_BASE_URL,144 'headers': {145 'Content-Type': 'application/βjson',146 'Accept-Language': 'es',147 },148 },149 },150 {151 'name': 'URL + header wrapping value',152 'arguments': {153 'url': TEST_BASE_URL,154 'headers': {155 'Content-Type': 'application/βjson' * 5,156 },157 },158 },159 {160 'name': 'URL + headers, one wrapping value',161 'arguments': {162 'url': TEST_BASE_URL,163 'headers': {164 'Content-Type': 'application/βjson' * 5,165 'Accept-Language': '*',166 },167 },168 },169 {170 'name': 'URL + header escaping quotes',171 'arguments': {172 'url': TEST_BASE_URL,173 'headers': {174 'Accept-Language': 'Header value with \'\' quotes',175 },176 },177 },178 {179 'name': 'URL + kwarg',180 'arguments': {181 'url': TEST_BASE_URL,182 'kwargs': {183 'timeout': 5,184 },185 },186 },187 {188 'name': 'URL + kwargs',189 'arguments': {190 'url': TEST_BASE_URL,191 'kwargs': {192 'timeout': 5,193 'stream': True,194 },195 },196 },197 {198 'name': 'URL + kwarg escaping quotes',199 'arguments': {200 'url': TEST_BASE_URL,201 'kwargs': {202 'cookies': {203 'foo': 'value with \'\' quotes',204 },205 },206 },207 },208 {209 'name': 'URL + kwarg wrapping value',210 'arguments': {211 'url': TEST_BASE_URL,212 'kwargs': {213 'cookies': {214 'bar': 'foo bar baz ' * 50,215 },216 },217 },218 },219 {220 'name': 'URL + kwargs, one wrapping value',221 'arguments': {222 'url': TEST_BASE_URL,223 'kwargs': {224 'cookies': {225 'bar': 'foo bar baz ' * 50,226 },227 'stream': True,228 },229 },230 },231 {232 'name': 'Parameter + header',233 'arguments': {234 'url': TEST_BASE_URL,235 'parameters': [236 {237 'name': 'param-1',238 'value': 'value-1',239 },240 ],241 'headers': {242 'Content-Type': 'application/βjson',243 },244 },245 },246 {247 'name': 'Parameter + header (oneline)',248 'arguments': {249 'url': TEST_BASE_URL,250 'parameters': [251 {252 'name': 'param-1',253 'value': 'value-1',254 },255 ],256 'headers': {257 'Content-Type': 'application/βjson',258 },259 'oneline': True,260 },261 },262 {263 'name': 'Parameters + header',264 'arguments': {265 'url': TEST_BASE_URL,266 'parameters': [267 {268 'name': 'param-1',269 'value': 'value-1',270 },271 {272 'name': 'param-2',273 'value': 'value-2',274 },275 ],276 'headers': {277 'Content-Type': 'application/βjson',278 },279 },280 },281 {282 'name': 'Parameter + headers',283 'arguments': {284 'url': TEST_BASE_URL,285 'parameters': [286 {287 'name': 'param-1',288 'value': 'value-1',289 },290 ],291 'headers': {292 'Content-Type': 'application/βjson',293 'Accept-Language': '*',294 },295 },296 },297 {298 'name': 'Parameters + headers',299 'arguments': {300 'url': TEST_BASE_URL,301 'parameters': [302 {303 'name': 'param-1',304 'value': 'value-1',305 },306 {307 'name': 'param-2',308 'value': 'value-2',309 },310 ],311 'headers': {312 'Content-Type': 'application/βjson',313 'Accept-Language': '*',314 },315 },316 },317 {318 'name': 'Parameter + kwarg',319 'arguments': {320 'url': TEST_BASE_URL,321 'parameters': [322 {323 'name': 'param-1',324 'value': 'value-1',325 },326 ],327 'kwargs': {328 'timeout': 10,329 },330 },331 },332 {333 'name': 'Parameter + kwarg (oneline)',334 'arguments': {335 'url': TEST_BASE_URL,336 'parameters': [337 {338 'name': 'a',339 'value': 'b',340 },341 ],342 'kwargs': {343 'timeout': 10,344 },345 'oneline': True,346 },347 },348 {349 'name': 'Parameters + kwarg',350 'arguments': {351 'url': TEST_BASE_URL,352 'parameters': [353 {354 'name': 'param-1',355 'value': 'value-1',356 },357 {358 'name': 'param-2',359 'value': 'value-2',360 },361 ],362 'kwargs': {363 'timeout': 10,364 },365 },366 },367 {368 'name': 'Parameter + kwargs',369 'arguments': {370 'url': TEST_BASE_URL,371 'parameters': [372 {373 'name': 'param-1',374 'value': 'value-1',375 },376 ],377 'kwargs': {378 'timeout': 10,379 'stream': True,380 },381 },382 },383 {384 'name': 'Parameters + kwargs',385 'arguments': {386 'url': TEST_BASE_URL,387 'parameters': [388 {389 'name': 'param-1',390 'value': 'value-1',391 },392 {393 'name': 'param-2',394 'value': 'value-2',395 },396 ],397 'kwargs': {398 'timeout': 10,399 'stream': True,400 },401 },402 },403 {404 'name': 'URL + header + kwarg',405 'arguments': {406 'url': TEST_BASE_URL,407 'headers': {408 'Content-Type': 'application/βjson',409 },410 'kwargs': {411 'timeout': 5,412 },413 },414 },415 {416 'name': 'URL + header + kwarg (oneline)',417 'arguments': {418 'url': TEST_BASE_URL,419 'headers': {420 'Content-Type': 'application/βjson',421 },422 'kwargs': {423 'timeout': 5,424 },425 'oneline': True,426 },427 },428 {429 'name': 'URL + headers + kwarg',430 'arguments': {431 'url': TEST_BASE_URL,432 'headers': {433 'Content-Type': 'application/βjson',434 'Accept-Language': '*',435 },436 'kwargs': {437 'timeout': 5,438 },439 },440 },441 {442 'name': 'URL + header + kwargs',443 'arguments': {444 'url': TEST_BASE_URL,445 'headers': {446 'Accept-Language': '*',447 },448 'kwargs': {449 'timeout': 5,450 'stream': False,451 },452 },453 },454 {455 'name': 'URL + headers + kwargs',456 'arguments': {457 'url': TEST_BASE_URL,458 'headers': {459 'Content-Type': 'application/βjson',460 'Accept-Language': '*',461 },462 'kwargs': {463 'timeout': 5,464 'stream': False,465 },466 },467 },468 {469 'name': 'Parameter + header + kwarg',470 'arguments': {471 'url': TEST_BASE_URL,472 'parameters': [473 {474 'name': 'param-1',475 'value': 'value-1',476 },477 ],478 'headers': {479 'Content-Type': 'application/βjson',480 },481 'kwargs': {482 'timeout': 5,483 },484 },485 },486 {487 'name': 'Parameter + header + kwargs',488 'arguments': {489 'url': TEST_BASE_URL,490 'parameters': [491 {492 'name': 'param-1',493 'value': 'value-1',494 },495 ],496 'headers': {497 'Content-Type': 'application/βjson',498 },499 'kwargs': {500 'timeout': 5,501 'stream': True,502 },503 },504 },505 {506 'name': 'Parameters + header + kwarg',507 'arguments': {508 'url': TEST_BASE_URL,509 'parameters': [510 {511 'name': 'param-1',512 'value': 'value-1',513 },514 {515 'name': 'param-2',516 'value': 7.77,517 },518 ],519 'headers': {520 'Content-Type': 'application/βjson',521 },522 'kwargs': {523 'timeout': 5,524 },525 },526 },527 {528 'name': 'Parameters + header + kwargs',529 'arguments': {530 'url': TEST_BASE_URL,531 'parameters': [532 {533 'name': 'param-1',534 'value': 'value-1',535 },536 {537 'name': 'param-2',538 'value': 7.77,539 },540 ],541 'headers': {542 'Content-Type': 'application/βjson',543 },544 'kwargs': {545 'timeout': 5,546 'stream': False,547 },548 },549 },550 {551 'name': 'Parameters + headers + kwarg',552 'arguments': {553 'url': TEST_BASE_URL,554 'parameters': [555 {556 'name': 'param-1',557 'value': 'value-1',558 },559 {560 'name': 'param-2',561 'value': 7.77,562 },563 ],564 'headers': {565 'Content-Type': 'application/βjson',566 'Accept-Language': 'fr',567 },568 'kwargs': {569 'timeout': 5,570 },571 },572 },573 {574 'name': 'Parameters + headers + kwargs',575 'arguments': {576 'url': TEST_BASE_URL,577 'parameters': [578 {579 'name': 'param-1',580 'value': 'value-1',581 },582 {583 'name': 'param-2',584 'value': 7.77,585 },586 ],587 'headers': {588 'Content-Type': 'application/βjson',589 'Accept-Language': 'fr',590 },591 'kwargs': {592 'timeout': 5,593 'stream': True,594 },595 },596 },597 {598 'name': 'Setup',599 'arguments': {600 'url': TEST_BASE_URL,601 'setup': True,602 },603 },604 {605 'name': 'No setup',606 'arguments': {607 'url': TEST_BASE_URL,608 'setup': False,609 },610 },611 {612 'name': 'Custom setup',613 'arguments': {614 'url': TEST_BASE_URL,615 'setup': 'custom_setup=1\n\n',616 },617 },618 {619 'name': 'Custom teardown',620 'arguments': {621 'url': TEST_BASE_URL,622 'teardown': '\n\ncustom_teardown=1',623 },624 },625 {626 'name': 'Quote character \'',627 'arguments': {628 'url': TEST_BASE_URL,629 'quote_char': '\'',630 },631 },632 {633 'name': 'Quote character "',634 'arguments': {635 'url': TEST_BASE_URL,636 'quote_char': '"',637 },638 },639 {640 'name': 'Indent 2 spaces',641 'arguments': {642 'url': TEST_BASE_URL,643 'indent': ' ',644 'headers': {645 'Accept-Language': 'es en fr * ' * 20,646 },647 },648 },649 {650 'name': 'Indent 4 spaces',651 'arguments': {652 'url': TEST_BASE_URL,653 'indent': ' ',654 'headers': {655 'Accept-Language': 'es en fr * ' * 20,656 },657 },658 },659 {660 'name': 'One line',661 'arguments': {662 'url': TEST_BASE_URL,663 'oneline': True,664 },665 },666 {667 'name': 'One line + no setup',668 'arguments': {669 'url': TEST_BASE_URL,670 'oneline': True,671 'setup': False,672 },673 },674 {675 'name': 'Wrap 0',676 'arguments': {677 'url': TEST_BASE_URL,678 'wrap': 0,679 },680 },681 {682 'name': 'Wrap 1',683 'arguments': {684 'url': TEST_BASE_URL,685 'wrap': 1,686 },687 },688 {689 'name': 'Wrap 10',690 'arguments': {691 'url': TEST_BASE_URL,692 'wrap': 10,693 },694 },695 {696 'name': 'Wrap 20',697 'arguments': {698 'url': TEST_BASE_URL,699 'wrap': 20,700 },701 },702 {703 'name': 'Wrap 25',704 'arguments': {705 'url': TEST_BASE_URL,706 'wrap': 25,707 },708 },709 {710 'name': 'Wrap 30',711 'arguments': {712 'url': TEST_BASE_URL,713 'wrap': 30,714 },715 },716 {717 'name': 'Wrap 35',718 'arguments': {719 'url': TEST_BASE_URL,720 'wrap': 35,721 },722 },723 {724 'name': 'Wrap 40',725 'arguments': {726 'url': TEST_BASE_URL,727 'wrap': 40,728 },729 },730 {731 'name': 'Wrap infinite',732 'arguments': {733 'url': TEST_BASE_URL,734 'wrap': float('inf'),735 },736 },737 {738 'name': 'Wrap null is infinite',739 'arguments': {740 'url': TEST_BASE_URL,741 'wrap': None,742 },743 },744 ]745 if method.lower() == 'post':746 response.extend([747 {748 'name': 'Data by parameter (text/βplain)',749 'arguments': {750 'url': TEST_BASE_URL,751 'parameters': [752 {753 'name': '',754 'value': 'foo bar baz ' * 3,755 },756 ],757 'headers': {758 'Content-Type': 'text/βplain',759 },760 },761 },762 {763 'name': 'Data by parameter (text/βplain) wrapping value',764 'arguments': {765 'url': TEST_BASE_URL,766 'parameters': [767 {768 'name': '',769 'value': 'foo bar baz ' * 30,770 },771 ],772 'headers': {773 'Content-Type': 'text/βplain',774 },775 },776 },777 {778 'name': 'Data by parameter (application/βjson)',779 'arguments': {780 'url': TEST_BASE_URL,781 'parameters': [782 {783 'name': 'param-1',784 'value': 'value-1',785 },786 ],787 'headers': {788 'Content-Type': 'application/βjson',789 },790 },791 },792 {793 'name': 'Data by parameters (application/βjson)',794 'arguments': {795 'url': TEST_BASE_URL,796 'parameters': [797 {798 'name': 'param-int',799 'value': 1,800 },801 {802 'name': 'param-float',803 'value': .777,804 },805 {806 'name': 'param-bool',807 'value': True,808 },809 ],810 'headers': {811 'Content-Type': 'application/βjson',812 },813 },814 },815 {816 'name': (817 'Data by parameter'818 ' (application/βx-www-form-urlencoded)'819 ),820 'arguments': {821 'url': TEST_BASE_URL,822 'parameters': [823 {824 'name': 'param-1',825 'value': 'value-1',826 },827 ],828 'headers': {829 'Content-Type': 'application/βx-www-form-urlencoded',830 },831 },832 },833 {834 'name': (835 'Data by parameters'836 ' (application/βx-www-form-urlencoded)'837 ),838 'arguments': {839 'url': TEST_BASE_URL,840 'parameters': [841 {842 'name': 'param-int',843 'value': 1,844 },845 {846 'name': 'param-float',847 'value': .777,848 },849 {850 'name': 'param-bool',851 'value': True,852 },853 ],854 'headers': {855 'Content-Type': 'application/βx-www-form-urlencoded',856 },857 },858 },859 {860 'name': 'File by filepath (multipart/βform-data)',861 'arguments': {862 'url': TEST_BASE_URL,863 'files': {864 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),865 },866 },867 },868 {869 'name': 'Files by filepath (multipart/βform-data)',870 'arguments': {871 'url': TEST_BASE_URL,872 'files': {873 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),874 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),875 },876 },877 },878 {879 'name': 'File by filepath (multipart/βform-data) wrapping',880 'arguments': {881 'url': TEST_BASE_URL,882 'files': {883 'param-1': os.path.join(884 TEMPDIR, '%s.ext' % ('foo' * 40),885 ),886 },887 },888 },889 {890 'name': (891 'Files by filepath (multipart/βform-data)'892 ' with Content-Type'893 ),894 'arguments': {895 'url': TEST_BASE_URL,896 'files': {897 'param-1': (898 os.path.join(TEMPDIR, 'file-1.ext'),899 'text/βplain',900 ),901 'param-2': (902 os.path.join(TEMPDIR, 'file-2.ext'),903 'text/βcsv',904 ),905 },906 },907 },908 {909 'name': (910 'File by filepath (multipart/βform-data)'911 ' with Content-Type wrapping'912 ),913 'arguments': {914 'url': TEST_BASE_URL,915 'files': {916 'param-1': (917 os.path.join(TEMPDIR, 'file-1.ext'),918 'text/βplain ' * 20,919 ),920 },921 },922 },923 {924 'name': (925 'File by filepath (multipart/βform-data),'926 ' Content-Type, header'927 ),928 'arguments': {929 'url': TEST_BASE_URL,930 'files': {931 'param-1': (932 os.path.join(TEMPDIR, 'file-1.ext'),933 'text/βplain',934 {'Accept-Language': 'es'},935 ),936 },937 },938 },939 {940 'name': (941 'File by filepath (multipart/βform-data),'942 ' Content-Type, headers'943 ),944 'arguments': {945 'url': TEST_BASE_URL,946 'files': {947 'param-1': (948 os.path.join(TEMPDIR, 'file-1.ext'),949 'text/βplain',950 {951 'Accept-Language': 'es',952 'Accept-Charset': 'utf-8',953 },954 ),955 },956 },957 },958 {959 'name': 'Files by filepath (multipart/βform-data) + parameter',960 'arguments': {961 'url': TEST_BASE_URL,962 'files': {963 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),964 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),965 },966 'parameters': [967 {968 'name': 'param-1',969 'value': 'value-1',970 },971 ],972 },973 },974 {975 'name': 'Files by filepath (multipart/βform-data) + parameters',976 'arguments': {977 'url': TEST_BASE_URL,978 'files': {979 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),980 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),981 },982 'parameters': [983 {984 'name': 'param-1',985 'value': 'value-1',986 },987 {988 'name': 'param-2',989 'value': 'value-2',990 },991 ],992 },993 },994 {995 'name': (996 'Files by filepath (multipart/βform-data) + parameter'997 ' + header'998 ),999 'arguments': {1000 'url': TEST_BASE_URL,1001 'files': {1002 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1003 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1004 },1005 'parameters': [1006 {1007 'name': 'param-1',1008 'value': 'value-1',1009 },1010 ],1011 'headers': {1012 'Accept-Language': 'fr',1013 },1014 },1015 },1016 {1017 'name': (1018 'Files by filepath (multipart/βform-data) + parameter'1019 ' + headers'1020 ),1021 'arguments': {1022 'url': TEST_BASE_URL,1023 'files': {1024 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1025 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1026 },1027 'parameters': [1028 {1029 'name': 'param-1',1030 'value': 'value-1',1031 },1032 ],1033 'headers': {1034 'Accept-Language': 'fr',1035 'Accept-Charset': 'utf-8',1036 },1037 },1038 },1039 {1040 'name': (1041 'Files by filepath (multipart/βform-data) + parameters'1042 ' + header'1043 ),1044 'arguments': {1045 'url': TEST_BASE_URL,1046 'files': {1047 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1048 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1049 },1050 'parameters': [1051 {1052 'name': 'param-1',1053 'value': 'value-1',1054 },1055 {1056 'name': 'param-2',1057 'value': 'value-2',1058 },1059 ],1060 'headers': {1061 'Accept-Language': 'es',1062 },1063 },1064 },1065 {1066 'name': (1067 'Files by filepath (multipart/βform-data) + parameters'1068 ' + headers'1069 ),1070 'arguments': {1071 'url': TEST_BASE_URL,1072 'files': {1073 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1074 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1075 },1076 'parameters': [1077 {1078 'name': 'param-1',1079 'value': 'value-1',1080 },1081 {1082 'name': 'param-2',1083 'value': 'value-2',1084 },1085 ],1086 'headers': {1087 'Accept-Language': 'fr',1088 'Accept-Charset': 'utf-8',1089 },1090 },1091 },1092 {1093 'name': (1094 'Files by filepath (multipart/βform-data) + parameter'1095 ' + header + kwarg'1096 ),1097 'arguments': {1098 'url': TEST_BASE_URL,1099 'files': {1100 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1101 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1102 },1103 'parameters': [1104 {1105 'name': 'param-1',1106 'value': 'value-1',1107 },1108 ],1109 'headers': {1110 'Accept-Language': 'fr',1111 },1112 'kwargs': {1113 'timeout': 10,1114 },1115 },1116 },1117 {1118 'name': (1119 'Files by filepath (multipart/βform-data) + parameter'1120 ' + headers + kwarg'1121 ),1122 'arguments': {1123 'url': TEST_BASE_URL,1124 'files': {1125 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1126 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1127 },1128 'parameters': [1129 {1130 'name': 'param-1',1131 'value': 'value-1',1132 },1133 ],1134 'headers': {1135 'Accept-Language': 'fr',1136 'Accept-Charset': 'utf-8',1137 },1138 'kwargs': {1139 'timeout': 10,1140 },1141 },1142 },1143 {1144 'name': (1145 'Files by filepath (multipart/βform-data) + parameters'1146 ' + header + kwarg'1147 ),1148 'arguments': {1149 'url': TEST_BASE_URL,1150 'files': {1151 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1152 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1153 },1154 'parameters': [1155 {1156 'name': 'param-1',1157 'value': 'value-1',1158 },1159 {1160 'name': 'param-2',1161 'value': 'value-2',1162 },1163 ],1164 'headers': {1165 'Accept-Language': 'fr',1166 },1167 'kwargs': {1168 'timeout': 10,1169 },1170 },1171 },1172 {1173 'name': (1174 'Files by filepath (multipart/βform-data) + parameters'1175 ' + headers + kwarg'1176 ),1177 'arguments': {1178 'url': TEST_BASE_URL,1179 'files': {1180 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1181 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1182 },1183 'parameters': [1184 {1185 'name': 'param-1',1186 'value': 'value-1',1187 },1188 {1189 'name': 'param-2',1190 'value': 'value-2',1191 },1192 ],1193 'headers': {1194 'Accept-Language': 'fr',1195 'Accept-Charset': 'utf-8',1196 },1197 'kwargs': {1198 'timeout': 10,1199 },1200 },1201 },1202 {1203 'name': (1204 'Files by filepath (multipart/βform-data) + parameter'1205 ' + header + kwargs'1206 ),1207 'arguments': {1208 'url': TEST_BASE_URL,1209 'files': {1210 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1211 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1212 },1213 'parameters': [1214 {1215 'name': 'param-1',1216 'value': 'value-1',1217 },1218 ],1219 'headers': {1220 'Accept-Language': 'fr',1221 },1222 'kwargs': {1223 'timeout': 10,1224 'cookies': {1225 'hello': 'world',1226 },1227 },1228 },1229 },1230 {1231 'name': (1232 'Files by filepath (multipart/βform-data) + parameter'1233 ' + headers + kwargs'1234 ),1235 'arguments': {1236 'url': TEST_BASE_URL,1237 'files': {1238 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1239 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1240 },1241 'parameters': [1242 {1243 'name': 'param-1',1244 'value': 'value-1',1245 },1246 ],1247 'headers': {1248 'Accept-Language': 'fr',1249 'Accept-Charset': 'utf-8',1250 },1251 'kwargs': {1252 'timeout': 10,1253 'stream': False,1254 },1255 },1256 },1257 {1258 'name': (1259 'Files by filepath (multipart/βform-data) + parameters'1260 ' + header + kwargs'1261 ),1262 'arguments': {1263 'url': TEST_BASE_URL,1264 'files': {1265 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1266 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1267 },1268 'parameters': [1269 {1270 'name': 'param-1',1271 'value': 'value-1',1272 },1273 {1274 'name': 'param-2',1275 'value': 'value-2',1276 },1277 ],1278 'headers': {1279 'Accept-Language': 'fr',1280 },1281 'kwargs': {1282 'timeout': 10,1283 'stream': False,1284 },1285 },1286 },1287 {1288 'name': (1289 'Files by filepath (multipart/βform-data) + parameters'1290 ' + headers + kwargs'1291 ),1292 'arguments': {1293 'url': TEST_BASE_URL,1294 'files': {1295 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1296 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1297 },1298 'parameters': [1299 {1300 'name': 'param-1',1301 'value': 'value-1',1302 },1303 {1304 'name': 'param-2',1305 'value': 'value-2',1306 },1307 ],1308 'headers': {1309 'Accept-Language': 'fr',1310 'Accept-Charset': 'utf-8',1311 },1312 'kwargs': {1313 'timeout': 10,1314 'stream': False,1315 },1316 },1317 },1318 {1319 'name': (1320 'No setup + files by filepath (multipart/βform-data)'1321 ' + parameters + headers + kwargs + '1322 ),1323 'arguments': {1324 'url': TEST_BASE_URL,1325 'setup': False,1326 'files': {1327 'param-1': os.path.join(TEMPDIR, 'file-1.ext'),1328 'param-2': os.path.join(TEMPDIR, 'file-2.ext'),1329 },1330 'parameters': [1331 {1332 'name': 'param-1',1333 'value': 'value-1',1334 },1335 {1336 'name': 'param-2',1337 'value': 'value-2',1338 },1339 ],1340 'headers': {1341 'Accept-Language': 'fr',1342 'Accept-Charset': 'utf-8',1343 },1344 'kwargs': {1345 'timeout': 10,1346 'stream': False,1347 },1348 },1349 },1350 ])1351 if include_filenames:1352 for index, args_group in enumerate(response):1353 fname = argument_combination_to_filename(1354 args_group['name'], index,1355 )1356 if dirpath and os.path.exists(dirpath):1357 fname = os.path.join(dirpath, fname)1358 args_group['filename'] = fname...
test_logoscraper.py
Source:test_logoscraper.py
1from bs4 import BeautifulSoup2import scraper.logoscraper as logoscraper3test_base_url = "https:/β/βwww.testbase.com"4def test_get_logo_should_pass():5 htmls = [6 f"<div class='logo'><img src='https:/β/βwww.test.com'></βimg></βdiv>",7 f"<div id='logo'><img src='https:/β/βwww.test.com'></βimg></βdiv>",8 f"<a><img src='https:/β/βwww.test.com'></βimg></βa>",9 f"<div><img src='https:/β/βwww.test.com'></βimg></βdiv>",10 f"<a href={test_base_url}><img src='https:/β/βwww.test.com'></βimg></βdiv>",11 ]12 for html in htmls:13 assert (14 logoscraper.get_logo(BeautifulSoup(html, "html.parser"), test_base_url)15 == "https:/β/βwww.test.com"16 )17def test_find_image_tag():18 result = logoscraper.find_img_tag(19 BeautifulSoup(20 f'<a><img src="https:/β/β{test_base_url}"></βimg></βa>', "html.parser"21 ),22 test_base_url,23 )24 assert result25def test_find_image_tag_return_itself():26 result = logoscraper.find_img_tag(27 BeautifulSoup(f'<img src="https:/β/β{test_base_url}"></βimg>', "html.parser"),28 test_base_url,29 )30 assert result31def test_find_image_tag_retrun_none():32 result = logoscraper.find_img_tag(33 BeautifulSoup(f'<a href="https:/β/β{test_base_url}"></βa>', "html.parser"),34 test_base_url,35 )36 assert result is None37def test_format_image_source():38 html = BeautifulSoup(f"<img src='{test_base_url}'></βimg>", "html.parser").find(39 "img"40 )41 assert logoscraper.format_image_source(html, test_base_url) == test_base_url42def test_format_image_source_no_source():43 html = BeautifulSoup(f"<img></βimg>", "html.parser").find("img")44 assert logoscraper.format_image_source(html, test_base_url) is None45def test_format_image_source_relative_path():46 html = BeautifulSoup(47 f"<img src='resources/βimages/βimage.png'></βimg>", "html.parser"48 ).find("img")49 assert (50 logoscraper.format_image_source(html, test_base_url)51 == f"{test_base_url}/βresources/βimages/βimage.png"...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!