diff --git a/brozzler/model.py b/brozzler/model.py index e10a712..9832a40 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -242,8 +242,19 @@ class Site(doublethink.Document, ElapsedMixIn): self.scope["accepts"].append({"ssurt": ssurt}) def note_seed_redirect(self, url): + canon_seed_redirect = brozzler.site_surt_canon(url) + canon_seed = brozzler.site_surt_canon(self.seed) + + # if http://foo.com/ redirects to https://foo.com/a/b/c let's also + # put all of https://foo.com/ in scope + if (canon_seed_redirect.authority == canon_seed.authority + and canon_seed_redirect.scheme != canon_seed.scheme): + canon_seed.scheme = canon_seed_redirect.scheme + self._accept_ssurt_if_not_redundant( + canon_seed.ssurt().decode('ascii')) + self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(url).ssurt().decode('ascii')) + canon_seed_redirect.ssurt().decode('ascii')) def extra_headers(self): hdrs = {} diff --git a/tests/test_units.py b/tests/test_units.py index 4a91e0c..1d62bc6 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -420,3 +420,22 @@ def test_needs_browsing(): assert not brozzler.worker.BrozzlerWorker._needs_browsing( None, page, spy.fetches) +def test_seed_redirect(): + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,foo,//https:/',}]} + + site = brozzler.Site(None, {'seed': 'https://foo.com/'}) + site.note_seed_redirect('http://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//https:/',}, + {'ssurt': 'com,foo,//http:/',}]} + + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://bar.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,bar,//https:/a/b/c',}]} +