From a74f46dc53a1bd55f828f5e6dc7268d28305980a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 21 Dec 2018 15:17:31 -0800 Subject: [PATCH] least surprise on http/https seed redirects if http://foo.com/ redirects to https://foo.com/a/b/c let's also put all of https://foo.com/ in scope --- brozzler/model.py | 13 ++++++++++++- tests/test_units.py | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/brozzler/model.py b/brozzler/model.py index e10a712..9832a40 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -242,8 +242,19 @@ class Site(doublethink.Document, ElapsedMixIn): self.scope["accepts"].append({"ssurt": ssurt}) def note_seed_redirect(self, url): + canon_seed_redirect = brozzler.site_surt_canon(url) + canon_seed = brozzler.site_surt_canon(self.seed) + + # if http://foo.com/ redirects to https://foo.com/a/b/c let's also + # put all of https://foo.com/ in scope + if (canon_seed_redirect.authority == canon_seed.authority + and canon_seed_redirect.scheme != canon_seed.scheme): + canon_seed.scheme = canon_seed_redirect.scheme + self._accept_ssurt_if_not_redundant( + canon_seed.ssurt().decode('ascii')) + self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(url).ssurt().decode('ascii')) + canon_seed_redirect.ssurt().decode('ascii')) def extra_headers(self): hdrs = {} diff --git a/tests/test_units.py b/tests/test_units.py index 4a91e0c..1d62bc6 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -420,3 +420,22 @@ def test_needs_browsing(): assert not brozzler.worker.BrozzlerWorker._needs_browsing( None, page, spy.fetches) +def test_seed_redirect(): + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,foo,//https:/',}]} + + site = brozzler.Site(None, {'seed': 'https://foo.com/'}) + site.note_seed_redirect('http://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//https:/',}, + {'ssurt': 'com,foo,//http:/',}]} + + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://bar.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,bar,//https:/a/b/c',}]} +