mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-20 13:38:49 -04:00
least surprise on http/https seed redirects
if http://foo.com/ redirects to https://foo.com/a/b/c let's also put all of https://foo.com/ in scope
This commit is contained in:
parent
6b8e597a43
commit
a74f46dc53
2 changed files with 31 additions and 1 deletions
|
@ -242,8 +242,19 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
self.scope["accepts"].append({"ssurt": ssurt})
|
self.scope["accepts"].append({"ssurt": ssurt})
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
|
canon_seed_redirect = brozzler.site_surt_canon(url)
|
||||||
|
canon_seed = brozzler.site_surt_canon(self.seed)
|
||||||
|
|
||||||
|
# if http://foo.com/ redirects to https://foo.com/a/b/c let's also
|
||||||
|
# put all of https://foo.com/ in scope
|
||||||
|
if (canon_seed_redirect.authority == canon_seed.authority
|
||||||
|
and canon_seed_redirect.scheme != canon_seed.scheme):
|
||||||
|
canon_seed.scheme = canon_seed_redirect.scheme
|
||||||
self._accept_ssurt_if_not_redundant(
|
self._accept_ssurt_if_not_redundant(
|
||||||
brozzler.site_surt_canon(url).ssurt().decode('ascii'))
|
canon_seed.ssurt().decode('ascii'))
|
||||||
|
|
||||||
|
self._accept_ssurt_if_not_redundant(
|
||||||
|
canon_seed_redirect.ssurt().decode('ascii'))
|
||||||
|
|
||||||
def extra_headers(self):
|
def extra_headers(self):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
|
|
|
@ -420,3 +420,22 @@ def test_needs_browsing():
|
||||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
||||||
None, page, spy.fetches)
|
None, page, spy.fetches)
|
||||||
|
|
||||||
|
def test_seed_redirect():
|
||||||
|
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||||
|
site.note_seed_redirect('https://foo.com/a/b/c')
|
||||||
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'com,foo,//http:/',},
|
||||||
|
{'ssurt': 'com,foo,//https:/',}]}
|
||||||
|
|
||||||
|
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
|
||||||
|
site.note_seed_redirect('http://foo.com/a/b/c')
|
||||||
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'com,foo,//https:/',},
|
||||||
|
{'ssurt': 'com,foo,//http:/',}]}
|
||||||
|
|
||||||
|
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||||
|
site.note_seed_redirect('https://bar.com/a/b/c')
|
||||||
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'com,foo,//http:/',},
|
||||||
|
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue