mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Catching edge cases that were avoiding setting hop path information
This commit is contained in:
parent
d61cec399e
commit
f4a9e77b06
@ -271,12 +271,12 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
def extra_headers(self, page=None):
|
||||
hdrs = {}
|
||||
if self.warcprox_meta:
|
||||
if page and "hop_path" in self.warcprox_meta:
|
||||
self.warcprox_meta["hop_path"] = page.hop_path
|
||||
self.warcprox_meta["hop_path_parent"] = page.url
|
||||
if page is not None:
|
||||
self.warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||
self.warcprox_meta["metadata"]["hop_path_referer"] = page.url
|
||||
warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
self.warcprox_meta["hop_path"] = None
|
||||
del self.warcprox_meta["hop_path_parent"]
|
||||
del self.warcprox_meta["metadata"]["hop_path"]
|
||||
del self.warcprox_meta["metadata"]["hop_path_referer"]
|
||||
else:
|
||||
warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
hdrs["Warcprox-Meta"] = warcprox_meta_json
|
||||
|
@ -244,13 +244,13 @@ class BrozzlerWorker:
|
||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
|
||||
def _on_response(chrome_msg):
|
||||
if ('params' in chrome_msg
|
||||
|
@ -115,7 +115,7 @@ def final_bounces(fetches, url):
|
||||
|
||||
return final_bounces
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site):
|
||||
def _build_youtube_dl(worker, destdir, site, page):
|
||||
'''
|
||||
Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
|
||||
|
||||
@ -269,7 +269,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
||||
ydl = _YoutubeDL(ydl_opts)
|
||||
if site.extra_headers():
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
|
||||
ydl.fetch_spy = YoutubeDLSpy()
|
||||
ydl.stitch_ups = []
|
||||
ydl._opener.add_handler(ydl.fetch_spy)
|
||||
@ -336,7 +336,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
return ie_result
|
||||
except brozzler.ShutdownRequested as e:
|
||||
raise
|
||||
@ -380,7 +380,7 @@ def do_youtube_dl(worker, site, page):
|
||||
`list` of `str`: outlink urls
|
||||
'''
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = set()
|
||||
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
|
||||
|
Loading…
x
Reference in New Issue
Block a user