From f4a9e77b06112ccc86f2745670c8bbc796c15657 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 3 Mar 2022 00:15:20 +0000 Subject: [PATCH] Catching edge cases that were avoiding setting hop path information --- brozzler/model.py | 10 +++++----- brozzler/worker.py | 4 ++-- brozzler/ydl.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 9627a11..14211ee 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -271,12 +271,12 @@ class Site(doublethink.Document, ElapsedMixIn): def extra_headers(self, page=None): hdrs = {} if self.warcprox_meta: - if page and "hop_path" in self.warcprox_meta: - self.warcprox_meta["hop_path"] = page.hop_path - self.warcprox_meta["hop_path_parent"] = page.url + if page is not None: + self.warcprox_meta["metadata"]["hop_path"] = page.hop_path + self.warcprox_meta["metadata"]["hop_path_referer"] = page.url warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':')) - self.warcprox_meta["hop_path"] = None - del self.warcprox_meta["hop_path_parent"] + del self.warcprox_meta["metadata"]["hop_path"] + del self.warcprox_meta["metadata"]["hop_path_referer"] else: warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':')) hdrs["Warcprox-Meta"] = warcprox_meta_json diff --git a/brozzler/worker.py b/brozzler/worker.py index b911eca..f631ced 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,13 +244,13 @@ class BrozzlerWorker: url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) def _on_response(chrome_msg): if ('params' in chrome_msg diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 768dfcf..33e8a2b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -115,7 +115,7 @@ def final_bounces(fetches, url): return final_bounces -def _build_youtube_dl(worker, destdir, site): +def _build_youtube_dl(worker, destdir, site, page): ''' Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`. @@ -269,7 +269,7 @@ def _build_youtube_dl(worker, destdir, site): ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): - ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers())) + ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.fetch_spy = YoutubeDLSpy() ydl.stitch_ups = [] ydl._opener.add_handler(ydl.fetch_spy) @@ -336,7 +336,7 @@ def _try_youtube_dl(worker, ydl, site, page): warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise @@ -380,7 +380,7 @@ def do_youtube_dl(worker, site, page): `list` of `str`: outlink urls ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: - ydl = _build_youtube_dl(worker, tempdir, site) + ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() if ie_result and ie_result.get('extractor') == 'youtube:playlist':