From 0f72233f3ba74db4f4f111539f588a802b4396c1 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 31 Aug 2021 19:44:55 +0000 Subject: [PATCH 1/5] Adding support for hop path information to be stored and passed along to warcprox --- brozzler/frontier.py | 1 + brozzler/model.py | 18 ++++++++++++++---- brozzler/worker.py | 12 +++++++----- tests/test_units.py | 2 +- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 6715eb3..5b7a95c 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -302,6 +302,7 @@ class RethinkDbFrontier: 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, + 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", 'via_page_id': parent_page.id, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else []}) diff --git a/brozzler/model.py b/brozzler/model.py index c9ece80..9627a11 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -121,7 +121,8 @@ def new_seed_page(frontier, site): page = brozzler.Page(frontier.rr, { "url": str(url), "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0, - "priority": 1000, "needs_robots_check": True}) + "priority": 1000, "needs_robots_check": True, + "hop_path": None}) if hashtag: page.hashtags = [hashtag,] return page @@ -267,11 +268,18 @@ class Site(doublethink.Document, ElapsedMixIn): self._accept_ssurt_if_not_redundant( canon_seed_redirect.ssurt().decode('ascii')) - def extra_headers(self): + def extra_headers(self, page=None): hdrs = {} if self.warcprox_meta: - hdrs["Warcprox-Meta"] = json.dumps( - self.warcprox_meta, separators=(',', ':')) + if page and "hop_path" in self.warcprox_meta: + self.warcprox_meta["hop_path"] = page.hop_path + self.warcprox_meta["hop_path_parent"] = page.url + warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':')) + self.warcprox_meta["hop_path"] = None + del self.warcprox_meta["hop_path_parent"] + else: + warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':')) + hdrs["Warcprox-Meta"] = warcprox_meta_json return hdrs def accept_reject_or_neither(self, url, parent_page=None): @@ -338,6 +346,8 @@ class Page(doublethink.Document): def populate_defaults(self): if not "hops_from_seed" in self: self.hops_from_seed = 0 + if not "hop_path" in self: + self.hop_path = None if not "brozzle_count" in self: self.brozzle_count = 0 if not "claimed" in self: diff --git a/brozzler/worker.py b/brozzler/worker.py index d88893b..b911eca 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -224,7 +224,7 @@ class BrozzlerWorker: else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) - self._fetch_url(site, page.url) + self._fetch_url(site, page=page) else: self.logger.info('already fetched: %s', page) @@ -287,7 +287,7 @@ class BrozzlerWorker: .get('scriptURL') if url and url not in sw_fetched: self.logger.info('fetching service worker script %s', url) - self._fetch_url(site, url) + self._fetch_url(site, url=url) sw_fetched.add(url) if not browser.is_running(): @@ -295,7 +295,7 @@ class BrozzlerWorker: proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( - page.url, extra_headers=site.extra_headers(), + page.url, extra_headers=site.extra_headers(page), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), @@ -316,8 +316,10 @@ class BrozzlerWorker: page.note_redirect(final_page_url) return outlinks - def _fetch_url(self, site, url): + def _fetch_url(self, site, url=None, page=None): proxies = None + if page: + url=page.url if self._proxy_for(site): proxies = { 'http': 'http://%s' % self._proxy_for(site), @@ -328,7 +330,7 @@ class BrozzlerWorker: try: # response is ignored requests.get( - url, proxies=proxies, headers=site.extra_headers(), + url, proxies=proxies, headers=site.extra_headers(page), verify=False) except requests.exceptions.ProxyError as e: raise brozzler.ProxyError( diff --git a/tests/test_units.py b/tests/test_units.py index 5b0295c..4326867 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -234,7 +234,7 @@ def test_proxy_down(): # raw fetch with pytest.raises(brozzler.ProxyError): - worker._fetch_url(site, page.url) + worker._fetch_url(site, page=page) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): From f4a9e77b06112ccc86f2745670c8bbc796c15657 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 3 Mar 2022 00:15:20 +0000 Subject: [PATCH 2/5] Catching edge cases that were avoiding setting hop path information --- brozzler/model.py | 10 +++++----- brozzler/worker.py | 4 ++-- brozzler/ydl.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 9627a11..14211ee 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -271,12 +271,12 @@ class Site(doublethink.Document, ElapsedMixIn): def extra_headers(self, page=None): hdrs = {} if self.warcprox_meta: - if page and "hop_path" in self.warcprox_meta: - self.warcprox_meta["hop_path"] = page.hop_path - self.warcprox_meta["hop_path_parent"] = page.url + if page is not None: + self.warcprox_meta["metadata"]["hop_path"] = page.hop_path + self.warcprox_meta["metadata"]["hop_path_referer"] = page.url warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':')) - self.warcprox_meta["hop_path"] = None - del self.warcprox_meta["hop_path_parent"] + del self.warcprox_meta["metadata"]["hop_path"] + del self.warcprox_meta["metadata"]["hop_path_referer"] else: warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':')) hdrs["Warcprox-Meta"] = warcprox_meta_json diff --git a/brozzler/worker.py b/brozzler/worker.py index b911eca..f631ced 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,13 +244,13 @@ class BrozzlerWorker: url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) def _on_response(chrome_msg): if ('params' in chrome_msg diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 768dfcf..33e8a2b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -115,7 +115,7 @@ def final_bounces(fetches, url): return final_bounces -def _build_youtube_dl(worker, destdir, site): +def _build_youtube_dl(worker, destdir, site, page): ''' Builds a `youtube_dl.YoutubeDL` for brozzling `site` with `worker`. @@ -269,7 +269,7 @@ def _build_youtube_dl(worker, destdir, site): ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): - ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers())) + ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) ydl.fetch_spy = YoutubeDLSpy() ydl.stitch_ups = [] ydl._opener.add_handler(ydl.fetch_spy) @@ -336,7 +336,7 @@ def _try_youtube_dl(worker, ydl, site, page): warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers()) + extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise @@ -380,7 +380,7 @@ def do_youtube_dl(worker, site, page): `list` of `str`: outlink urls ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: - ydl = _build_youtube_dl(worker, tempdir, site) + ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set() if ie_result and ie_result.get('extractor') == 'youtube:playlist': From cd16985724561ff1eebcc2839156c9af114c38e0 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 24 Mar 2022 21:38:47 +0000 Subject: [PATCH 3/5] Refactor of hop referrer passing --- brozzler/frontier.py | 1 + brozzler/model.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 5b7a95c..6e64e51 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -304,6 +304,7 @@ class RethinkDbFrontier: 'hops_from_seed': parent_page.hops_from_seed + 1, 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", 'via_page_id': parent_page.id, + 'via_page_url': parent_page.url, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else []}) return page diff --git a/brozzler/model.py b/brozzler/model.py index 14211ee..53a36d1 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -273,10 +273,12 @@ class Site(doublethink.Document, ElapsedMixIn): if self.warcprox_meta: if page is not None: self.warcprox_meta["metadata"]["hop_path"] = page.hop_path - self.warcprox_meta["metadata"]["hop_path_referer"] = page.url + self.warcprox_meta["metadata"]["brozzled_url"] = page.url + self.warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':')) del self.warcprox_meta["metadata"]["hop_path"] - del self.warcprox_meta["metadata"]["hop_path_referer"] + del self.warcprox_meta["metadata"]["brozzled_url"] + del self.warcprox_meta["metadata"]["hop_via_url"] else: warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':')) hdrs["Warcprox-Meta"] = warcprox_meta_json @@ -348,6 +350,8 @@ class Page(doublethink.Document): self.hops_from_seed = 0 if not "hop_path" in self: self.hop_path = None + if not "via_page_url" in self: + self.via_page_url = None if not "brozzle_count" in self: self.brozzle_count = 0 if not "claimed" in self: From 05826942a972b39f269f369dd2fc5f0df84e3ea1 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Wed, 20 Apr 2022 22:49:18 +0000 Subject: [PATCH 4/5] Style fix --- brozzler/model.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 53a36d1..689b268 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -31,6 +31,7 @@ import urlcanon import urllib import uuid import yaml +from typing import Optional def load_schema(): schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') @@ -119,9 +120,12 @@ def new_seed_page(frontier, site): hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) page = brozzler.Page(frontier.rr, { - "url": str(url), "site_id": site.get("id"), - "job_id": site.get("job_id"), "hops_from_seed": 0, - "priority": 1000, "needs_robots_check": True, + "url": str(url), + "site_id": site.get("id"), + "job_id": site.get("job_id"), + "hops_from_seed": 0, + "priority": 1000, + "needs_robots_check": True, "hop_path": None}) if hashtag: page.hashtags = [hashtag,] @@ -268,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn): self._accept_ssurt_if_not_redundant( canon_seed_redirect.ssurt().decode('ascii')) - def extra_headers(self, page=None): + def extra_headers(self, page: Optional["Page"] = None): hdrs = {} if self.warcprox_meta: if page is not None: From eef8a1c432330d53a5f2db11b2257921d7895e97 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 26 Apr 2022 09:55:08 -0700 Subject: [PATCH 5/5] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3bef00f..184f5d9 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.26', + version='1.5.27', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',