From d488cdbb3bd7598d5c9f5fbeb26a094a71c9e94c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 18 Oct 2024 16:40:24 -0700 Subject: [PATCH 1/4] fixes for chrome v.130 --- brozzler/browser.py | 2 -- brozzler/worker.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 9c72589..4b602f1 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -658,11 +658,9 @@ class Browser: ): headers = extra_headers or {} headers["Accept-Encoding"] = "gzip" # avoid encodings br, sdch - self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method="Network.setExtraHTTPHeaders", params={"headers": headers} ) - self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=10) if user_agent: msg_id = self.send_to_chrome( method="Network.setUserAgentOverride", params={"userAgent": user_agent} diff --git a/brozzler/worker.py b/brozzler/worker.py index 116bb75..1d238a1 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -414,7 +414,7 @@ class BrozzlerWorker: self.logger.trace("%r", chrome_msg) if chrome_msg.get("params", {}).get("versions"): url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL") - if url and url not in sw_fetched: + if url and url.startswith('http') and url not in sw_fetched: self.logger.info("fetching service worker script %s", url) self._fetch_url(site, url=url) sw_fetched.add(url) From c781f53e3f8d53520ede477096d5fbdf72c9a4ca Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 2 Dec 2024 15:19:50 -0800 Subject: [PATCH 2/4] black'd --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 1d238a1..3f9d15a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -414,7 +414,7 @@ class BrozzlerWorker: self.logger.trace("%r", chrome_msg) if chrome_msg.get("params", {}).get("versions"): url = chrome_msg.get("params", {}).get("versions")[0].get("scriptURL") - if url and url.startswith('http') and url not in sw_fetched: + if url and url.startswith("http") and url not in sw_fetched: self.logger.info("fetching service worker script %s", url) self._fetch_url(site, url=url) sw_fetched.add(url) From 42e2452363f298a21beb885f04074e8efa97ad52 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 5 Dec 2024 09:14:11 -0800 Subject: [PATCH 3/4] chore: Bump version to 1.6.4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a8633e1..c44871b 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.6.3", + version="1.6.4", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 905c11d79564e2ef009475604eacc3033e079c4a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 6 Dec 2024 17:17:57 -0800 Subject: [PATCH 4/4] ytdlp should use /tmp --- brozzler/ydl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 09593f1..7ce2686 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -420,7 +420,7 @@ def do_youtube_dl(worker, site, page): Returns: `list` of `str`: outlink urls """ - with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: + with tempfile.TemporaryDirectory(prefix="brzl-ydl-", dir="/tmp") as tempdir: ydl = _build_youtube_dl(worker, tempdir, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = set()