From 8ed517c1c023ca9e49da5c1b8668b96dfa90caca Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 6 Feb 2025 11:19:23 -0800 Subject: [PATCH 1/4] chore: bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 72452c6..1f84a50 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.6.6", + version="1.6.7", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", From 65de0d2a5f252a7d6145c6c6a437ae425fee5e20 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 9 Feb 2025 11:13:03 -0800 Subject: [PATCH 2/4] timeout for fetch_url --- brozzler/worker.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 78bdb6a..a73129b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -52,6 +52,7 @@ class BrozzlerWorker: HEARTBEAT_INTERVAL = 200.0 SITE_SESSION_MINUTES = 15 HEADER_REQUEST_TIMEOUT = 30 + FETCH_URL_TIMEOUT = 60 def __init__( self, @@ -334,6 +335,7 @@ class BrozzlerWorker: # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow try: + self.logger.info("getting page headers for %s", page.url) with requests.get( page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT ) as r: @@ -485,8 +487,14 @@ class BrozzlerWorker: try: # response is ignored requests.get( - url, proxies=proxies, headers=site.extra_headers(page), verify=False + url, + proxies=proxies, + headers=site.extra_headers(page), + verify=False, + timeout=self.FETCH_URL_TIMEOUT, ) + except requests.exceptions.Timeout as e: + self.logger.warning("Timed out fetching %s: %s", page.url, e) except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e From 561e0803c6f13209eed53405654aef9c6d7c0ad3 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 11 Feb 2025 12:27:50 -0800 Subject: [PATCH 3/4] requests timeout and user_agent --- brozzler/worker.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index a73129b..022544b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -272,7 +272,7 @@ class BrozzlerWorker: self.logger.info("brozzling {}".format(page)) outlinks = set() - page_headers = self._get_page_headers(page) + page_headers = self._get_page_headers(site, page) if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) @@ -331,13 +331,19 @@ class BrozzlerWorker: @metrics.brozzler_header_processing_duration_seconds.time() @metrics.brozzler_in_progress_headers.track_inprogress() - def _get_page_headers(self, page): + def _get_page_headers(self, site, page): # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow try: + user_agent = site.get("user_agent") + headers = {"User-Agent": user_agent} if user_agent else {} self.logger.info("getting page headers for %s", page.url) with requests.get( - page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT + page.url, + stream=True, + verify=False, + headers=headers, + timeout=self.HEADER_REQUEST_TIMEOUT, ) as r: return r.headers except requests.exceptions.Timeout as e: @@ -482,14 +488,17 @@ class BrozzlerWorker: "http": "http://%s" % self._proxy_for(site), "https": "http://%s" % self._proxy_for(site), } + user_agent = site.get("user_agent") + headers = {"User-Agent": user_agent} if user_agent else {} + headers.update(site.extra_headers(page)) - self.logger.info("fetching %s", url) + self.logger.info("fetching url %s", url) try: # response is ignored requests.get( url, proxies=proxies, - headers=site.extra_headers(page), + headers=headers, verify=False, timeout=self.FETCH_URL_TIMEOUT, ) From 430c0daf39d23a85120679d4e6b4b4b47d8567e9 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 11 Feb 2025 12:51:26 -0800 Subject: [PATCH 4/4] catch and log more exceptions on fetch_url error --- brozzler/worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index 022544b..18ff319 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -506,6 +506,8 @@ class BrozzlerWorker: self.logger.warning("Timed out fetching %s: %s", page.url, e) except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e + except requests.exceptions.RequestException as e: + self.logger.warning("Failed to get headers for %s: %s", page.url, e) def brozzle_site(self, browser, site): try: