From f40bbd13d936d08495fceab284ec53175092db42 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 12 Aug 2024 23:33:53 +0000 Subject: [PATCH 1/2] fix: handle exceptions when requesting page headers for content-type sampling. --- brozzler/worker.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 3d20035..7279753 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -295,9 +295,13 @@ class BrozzlerWorker: def _get_page_headers(self, page): # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow - with requests.get(page.url, stream=True) as r: - page_headers = r.headers - return page_headers + try: + with requests.get(page.url, stream=True, verify=False) as r: + page_headers = r.headers + return page_headers + except requests.exceptions.RequestException as e: + self.logger.warning("Failed to get headers for %s: %s", page.url, e) + return {} def _needs_browsing(self, page_headers): if ( From 2dfe84149382d5523ef4dea5c4d08748f715818a Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Mon, 12 Aug 2024 23:53:04 +0000 Subject: [PATCH 2/2] fix: match original indenting --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7279753..479dfa7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -298,7 +298,7 @@ class BrozzlerWorker: try: with requests.get(page.url, stream=True, verify=False) as r: page_headers = r.headers - return page_headers + return page_headers except requests.exceptions.RequestException as e: self.logger.warning("Failed to get headers for %s: %s", page.url, e) return {}