From 0feac5cd07f409ebb9d867cd28943b542703e5ac Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Tue, 4 Feb 2025 16:21:28 -0800 Subject: [PATCH] feat: add timeout to header check --- brozzler/worker.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 6a7667d..9f83eea 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -51,6 +51,7 @@ class BrozzlerWorker: # cluster with slow rethinkdb. HEARTBEAT_INTERVAL = 200.0 SITE_SESSION_MINUTES = 15 + HEADER_REQUEST_TIMEOUT = 30 def __init__( self, @@ -333,9 +334,16 @@ class BrozzlerWorker: # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow try: - with requests.get(page.url, stream=True, verify=False) as r: + with requests.get( + page.url, stream=True, verify=False, timeout=HEADER_REQUEST_TIMEOUT + ) as r: page_headers = r.headers return page_headers + except requests.exceptions.Timeout as e: + self.logger.warning( + "Timed out trying to get headers for %s: %s", page.url, e + ) + return {} except requests.exceptions.RequestException as e: self.logger.warning("Failed to get headers for %s: %s", page.url, e) return {}