Merge pull request #318 from internetarchive/adam/get-page-header-timeout

feat: add timeout to header check
2025-12-16 09:03:55 -05:00 · 2025-02-06 11:22:28 -08:00 · 2025-02-06 11:22:28 -08:00 · 7ededbc521
commit 7ededbc521
parent df4bd148d5 8ed517c1c0
2 changed files with 11 additions and 5 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -51,6 +51,7 @@ class BrozzlerWorker:
    # cluster with slow rethinkdb.
    HEARTBEAT_INTERVAL = 200.0
    SITE_SESSION_MINUTES = 15
+    HEADER_REQUEST_TIMEOUT = 30

    def __init__(
        self,
@ -333,12 +334,17 @@ class BrozzlerWorker:
        # bypassing warcprox, requests' stream=True defers downloading the body of the response
        # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
        try:
-            with requests.get(page.url, stream=True, verify=False) as r:
-                page_headers = r.headers
-            return page_headers
+            with requests.get(
+                page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
+            ) as r:
+                return r.headers
+        except requests.exceptions.Timeout as e:
+            self.logger.warning(
+                "Timed out trying to get headers for %s: %s", page.url, e
+            )
        except requests.exceptions.RequestException as e:
            self.logger.warning("Failed to get headers for %s: %s", page.url, e)
-            return {}
+        return {}

    def _needs_browsing(self, page_headers):
        if (
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ def find_package_data(package):

 setuptools.setup(
    name="brozzler",
-    version="1.6.6",
+    version="1.6.7",
    description="Distributed web crawling with browsers",
    url="https://github.com/internetarchive/brozzler",
    author="Noah Levitt",