Merge pull request #318 from internetarchive/adam/get-page-header-timeout
Some checks failed
Publish Artifacts / Build distribution 📦 (push) Has been cancelled
Python Formatting Check / formatting (push) Has been cancelled

feat: add timeout to header check
This commit is contained in:
Adam Miller 2025-02-06 11:22:28 -08:00 committed by GitHub
commit 7ededbc521
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 5 deletions

View File

@ -51,6 +51,7 @@ class BrozzlerWorker:
# cluster with slow rethinkdb.
HEARTBEAT_INTERVAL = 200.0
SITE_SESSION_MINUTES = 15
HEADER_REQUEST_TIMEOUT = 30
def __init__(
self,
@ -333,12 +334,17 @@ class BrozzlerWorker:
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try:
with requests.get(page.url, stream=True, verify=False) as r:
page_headers = r.headers
return page_headers
with requests.get(
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
) as r:
return r.headers
except requests.exceptions.Timeout as e:
self.logger.warning(
"Timed out trying to get headers for %s: %s", page.url, e
)
except requests.exceptions.RequestException as e:
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}
return {}
def _needs_browsing(self, page_headers):
if (

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup(
name="brozzler",
version="1.6.6",
version="1.6.7",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",