timeout for fetch_url

This commit is contained in:
Barbara Miller 2025-02-09 11:13:03 -08:00
parent 7ededbc521
commit 65de0d2a5f

View File

@ -52,6 +52,7 @@ class BrozzlerWorker:
HEARTBEAT_INTERVAL = 200.0 HEARTBEAT_INTERVAL = 200.0
SITE_SESSION_MINUTES = 15 SITE_SESSION_MINUTES = 15
HEADER_REQUEST_TIMEOUT = 30 HEADER_REQUEST_TIMEOUT = 30
FETCH_URL_TIMEOUT = 60
def __init__( def __init__(
self, self,
@ -334,6 +335,7 @@ class BrozzlerWorker:
# bypassing warcprox, requests' stream=True defers downloading the body of the response # bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try: try:
self.logger.info("getting page headers for %s", page.url)
with requests.get( with requests.get(
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
) as r: ) as r:
@ -485,8 +487,14 @@ class BrozzlerWorker:
try: try:
# response is ignored # response is ignored
requests.get( requests.get(
url, proxies=proxies, headers=site.extra_headers(page), verify=False url,
proxies=proxies,
headers=site.extra_headers(page),
verify=False,
timeout=self.FETCH_URL_TIMEOUT,
) )
except requests.exceptions.Timeout as e:
self.logger.warning("Timed out fetching %s: %s", page.url, e)
except requests.exceptions.ProxyError as e: except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e raise brozzler.ProxyError("proxy error fetching %s" % url) from e