mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
feat: add timeout to header check
This commit is contained in:
parent
df4bd148d5
commit
0feac5cd07
@ -51,6 +51,7 @@ class BrozzlerWorker:
|
|||||||
# cluster with slow rethinkdb.
|
# cluster with slow rethinkdb.
|
||||||
HEARTBEAT_INTERVAL = 200.0
|
HEARTBEAT_INTERVAL = 200.0
|
||||||
SITE_SESSION_MINUTES = 15
|
SITE_SESSION_MINUTES = 15
|
||||||
|
HEADER_REQUEST_TIMEOUT = 30
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -333,9 +334,16 @@ class BrozzlerWorker:
|
|||||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||||
try:
|
try:
|
||||||
with requests.get(page.url, stream=True, verify=False) as r:
|
with requests.get(
|
||||||
|
page.url, stream=True, verify=False, timeout=HEADER_REQUEST_TIMEOUT
|
||||||
|
) as r:
|
||||||
page_headers = r.headers
|
page_headers = r.headers
|
||||||
return page_headers
|
return page_headers
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
self.logger.warning(
|
||||||
|
"Timed out trying to get headers for %s: %s", page.url, e
|
||||||
|
)
|
||||||
|
return {}
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
||||||
return {}
|
return {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user