mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-10 16:49:41 -04:00
Merge pull request #318 from internetarchive/adam/get-page-header-timeout
feat: add timeout to header check
This commit is contained in:
commit
7ededbc521
2 changed files with 11 additions and 5 deletions
|
@ -51,6 +51,7 @@ class BrozzlerWorker:
|
|||
# cluster with slow rethinkdb.
|
||||
HEARTBEAT_INTERVAL = 200.0
|
||||
SITE_SESSION_MINUTES = 15
|
||||
HEADER_REQUEST_TIMEOUT = 30
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -333,12 +334,17 @@ class BrozzlerWorker:
|
|||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||
try:
|
||||
with requests.get(page.url, stream=True, verify=False) as r:
|
||||
page_headers = r.headers
|
||||
return page_headers
|
||||
with requests.get(
|
||||
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
|
||||
) as r:
|
||||
return r.headers
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.logger.warning(
|
||||
"Timed out trying to get headers for %s: %s", page.url, e
|
||||
)
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
||||
return {}
|
||||
return {}
|
||||
|
||||
def _needs_browsing(self, page_headers):
|
||||
if (
|
||||
|
|
2
setup.py
2
setup.py
|
@ -34,7 +34,7 @@ def find_package_data(package):
|
|||
|
||||
setuptools.setup(
|
||||
name="brozzler",
|
||||
version="1.6.6",
|
||||
version="1.6.7",
|
||||
description="Distributed web crawling with browsers",
|
||||
url="https://github.com/internetarchive/brozzler",
|
||||
author="Noah Levitt",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue