Merge branch 'adam/adds_get_page_header_exception_handling' into qa

This commit is contained in:
Adam Miller 2024-08-12 23:42:31 +00:00
commit d9406485ae
2 changed files with 8 additions and 4 deletions

View File

@ -296,9 +296,13 @@ class BrozzlerWorker:
def _get_page_headers(self, page):
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
with requests.get(page.url, stream=True) as r:
page_headers = r.headers
return page_headers
try:
with requests.get(page.url, stream=True, verify=False) as r:
page_headers = r.headers
return page_headers
except requests.exceptions.RequestException as e:
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}
def _needs_browsing(self, page_headers):
if (

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup(
name="brozzler",
version="1.5.53a1",
version="1.5.53a2",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",