mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-23 06:50:37 -04:00
Merge branch 'adam/adds_get_page_header_exception_handling' into qa
This commit is contained in:
commit
d9406485ae
2 changed files with 8 additions and 4 deletions
|
@ -296,9 +296,13 @@ class BrozzlerWorker:
|
||||||
def _get_page_headers(self, page):
|
def _get_page_headers(self, page):
|
||||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||||
with requests.get(page.url, stream=True) as r:
|
try:
|
||||||
|
with requests.get(page.url, stream=True, verify=False) as r:
|
||||||
page_headers = r.headers
|
page_headers = r.headers
|
||||||
return page_headers
|
return page_headers
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
||||||
|
return {}
|
||||||
|
|
||||||
def _needs_browsing(self, page_headers):
|
def _needs_browsing(self, page_headers):
|
||||||
if (
|
if (
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -34,7 +34,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="brozzler",
|
name="brozzler",
|
||||||
version="1.5.53a1",
|
version="1.5.53a2",
|
||||||
description="Distributed web crawling with browsers",
|
description="Distributed web crawling with browsers",
|
||||||
url="https://github.com/internetarchive/brozzler",
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author="Noah Levitt",
|
author="Noah Levitt",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue