Merge branch 'bmiller/more_better_requests' into qa

This commit is contained in:
Barbara Miller 2025-02-11 16:30:36 -08:00
commit a2da84cad7
2 changed files with 25 additions and 6 deletions

View File

@ -53,6 +53,7 @@ class BrozzlerWorker:
HEARTBEAT_INTERVAL = 200.0 HEARTBEAT_INTERVAL = 200.0
SITE_SESSION_MINUTES = 15 SITE_SESSION_MINUTES = 15
HEADER_REQUEST_TIMEOUT = 30 HEADER_REQUEST_TIMEOUT = 30
FETCH_URL_TIMEOUT = 60
def __init__( def __init__(
self, self,
@ -271,7 +272,7 @@ class BrozzlerWorker:
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
outlinks = set() outlinks = set()
page_headers = self._get_page_headers(page) page_headers = self._get_page_headers(site, page)
if not self._needs_browsing(page_headers): if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page) self.logger.info("needs fetch: %s", page)
@ -338,12 +339,19 @@ class BrozzlerWorker:
@metrics.brozzler_header_processing_duration_seconds.time() @metrics.brozzler_header_processing_duration_seconds.time()
@metrics.brozzler_in_progress_headers.track_inprogress() @metrics.brozzler_in_progress_headers.track_inprogress()
def _get_page_headers(self, page): def _get_page_headers(self, site, page):
# bypassing warcprox, requests' stream=True defers downloading the body of the response # bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try: try:
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
self.logger.info("getting page headers for %s", page.url)
with requests.get( with requests.get(
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT page.url,
stream=True,
verify=False,
headers=headers,
timeout=self.HEADER_REQUEST_TIMEOUT,
) as r: ) as r:
return r.headers return r.headers
except requests.exceptions.Timeout as e: except requests.exceptions.Timeout as e:
@ -504,15 +512,26 @@ class BrozzlerWorker:
"http": "http://%s" % self._proxy_for(site), "http": "http://%s" % self._proxy_for(site),
"https": "http://%s" % self._proxy_for(site), "https": "http://%s" % self._proxy_for(site),
} }
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page))
self.logger.info("fetching %s", url) self.logger.info("fetching url %s", url)
try: try:
# response is ignored # response is ignored
requests.get( requests.get(
url, proxies=proxies, headers=site.extra_headers(page), verify=False url,
proxies=proxies,
headers=headers,
verify=False,
timeout=self.FETCH_URL_TIMEOUT,
) )
except requests.exceptions.Timeout as e:
self.logger.warning("Timed out fetching %s: %s", page.url, e)
except requests.exceptions.ProxyError as e: except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e raise brozzler.ProxyError("proxy error fetching %s" % url) from e
except requests.exceptions.RequestException as e:
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):
try: try:

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name="brozzler", name="brozzler",
version="1.6.7.a1", version="1.6.8.a0",
description="Distributed web crawling with browsers", description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler", url="https://github.com/internetarchive/brozzler",
author="Noah Levitt", author="Noah Levitt",