requests timeout and user_agent

This commit is contained in:
Barbara Miller 2025-02-11 12:27:50 -08:00
parent 65de0d2a5f
commit 561e0803c6

View File

@ -272,7 +272,7 @@ class BrozzlerWorker:
self.logger.info("brozzling {}".format(page))
outlinks = set()
page_headers = self._get_page_headers(page)
page_headers = self._get_page_headers(site, page)
if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
@ -331,13 +331,19 @@ class BrozzlerWorker:
@metrics.brozzler_header_processing_duration_seconds.time()
@metrics.brozzler_in_progress_headers.track_inprogress()
def _get_page_headers(self, page):
def _get_page_headers(self, site, page):
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
try:
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
self.logger.info("getting page headers for %s", page.url)
with requests.get(
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
page.url,
stream=True,
verify=False,
headers=headers,
timeout=self.HEADER_REQUEST_TIMEOUT,
) as r:
return r.headers
except requests.exceptions.Timeout as e:
@ -482,14 +488,17 @@ class BrozzlerWorker:
"http": "http://%s" % self._proxy_for(site),
"https": "http://%s" % self._proxy_for(site),
}
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page))
self.logger.info("fetching %s", url)
self.logger.info("fetching url %s", url)
try:
# response is ignored
requests.get(
url,
proxies=proxies,
headers=site.extra_headers(page),
headers=headers,
verify=False,
timeout=self.FETCH_URL_TIMEOUT,
)