mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
requests timeout and user_agent
This commit is contained in:
parent
65de0d2a5f
commit
561e0803c6
@ -272,7 +272,7 @@ class BrozzlerWorker:
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
outlinks = set()
|
||||
|
||||
page_headers = self._get_page_headers(page)
|
||||
page_headers = self._get_page_headers(site, page)
|
||||
|
||||
if not self._needs_browsing(page_headers):
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
@ -331,13 +331,19 @@ class BrozzlerWorker:
|
||||
|
||||
@metrics.brozzler_header_processing_duration_seconds.time()
|
||||
@metrics.brozzler_in_progress_headers.track_inprogress()
|
||||
def _get_page_headers(self, page):
|
||||
def _get_page_headers(self, site, page):
|
||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||
try:
|
||||
user_agent = site.get("user_agent")
|
||||
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||
self.logger.info("getting page headers for %s", page.url)
|
||||
with requests.get(
|
||||
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
|
||||
page.url,
|
||||
stream=True,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
timeout=self.HEADER_REQUEST_TIMEOUT,
|
||||
) as r:
|
||||
return r.headers
|
||||
except requests.exceptions.Timeout as e:
|
||||
@ -482,14 +488,17 @@ class BrozzlerWorker:
|
||||
"http": "http://%s" % self._proxy_for(site),
|
||||
"https": "http://%s" % self._proxy_for(site),
|
||||
}
|
||||
user_agent = site.get("user_agent")
|
||||
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||
headers.update(site.extra_headers(page))
|
||||
|
||||
self.logger.info("fetching %s", url)
|
||||
self.logger.info("fetching url %s", url)
|
||||
try:
|
||||
# response is ignored
|
||||
requests.get(
|
||||
url,
|
||||
proxies=proxies,
|
||||
headers=site.extra_headers(page),
|
||||
headers=headers,
|
||||
verify=False,
|
||||
timeout=self.FETCH_URL_TIMEOUT,
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user