mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
requests timeout and user_agent
This commit is contained in:
parent
65de0d2a5f
commit
561e0803c6
@ -272,7 +272,7 @@ class BrozzlerWorker:
|
|||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
|
|
||||||
page_headers = self._get_page_headers(page)
|
page_headers = self._get_page_headers(site, page)
|
||||||
|
|
||||||
if not self._needs_browsing(page_headers):
|
if not self._needs_browsing(page_headers):
|
||||||
self.logger.info("needs fetch: %s", page)
|
self.logger.info("needs fetch: %s", page)
|
||||||
@ -331,13 +331,19 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
@metrics.brozzler_header_processing_duration_seconds.time()
|
@metrics.brozzler_header_processing_duration_seconds.time()
|
||||||
@metrics.brozzler_in_progress_headers.track_inprogress()
|
@metrics.brozzler_in_progress_headers.track_inprogress()
|
||||||
def _get_page_headers(self, page):
|
def _get_page_headers(self, site, page):
|
||||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||||
try:
|
try:
|
||||||
|
user_agent = site.get("user_agent")
|
||||||
|
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||||
self.logger.info("getting page headers for %s", page.url)
|
self.logger.info("getting page headers for %s", page.url)
|
||||||
with requests.get(
|
with requests.get(
|
||||||
page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT
|
page.url,
|
||||||
|
stream=True,
|
||||||
|
verify=False,
|
||||||
|
headers=headers,
|
||||||
|
timeout=self.HEADER_REQUEST_TIMEOUT,
|
||||||
) as r:
|
) as r:
|
||||||
return r.headers
|
return r.headers
|
||||||
except requests.exceptions.Timeout as e:
|
except requests.exceptions.Timeout as e:
|
||||||
@ -482,14 +488,17 @@ class BrozzlerWorker:
|
|||||||
"http": "http://%s" % self._proxy_for(site),
|
"http": "http://%s" % self._proxy_for(site),
|
||||||
"https": "http://%s" % self._proxy_for(site),
|
"https": "http://%s" % self._proxy_for(site),
|
||||||
}
|
}
|
||||||
|
user_agent = site.get("user_agent")
|
||||||
|
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||||
|
headers.update(site.extra_headers(page))
|
||||||
|
|
||||||
self.logger.info("fetching %s", url)
|
self.logger.info("fetching url %s", url)
|
||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
requests.get(
|
||||||
url,
|
url,
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
headers=site.extra_headers(page),
|
headers=headers,
|
||||||
verify=False,
|
verify=False,
|
||||||
timeout=self.FETCH_URL_TIMEOUT,
|
timeout=self.FETCH_URL_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user