diff --git a/brozzler/worker.py b/brozzler/worker.py index a73129b..022544b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -272,7 +272,7 @@ class BrozzlerWorker: self.logger.info("brozzling {}".format(page)) outlinks = set() - page_headers = self._get_page_headers(page) + page_headers = self._get_page_headers(site, page) if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) @@ -331,13 +331,19 @@ class BrozzlerWorker: @metrics.brozzler_header_processing_duration_seconds.time() @metrics.brozzler_in_progress_headers.track_inprogress() - def _get_page_headers(self, page): + def _get_page_headers(self, site, page): # bypassing warcprox, requests' stream=True defers downloading the body of the response # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow try: + user_agent = site.get("user_agent") + headers = {"User-Agent": user_agent} if user_agent else {} self.logger.info("getting page headers for %s", page.url) with requests.get( - page.url, stream=True, verify=False, timeout=self.HEADER_REQUEST_TIMEOUT + page.url, + stream=True, + verify=False, + headers=headers, + timeout=self.HEADER_REQUEST_TIMEOUT, ) as r: return r.headers except requests.exceptions.Timeout as e: @@ -482,14 +488,17 @@ class BrozzlerWorker: "http": "http://%s" % self._proxy_for(site), "https": "http://%s" % self._proxy_for(site), } + user_agent = site.get("user_agent") + headers = {"User-Agent": user_agent} if user_agent else {} + headers.update(site.extra_headers(page)) - self.logger.info("fetching %s", url) + self.logger.info("fetching url %s", url) try: # response is ignored requests.get( url, proxies=proxies, - headers=site.extra_headers(page), + headers=headers, verify=False, timeout=self.FETCH_URL_TIMEOUT, )