diff --git a/brozzler/worker.py b/brozzler/worker.py index 54c4835..9327126 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -32,6 +32,8 @@ import io import socket import random import requests +import urllib3 +from urllib3.exceptions import TimeoutError, ProxyError import doublethink import tempfile import urlcanon @@ -504,14 +506,16 @@ class BrozzlerWorker: return outlinks def _fetch_url(self, site, url=None, page=None): - proxies = None + proxy_url = self._proxy_for(site) + if page: url = page.url - if self._proxy_for(site): - proxies = { - "http": "http://%s" % self._proxy_for(site), - "https": "http://%s" % self._proxy_for(site), - } + + if proxy_url: + http = urllib3.ProxyManager("https://%s" % proxy_url, cert_reqs="CERT_NONE") + else: + http = urllib3.PoolManager() + user_agent = site.get("user_agent") headers = {"User-Agent": user_agent} if user_agent else {} headers.update(site.extra_headers(page)) @@ -519,24 +523,19 @@ class BrozzlerWorker: self.logger.info("fetching url %s", url) try: # response is ignored - requests.get( + http.request( + "GET", url, - proxies=proxies, headers=headers, - verify=False, timeout=self.FETCH_URL_TIMEOUT, + retries=False ) - except requests.exceptions.Timeout as e: + except TimeoutError as e: self.logger.warning("Timed out fetching %s", url) - if "archive.org" in e: - raise brozzler.ProxyError("proxy error fetching %s" % url) from e - else: - raise brozzler.PageConnectionError( - "timeout error fetching %s" % url - ) from e - except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e - except requests.exceptions.RequestException as e: + except ProxyError as e: + raise brozzler.ProxyError("proxy error fetching %s" % url) from e + except urllib3.exceptions.RequestError as e: self.logger.warning("Failed to fetch url %s", page.url, e) def brozzle_site(self, browser, site):