Merge branch 'bmiller/better_fetch_url_timeout_errors' into qa

This commit is contained in:
Barbara Miller 2025-02-13 16:33:32 -08:00
commit 381c94bbcc

View file

@ -32,6 +32,8 @@ import io
import socket import socket
import random import random
import requests import requests
import urllib3
from urllib3.exceptions import TimeoutError, ProxyError
import doublethink import doublethink
import tempfile import tempfile
import urlcanon import urlcanon
@ -504,14 +506,16 @@ class BrozzlerWorker:
return outlinks return outlinks
def _fetch_url(self, site, url=None, page=None): def _fetch_url(self, site, url=None, page=None):
proxies = None proxy_url = self._proxy_for(site)
if page: if page:
url = page.url url = page.url
if self._proxy_for(site):
proxies = { if proxy_url:
"http": "http://%s" % self._proxy_for(site), http = urllib3.ProxyManager("https://%s" % proxy_url, cert_reqs="CERT_NONE")
"https": "http://%s" % self._proxy_for(site), else:
} http = urllib3.PoolManager()
user_agent = site.get("user_agent") user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {} headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page)) headers.update(site.extra_headers(page))
@ -519,24 +523,19 @@ class BrozzlerWorker:
self.logger.info("fetching url %s", url) self.logger.info("fetching url %s", url)
try: try:
# response is ignored # response is ignored
requests.get( http.request(
"GET",
url, url,
proxies=proxies,
headers=headers, headers=headers,
verify=False,
timeout=self.FETCH_URL_TIMEOUT, timeout=self.FETCH_URL_TIMEOUT,
retries=False
) )
except requests.exceptions.Timeout as e: except TimeoutError as e:
self.logger.warning("Timed out fetching %s", url) self.logger.warning("Timed out fetching %s", url)
if "archive.org" in e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e raise brozzler.ProxyError("proxy error fetching %s" % url) from e
else: except ProxyError as e:
raise brozzler.PageConnectionError(
"timeout error fetching %s" % url
) from e
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e raise brozzler.ProxyError("proxy error fetching %s" % url) from e
except requests.exceptions.RequestException as e: except urllib3.exceptions.RequestError as e:
self.logger.warning("Failed to fetch url %s", page.url, e) self.logger.warning("Failed to fetch url %s", page.url, e)
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):