Merge pull request #323 from galgeek/bmiller/better_fetch_url_timeout_errors

better error handling for _fetch_url
This commit is contained in:
Barbara Miller 2025-02-14 12:40:04 -08:00 committed by GitHub
commit c63f4296a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -31,6 +31,8 @@ import io
import socket import socket
import random import random
import requests import requests
import urllib3
from urllib3.exceptions import TimeoutError, ProxyError
import doublethink import doublethink
import tempfile import tempfile
import urlcanon import urlcanon
@ -480,14 +482,16 @@ class BrozzlerWorker:
return outlinks return outlinks
def _fetch_url(self, site, url=None, page=None): def _fetch_url(self, site, url=None, page=None):
proxies = None proxy_url = self._proxy_for(site)
if page: if page:
url = page.url url = page.url
if self._proxy_for(site):
proxies = { if proxy_url:
"http": "http://%s" % self._proxy_for(site), http = urllib3.ProxyManager("http://%s" % proxy_url, cert_reqs="CERT_NONE")
"https": "http://%s" % self._proxy_for(site), else:
} http = urllib3.PoolManager()
user_agent = site.get("user_agent") user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {} headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page)) headers.update(site.extra_headers(page))
@ -495,19 +499,22 @@ class BrozzlerWorker:
self.logger.info("fetching url %s", url) self.logger.info("fetching url %s", url)
try: try:
# response is ignored # response is ignored
requests.get( http.request(
"GET",
url, url,
proxies=proxies,
headers=headers, headers=headers,
verify=False,
timeout=self.FETCH_URL_TIMEOUT, timeout=self.FETCH_URL_TIMEOUT,
retries=False,
) )
except requests.exceptions.Timeout as e: self.logger.info("Completed fetching url %s", url)
self.logger.warning("Timed out fetching %s: %s", page.url, e) except TimeoutError as e:
except requests.exceptions.ProxyError as e: self.logger.warning("Timed out fetching %s", url)
raise brozzler.PageConnectionError() from e
except ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e raise brozzler.ProxyError("proxy error fetching %s" % url) from e
except requests.exceptions.RequestException as e: except urllib3.exceptions.RequestError as e:
self.logger.warning("Failed to fetch url %s", page.url, e) self.logger.warning("Failed to fetch url %s: %s", url, e)
raise brozzler.PageConnectionError() from e
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):
try: try: