Merge pull request #323 from galgeek/bmiller/better_fetch_url_timeout_errors

better error handling for _fetch_url
This commit is contained in:
Barbara Miller 2025-02-14 12:40:04 -08:00 committed by GitHub
commit c63f4296a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -31,6 +31,8 @@ import io
import socket
import random
import requests
import urllib3
from urllib3.exceptions import TimeoutError, ProxyError
import doublethink
import tempfile
import urlcanon
@ -480,14 +482,16 @@ class BrozzlerWorker:
return outlinks
def _fetch_url(self, site, url=None, page=None):
proxies = None
proxy_url = self._proxy_for(site)
if page:
url = page.url
if self._proxy_for(site):
proxies = {
"http": "http://%s" % self._proxy_for(site),
"https": "http://%s" % self._proxy_for(site),
}
if proxy_url:
http = urllib3.ProxyManager("http://%s" % proxy_url, cert_reqs="CERT_NONE")
else:
http = urllib3.PoolManager()
user_agent = site.get("user_agent")
headers = {"User-Agent": user_agent} if user_agent else {}
headers.update(site.extra_headers(page))
@ -495,19 +499,22 @@ class BrozzlerWorker:
self.logger.info("fetching url %s", url)
try:
# response is ignored
requests.get(
http.request(
"GET",
url,
proxies=proxies,
headers=headers,
verify=False,
timeout=self.FETCH_URL_TIMEOUT,
retries=False,
)
except requests.exceptions.Timeout as e:
self.logger.warning("Timed out fetching %s: %s", page.url, e)
except requests.exceptions.ProxyError as e:
self.logger.info("Completed fetching url %s", url)
except TimeoutError as e:
self.logger.warning("Timed out fetching %s", url)
raise brozzler.PageConnectionError() from e
except ProxyError as e:
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
except requests.exceptions.RequestException as e:
self.logger.warning("Failed to fetch url %s", page.url, e)
except urllib3.exceptions.RequestError as e:
self.logger.warning("Failed to fetch url %s: %s", url, e)
raise brozzler.PageConnectionError() from e
def brozzle_site(self, browser, site):
try: