From 53a1869def85bca6d4e6f7f9f5b38c5a142b43cc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 14:21:24 -0800 Subject: [PATCH 1/8] better error handling for _fetch_url --- brozzler/worker.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index be1b196..f90bbd5 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -503,7 +503,11 @@ class BrozzlerWorker: timeout=self.FETCH_URL_TIMEOUT, ) except requests.exceptions.Timeout as e: - self.logger.warning("Timed out fetching %s: %s", page.url, e) + self.logger.warning("Timed out fetching %s", url) + if 'archive.org' in e: + raise brozzler.ProxyError("proxy error fetching %s" % url) from e + else: + raise brozzler.PageConnectionError(timeout error fetching %s" % url) from e except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e except requests.exceptions.RequestException as e: From 2c9c040b841940c89d505d88482b8851ff167f2e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 14:24:27 -0800 Subject: [PATCH 2/8] black'd --- brozzler/worker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index f90bbd5..9a1d64d 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -504,10 +504,12 @@ class BrozzlerWorker: ) except requests.exceptions.Timeout as e: self.logger.warning("Timed out fetching %s", url) - if 'archive.org' in e: + if "archive.org" in e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e else: - raise brozzler.PageConnectionError(timeout error fetching %s" % url) from e + raise brozzler.PageConnectionError( + "timeout error fetching %s" % url + ) from e except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e except requests.exceptions.RequestException as e: From 4af48be6ca8d77d5ca588c5679d6d6e3e452b87c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 16:11:22 -0800 Subject: [PATCH 3/8] use urllib3 --- brozzler/worker.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 9a1d64d..cd4671b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -31,6 +31,8 @@ import io import socket import random import requests +import urllib3 +from urllib3.exceptions import TimeoutError, ProxyError import doublethink import tempfile import urlcanon @@ -480,14 +482,16 @@ class BrozzlerWorker: return outlinks def _fetch_url(self, site, url=None, page=None): - proxies = None + proxy_url = self._proxy_for(site) + if page: url = page.url - if self._proxy_for(site): - proxies = { - "http": "http://%s" % self._proxy_for(site), - "https": "http://%s" % self._proxy_for(site), - } + + if proxy_url: + http = urllib3.ProxyManager("https://%s" % proxy_url) + else: + http = urllib3.PoolManager() + user_agent = site.get("user_agent") headers = {"User-Agent": user_agent} if user_agent else {} headers.update(site.extra_headers(page)) @@ -495,24 +499,19 @@ class BrozzlerWorker: self.logger.info("fetching url %s", url) try: # response is ignored - requests.get( + http.request( + "GET", url, - proxies=proxies, headers=headers, - verify=False, timeout=self.FETCH_URL_TIMEOUT, + retries=False ) - except requests.exceptions.Timeout as e: + except TimeoutError as e: self.logger.warning("Timed out fetching %s", url) - if "archive.org" in e: - raise brozzler.ProxyError("proxy error fetching %s" % url) from e - else: - raise brozzler.PageConnectionError( - "timeout error fetching %s" % url - ) from e - except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e - except requests.exceptions.RequestException as e: + except ProxyError as e: + raise brozzler.ProxyError("proxy error fetching %s" % url) from e + except urllib3.exceptions.RequestError as e: self.logger.warning("Failed to fetch url %s", page.url, e) def brozzle_site(self, browser, site): From 9dca20023090aa897c16480590a0dc2aa6edc2e1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 16:27:05 -0800 Subject: [PATCH 4/8] cert_reqs="CERT_NONE" --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index cd4671b..cd065f3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -488,7 +488,7 @@ class BrozzlerWorker: url = page.url if proxy_url: - http = urllib3.ProxyManager("https://%s" % proxy_url) + http = urllib3.ProxyManager("https://%s" % proxy_url, cert_reqs="CERT_NONE") else: http = urllib3.PoolManager() From 819a483227d03fab0ffca9fc00bb6c65940c59a3 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 17:55:36 -0800 Subject: [PATCH 5/8] black'd --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index cd065f3..0f1e72b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -504,7 +504,7 @@ class BrozzlerWorker: url, headers=headers, timeout=self.FETCH_URL_TIMEOUT, - retries=False + retries=False, ) except TimeoutError as e: self.logger.warning("Timed out fetching %s", url) From 732a7943f0275e917ad037b5b8cc2c08855da827 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Feb 2025 17:57:53 -0800 Subject: [PATCH 6/8] http, not https, maybe --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 0f1e72b..a6586bf 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -488,7 +488,7 @@ class BrozzlerWorker: url = page.url if proxy_url: - http = urllib3.ProxyManager("https://%s" % proxy_url, cert_reqs="CERT_NONE") + http = urllib3.ProxyManager("http://%s" % proxy_url, cert_reqs="CERT_NONE") else: http = urllib3.PoolManager() From ba7031f2da86075440db9f7c145eb16ec27e4989 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 14 Feb 2025 09:39:41 -0800 Subject: [PATCH 7/8] better exceptions for fetch_url --- brozzler/worker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index a6586bf..619ef35 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -508,11 +508,12 @@ class BrozzlerWorker: ) except TimeoutError as e: self.logger.warning("Timed out fetching %s", url) - raise brozzler.ProxyError("proxy error fetching %s" % url) from e + raise brozzler.PageConnectionError() from e except ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e except urllib3.exceptions.RequestError as e: - self.logger.warning("Failed to fetch url %s", page.url, e) + self.logger.warning("Failed to fetch url %s: %s", url, e) + raise brozzler.PageConnectionError() from e def brozzle_site(self, browser, site): try: From 71ffbddfeb34480466a5c5ff55d16e05dd3e4dfc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 14 Feb 2025 10:38:24 -0800 Subject: [PATCH 8/8] log _fetch_url completion --- brozzler/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index 619ef35..ad1a993 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -506,6 +506,7 @@ class BrozzlerWorker: timeout=self.FETCH_URL_TIMEOUT, retries=False, ) + self.logger.info("Completed fetching url %s", url) except TimeoutError as e: self.logger.warning("Timed out fetching %s", url) raise brozzler.PageConnectionError() from e