mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
Merge pull request #323 from galgeek/bmiller/better_fetch_url_timeout_errors
better error handling for _fetch_url
This commit is contained in:
commit
c63f4296a6
@ -31,6 +31,8 @@ import io
|
||||
import socket
|
||||
import random
|
||||
import requests
|
||||
import urllib3
|
||||
from urllib3.exceptions import TimeoutError, ProxyError
|
||||
import doublethink
|
||||
import tempfile
|
||||
import urlcanon
|
||||
@ -480,14 +482,16 @@ class BrozzlerWorker:
|
||||
return outlinks
|
||||
|
||||
def _fetch_url(self, site, url=None, page=None):
|
||||
proxies = None
|
||||
proxy_url = self._proxy_for(site)
|
||||
|
||||
if page:
|
||||
url = page.url
|
||||
if self._proxy_for(site):
|
||||
proxies = {
|
||||
"http": "http://%s" % self._proxy_for(site),
|
||||
"https": "http://%s" % self._proxy_for(site),
|
||||
}
|
||||
|
||||
if proxy_url:
|
||||
http = urllib3.ProxyManager("http://%s" % proxy_url, cert_reqs="CERT_NONE")
|
||||
else:
|
||||
http = urllib3.PoolManager()
|
||||
|
||||
user_agent = site.get("user_agent")
|
||||
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||
headers.update(site.extra_headers(page))
|
||||
@ -495,19 +499,22 @@ class BrozzlerWorker:
|
||||
self.logger.info("fetching url %s", url)
|
||||
try:
|
||||
# response is ignored
|
||||
requests.get(
|
||||
http.request(
|
||||
"GET",
|
||||
url,
|
||||
proxies=proxies,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
timeout=self.FETCH_URL_TIMEOUT,
|
||||
retries=False,
|
||||
)
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.logger.warning("Timed out fetching %s: %s", page.url, e)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
self.logger.info("Completed fetching url %s", url)
|
||||
except TimeoutError as e:
|
||||
self.logger.warning("Timed out fetching %s", url)
|
||||
raise brozzler.PageConnectionError() from e
|
||||
except ProxyError as e:
|
||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.logger.warning("Failed to fetch url %s", page.url, e)
|
||||
except urllib3.exceptions.RequestError as e:
|
||||
self.logger.warning("Failed to fetch url %s: %s", url, e)
|
||||
raise brozzler.PageConnectionError() from e
|
||||
|
||||
def brozzle_site(self, browser, site):
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user