mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
Merge pull request #323 from galgeek/bmiller/better_fetch_url_timeout_errors
better error handling for _fetch_url
This commit is contained in:
commit
c63f4296a6
@ -31,6 +31,8 @@ import io
|
|||||||
import socket
|
import socket
|
||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
import urllib3
|
||||||
|
from urllib3.exceptions import TimeoutError, ProxyError
|
||||||
import doublethink
|
import doublethink
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
@ -480,14 +482,16 @@ class BrozzlerWorker:
|
|||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _fetch_url(self, site, url=None, page=None):
|
def _fetch_url(self, site, url=None, page=None):
|
||||||
proxies = None
|
proxy_url = self._proxy_for(site)
|
||||||
|
|
||||||
if page:
|
if page:
|
||||||
url = page.url
|
url = page.url
|
||||||
if self._proxy_for(site):
|
|
||||||
proxies = {
|
if proxy_url:
|
||||||
"http": "http://%s" % self._proxy_for(site),
|
http = urllib3.ProxyManager("http://%s" % proxy_url, cert_reqs="CERT_NONE")
|
||||||
"https": "http://%s" % self._proxy_for(site),
|
else:
|
||||||
}
|
http = urllib3.PoolManager()
|
||||||
|
|
||||||
user_agent = site.get("user_agent")
|
user_agent = site.get("user_agent")
|
||||||
headers = {"User-Agent": user_agent} if user_agent else {}
|
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||||
headers.update(site.extra_headers(page))
|
headers.update(site.extra_headers(page))
|
||||||
@ -495,19 +499,22 @@ class BrozzlerWorker:
|
|||||||
self.logger.info("fetching url %s", url)
|
self.logger.info("fetching url %s", url)
|
||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
http.request(
|
||||||
|
"GET",
|
||||||
url,
|
url,
|
||||||
proxies=proxies,
|
|
||||||
headers=headers,
|
headers=headers,
|
||||||
verify=False,
|
|
||||||
timeout=self.FETCH_URL_TIMEOUT,
|
timeout=self.FETCH_URL_TIMEOUT,
|
||||||
|
retries=False,
|
||||||
)
|
)
|
||||||
except requests.exceptions.Timeout as e:
|
self.logger.info("Completed fetching url %s", url)
|
||||||
self.logger.warning("Timed out fetching %s: %s", page.url, e)
|
except TimeoutError as e:
|
||||||
except requests.exceptions.ProxyError as e:
|
self.logger.warning("Timed out fetching %s", url)
|
||||||
|
raise brozzler.PageConnectionError() from e
|
||||||
|
except ProxyError as e:
|
||||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||||
except requests.exceptions.RequestException as e:
|
except urllib3.exceptions.RequestError as e:
|
||||||
self.logger.warning("Failed to fetch url %s", page.url, e)
|
self.logger.warning("Failed to fetch url %s: %s", url, e)
|
||||||
|
raise brozzler.PageConnectionError() from e
|
||||||
|
|
||||||
def brozzle_site(self, browser, site):
|
def brozzle_site(self, browser, site):
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user