mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-22 22:40:47 -04:00
Merge branch 'bmiller/better_fetch_url_timeout_errors' into qa
This commit is contained in:
commit
381c94bbcc
1 changed files with 17 additions and 18 deletions
|
@ -32,6 +32,8 @@ import io
|
||||||
import socket
|
import socket
|
||||||
import random
|
import random
|
||||||
import requests
|
import requests
|
||||||
|
import urllib3
|
||||||
|
from urllib3.exceptions import TimeoutError, ProxyError
|
||||||
import doublethink
|
import doublethink
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
|
@ -504,14 +506,16 @@ class BrozzlerWorker:
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _fetch_url(self, site, url=None, page=None):
|
def _fetch_url(self, site, url=None, page=None):
|
||||||
proxies = None
|
proxy_url = self._proxy_for(site)
|
||||||
|
|
||||||
if page:
|
if page:
|
||||||
url = page.url
|
url = page.url
|
||||||
if self._proxy_for(site):
|
|
||||||
proxies = {
|
if proxy_url:
|
||||||
"http": "http://%s" % self._proxy_for(site),
|
http = urllib3.ProxyManager("https://%s" % proxy_url, cert_reqs="CERT_NONE")
|
||||||
"https": "http://%s" % self._proxy_for(site),
|
else:
|
||||||
}
|
http = urllib3.PoolManager()
|
||||||
|
|
||||||
user_agent = site.get("user_agent")
|
user_agent = site.get("user_agent")
|
||||||
headers = {"User-Agent": user_agent} if user_agent else {}
|
headers = {"User-Agent": user_agent} if user_agent else {}
|
||||||
headers.update(site.extra_headers(page))
|
headers.update(site.extra_headers(page))
|
||||||
|
@ -519,24 +523,19 @@ class BrozzlerWorker:
|
||||||
self.logger.info("fetching url %s", url)
|
self.logger.info("fetching url %s", url)
|
||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
http.request(
|
||||||
|
"GET",
|
||||||
url,
|
url,
|
||||||
proxies=proxies,
|
|
||||||
headers=headers,
|
headers=headers,
|
||||||
verify=False,
|
|
||||||
timeout=self.FETCH_URL_TIMEOUT,
|
timeout=self.FETCH_URL_TIMEOUT,
|
||||||
|
retries=False
|
||||||
)
|
)
|
||||||
except requests.exceptions.Timeout as e:
|
except TimeoutError as e:
|
||||||
self.logger.warning("Timed out fetching %s", url)
|
self.logger.warning("Timed out fetching %s", url)
|
||||||
if "archive.org" in e:
|
|
||||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||||
else:
|
except ProxyError as e:
|
||||||
raise brozzler.PageConnectionError(
|
|
||||||
"timeout error fetching %s" % url
|
|
||||||
) from e
|
|
||||||
except requests.exceptions.ProxyError as e:
|
|
||||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||||
except requests.exceptions.RequestException as e:
|
except urllib3.exceptions.RequestError as e:
|
||||||
self.logger.warning("Failed to fetch url %s", page.url, e)
|
self.logger.warning("Failed to fetch url %s", page.url, e)
|
||||||
|
|
||||||
def brozzle_site(self, browser, site):
|
def brozzle_site(self, browser, site):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue