mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-06 14:22:14 -04:00
raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch
This commit is contained in:
parent
349b41ab32
commit
dc43794363
4 changed files with 61 additions and 26 deletions
|
@ -101,14 +101,14 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||||
result = _robots_cache(site, proxy).allowed(
|
result = _robots_cache(site, proxy).allowed(
|
||||||
url, site.user_agent or "brozzler")
|
url, site.user_agent or "brozzler")
|
||||||
return result
|
return result
|
||||||
except BaseException as e:
|
except Exception as e:
|
||||||
if (isinstance(e, reppy.exceptions.ServerError)
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||||
and isinstance(e.args[0], brozzler.ReachedLimit)) or (
|
e.args[0], brozzler.ReachedLimit):
|
||||||
isinstance(e, reppy.exceptions.ConnectionException)
|
|
||||||
and isinstance(
|
|
||||||
e.args[0], requests.exceptions.ProxyError)):
|
|
||||||
# reppy has wrapped an exception that we want to bubble up
|
|
||||||
raise e.args[0]
|
raise e.args[0]
|
||||||
|
elif hasattr(e, 'args') and isinstance(
|
||||||
|
e.args[0], requests.exceptions.ProxyError):
|
||||||
|
# reppy has wrapped an exception that we want to bubble up
|
||||||
|
raise brozzler.ProxyError(e)
|
||||||
else:
|
else:
|
||||||
if tries_left > 0:
|
if tries_left > 0:
|
||||||
logging.warn(
|
logging.warn(
|
||||||
|
|
|
@ -251,10 +251,17 @@ class BrozzlerWorker:
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||||
pass
|
pass
|
||||||
elif (hasattr(e, "exc_info") and e.exc_info[0] ==
|
elif (hasattr(e, "exc_info")
|
||||||
urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
|
and e.exc_info[0] == urllib.error.HTTPError
|
||||||
|
and hasattr(e.exc_info[1], "code")
|
||||||
and e.exc_info[1].code == 420):
|
and e.exc_info[1].code == 420):
|
||||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||||
|
elif (hasattr(e, 'exc_info')
|
||||||
|
and e.exc_info[0] == urllib.error.URLError
|
||||||
|
and self._proxy_for(site)):
|
||||||
|
# connection problem when using a proxy == proxy error (XXX?)
|
||||||
|
raise brozzler.ProxyError(
|
||||||
|
'youtube-dl hit apparent proxy error', e)
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
@ -285,6 +292,8 @@ class BrozzlerWorker:
|
||||||
raise
|
raise
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
raise
|
raise
|
||||||
|
except brozzler.ProxyError:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||||
and hasattr(e.exc_info[1], 'code')
|
and hasattr(e.exc_info[1], 'code')
|
||||||
|
@ -294,7 +303,7 @@ class BrozzlerWorker:
|
||||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||||
else:
|
else:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"youtube_dl raised exception on %s", page,
|
'youtube_dl raised exception on %s', page,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if self._needs_browsing(page, ydl_spy):
|
||||||
|
@ -379,10 +388,13 @@ class BrozzlerWorker:
|
||||||
}
|
}
|
||||||
|
|
||||||
self.logger.info('fetching %s', page)
|
self.logger.info('fetching %s', page)
|
||||||
# response is ignored
|
try:
|
||||||
requests.get(
|
# response is ignored
|
||||||
page.url, proxies=proxies, headers=site.extra_headers(),
|
requests.get(
|
||||||
verify=False)
|
page.url, proxies=proxies, headers=site.extra_headers(),
|
||||||
|
verify=False)
|
||||||
|
except requests.exceptions.ProxyError as e:
|
||||||
|
raise brozzler.ProxyError(e)
|
||||||
|
|
||||||
def _needs_browsing(self, page, brozzler_spy):
|
def _needs_browsing(self, page, brozzler_spy):
|
||||||
final_bounces = brozzler_spy.final_bounces(page.url)
|
final_bounces = brozzler_spy.final_bounces(page.url)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b11.dev232',
|
version='1.1b11.dev233',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -23,11 +23,11 @@ import threading
|
||||||
import os
|
import os
|
||||||
import brozzler
|
import brozzler
|
||||||
import brozzler.chrome
|
import brozzler.chrome
|
||||||
import socket
|
|
||||||
import logging
|
import logging
|
||||||
import yaml
|
import yaml
|
||||||
import datetime
|
import datetime
|
||||||
import requests
|
import requests
|
||||||
|
import tempfile
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
|
@ -108,18 +108,41 @@ blocks:
|
||||||
assert site.is_in_scope(
|
assert site.is_in_scope(
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
||||||
|
|
||||||
def test_robots_proxy_down(httpd):
|
def test_proxy_down():
|
||||||
'''
|
'''
|
||||||
Test that exception fetching robots.txt bubbles up if proxy is down.
|
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||||
'''
|
|
||||||
url = 'http://localhost:%s/' % httpd.server_port
|
|
||||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
|
||||||
|
|
||||||
sock = socket.socket()
|
This test needs to cover every possible fetch through the proxy other than
|
||||||
sock.bind(('127.0.0.1', 0))
|
fetches from the browser. For that, see test_brozzling.py.
|
||||||
not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
'''
|
||||||
with pytest.raises(requests.exceptions.ProxyError):
|
# nobody listens on port 4 :)
|
||||||
brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy)
|
not_listening_proxy = '127.0.0.1:4'
|
||||||
|
|
||||||
|
### binding and not listening produces another type of connection
|
||||||
|
### error, which we could test, but it takes a while
|
||||||
|
# sock = socket.socket()
|
||||||
|
# sock.bind(('127.0.0.1', 0))
|
||||||
|
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
||||||
|
|
||||||
|
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||||
|
|
||||||
|
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
||||||
|
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||||
|
|
||||||
|
# robots.txt fetch
|
||||||
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
brozzler.is_permitted_by_robots(
|
||||||
|
site, 'http://example.com/', proxy=not_listening_proxy)
|
||||||
|
|
||||||
|
# youtube-dl fetch
|
||||||
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
|
ydl = worker._youtube_dl(tempdir, site)
|
||||||
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
worker._try_youtube_dl(ydl, site, page)
|
||||||
|
|
||||||
|
# raw fetch
|
||||||
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
worker._fetch_url(site, page)
|
||||||
|
|
||||||
def test_start_stop_backwards_compat():
|
def test_start_stop_backwards_compat():
|
||||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue