raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch

This commit is contained in:
Noah Levitt 2017-04-17 18:15:22 -07:00
parent 349b41ab32
commit dc43794363
4 changed files with 61 additions and 26 deletions

View file

@ -101,14 +101,14 @@ def is_permitted_by_robots(site, url, proxy=None):
result = _robots_cache(site, proxy).allowed( result = _robots_cache(site, proxy).allowed(
url, site.user_agent or "brozzler") url, site.user_agent or "brozzler")
return result return result
except BaseException as e: except Exception as e:
if (isinstance(e, reppy.exceptions.ServerError) if isinstance(e, reppy.exceptions.ServerError) and isinstance(
and isinstance(e.args[0], brozzler.ReachedLimit)) or ( e.args[0], brozzler.ReachedLimit):
isinstance(e, reppy.exceptions.ConnectionException)
and isinstance(
e.args[0], requests.exceptions.ProxyError)):
# reppy has wrapped an exception that we want to bubble up
raise e.args[0] raise e.args[0]
elif hasattr(e, 'args') and isinstance(
e.args[0], requests.exceptions.ProxyError):
# reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e)
else: else:
if tries_left > 0: if tries_left > 0:
logging.warn( logging.warn(

View file

@ -251,10 +251,17 @@ class BrozzlerWorker:
except BaseException as e: except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass pass
elif (hasattr(e, "exc_info") and e.exc_info[0] == elif (hasattr(e, "exc_info")
urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420): and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1]) raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and self._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error', e)
else: else:
raise raise
@ -285,6 +292,8 @@ class BrozzlerWorker:
raise raise
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except brozzler.ProxyError:
raise
except Exception as e: except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code') and hasattr(e.exc_info[1], 'code')
@ -294,7 +303,7 @@ class BrozzlerWorker:
e.exc_info[1].code, e.exc_info[1].msg, page.url) e.exc_info[1].code, e.exc_info[1].msg, page.url)
else: else:
self.logger.error( self.logger.error(
"youtube_dl raised exception on %s", page, 'youtube_dl raised exception on %s', page,
exc_info=True) exc_info=True)
if self._needs_browsing(page, ydl_spy): if self._needs_browsing(page, ydl_spy):
@ -379,10 +388,13 @@ class BrozzlerWorker:
} }
self.logger.info('fetching %s', page) self.logger.info('fetching %s', page)
# response is ignored try:
requests.get( # response is ignored
page.url, proxies=proxies, headers=site.extra_headers(), requests.get(
verify=False) page.url, proxies=proxies, headers=site.extra_headers(),
verify=False)
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError(e)
def _needs_browsing(self, page, brozzler_spy): def _needs_browsing(self, page, brozzler_spy):
final_bounces = brozzler_spy.final_bounces(page.url) final_bounces = brozzler_spy.final_bounces(page.url)

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b11.dev232', version='1.1b11.dev233',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -23,11 +23,11 @@ import threading
import os import os
import brozzler import brozzler
import brozzler.chrome import brozzler.chrome
import socket
import logging import logging
import yaml import yaml
import datetime import datetime
import requests import requests
import tempfile
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -108,18 +108,41 @@ blocks:
assert site.is_in_scope( assert site.is_in_scope(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
def test_robots_proxy_down(httpd): def test_proxy_down():
''' '''
Test that exception fetching robots.txt bubbles up if proxy is down. Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
'''
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
sock = socket.socket() This test needs to cover every possible fetch through the proxy other than
sock.bind(('127.0.0.1', 0)) fetches from the browser. For that, see test_brozzling.py.
not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1] '''
with pytest.raises(requests.exceptions.ProxyError): # nobody listens on port 4 :)
brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy) not_listening_proxy = '127.0.0.1:4'
### binding and not listening produces another type of connection
### error, which we could test, but it takes a while
# sock = socket.socket()
# sock.bind(('127.0.0.1', 0))
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'})
# robots.txt fetch
with pytest.raises(brozzler.ProxyError):
brozzler.is_permitted_by_robots(
site, 'http://example.com/', proxy=not_listening_proxy)
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = worker._youtube_dl(tempdir, site)
with pytest.raises(brozzler.ProxyError):
worker._try_youtube_dl(ydl, site, page)
# raw fetch
with pytest.raises(brozzler.ProxyError):
worker._fetch_url(site, page)
def test_start_stop_backwards_compat(): def test_start_stop_backwards_compat():
site = brozzler.Site(None, {'seed': 'http://example.com/'}) site = brozzler.Site(None, {'seed': 'http://example.com/'})