Merge branch 'master' into qa

* master:
  raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch
  raise new exception brozzler.ProxyError in case of proxy error browsing a page
  make brozzle-page respect --proxy (no test for this!)
  oops, version bump for previous commit
  bubble up proxy errors fetching robots.txt, with unit test, and documentation
This commit is contained in:
Noah Levitt 2017-04-17 18:15:32 -07:00
commit 6844cb5bcb
8 changed files with 126 additions and 15 deletions

View File

@ -29,6 +29,9 @@ class NothingToClaim(Exception):
class CrawlStopped(Exception):
pass
class ProxyError(Exception):
pass
class ReachedLimit(Exception):
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
import json

View File

@ -239,6 +239,10 @@ class WebsockReceiverThread(threading.Thread):
message['params']['message']['text'])
elif message['method'] == 'Page.javascriptDialogOpening':
self._javascript_dialog_opening(message)
elif (message['method'] == 'Network.loadingFailed'
and 'params' in message and 'errorText' in message['params']
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
# else:
# self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message:
@ -411,7 +415,8 @@ class Browser:
outlinks: a list of navigational links extracted from the page
Raises:
BrowsingException: if browsing the page fails
brozzler.ProxyError: in case of proxy connection error
BrowsingException: if browsing the page fails in some other way
'''
if not self.is_running():
raise BrowsingException('browser has not been started')

View File

@ -166,7 +166,7 @@ def brozzle_page(argv=None):
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
'username': args.username, 'password': args.password})
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None)
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)

View File

@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
return _robots_caches[site.id]
def is_permitted_by_robots(site, url, proxy=None):
'''
Checks if `url` is permitted by robots.txt.
In case of problems fetching robots.txt, different things can happen.
Reppy (the robots.txt parsing library) handles some exceptions internally
and applies an appropriate policy. It bubbles up other exceptions. Of
these, there are two kinds that this function raises for the caller to
handle, described below. Yet other types of exceptions are caught, and the
fetch is retried up to 10 times. In this case, after the 10th failure, the
function returns `False` (i.e. forbidden by robots).
Returns:
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
by robots.txt, `False` otherwise
Raises:
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
requests.exceptions.ProxyError: if the proxy is down
'''
if site.ignore_robots:
return True
@ -82,14 +101,23 @@ def is_permitted_by_robots(site, url, proxy=None):
result = _robots_cache(site, proxy).allowed(
url, site.user_agent or "brozzler")
return result
except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
except Exception as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
e.args[0], brozzler.ReachedLimit):
raise e.args[0]
elif hasattr(e, 'args') and isinstance(
e.args[0], requests.exceptions.ProxyError):
# reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e)
else:
if tries_left > 0:
logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
logging.warn(
"caught exception fetching robots.txt (%s tries "
"left) for %s: %s", tries_left, url, repr(e))
tries_left -= 1
else:
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
logging.error(
"caught exception fetching robots.txt (0 tries "
"left) for %s: %s", url, repr(e), exc_info=True)
return False

View File

@ -251,10 +251,17 @@ class BrozzlerWorker:
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
elif (hasattr(e, "exc_info") and e.exc_info[0] ==
urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 420):
raise brozzler.ReachedLimit(e.exc_info[1])
elif (hasattr(e, 'exc_info')
and e.exc_info[0] == urllib.error.URLError
and self._proxy_for(site)):
# connection problem when using a proxy == proxy error (XXX?)
raise brozzler.ProxyError(
'youtube-dl hit apparent proxy error', e)
else:
raise
@ -285,6 +292,8 @@ class BrozzlerWorker:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code')
@ -294,7 +303,7 @@ class BrozzlerWorker:
e.exc_info[1].code, e.exc_info[1].msg, page.url)
else:
self.logger.error(
"youtube_dl raised exception on %s", page,
'youtube_dl raised exception on %s', page,
exc_info=True)
if self._needs_browsing(page, ydl_spy):
@ -379,10 +388,13 @@ class BrozzlerWorker:
}
self.logger.info('fetching %s', page)
# response is ignored
requests.get(
page.url, proxies=proxies, headers=site.extra_headers(),
verify=False)
try:
# response is ignored
requests.get(
page.url, proxies=proxies, headers=site.extra_headers(),
verify=False)
except requests.exceptions.ProxyError as e:
raise brozzler.ProxyError(e)
def _needs_browsing(self, page, brozzler_spy):
final_bounces = brozzler_spy.final_bounces(page.url)

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b11.dev229',
version='1.1b11.dev233',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -26,6 +26,7 @@ import threading
import argparse
import urllib
import json
import threading
args = argparse.Namespace()
args.log_level = logging.INFO
@ -186,3 +187,28 @@ def test_extract_outlinks(httpd):
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
}
def test_proxy_down():
'''
Test that browsing raises `brozzler.ProxyError` when proxy is down.
See also `test_proxy_down` in test_units.py.
'''
site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'})
# nobody listens on port 4 :)
not_listening_proxy = '127.0.0.1:4'
### binding and not listening produces another type of connection
### error, which we could test, but it takes a while
# sock = socket.socket()
# sock.bind(('127.0.0.1', 0))
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ProxyError):
worker.brozzle_page(browser, site, page)

View File

@ -23,10 +23,11 @@ import threading
import os
import brozzler
import brozzler.chrome
import socket
import logging
import yaml
import datetime
import requests
import tempfile
@pytest.fixture(scope='module')
def httpd(request):
@ -107,6 +108,42 @@ blocks:
assert site.is_in_scope(
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
def test_proxy_down():
'''
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
This test needs to cover every possible fetch through the proxy other than
fetches from the browser. For that, see test_brozzling.py.
'''
# nobody listens on port 4 :)
not_listening_proxy = '127.0.0.1:4'
### binding and not listening produces another type of connection
### error, which we could test, but it takes a while
# sock = socket.socket()
# sock.bind(('127.0.0.1', 0))
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'})
# robots.txt fetch
with pytest.raises(brozzler.ProxyError):
brozzler.is_permitted_by_robots(
site, 'http://example.com/', proxy=not_listening_proxy)
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = worker._youtube_dl(tempdir, site)
with pytest.raises(brozzler.ProxyError):
worker._try_youtube_dl(ydl, site, page)
# raw fetch
with pytest.raises(brozzler.ProxyError):
worker._fetch_url(site, page)
def test_start_stop_backwards_compat():
site = brozzler.Site(None, {'seed': 'http://example.com/'})
assert len(site.starts_and_stops) == 1