mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch raise new exception brozzler.ProxyError in case of proxy error browsing a page make brozzle-page respect --proxy (no test for this!) oops, version bump for previous commit bubble up proxy errors fetching robots.txt, with unit test, and documentation
This commit is contained in:
commit
6844cb5bcb
@ -29,6 +29,9 @@ class NothingToClaim(Exception):
|
||||
class CrawlStopped(Exception):
|
||||
pass
|
||||
|
||||
class ProxyError(Exception):
|
||||
pass
|
||||
|
||||
class ReachedLimit(Exception):
|
||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||
import json
|
||||
|
@ -239,6 +239,10 @@ class WebsockReceiverThread(threading.Thread):
|
||||
message['params']['message']['text'])
|
||||
elif message['method'] == 'Page.javascriptDialogOpening':
|
||||
self._javascript_dialog_opening(message)
|
||||
elif (message['method'] == 'Network.loadingFailed'
|
||||
and 'params' in message and 'errorText' in message['params']
|
||||
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
|
||||
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
|
||||
# else:
|
||||
# self.logger.debug("%s %s", message["method"], json_message)
|
||||
elif 'result' in message:
|
||||
@ -411,7 +415,8 @@ class Browser:
|
||||
outlinks: a list of navigational links extracted from the page
|
||||
|
||||
Raises:
|
||||
BrowsingException: if browsing the page fails
|
||||
brozzler.ProxyError: in case of proxy connection error
|
||||
BrowsingException: if browsing the page fails in some other way
|
||||
'''
|
||||
if not self.is_running():
|
||||
raise BrowsingException('browser has not been started')
|
||||
|
@ -166,7 +166,7 @@ def brozzle_page(argv=None):
|
||||
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
||||
'username': args.username, 'password': args.password})
|
||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
|
||||
|
||||
def on_screenshot(screenshot_png):
|
||||
OK_CHARS = (string.ascii_letters + string.digits)
|
||||
|
@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
|
||||
return _robots_caches[site.id]
|
||||
|
||||
def is_permitted_by_robots(site, url, proxy=None):
|
||||
'''
|
||||
Checks if `url` is permitted by robots.txt.
|
||||
|
||||
In case of problems fetching robots.txt, different things can happen.
|
||||
Reppy (the robots.txt parsing library) handles some exceptions internally
|
||||
and applies an appropriate policy. It bubbles up other exceptions. Of
|
||||
these, there are two kinds that this function raises for the caller to
|
||||
handle, described below. Yet other types of exceptions are caught, and the
|
||||
fetch is retried up to 10 times. In this case, after the 10th failure, the
|
||||
function returns `False` (i.e. forbidden by robots).
|
||||
|
||||
Returns:
|
||||
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
|
||||
by robots.txt, `False` otherwise
|
||||
|
||||
Raises:
|
||||
brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
|
||||
requests.exceptions.ProxyError: if the proxy is down
|
||||
'''
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
|
||||
@ -82,14 +101,23 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
result = _robots_cache(site, proxy).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except BaseException as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
except Exception as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||
e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
elif hasattr(e, 'args') and isinstance(
|
||||
e.args[0], requests.exceptions.ProxyError):
|
||||
# reppy has wrapped an exception that we want to bubble up
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
if tries_left > 0:
|
||||
logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
|
||||
logging.warn(
|
||||
"caught exception fetching robots.txt (%s tries "
|
||||
"left) for %s: %s", tries_left, url, repr(e))
|
||||
tries_left -= 1
|
||||
else:
|
||||
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
|
||||
logging.error(
|
||||
"caught exception fetching robots.txt (0 tries "
|
||||
"left) for %s: %s", url, repr(e), exc_info=True)
|
||||
return False
|
||||
|
||||
|
@ -251,10 +251,17 @@ class BrozzlerWorker:
|
||||
except BaseException as e:
|
||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||
pass
|
||||
elif (hasattr(e, "exc_info") and e.exc_info[0] ==
|
||||
urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
|
||||
elif (hasattr(e, "exc_info")
|
||||
and e.exc_info[0] == urllib.error.HTTPError
|
||||
and hasattr(e.exc_info[1], "code")
|
||||
and e.exc_info[1].code == 420):
|
||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||
elif (hasattr(e, 'exc_info')
|
||||
and e.exc_info[0] == urllib.error.URLError
|
||||
and self._proxy_for(site)):
|
||||
# connection problem when using a proxy == proxy error (XXX?)
|
||||
raise brozzler.ProxyError(
|
||||
'youtube-dl hit apparent proxy error', e)
|
||||
else:
|
||||
raise
|
||||
|
||||
@ -285,6 +292,8 @@ class BrozzlerWorker:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except brozzler.ProxyError:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], 'code')
|
||||
@ -294,7 +303,7 @@ class BrozzlerWorker:
|
||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||
else:
|
||||
self.logger.error(
|
||||
"youtube_dl raised exception on %s", page,
|
||||
'youtube_dl raised exception on %s', page,
|
||||
exc_info=True)
|
||||
|
||||
if self._needs_browsing(page, ydl_spy):
|
||||
@ -379,10 +388,13 @@ class BrozzlerWorker:
|
||||
}
|
||||
|
||||
self.logger.info('fetching %s', page)
|
||||
# response is ignored
|
||||
requests.get(
|
||||
page.url, proxies=proxies, headers=site.extra_headers(),
|
||||
verify=False)
|
||||
try:
|
||||
# response is ignored
|
||||
requests.get(
|
||||
page.url, proxies=proxies, headers=site.extra_headers(),
|
||||
verify=False)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise brozzler.ProxyError(e)
|
||||
|
||||
def _needs_browsing(self, page, brozzler_spy):
|
||||
final_bounces = brozzler_spy.final_bounces(page.url)
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b11.dev229',
|
||||
version='1.1b11.dev233',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -26,6 +26,7 @@ import threading
|
||||
import argparse
|
||||
import urllib
|
||||
import json
|
||||
import threading
|
||||
|
||||
args = argparse.Namespace()
|
||||
args.log_level = logging.INFO
|
||||
@ -186,3 +187,28 @@ def test_extract_outlinks(httpd):
|
||||
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
|
||||
}
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
||||
|
||||
See also `test_proxy_down` in test_units.py.
|
||||
'''
|
||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||
|
||||
# nobody listens on port 4 :)
|
||||
not_listening_proxy = '127.0.0.1:4'
|
||||
|
||||
### binding and not listening produces another type of connection
|
||||
### error, which we could test, but it takes a while
|
||||
# sock = socket.socket()
|
||||
# sock.bind(('127.0.0.1', 0))
|
||||
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
||||
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker.brozzle_page(browser, site, page)
|
||||
|
||||
|
@ -23,10 +23,11 @@ import threading
|
||||
import os
|
||||
import brozzler
|
||||
import brozzler.chrome
|
||||
import socket
|
||||
import logging
|
||||
import yaml
|
||||
import datetime
|
||||
import requests
|
||||
import tempfile
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
@ -107,6 +108,42 @@ blocks:
|
||||
assert site.is_in_scope(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
|
||||
This test needs to cover every possible fetch through the proxy other than
|
||||
fetches from the browser. For that, see test_brozzling.py.
|
||||
'''
|
||||
# nobody listens on port 4 :)
|
||||
not_listening_proxy = '127.0.0.1:4'
|
||||
|
||||
### binding and not listening produces another type of connection
|
||||
### error, which we could test, but it takes a while
|
||||
# sock = socket.socket()
|
||||
# sock.bind(('127.0.0.1', 0))
|
||||
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
||||
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
|
||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||
|
||||
# robots.txt fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.is_permitted_by_robots(
|
||||
site, 'http://example.com/', proxy=not_listening_proxy)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = worker._youtube_dl(tempdir, site)
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._try_youtube_dl(ydl, site, page)
|
||||
|
||||
# raw fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._fetch_url(site, page)
|
||||
|
||||
def test_start_stop_backwards_compat():
|
||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||
assert len(site.starts_and_stops) == 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user