diff --git a/readme.rst b/README.rst similarity index 99% rename from readme.rst rename to README.rst index 9496508..491f0b8 100644 --- a/readme.rst +++ b/README.rst @@ -1,7 +1,7 @@ .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master :target: https://travis-ci.org/internetarchive/brozzler -.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg +.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg :width: 60px |logo| brozzler diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index ca98e05..28cceba 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -27,7 +27,7 @@ default_parameters: actions: - selector: a.coreSpriteDismissLarge - - selector: div._mck9w a + - selector: a>div[role='button'] firstMatchOnly: true - selector: a.coreSpriteRightPaginationArrow repeatSameElement: true diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 36251cd..54e74ec 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -24,7 +24,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[dashboard]".\nSee readme.rst for more information.', + 'brozzler[dashboard]".\nSee README.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink diff --git a/brozzler/easy.py b/brozzler/easy.py index d4ccd5a..83cf1ba 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee readme.rst for more information.', + 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, e) sys.exit(1) import argparse diff --git a/brozzler/pywb.py b/brozzler/pywb.py index ff26653..5932f0b 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee readme.rst for more information.', + 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink @@ -270,7 +270,7 @@ Run pywb like so: $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback -See readme.rst for more information. +See README.rst for more information. ''' # copied and pasted from cdxdomainspecific.py, only changes are commented as diff --git a/brozzler/robots.py b/brozzler/robots.py index aef9913..5b96423 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent): return self.agents.get('*') reppy.parser.Rules.__getitem__ = _reppy_rules_getitem +class _SessionRaiseOn420(requests.Session): + timeout = 60 + def get(self, url, *args, **kwargs): + res = super().get(url, timeout=self.timeout, *args, **kwargs) + if res.status_code == 420 and 'warcprox-meta' in res.headers: + raise brozzler.ReachedLimit( + warcprox_meta=json.loads(res.headers['warcprox-meta']), + http_payload=res.text) + else: + return res + _robots_caches = {} # {site_id:reppy.cache.RobotsCache} def _robots_cache(site, proxy=None): - class SessionRaiseOn420(requests.Session): - def get(self, url, *args, **kwargs): - res = super().get(url, *args, **kwargs) - if res.status_code == 420 and 'warcprox-meta' in res.headers: - raise brozzler.ReachedLimit( - warcprox_meta=json.loads(res.headers['warcprox-meta']), - http_payload=res.text) - else: - return res - if not site.id in _robots_caches: - req_sesh = SessionRaiseOn420() + req_sesh = _SessionRaiseOn420() req_sesh.verify = False # ignore cert errors if proxy: proxie = "http://%s" % proxy @@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None): req_sesh.headers.update(site.extra_headers()) if site.user_agent: req_sesh.headers['User-Agent'] = site.user_agent - _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh) + _robots_caches[site.id] = reppy.cache.RobotsCache( + session=req_sesh, disallow_forbidden=False) return _robots_caches[site.id] @@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None): ''' Checks if `url` is permitted by robots.txt. - In case of problems fetching robots.txt, different things can happen. - Reppy (the robots.txt parsing library) handles some exceptions internally - and applies an appropriate policy. It bubbles up other exceptions. Of - these, there are two kinds that this function raises for the caller to - handle, described below. Yet other types of exceptions are caught, and the - fetch is retried up to 10 times. In this case, after the 10th failure, the - function returns `False` (i.e. forbidden by robots). + Treats any kind of error fetching robots.txt as "allow all". See + http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI) + for some background on that policy. Returns: bool: `True` if `site.ignore_robots` is set, or if `url` is permitted @@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None): if site.ignore_robots: return True - tries_left = 10 - while True: - try: - result = _robots_cache(site, proxy).allowed( - url, site.user_agent or "brozzler") - return result - except Exception as e: - if isinstance(e, reppy.exceptions.ServerError) and isinstance( - e.args[0], brozzler.ReachedLimit): - raise e.args[0] - elif hasattr(e, 'args') and isinstance( - e.args[0], requests.exceptions.ProxyError): - # reppy has wrapped an exception that we want to bubble up - raise brozzler.ProxyError(e) - else: - if tries_left > 0: - logging.warn( - "caught exception fetching robots.txt (%r tries " - "left) for %r: %r", tries_left, url, e) - tries_left -= 1 - else: - logging.error( - "caught exception fetching robots.txt (0 tries " - "left) for %r: %r", url, e, exc_info=True) - return False + try: + result = _robots_cache(site, proxy).allowed( + url, site.user_agent or "brozzler") + return result + except Exception as e: + if isinstance(e, reppy.exceptions.ServerError) and isinstance( + e.args[0], brozzler.ReachedLimit): + raise e.args[0] + elif hasattr(e, 'args') and isinstance( + e.args[0], requests.exceptions.ProxyError): + # reppy has wrapped an exception that we want to bubble up + raise brozzler.ProxyError(e) + else: + logging.warn( + "returning true (permitted) after problem fetching " + "robots.txt for %r: %r", url, e) + return True diff --git a/brozzler/worker.py b/brozzler/worker.py index 5dbb330..d47f939 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -113,7 +113,11 @@ class YoutubeDLSpy(urllib.request.BaseHandler): class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) - HEARTBEAT_INTERVAL = 20.0 + # 3⅓ min heartbeat interval => 10 min ttl + # This is kind of a long time, because `frontier.claim_sites()`, which runs + # in the same thread as the heartbeats, can take a while on a busy brozzler + # cluster with slow rethinkdb. + HEARTBEAT_INTERVAL = 200.0 SITE_SESSION_MINUTES = 15 def __init__( diff --git a/setup.py b/setup.py index d37da36..659ce46 100644 --- a/setup.py +++ b/setup.py @@ -32,12 +32,12 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b13.dev291', + version='1.4.dev295', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('readme.rst', mode='rb').read().decode('UTF-8'), + long_description=open('README.rst', mode='rb').read().decode('UTF-8'), license='Apache License 2.0', packages=['brozzler', 'brozzler.dashboard'], package_data={ diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 0ec5026..ae4f7a4 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -769,7 +769,7 @@ def test_time_limit(httpd): rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) - # create a new job with three sites that could be crawled forever + # create a new job with one seed that could be crawled forever job_conf = {'seeds': [{ 'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, 'time_limit': 20}]} @@ -789,6 +789,10 @@ def test_time_limit(httpd): assert sites[0].status == 'FINISHED_TIME_LIMIT' # all sites finished so job should be finished too + start = time.time() job.refresh() + while not job.status == 'FINISHED' and time.time() - start < 10: + time.sleep(0.5) + job.refresh() assert job.status == 'FINISHED' diff --git a/tests/test_units.py b/tests/test_units.py index eed034e..47307d0 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -32,6 +32,7 @@ import uuid import socket import time import sys +import threading logging.basicConfig( stream=sys.stderr, level=logging.INFO, format=( @@ -67,6 +68,87 @@ def test_robots(httpd): site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) assert not brozzler.is_permitted_by_robots(site, url) +def test_robots_http_statuses(): + for status in ( + 200, 204, 400, 401, 402, 403, 404, 405, + 500, 501, 502, 503, 504, 505): + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + response = (('HTTP/1.1 %s Meaningless message\r\n' + + 'Content-length: 0\r\n' + + '\r\n') % status).encode('utf-8') + self.connection.sendall(response) + # self.send_response(status) + # self.end_headers() + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + finally: + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_empty_response(): + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.connection.shutdown(socket.SHUT_RDWR) + self.connection.close() + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + finally: + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_socket_timeout(): + stop_hanging = threading.Event() + class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + stop_hanging.wait(60) + self.connection.sendall( + b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n') + + orig_timeout = brozzler.robots._SessionRaiseOn420.timeout + + httpd = http.server.HTTPServer(('localhost', 0), Handler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + try: + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(None, {'seed': url}) + brozzler.robots._SessionRaiseOn420.timeout = 2 + assert brozzler.is_permitted_by_robots(site, url) + finally: + brozzler.robots._SessionRaiseOn420.timeout = orig_timeout + stop_hanging.set() + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + +def test_robots_dns_failure(): + # .invalid. is guaranteed nonexistent per rfc 6761 + url = 'http://whatever.invalid./' + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + +def test_robots_connection_failure(): + # .invalid. is guaranteed nonexistent per rfc 6761 + url = 'http://localhost:4/' # nobody listens on port 4 + site = brozzler.Site(None, {'seed': url}) + assert brozzler.is_permitted_by_robots(site, url) + def test_scoping(): test_scope = yaml.load(''' max_hops: 100