Merge branch 'master' into karl

* master: bump up heartbeat interval (see comment) back to dev version version 1.3 (messed up 1.2) setuptools wants README not readme back to dev version number version 1.2 bump dev version after merge is test_time_limit is failing because of timing? fix bug in test, add another one treat any error fetching robots.txt as "allow all" update instagram behavior
2025-04-19 23:35:54 -04:00 · 2018-07-23 23:28:42 +00:00 · 2018-07-23 23:28:42 +00:00 · a7fb7bcc37
commit a7fb7bcc37
parent bd78e07232 9d18dc6aeb
10 changed files with 134 additions and 54 deletions
--- a/README.rst
+++ b/README.rst
@ -1,7 +1,7 @@
 .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
    :target: https://travis-ci.org/internetarchive/brozzler

-.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
+.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
   :width: 60px

 |logo| brozzler
--- a/brozzler/behaviors.yaml
+++ b/brozzler/behaviors.yaml
@ -27,7 +27,7 @@
  default_parameters:
    actions:
      - selector: a.coreSpriteDismissLarge
-      - selector: div._mck9w a
+      - selector: a>div[role='button']
        firstMatchOnly: true
      - selector: a.coreSpriteRightPaginationArrow
        repeatSameElement: true
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -24,7 +24,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[dashboard]".\nSee readme.rst for more information.',
+            'brozzler[dashboard]".\nSee README.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import doublethink
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -31,7 +31,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee readme.rst for more information.',
+            'brozzler[easy]".\nSee README.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import argparse
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -31,7 +31,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee readme.rst for more information.',
+            'brozzler[easy]".\nSee README.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import doublethink
@ -270,7 +270,7 @@ Run pywb like so:

    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback

-See readme.rst for more information.
+See README.rst for more information.
 '''

 # copied and pasted from cdxdomainspecific.py, only changes are commented as
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent):
    return self.agents.get('*')
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem

+class _SessionRaiseOn420(requests.Session):
+    timeout = 60
+    def get(self, url, *args, **kwargs):
+        res = super().get(url, timeout=self.timeout, *args, **kwargs)
+        if res.status_code == 420 and 'warcprox-meta' in res.headers:
+            raise brozzler.ReachedLimit(
+                    warcprox_meta=json.loads(res.headers['warcprox-meta']),
+                    http_payload=res.text)
+        else:
+            return res
+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site, proxy=None):
-    class SessionRaiseOn420(requests.Session):
-        def get(self, url, *args, **kwargs):
-            res = super().get(url, *args, **kwargs)
-            if res.status_code == 420 and 'warcprox-meta' in res.headers:
-                raise brozzler.ReachedLimit(
-                        warcprox_meta=json.loads(res.headers['warcprox-meta']),
-                        http_payload=res.text)
-            else:
-                return res
-
    if not site.id in _robots_caches:
-        req_sesh = SessionRaiseOn420()
+        req_sesh = _SessionRaiseOn420()
        req_sesh.verify = False   # ignore cert errors
        if proxy:
            proxie = "http://%s" % proxy
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
            req_sesh.headers['User-Agent'] = site.user_agent
-        _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
+        _robots_caches[site.id] = reppy.cache.RobotsCache(
+                session=req_sesh, disallow_forbidden=False)

    return _robots_caches[site.id]

@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.

-    In case of problems fetching robots.txt, different things can happen.
-    Reppy (the robots.txt parsing library) handles some exceptions internally
-    and applies an appropriate policy. It bubbles up other exceptions. Of
-    these, there are two kinds that this function raises for the caller to
-    handle, described below. Yet other types of exceptions are caught, and the
-    fetch is retried up to 10 times. In this case, after the 10th failure, the
-    function returns `False` (i.e. forbidden by robots).
+    Treats any kind of error fetching robots.txt as "allow all". See
+    http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
+    for some background on that policy.

    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None):
    if site.ignore_robots:
        return True

-    tries_left = 10
-    while True:
-        try:
-            result = _robots_cache(site, proxy).allowed(
-                    url, site.user_agent or "brozzler")
-            return result
-        except Exception as e:
-            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-                    e.args[0], brozzler.ReachedLimit):
-                raise e.args[0]
-            elif hasattr(e, 'args') and isinstance(
-                    e.args[0], requests.exceptions.ProxyError):
-                # reppy has wrapped an exception that we want to bubble up
-                raise brozzler.ProxyError(e)
-            else:
-                if tries_left > 0:
-                    logging.warn(
-                            "caught exception fetching robots.txt (%r tries "
-                            "left) for %r: %r", tries_left, url, e)
-                    tries_left -= 1
-                else:
-                    logging.error(
-                            "caught exception fetching robots.txt (0 tries "
-                            "left) for %r: %r", url, e, exc_info=True)
-                    return False
+    try:
+        result = _robots_cache(site, proxy).allowed(
+                url, site.user_agent or "brozzler")
+        return result
+    except Exception as e:
+        if isinstance(e, reppy.exceptions.ServerError) and isinstance(
+                e.args[0], brozzler.ReachedLimit):
+            raise e.args[0]
+        elif hasattr(e, 'args') and isinstance(
+                e.args[0], requests.exceptions.ProxyError):
+            # reppy has wrapped an exception that we want to bubble up
+            raise brozzler.ProxyError(e)
+        else:
+            logging.warn(
+                    "returning true (permitted) after problem fetching "
+                    "robots.txt for %r: %r", url, e)
+            return True

--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -113,7 +113,11 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
 class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)

-    HEARTBEAT_INTERVAL = 20.0
+    # 3⅓ min heartbeat interval => 10 min ttl
+    # This is kind of a long time, because `frontier.claim_sites()`, which runs
+    # in the same thread as the heartbeats, can take a while on a busy brozzler
+    # cluster with slow rethinkdb.
+    HEARTBEAT_INTERVAL = 200.0
    SITE_SESSION_MINUTES = 15

    def __init__(
--- a/setup.py
+++ b/setup.py
@ -32,12 +32,12 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b13.dev291',
+        version='1.4.dev295',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
        author_email='nlevitt@archive.org',
-        long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
+        long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
        license='Apache License 2.0',
        packages=['brozzler', 'brozzler.dashboard'],
        package_data={
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -769,7 +769,7 @@ def test_time_limit(httpd):
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)

-    # create a new job with three sites that could be crawled forever
+    # create a new job with one seed that could be crawled forever
    job_conf = {'seeds': [{
        'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
        'time_limit': 20}]}
@ -789,6 +789,10 @@ def test_time_limit(httpd):
    assert sites[0].status == 'FINISHED_TIME_LIMIT'

    # all sites finished so job should be finished too
+    start = time.time()
    job.refresh()
+    while not job.status == 'FINISHED' and time.time() - start < 10:
+        time.sleep(0.5)
+        job.refresh()
    assert job.status == 'FINISHED'

--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -32,6 +32,7 @@ import uuid
 import socket
 import time
 import sys
+import threading

 logging.basicConfig(
        stream=sys.stderr, level=logging.INFO, format=(
@ -67,6 +68,87 @@ def test_robots(httpd):
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
    assert not brozzler.is_permitted_by_robots(site, url)

+def test_robots_http_statuses():
+    for status in (
+            200, 204, 400, 401, 402, 403, 404, 405,
+            500, 501, 502, 503, 504, 505):
+        class Handler(http.server.BaseHTTPRequestHandler):
+            def do_GET(self):
+                response = (('HTTP/1.1 %s Meaningless message\r\n'
+                          + 'Content-length: 0\r\n'
+                          + '\r\n') % status).encode('utf-8')
+                self.connection.sendall(response)
+                # self.send_response(status)
+                # self.end_headers()
+        httpd = http.server.HTTPServer(('localhost', 0), Handler)
+        httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+        httpd_thread.start()
+
+        try:
+            url = 'http://localhost:%s/' % httpd.server_port
+            site = brozzler.Site(None, {'seed': url})
+            assert brozzler.is_permitted_by_robots(site, url)
+        finally:
+            httpd.shutdown()
+            httpd.server_close()
+            httpd_thread.join()
+
+def test_robots_empty_response():
+    class Handler(http.server.BaseHTTPRequestHandler):
+        def do_GET(self):
+            self.connection.shutdown(socket.SHUT_RDWR)
+            self.connection.close()
+    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    try:
+        url = 'http://localhost:%s/' % httpd.server_port
+        site = brozzler.Site(None, {'seed': url})
+        assert brozzler.is_permitted_by_robots(site, url)
+    finally:
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+
+def test_robots_socket_timeout():
+    stop_hanging = threading.Event()
+    class Handler(http.server.BaseHTTPRequestHandler):
+        def do_GET(self):
+            stop_hanging.wait(60)
+            self.connection.sendall(
+                    b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
+
+    orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
+
+    httpd = http.server.HTTPServer(('localhost', 0), Handler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    try:
+        url = 'http://localhost:%s/' % httpd.server_port
+        site = brozzler.Site(None, {'seed': url})
+        brozzler.robots._SessionRaiseOn420.timeout = 2
+        assert brozzler.is_permitted_by_robots(site, url)
+    finally:
+        brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
+        stop_hanging.set()
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+
+def test_robots_dns_failure():
+    # .invalid. is guaranteed nonexistent per rfc 6761
+    url = 'http://whatever.invalid./'
+    site = brozzler.Site(None, {'seed': url})
+    assert brozzler.is_permitted_by_robots(site, url)
+
+def test_robots_connection_failure():
+    # .invalid. is guaranteed nonexistent per rfc 6761
+    url = 'http://localhost:4/' # nobody listens on port 4
+    site = brozzler.Site(None, {'seed': url})
+    assert brozzler.is_permitted_by_robots(site, url)
+
 def test_scoping():
    test_scope = yaml.load('''
 max_hops: 100