Merge branch 'master' into qa

* master: don't check robots.txt when scheduling a new site to be crawled, but mark the seed Page as needs_robots_check, and delegate the robots check to brozzler-worker; new test of robots.txt adherence robots.txt for testing monkey-patch reppy to support substring user-agent matching give vagrant vm enough memory so that tests pass consistently need warcprox to listen on public address because that's what it puts in the service registry looks like the problem may have been a bug in ansible 2.2.0.0, so pin to 2.1.3.0
2025-08-08 14:32:23 -04:00 · 2016-11-16 12:24:30 -08:00 · 2016-11-16 12:24:30 -08:00 · eaa32ad3fc
commit eaa32ad3fc
parent e01739743f 72816d1058
13 changed files with 208 additions and 40 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -4,9 +4,7 @@ python:
 sudo: required
 dist: trusty
 before_install:
- sudo apt-add-repository -y ppa:ansible/ansible
- sudo apt-get -qq update
- sudo apt-get install -y ansible
+- sudo pip install ansible==2.1.3.0
 install:
 - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
 - pip install $TRAVIS_BUILD_DIR pytest
--- a/ansible/roles/warcprox/templates/warcprox.conf.j2
+++ b/ansible/roles/warcprox/templates/warcprox.conf.j2
@ -14,6 +14,7 @@ console log

 # --profile
 exec nice warcprox \
+         --address=0.0.0.0 \
         --dir={{warcs_dir}} \
         --base32 \
         --gzip \
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
--- a/brozzler/job.py
+++ b/brozzler/job.py
@ -106,13 +106,11 @@ def new_site(frontier, site):
        # where a brozzler worker immediately claims the site, finds no pages
        # to crawl, and decides the site is finished
        try:
-            if brozzler.is_permitted_by_robots(site, site.seed):
-                page = brozzler.Page(site.seed, site_id=site.id,
-                    job_id=site.job_id, hops_from_seed=0, priority=1000)
-                frontier.new_page(page)
-                logging.info("queued page %s", page)
-            else:
-                logging.warn("seed url %s is blocked by robots.txt", site.seed)
+            page = brozzler.Page(
+                    site.seed, site_id=site.id, job_id=site.job_id,
+                    hops_from_seed=0, priority=1000, needs_robots_check=True)
+            frontier.new_page(page)
+            logging.info("queued page %s", page)
        finally:
            # finally block because we want to insert the Site no matter what
            frontier.new_site(site)
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -1,6 +1,12 @@
 '''
 brozzler/robots.py - robots.txt support

+Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
+user-agent matching. We're sticking with 0.3.4 because later versions don't
+support supplying a custom requests.Session.
+
+See also https://github.com/seomoz/reppy/issues/37
+
 Copyright (C) 2014-2016 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
@ -19,11 +25,27 @@ limitations under the License.
 import json
 import logging
 import brozzler
+import reppy
 import reppy.cache
+import reppy.parser
 import requests

 __all__ = ["is_permitted_by_robots"]

+# monkey-patch reppy to do substring user-agent matching, see top of file
+reppy.Utility.short_user_agent = lambda strng: strng
+def _reppy_rules_getitem(self, agent):
+    '''
+    Find the user-agent token matching the supplied full user-agent, using
+    a case-insensitive substring search.
+    '''
+    lc_agent = agent.lower()
+    for s in self.agents:
+        if s in lc_agent:
+            return self.agents[s]
+    return self.agents.get('*')
+reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site):
    class SessionRaiseOn420(requests.Session):
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
    tries_left = 10
    while True:
        try:
-            result = _robots_cache(site).allowed(url, "brozzler")
+            result = _robots_cache(site).allowed(
+                    url, site.user_agent or "brozzler")
            return result
        except BaseException as e:
            if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -80,9 +80,8 @@ class Url:
            pass

        # if we get here, we're looking at two hostnames
-        # XXX do we need to handle case of one punycoded idn, other not?
-        domain_parts = ip_or_domain.split(".")
-        host_parts = self.host.split(".")
+        domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
+        host_parts = self.host.encode("idna").decode("ascii").lower().split(".")

        return host_parts[-len(domain_parts):] == domain_parts

@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable):
            self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
            redirect_url=None, priority=None, claimed=False, brozzle_count=0,
            via_page_id=None, last_claimed_by=None, hops_off_surt=0,
-            outlinks=None):
+            outlinks=None, needs_robots_check=False):
        self.site_id = site_id
        self.job_id = job_id
        self.url = url
@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable):
        self.via_page_id = via_page_id
        self.hops_off_surt = hops_off_surt
        self.outlinks = outlinks
+        self.needs_robots_check = needs_robots_check
        self._canon_hurl = None

        if priority is not None:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -327,12 +327,18 @@ class BrozzlerWorker:
                self._frontier.honor_stop_request(site.job_id)
                page = self._frontier.claim_page(site, "%s:%s" % (
                    socket.gethostname(), browser.chrome_port))
-                outlinks = self.brozzle_page(browser, site, page)
-                if browser.is_running():
-                    site.cookie_db = browser.persist_and_read_cookie_db()
+
+                if (page.needs_robots_check and
+                        not brozzler.is_permitted_by_robots(site, page.url)):
+                    logging.warn("page %s is blocked by robots.txt", page.url)
+                else:
+                    outlinks = self.brozzle_page(browser, site, page)
+                    self._frontier.scope_and_schedule_outlinks(
+                            site, page, outlinks)
+                    if browser.is_running():
+                        site.cookie_db = browser.persist_and_read_cookie_db()
+
                self._frontier.completed_page(site, page)
-                self._frontier.scope_and_schedule_outlinks(
-                        site, page, outlinks)
                page = None
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b8.dev123',
+        version='1.1b8.dev127',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/htdocs/robots.txt
+++ b/tests/htdocs/robots.txt
@ -0,0 +1,2 @@
+User-agent: badbot
+Disallow: /
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -29,6 +29,13 @@ import time
 import brozzler
 import datetime
 import requests
+import subprocess
+
+def start_service(service):
+    subprocess.check_call(['sudo', 'service', service, 'start'])
+
+def stop_service(service):
+    subprocess.check_call(['sudo', 'service', service, 'stop'])

@pytest.fixture(scope='module')
 def httpd(request):
@ -102,12 +109,18 @@ def test_brozzle_site(httpd):
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port

-    assert site.id is None
-    r = rethinkstuff.Rethinker('localhost', db='brozzler')
-    frontier = brozzler.RethinkDbFrontier(r)
-    brozzler.new_site(frontier, site)
-    assert site.id is not None
-    assert len(list(frontier.site_pages(site.id))) == 1
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        assert len(list(frontier.site_pages(site.id))) == 1
+    finally:
+        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
@ -118,14 +131,17 @@ def test_brozzle_site(httpd):

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
-    assert len(pages) == 2
+    assert len(pages) == 3
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/robots.txt' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port}

+    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
-    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    captures_by_url = {
+            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
@ -140,7 +156,6 @@ def test_brozzle_site(httpd):
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

-
 def test_warcprox_selection(httpd):
    ''' When enable_warcprox_features is true, brozzler is expected to choose
    and instance of warcprox '''
@ -156,12 +171,17 @@ def test_warcprox_selection(httpd):
            enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

-    assert site.id is None
-    r = rethinkstuff.Rethinker('localhost', db='brozzler')
-    frontier = brozzler.RethinkDbFrontier(r)
-    brozzler.new_site(frontier, site)
-    assert site.id is not None
-    assert len(list(frontier.site_pages(site.id))) == 1
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        assert len(list(frontier.site_pages(site.id))) == 1
+    finally:
+        start_service('brozzler-worker')

    # check proxy is set in rethink
    start = time.time()
@ -179,14 +199,17 @@ def test_warcprox_selection(httpd):

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
-    assert len(pages) == 2
+    assert len(pages) == 3
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/robots.txt' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port}

+    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
-    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    captures_by_url = {
+            c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
@ -199,4 +222,57 @@ def test_warcprox_selection(httpd):
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
-    assert requests.get(wb_url).content == expected_payload
+    assert requests.get(
+            wb_url, allow_redirects=False).content == expected_payload
+
+def test_obey_robots(httpd):
+    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
+    site = brozzler.Site(
+            seed='http://localhost:%s/' % httpd.server_port,
+            proxy='localhost:8000', enable_warcprox_features=True,
+            user_agent='im a badbot',   # robots.txt blocks badbot
+            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
+
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        site_pages = list(frontier.site_pages(site.id))
+        assert len(site_pages) == 1
+        assert site_pages[0].url == site.seed
+        assert site_pages[0].needs_robots_check
+    finally:
+        start_service('brozzler-worker')
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.status == 'FINISHED'
+
+    # check that we got the two pages we expected
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert {page.url for page in pages} == {
+            'http://localhost:%s/' % httpd.server_port}
+
+    # take a look at the captures table
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
+    captures = list(r.table('captures').filter({'test_id':test_id}).run())
+    assert len(captures) == 1
+    assert captures[0]['url'] == robots_url
+
+    # check pywb
+    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
+    expected_payload = open(os.path.join(
+        os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
+    assert requests.get(
+            wb_url, allow_redirects=False).content == expected_payload
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python
+'''
+test_units.py - some unit tests for parts of brozzler amenable to that
+
+Copyright (C) 2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import pytest
+import http.server
+import threading
+import os
+import brozzler
+
+@pytest.fixture(scope='module')
+def httpd(request):
+    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
+    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+
+    httpd = http.server.HTTPServer(
+            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    def fin():
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+    request.addfinalizer(fin)
+
+    return httpd
+
+def test_robots(httpd):
+    '''
+    Basic test of robots.txt user-agent substring matching.
+    '''
+    url = 'http://localhost:%s/' % httpd.server_port
+    site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
+    assert brozzler.is_permitted_by_robots(site, url)
+
+    site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
+    assert not brozzler.is_permitted_by_robots(site, url)
+
--- a/vagrant/Vagrantfile
+++ b/vagrant/Vagrantfile
@ -6,6 +6,11 @@ Vagrant.configure(2) do |config|

  config.vm.synced_folder "..", "/brozzler"

+  # bump up memory to avoid "can't start new thread" errors
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 1024
+  end
+
  config.vm.provision "ansible" do |ansible|
    ansible.inventory_path = "../ansible/hosts-vagrant"
    ansible.playbook = "../ansible/playbook.yml"
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@ -1,4 +1,9 @@
 #!/bin/bash
+#
+# any arguments are passed on to py.test
+# so for example to run only "test_obey_robots" you could run
+# ./run-tests.sh -k test_obey_robots
+#

 cd $(dirname "${BASH_SOURCE[0]}")

@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
 echo

 vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
-vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
+vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"