diff --git a/.travis.yml b/.travis.yml index 67e80b9..552bf6b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,7 @@ python: sudo: required dist: trusty before_install: -- sudo apt-add-repository -y ppa:ansible/ansible -- sudo apt-get -qq update -- sudo apt-get install -y ansible +- sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - pip install $TRAVIS_BUILD_DIR pytest diff --git a/ansible/roles/warcprox/templates/warcprox.conf.j2 b/ansible/roles/warcprox/templates/warcprox.conf.j2 index 34101d6..917d072 100644 --- a/ansible/roles/warcprox/templates/warcprox.conf.j2 +++ b/ansible/roles/warcprox/templates/warcprox.conf.j2 @@ -14,6 +14,7 @@ console log # --profile exec nice warcprox \ + --address=0.0.0.0 \ --dir={{warcs_dir}} \ --base32 \ --gzip \ diff --git a/brozzler/cli.py b/brozzler/cli.py old mode 100755 new mode 100644 diff --git a/brozzler/job.py b/brozzler/job.py index a213eae..a178884 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -106,13 +106,11 @@ def new_site(frontier, site): # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: - if brozzler.is_permitted_by_robots(site, site.seed): - page = brozzler.Page(site.seed, site_id=site.id, - job_id=site.job_id, hops_from_seed=0, priority=1000) - frontier.new_page(page) - logging.info("queued page %s", page) - else: - logging.warn("seed url %s is blocked by robots.txt", site.seed) + page = brozzler.Page( + site.seed, site_id=site.id, job_id=site.job_id, + hops_from_seed=0, priority=1000, needs_robots_check=True) + frontier.new_page(page) + logging.info("queued page %s", page) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) diff --git a/brozzler/robots.py b/brozzler/robots.py index cb5ffe1..26329d1 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -1,6 +1,12 @@ ''' brozzler/robots.py - robots.txt support +Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring +user-agent matching. We're sticking with 0.3.4 because later versions don't +support supplying a custom requests.Session. + +See also https://github.com/seomoz/reppy/issues/37 + Copyright (C) 2014-2016 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,11 +25,27 @@ limitations under the License. import json import logging import brozzler +import reppy import reppy.cache +import reppy.parser import requests __all__ = ["is_permitted_by_robots"] +# monkey-patch reppy to do substring user-agent matching, see top of file +reppy.Utility.short_user_agent = lambda strng: strng +def _reppy_rules_getitem(self, agent): + ''' + Find the user-agent token matching the supplied full user-agent, using + a case-insensitive substring search. + ''' + lc_agent = agent.lower() + for s in self.agents: + if s in lc_agent: + return self.agents[s] + return self.agents.get('*') +reppy.parser.Rules.__getitem__ = _reppy_rules_getitem + _robots_caches = {} # {site_id:reppy.cache.RobotsCache} def _robots_cache(site): class SessionRaiseOn420(requests.Session): @@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url): tries_left = 10 while True: try: - result = _robots_cache(site).allowed(url, "brozzler") + result = _robots_cache(site).allowed( + url, site.user_agent or "brozzler") return result except BaseException as e: if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): diff --git a/brozzler/site.py b/brozzler/site.py index 5cccac8..8ff692a 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -80,9 +80,8 @@ class Url: pass # if we get here, we're looking at two hostnames - # XXX do we need to handle case of one punycoded idn, other not? - domain_parts = ip_or_domain.split(".") - host_parts = self.host.split(".") + domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".") + host_parts = self.host.encode("idna").decode("ascii").lower().split(".") return host_parts[-len(domain_parts):] == domain_parts @@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable): self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0, via_page_id=None, last_claimed_by=None, hops_off_surt=0, - outlinks=None): + outlinks=None, needs_robots_check=False): self.site_id = site_id self.job_id = job_id self.url = url @@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable): self.via_page_id = via_page_id self.hops_off_surt = hops_off_surt self.outlinks = outlinks + self.needs_robots_check = needs_robots_check self._canon_hurl = None if priority is not None: diff --git a/brozzler/worker.py b/brozzler/worker.py index c306d95..21aa22e 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -327,12 +327,18 @@ class BrozzlerWorker: self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome_port)) - outlinks = self.brozzle_page(browser, site, page) - if browser.is_running(): - site.cookie_db = browser.persist_and_read_cookie_db() + + if (page.needs_robots_check and + not brozzler.is_permitted_by_robots(site, page.url)): + logging.warn("page %s is blocked by robots.txt", page.url) + else: + outlinks = self.brozzle_page(browser, site, page) + self._frontier.scope_and_schedule_outlinks( + site, page, outlinks) + if browser.is_running(): + site.cookie_db = browser.persist_and_read_cookie_db() + self._frontier.completed_page(site, page) - self._frontier.scope_and_schedule_outlinks( - site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) diff --git a/setup.py b/setup.py index d70d02b..dc70a9c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev123', + version='1.1b8.dev127', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/robots.txt b/tests/htdocs/robots.txt new file mode 100644 index 0000000..05ce8f2 --- /dev/null +++ b/tests/htdocs/robots.txt @@ -0,0 +1,2 @@ +User-agent: badbot +Disallow: / diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 1815cc9..ef4c51a 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -29,6 +29,13 @@ import time import brozzler import datetime import requests +import subprocess + +def start_service(service): + subprocess.check_call(['sudo', 'service', service, 'start']) + +def stop_service(service): + subprocess.check_call(['sudo', 'service', service, 'stop']) @pytest.fixture(scope='module') def httpd(request): @@ -102,12 +109,18 @@ def test_brozzle_site(httpd): page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port - assert site.id is None - r = rethinkstuff.Rethinker('localhost', db='brozzler') - frontier = brozzler.RethinkDbFrontier(r) - brozzler.new_site(frontier, site) - assert site.id is not None - assert len(list(frontier.site_pages(site.id))) == 1 + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 + finally: + start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() @@ -118,14 +131,17 @@ def test_brozzle_site(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 2 + assert len(pages) == 3 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, + 'http://localhost:%s/robots.txt' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port} + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() - captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} + captures_by_url = { + c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url @@ -140,7 +156,6 @@ def test_brozzle_site(httpd): os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload - def test_warcprox_selection(httpd): ''' When enable_warcprox_features is true, brozzler is expected to choose and instance of warcprox ''' @@ -156,12 +171,17 @@ def test_warcprox_selection(httpd): enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) - assert site.id is None - r = rethinkstuff.Rethinker('localhost', db='brozzler') - frontier = brozzler.RethinkDbFrontier(r) - brozzler.new_site(frontier, site) - assert site.id is not None - assert len(list(frontier.site_pages(site.id))) == 1 + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 + finally: + start_service('brozzler-worker') # check proxy is set in rethink start = time.time() @@ -179,14 +199,17 @@ def test_warcprox_selection(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 2 + assert len(pages) == 3 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, + 'http://localhost:%s/robots.txt' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port} + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() - captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} + captures_by_url = { + c['url']:c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url @@ -199,4 +222,57 @@ def test_warcprox_selection(httpd): wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() - assert requests.get(wb_url).content == expected_payload + assert requests.get( + wb_url, allow_redirects=False).content == expected_payload + +def test_obey_robots(httpd): + test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() + site = brozzler.Site( + seed='http://localhost:%s/' % httpd.server_port, + proxy='localhost:8000', enable_warcprox_features=True, + user_agent='im a badbot', # robots.txt blocks badbot + warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) + + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + site_pages = list(frontier.site_pages(site.id)) + assert len(site_pages) == 1 + assert site_pages[0].url == site.seed + assert site_pages[0].needs_robots_check + finally: + start_service('brozzler-worker') + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site = frontier.site(site.id) + assert site.status == 'FINISHED' + + # check that we got the two pages we expected + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 1 + assert {page.url for page in pages} == { + 'http://localhost:%s/' % httpd.server_port} + + # take a look at the captures table + time.sleep(2) # in case warcprox hasn't finished processing urls + robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port + captures = list(r.table('captures').filter({'test_id':test_id}).run()) + assert len(captures) == 1 + assert captures[0]['url'] == robots_url + + # check pywb + t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) + expected_payload = open(os.path.join( + os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() + assert requests.get( + wb_url, allow_redirects=False).content == expected_payload diff --git a/tests/test_units.py b/tests/test_units.py new file mode 100644 index 0000000..2fee049 --- /dev/null +++ b/tests/test_units.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +''' +test_units.py - some unit tests for parts of brozzler amenable to that + +Copyright (C) 2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import pytest +import http.server +import threading +import os +import brozzler + +@pytest.fixture(scope='module') +def httpd(request): + # SimpleHTTPRequestHandler always uses CWD so we have to chdir + os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + + httpd = http.server.HTTPServer( + ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + def fin(): + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + request.addfinalizer(fin) + + return httpd + +def test_robots(httpd): + ''' + Basic test of robots.txt user-agent substring matching. + ''' + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep') + assert brozzler.is_permitted_by_robots(site, url) + + site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') + assert not brozzler.is_permitted_by_robots(site, url) + diff --git a/vagrant/Vagrantfile b/vagrant/Vagrantfile index 5a1c8ae..f983e7c 100644 --- a/vagrant/Vagrantfile +++ b/vagrant/Vagrantfile @@ -6,6 +6,11 @@ Vagrant.configure(2) do |config| config.vm.synced_folder "..", "/brozzler" + # bump up memory to avoid "can't start new thread" errors + config.vm.provider "virtualbox" do |v| + v.memory = 1024 + end + config.vm.provision "ansible" do |ansible| ansible.inventory_path = "../ansible/hosts-vagrant" ansible.playbook = "../ansible/playbook.yml" diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index 2a8a0b5..710438b 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -1,4 +1,9 @@ #!/bin/bash +# +# any arguments are passed on to py.test +# so for example to run only "test_obey_robots" you could run +# ./run-tests.sh -k test_obey_robots +# cd $(dirname "${BASH_SOURCE[0]}") @@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ; echo vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest' -vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests' +vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"