Merge branch 'master' into qa

* master:
  don't check robots.txt when scheduling a new site to be crawled, but mark the seed Page as needs_robots_check, and delegate the robots check to brozzler-worker; new test of robots.txt adherence
  robots.txt for testing
  monkey-patch reppy to support substring user-agent matching
  give vagrant vm enough memory so that tests pass consistently
  need warcprox to listen on public address because that's what it puts in the service registry
  looks like the problem may have been a bug in ansible 2.2.0.0, so pin to 2.1.3.0
This commit is contained in:
Noah Levitt 2016-11-16 12:24:30 -08:00
commit eaa32ad3fc
13 changed files with 208 additions and 40 deletions

View File

@ -4,9 +4,7 @@ python:
sudo: required
dist: trusty
before_install:
- sudo apt-add-repository -y ppa:ansible/ansible
- sudo apt-get -qq update
- sudo apt-get install -y ansible
- sudo pip install ansible==2.1.3.0
install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR pytest

View File

@ -14,6 +14,7 @@ console log
# --profile
exec nice warcprox \
--address=0.0.0.0 \
--dir={{warcs_dir}} \
--base32 \
--gzip \

0
brozzler/cli.py Executable file → Normal file
View File

View File

@ -106,13 +106,11 @@ def new_site(frontier, site):
# where a brozzler worker immediately claims the site, finds no pages
# to crawl, and decides the site is finished
try:
if brozzler.is_permitted_by_robots(site, site.seed):
page = brozzler.Page(site.seed, site_id=site.id,
job_id=site.job_id, hops_from_seed=0, priority=1000)
frontier.new_page(page)
logging.info("queued page %s", page)
else:
logging.warn("seed url %s is blocked by robots.txt", site.seed)
page = brozzler.Page(
site.seed, site_id=site.id, job_id=site.job_id,
hops_from_seed=0, priority=1000, needs_robots_check=True)
frontier.new_page(page)
logging.info("queued page %s", page)
finally:
# finally block because we want to insert the Site no matter what
frontier.new_site(site)

View File

@ -1,6 +1,12 @@
'''
brozzler/robots.py - robots.txt support
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
user-agent matching. We're sticking with 0.3.4 because later versions don't
support supplying a custom requests.Session.
See also https://github.com/seomoz/reppy/issues/37
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
@ -19,11 +25,27 @@ limitations under the License.
import json
import logging
import brozzler
import reppy
import reppy.cache
import reppy.parser
import requests
__all__ = ["is_permitted_by_robots"]
# monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent):
'''
Find the user-agent token matching the supplied full user-agent, using
a case-insensitive substring search.
'''
lc_agent = agent.lower()
for s in self.agents:
if s in lc_agent:
return self.agents[s]
return self.agents.get('*')
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site):
class SessionRaiseOn420(requests.Session):
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
tries_left = 10
while True:
try:
result = _robots_cache(site).allowed(url, "brozzler")
result = _robots_cache(site).allowed(
url, site.user_agent or "brozzler")
return result
except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):

View File

@ -80,9 +80,8 @@ class Url:
pass
# if we get here, we're looking at two hostnames
# XXX do we need to handle case of one punycoded idn, other not?
domain_parts = ip_or_domain.split(".")
host_parts = self.host.split(".")
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
return host_parts[-len(domain_parts):] == domain_parts
@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable):
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None):
outlinks=None, needs_robots_check=False):
self.site_id = site_id
self.job_id = job_id
self.url = url
@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable):
self.via_page_id = via_page_id
self.hops_off_surt = hops_off_surt
self.outlinks = outlinks
self.needs_robots_check = needs_robots_check
self._canon_hurl = None
if priority is not None:

View File

@ -327,12 +327,18 @@ class BrozzlerWorker:
self._frontier.honor_stop_request(site.job_id)
page = self._frontier.claim_page(site, "%s:%s" % (
socket.gethostname(), browser.chrome_port))
outlinks = self.brozzle_page(browser, site, page)
if browser.is_running():
site.cookie_db = browser.persist_and_read_cookie_db()
if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url)
else:
outlinks = self.brozzle_page(browser, site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)
if browser.is_running():
site.cookie_db = browser.persist_and_read_cookie_db()
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)
page = None
except brozzler.NothingToClaim:
self.logger.info("no pages left for site %s", site)

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev123',
version='1.1b8.dev127',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

2
tests/htdocs/robots.txt Normal file
View File

@ -0,0 +1,2 @@
User-agent: badbot
Disallow: /

View File

@ -29,6 +29,13 @@ import time
import brozzler
import datetime
import requests
import subprocess
def start_service(service):
subprocess.check_call(['sudo', 'service', service, 'start'])
def stop_service(service):
subprocess.check_call(['sudo', 'service', service, 'stop'])
@pytest.fixture(scope='module')
def httpd(request):
@ -102,12 +109,18 @@ def test_brozzle_site(httpd):
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
# so we can examine rethinkdb before it does anything
try:
stop_service('brozzler-worker')
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
finally:
start_service('brozzler-worker')
# the site should be brozzled fairly quickly
start = time.time()
@ -118,14 +131,17 @@ def test_brozzle_site(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert len(pages) == 3
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
@ -140,7 +156,6 @@ def test_brozzle_site(httpd):
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_warcprox_selection(httpd):
''' When enable_warcprox_features is true, brozzler is expected to choose
and instance of warcprox '''
@ -156,12 +171,17 @@ def test_warcprox_selection(httpd):
enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
# so we can examine rethinkdb before it does anything
try:
stop_service('brozzler-worker')
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
finally:
start_service('brozzler-worker')
# check proxy is set in rethink
start = time.time()
@ -179,14 +199,17 @@ def test_warcprox_selection(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert len(pages) == 3
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
captures_by_url = {
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
@ -199,4 +222,57 @@ def test_warcprox_selection(httpd):
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload
def test_obey_robots(httpd):
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
user_agent='im a badbot', # robots.txt blocks badbot
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
# so we can examine rethinkdb before it does anything
try:
stop_service('brozzler-worker')
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
site_pages = list(frontier.site_pages(site.id))
assert len(site_pages) == 1
assert site_pages[0].url == site.seed
assert site_pages[0].needs_robots_check
finally:
start_service('brozzler-worker')
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port}
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
captures = list(r.table('captures').filter({'test_id':test_id}).run())
assert len(captures) == 1
assert captures[0]['url'] == robots_url
# check pywb
t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload

54
tests/test_units.py Normal file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
'''
test_units.py - some unit tests for parts of brozzler amenable to that
Copyright (C) 2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import pytest
import http.server
import threading
import os
import brozzler
@pytest.fixture(scope='module')
def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
request.addfinalizer(fin)
return httpd
def test_robots(httpd):
'''
Basic test of robots.txt user-agent substring matching.
'''
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
assert brozzler.is_permitted_by_robots(site, url)
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url)

5
vagrant/Vagrantfile vendored
View File

@ -6,6 +6,11 @@ Vagrant.configure(2) do |config|
config.vm.synced_folder "..", "/brozzler"
# bump up memory to avoid "can't start new thread" errors
config.vm.provider "virtualbox" do |v|
v.memory = 1024
end
config.vm.provision "ansible" do |ansible|
ansible.inventory_path = "../ansible/hosts-vagrant"
ansible.playbook = "../ansible/playbook.yml"

View File

@ -1,4 +1,9 @@
#!/bin/bash
#
# any arguments are passed on to py.test
# so for example to run only "test_obey_robots" you could run
# ./run-tests.sh -k test_obey_robots
#
cd $(dirname "${BASH_SOURCE[0]}")
@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
echo
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"