mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: don't check robots.txt when scheduling a new site to be crawled, but mark the seed Page as needs_robots_check, and delegate the robots check to brozzler-worker; new test of robots.txt adherence robots.txt for testing monkey-patch reppy to support substring user-agent matching give vagrant vm enough memory so that tests pass consistently need warcprox to listen on public address because that's what it puts in the service registry looks like the problem may have been a bug in ansible 2.2.0.0, so pin to 2.1.3.0
This commit is contained in:
commit
eaa32ad3fc
@ -4,9 +4,7 @@ python:
|
||||
sudo: required
|
||||
dist: trusty
|
||||
before_install:
|
||||
- sudo apt-add-repository -y ppa:ansible/ansible
|
||||
- sudo apt-get -qq update
|
||||
- sudo apt-get install -y ansible
|
||||
- sudo pip install ansible==2.1.3.0
|
||||
install:
|
||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||
- pip install $TRAVIS_BUILD_DIR pytest
|
||||
|
@ -14,6 +14,7 @@ console log
|
||||
|
||||
# --profile
|
||||
exec nice warcprox \
|
||||
--address=0.0.0.0 \
|
||||
--dir={{warcs_dir}} \
|
||||
--base32 \
|
||||
--gzip \
|
||||
|
0
brozzler/cli.py
Executable file → Normal file
0
brozzler/cli.py
Executable file → Normal file
@ -106,13 +106,11 @@ def new_site(frontier, site):
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
if brozzler.is_permitted_by_robots(site, site.seed):
|
||||
page = brozzler.Page(site.seed, site_id=site.id,
|
||||
job_id=site.job_id, hops_from_seed=0, priority=1000)
|
||||
frontier.new_page(page)
|
||||
logging.info("queued page %s", page)
|
||||
else:
|
||||
logging.warn("seed url %s is blocked by robots.txt", site.seed)
|
||||
page = brozzler.Page(
|
||||
site.seed, site_id=site.id, job_id=site.job_id,
|
||||
hops_from_seed=0, priority=1000, needs_robots_check=True)
|
||||
frontier.new_page(page)
|
||||
logging.info("queued page %s", page)
|
||||
finally:
|
||||
# finally block because we want to insert the Site no matter what
|
||||
frontier.new_site(site)
|
||||
|
@ -1,6 +1,12 @@
|
||||
'''
|
||||
brozzler/robots.py - robots.txt support
|
||||
|
||||
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
||||
user-agent matching. We're sticking with 0.3.4 because later versions don't
|
||||
support supplying a custom requests.Session.
|
||||
|
||||
See also https://github.com/seomoz/reppy/issues/37
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -19,11 +25,27 @@ limitations under the License.
|
||||
import json
|
||||
import logging
|
||||
import brozzler
|
||||
import reppy
|
||||
import reppy.cache
|
||||
import reppy.parser
|
||||
import requests
|
||||
|
||||
__all__ = ["is_permitted_by_robots"]
|
||||
|
||||
# monkey-patch reppy to do substring user-agent matching, see top of file
|
||||
reppy.Utility.short_user_agent = lambda strng: strng
|
||||
def _reppy_rules_getitem(self, agent):
|
||||
'''
|
||||
Find the user-agent token matching the supplied full user-agent, using
|
||||
a case-insensitive substring search.
|
||||
'''
|
||||
lc_agent = agent.lower()
|
||||
for s in self.agents:
|
||||
if s in lc_agent:
|
||||
return self.agents[s]
|
||||
return self.agents.get('*')
|
||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||
|
||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
def _robots_cache(site):
|
||||
class SessionRaiseOn420(requests.Session):
|
||||
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
|
||||
tries_left = 10
|
||||
while True:
|
||||
try:
|
||||
result = _robots_cache(site).allowed(url, "brozzler")
|
||||
result = _robots_cache(site).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except BaseException as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
|
@ -80,9 +80,8 @@ class Url:
|
||||
pass
|
||||
|
||||
# if we get here, we're looking at two hostnames
|
||||
# XXX do we need to handle case of one punycoded idn, other not?
|
||||
domain_parts = ip_or_domain.split(".")
|
||||
host_parts = self.host.split(".")
|
||||
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
|
||||
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
|
||||
|
||||
return host_parts[-len(domain_parts):] == domain_parts
|
||||
|
||||
@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable):
|
||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
|
||||
outlinks=None):
|
||||
outlinks=None, needs_robots_check=False):
|
||||
self.site_id = site_id
|
||||
self.job_id = job_id
|
||||
self.url = url
|
||||
@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable):
|
||||
self.via_page_id = via_page_id
|
||||
self.hops_off_surt = hops_off_surt
|
||||
self.outlinks = outlinks
|
||||
self.needs_robots_check = needs_robots_check
|
||||
self._canon_hurl = None
|
||||
|
||||
if priority is not None:
|
||||
|
@ -327,12 +327,18 @@ class BrozzlerWorker:
|
||||
self._frontier.honor_stop_request(site.job_id)
|
||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||
socket.gethostname(), browser.chrome_port))
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
if browser.is_running():
|
||||
site.cookie_db = browser.persist_and_read_cookie_db()
|
||||
|
||||
if (page.needs_robots_check and
|
||||
not brozzler.is_permitted_by_robots(site, page.url)):
|
||||
logging.warn("page %s is blocked by robots.txt", page.url)
|
||||
else:
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
site, page, outlinks)
|
||||
if browser.is_running():
|
||||
site.cookie_db = browser.persist_and_read_cookie_db()
|
||||
|
||||
self._frontier.completed_page(site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
site, page, outlinks)
|
||||
page = None
|
||||
except brozzler.NothingToClaim:
|
||||
self.logger.info("no pages left for site %s", site)
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b8.dev123',
|
||||
version='1.1b8.dev127',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
2
tests/htdocs/robots.txt
Normal file
2
tests/htdocs/robots.txt
Normal file
@ -0,0 +1,2 @@
|
||||
User-agent: badbot
|
||||
Disallow: /
|
@ -29,6 +29,13 @@ import time
|
||||
import brozzler
|
||||
import datetime
|
||||
import requests
|
||||
import subprocess
|
||||
|
||||
def start_service(service):
|
||||
subprocess.check_call(['sudo', 'service', service, 'start'])
|
||||
|
||||
def stop_service(service):
|
||||
subprocess.check_call(['sudo', 'service', service, 'stop'])
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
@ -102,12 +109,18 @@ def test_brozzle_site(httpd):
|
||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
# so we can examine rethinkdb before it does anything
|
||||
try:
|
||||
stop_service('brozzler-worker')
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
finally:
|
||||
start_service('brozzler-worker')
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
@ -118,14 +131,17 @@ def test_brozzle_site(httpd):
|
||||
|
||||
# check that we got the two pages we expected
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 2
|
||||
assert len(pages) == 3
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/' % httpd.server_port,
|
||||
'http://localhost:%s/robots.txt' % httpd.server_port,
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
||||
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
# take a look at the captures table
|
||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||
captures_by_url = {
|
||||
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||
assert page1 in captures_by_url
|
||||
assert '%srobots.txt' % page1 in captures_by_url
|
||||
assert page2 in captures_by_url
|
||||
@ -140,7 +156,6 @@ def test_brozzle_site(httpd):
|
||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||
assert requests.get(wb_url).content == expected_payload
|
||||
|
||||
|
||||
def test_warcprox_selection(httpd):
|
||||
''' When enable_warcprox_features is true, brozzler is expected to choose
|
||||
and instance of warcprox '''
|
||||
@ -156,12 +171,17 @@ def test_warcprox_selection(httpd):
|
||||
enable_warcprox_features=True,
|
||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
# so we can examine rethinkdb before it does anything
|
||||
try:
|
||||
stop_service('brozzler-worker')
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
finally:
|
||||
start_service('brozzler-worker')
|
||||
|
||||
# check proxy is set in rethink
|
||||
start = time.time()
|
||||
@ -179,14 +199,17 @@ def test_warcprox_selection(httpd):
|
||||
|
||||
# check that we got the two pages we expected
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 2
|
||||
assert len(pages) == 3
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/' % httpd.server_port,
|
||||
'http://localhost:%s/robots.txt' % httpd.server_port,
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
||||
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
# take a look at the captures table
|
||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||
captures_by_url = {
|
||||
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||
assert page1 in captures_by_url
|
||||
assert '%srobots.txt' % page1 in captures_by_url
|
||||
assert page2 in captures_by_url
|
||||
@ -199,4 +222,57 @@ def test_warcprox_selection(httpd):
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||
expected_payload = open(os.path.join(
|
||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||
assert requests.get(wb_url).content == expected_payload
|
||||
assert requests.get(
|
||||
wb_url, allow_redirects=False).content == expected_payload
|
||||
|
||||
def test_obey_robots(httpd):
|
||||
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
||||
site = brozzler.Site(
|
||||
seed='http://localhost:%s/' % httpd.server_port,
|
||||
proxy='localhost:8000', enable_warcprox_features=True,
|
||||
user_agent='im a badbot', # robots.txt blocks badbot
|
||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||
|
||||
# so we can examine rethinkdb before it does anything
|
||||
try:
|
||||
stop_service('brozzler-worker')
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
site_pages = list(frontier.site_pages(site.id))
|
||||
assert len(site_pages) == 1
|
||||
assert site_pages[0].url == site.seed
|
||||
assert site_pages[0].needs_robots_check
|
||||
finally:
|
||||
start_service('brozzler-worker')
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||
time.sleep(0.5)
|
||||
site = frontier.site(site.id)
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
# check that we got the two pages we expected
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/' % httpd.server_port}
|
||||
|
||||
# take a look at the captures table
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||
captures = list(r.table('captures').filter({'test_id':test_id}).run())
|
||||
assert len(captures) == 1
|
||||
assert captures[0]['url'] == robots_url
|
||||
|
||||
# check pywb
|
||||
t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
|
||||
expected_payload = open(os.path.join(
|
||||
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
|
||||
assert requests.get(
|
||||
wb_url, allow_redirects=False).content == expected_payload
|
||||
|
54
tests/test_units.py
Normal file
54
tests/test_units.py
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import pytest
|
||||
import http.server
|
||||
import threading
|
||||
import os
|
||||
import brozzler
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
|
||||
httpd = http.server.HTTPServer(
|
||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
def test_robots(httpd):
|
||||
'''
|
||||
Basic test of robots.txt user-agent substring matching.
|
||||
'''
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
5
vagrant/Vagrantfile
vendored
5
vagrant/Vagrantfile
vendored
@ -6,6 +6,11 @@ Vagrant.configure(2) do |config|
|
||||
|
||||
config.vm.synced_folder "..", "/brozzler"
|
||||
|
||||
# bump up memory to avoid "can't start new thread" errors
|
||||
config.vm.provider "virtualbox" do |v|
|
||||
v.memory = 1024
|
||||
end
|
||||
|
||||
config.vm.provision "ansible" do |ansible|
|
||||
ansible.inventory_path = "../ansible/hosts-vagrant"
|
||||
ansible.playbook = "../ansible/playbook.yml"
|
||||
|
@ -1,4 +1,9 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# any arguments are passed on to py.test
|
||||
# so for example to run only "test_obey_robots" you could run
|
||||
# ./run-tests.sh -k test_obey_robots
|
||||
#
|
||||
|
||||
cd $(dirname "${BASH_SOURCE[0]}")
|
||||
|
||||
@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
|
||||
echo
|
||||
|
||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
||||
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"
|
||||
|
Loading…
x
Reference in New Issue
Block a user