mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
don't check robots.txt when scheduling a new site to be crawled, but mark the seed Page as needs_robots_check, and delegate the robots check to brozzler-worker; new test of robots.txt adherence
This commit is contained in:
parent
24cc8377fb
commit
72816d1058
0
brozzler/cli.py
Executable file → Normal file
0
brozzler/cli.py
Executable file → Normal file
@ -106,13 +106,11 @@ def new_site(frontier, site):
|
|||||||
# where a brozzler worker immediately claims the site, finds no pages
|
# where a brozzler worker immediately claims the site, finds no pages
|
||||||
# to crawl, and decides the site is finished
|
# to crawl, and decides the site is finished
|
||||||
try:
|
try:
|
||||||
if brozzler.is_permitted_by_robots(site, site.seed):
|
page = brozzler.Page(
|
||||||
page = brozzler.Page(site.seed, site_id=site.id,
|
site.seed, site_id=site.id, job_id=site.job_id,
|
||||||
job_id=site.job_id, hops_from_seed=0, priority=1000)
|
hops_from_seed=0, priority=1000, needs_robots_check=True)
|
||||||
frontier.new_page(page)
|
frontier.new_page(page)
|
||||||
logging.info("queued page %s", page)
|
logging.info("queued page %s", page)
|
||||||
else:
|
|
||||||
logging.warn("seed url %s is blocked by robots.txt", site.seed)
|
|
||||||
finally:
|
finally:
|
||||||
# finally block because we want to insert the Site no matter what
|
# finally block because we want to insert the Site no matter what
|
||||||
frontier.new_site(site)
|
frontier.new_site(site)
|
||||||
|
@ -80,9 +80,8 @@ class Url:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# if we get here, we're looking at two hostnames
|
# if we get here, we're looking at two hostnames
|
||||||
# XXX do we need to handle case of one punycoded idn, other not?
|
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
|
||||||
domain_parts = ip_or_domain.split(".")
|
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
|
||||||
host_parts = self.host.split(".")
|
|
||||||
|
|
||||||
return host_parts[-len(domain_parts):] == domain_parts
|
return host_parts[-len(domain_parts):] == domain_parts
|
||||||
|
|
||||||
@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable):
|
|||||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||||
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||||
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
|
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
|
||||||
outlinks=None):
|
outlinks=None, needs_robots_check=False):
|
||||||
self.site_id = site_id
|
self.site_id = site_id
|
||||||
self.job_id = job_id
|
self.job_id = job_id
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable):
|
|||||||
self.via_page_id = via_page_id
|
self.via_page_id = via_page_id
|
||||||
self.hops_off_surt = hops_off_surt
|
self.hops_off_surt = hops_off_surt
|
||||||
self.outlinks = outlinks
|
self.outlinks = outlinks
|
||||||
|
self.needs_robots_check = needs_robots_check
|
||||||
self._canon_hurl = None
|
self._canon_hurl = None
|
||||||
|
|
||||||
if priority is not None:
|
if priority is not None:
|
||||||
|
@ -327,12 +327,18 @@ class BrozzlerWorker:
|
|||||||
self._frontier.honor_stop_request(site.job_id)
|
self._frontier.honor_stop_request(site.job_id)
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
socket.gethostname(), browser.chrome_port))
|
socket.gethostname(), browser.chrome_port))
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
|
||||||
if browser.is_running():
|
if (page.needs_robots_check and
|
||||||
site.cookie_db = browser.persist_and_read_cookie_db()
|
not brozzler.is_permitted_by_robots(site, page.url)):
|
||||||
|
logging.warn("page %s is blocked by robots.txt", page.url)
|
||||||
|
else:
|
||||||
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
|
self._frontier.scope_and_schedule_outlinks(
|
||||||
|
site, page, outlinks)
|
||||||
|
if browser.is_running():
|
||||||
|
site.cookie_db = browser.persist_and_read_cookie_db()
|
||||||
|
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
|
||||||
site, page, outlinks)
|
|
||||||
page = None
|
page = None
|
||||||
except brozzler.NothingToClaim:
|
except brozzler.NothingToClaim:
|
||||||
self.logger.info("no pages left for site %s", site)
|
self.logger.info("no pages left for site %s", site)
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b8.dev126',
|
version='1.1b8.dev127',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -29,6 +29,13 @@ import time
|
|||||||
import brozzler
|
import brozzler
|
||||||
import datetime
|
import datetime
|
||||||
import requests
|
import requests
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
def start_service(service):
|
||||||
|
subprocess.check_call(['sudo', 'service', service, 'start'])
|
||||||
|
|
||||||
|
def stop_service(service):
|
||||||
|
subprocess.check_call(['sudo', 'service', service, 'stop'])
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
@ -102,12 +109,18 @@ def test_brozzle_site(httpd):
|
|||||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||||
|
|
||||||
assert site.id is None
|
# so we can examine rethinkdb before it does anything
|
||||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
try:
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
stop_service('brozzler-worker')
|
||||||
brozzler.new_site(frontier, site)
|
|
||||||
assert site.id is not None
|
assert site.id is None
|
||||||
assert len(list(frontier.site_pages(site.id))) == 1
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id is not None
|
||||||
|
assert len(list(frontier.site_pages(site.id))) == 1
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
|
||||||
# the site should be brozzled fairly quickly
|
# the site should be brozzled fairly quickly
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@ -118,14 +131,17 @@ def test_brozzle_site(httpd):
|
|||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that we got the two pages we expected
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 2
|
assert len(pages) == 3
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/' % httpd.server_port,
|
'http://localhost:%s/' % httpd.server_port,
|
||||||
|
'http://localhost:%s/robots.txt' % httpd.server_port,
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
'http://localhost:%s/file1.txt' % httpd.server_port}
|
||||||
|
|
||||||
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
captures_by_url = {
|
||||||
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert '%srobots.txt' % page1 in captures_by_url
|
assert '%srobots.txt' % page1 in captures_by_url
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
@ -140,7 +156,6 @@ def test_brozzle_site(httpd):
|
|||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(wb_url).content == expected_payload
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
|
||||||
|
|
||||||
def test_warcprox_selection(httpd):
|
def test_warcprox_selection(httpd):
|
||||||
''' When enable_warcprox_features is true, brozzler is expected to choose
|
''' When enable_warcprox_features is true, brozzler is expected to choose
|
||||||
and instance of warcprox '''
|
and instance of warcprox '''
|
||||||
@ -156,12 +171,17 @@ def test_warcprox_selection(httpd):
|
|||||||
enable_warcprox_features=True,
|
enable_warcprox_features=True,
|
||||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
assert site.id is None
|
# so we can examine rethinkdb before it does anything
|
||||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
try:
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
stop_service('brozzler-worker')
|
||||||
brozzler.new_site(frontier, site)
|
assert site.id is None
|
||||||
assert site.id is not None
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
assert len(list(frontier.site_pages(site.id))) == 1
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id is not None
|
||||||
|
assert len(list(frontier.site_pages(site.id))) == 1
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
|
||||||
# check proxy is set in rethink
|
# check proxy is set in rethink
|
||||||
start = time.time()
|
start = time.time()
|
||||||
@ -179,14 +199,17 @@ def test_warcprox_selection(httpd):
|
|||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that we got the two pages we expected
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 2
|
assert len(pages) == 3
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/' % httpd.server_port,
|
'http://localhost:%s/' % httpd.server_port,
|
||||||
|
'http://localhost:%s/robots.txt' % httpd.server_port,
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
'http://localhost:%s/file1.txt' % httpd.server_port}
|
||||||
|
|
||||||
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
captures_by_url = {
|
||||||
|
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert '%srobots.txt' % page1 in captures_by_url
|
assert '%srobots.txt' % page1 in captures_by_url
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
@ -199,4 +222,57 @@ def test_warcprox_selection(httpd):
|
|||||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(wb_url).content == expected_payload
|
assert requests.get(
|
||||||
|
wb_url, allow_redirects=False).content == expected_payload
|
||||||
|
|
||||||
|
def test_obey_robots(httpd):
|
||||||
|
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
site = brozzler.Site(
|
||||||
|
seed='http://localhost:%s/' % httpd.server_port,
|
||||||
|
proxy='localhost:8000', enable_warcprox_features=True,
|
||||||
|
user_agent='im a badbot', # robots.txt blocks badbot
|
||||||
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
|
# so we can examine rethinkdb before it does anything
|
||||||
|
try:
|
||||||
|
stop_service('brozzler-worker')
|
||||||
|
|
||||||
|
assert site.id is None
|
||||||
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id is not None
|
||||||
|
site_pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(site_pages) == 1
|
||||||
|
assert site_pages[0].url == site.seed
|
||||||
|
assert site_pages[0].needs_robots_check
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site = frontier.site(site.id)
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# check that we got the two pages we expected
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert {page.url for page in pages} == {
|
||||||
|
'http://localhost:%s/' % httpd.server_port}
|
||||||
|
|
||||||
|
# take a look at the captures table
|
||||||
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
|
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||||
|
captures = list(r.table('captures').filter({'test_id':test_id}).run())
|
||||||
|
assert len(captures) == 1
|
||||||
|
assert captures[0]['url'] == robots_url
|
||||||
|
|
||||||
|
# check pywb
|
||||||
|
t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
|
||||||
|
expected_payload = open(os.path.join(
|
||||||
|
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
|
||||||
|
assert requests.get(
|
||||||
|
wb_url, allow_redirects=False).content == expected_payload
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# any arguments are passed on to py.test
|
||||||
|
# so for example to run only "test_obey_robots" you could run
|
||||||
|
# ./run-tests.sh -k test_obey_robots
|
||||||
|
#
|
||||||
|
|
||||||
cd $(dirname "${BASH_SOURCE[0]}")
|
cd $(dirname "${BASH_SOURCE[0]}")
|
||||||
|
|
||||||
@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
|
|||||||
echo
|
echo
|
||||||
|
|
||||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
||||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user