new flag Page.blocked_by_robots

This commit is contained in:
Noah Levitt 2017-01-30 10:43:25 -08:00
parent a8b564f100
commit 4b6831b464
4 changed files with 10 additions and 7 deletions

View File

@ -1,7 +1,7 @@
'''
brozzler/site.py - classes representing sites and pages
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable):
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None, needs_robots_check=False):
outlinks=None, needs_robots_check=False, blocked_by_robots=None):
self.site_id = site_id
self.job_id = job_id
self.url = url
@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable):
self.hops_off_surt = hops_off_surt
self.outlinks = outlinks
self.needs_robots_check = needs_robots_check
self.blocked_by_robots = blocked_by_robots
self._canon_hurl = None
if priority is not None:

View File

@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -337,6 +337,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
else:
outlinks = self.brozzle_page(browser, site, page)
self._frontier.scope_and_schedule_outlinks(

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev175',
version='1.1b9.dev176',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -3,7 +3,7 @@
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
warcprox, pywb, rethinkdb and other dependencies to be running already
Copyright (C) 2016 Internet Archive
Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -257,8 +257,9 @@ def test_obey_robots(httpd):
# check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port}
page = pages[0]
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
assert page.blocked_by_robots
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls