From 72816d105843ac7d878fe9ddf0f2a4bd823c0e4a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Nov 2016 12:23:59 -0800 Subject: [PATCH] don't check robots.txt when scheduling a new site to be crawled, but mark the seed Page as needs_robots_check, and delegate the robots check to brozzler-worker; new test of robots.txt adherence --- brozzler/cli.py | 0 brozzler/job.py | 12 ++--- brozzler/site.py | 8 +-- brozzler/worker.py | 16 ++++-- setup.py | 2 +- tests/test_cluster.py | 112 +++++++++++++++++++++++++++++++++++------- vagrant/run-tests.sh | 7 ++- 7 files changed, 121 insertions(+), 36 deletions(-) mode change 100755 => 100644 brozzler/cli.py diff --git a/brozzler/cli.py b/brozzler/cli.py old mode 100755 new mode 100644 diff --git a/brozzler/job.py b/brozzler/job.py index a213eae..a178884 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -106,13 +106,11 @@ def new_site(frontier, site): # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: - if brozzler.is_permitted_by_robots(site, site.seed): - page = brozzler.Page(site.seed, site_id=site.id, - job_id=site.job_id, hops_from_seed=0, priority=1000) - frontier.new_page(page) - logging.info("queued page %s", page) - else: - logging.warn("seed url %s is blocked by robots.txt", site.seed) + page = brozzler.Page( + site.seed, site_id=site.id, job_id=site.job_id, + hops_from_seed=0, priority=1000, needs_robots_check=True) + frontier.new_page(page) + logging.info("queued page %s", page) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) diff --git a/brozzler/site.py b/brozzler/site.py index 5cccac8..8ff692a 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -80,9 +80,8 @@ class Url: pass # if we get here, we're looking at two hostnames - # XXX do we need to handle case of one punycoded idn, other not? - domain_parts = ip_or_domain.split(".") - host_parts = self.host.split(".") + domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".") + host_parts = self.host.encode("idna").decode("ascii").lower().split(".") return host_parts[-len(domain_parts):] == domain_parts @@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable): self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0, via_page_id=None, last_claimed_by=None, hops_off_surt=0, - outlinks=None): + outlinks=None, needs_robots_check=False): self.site_id = site_id self.job_id = job_id self.url = url @@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable): self.via_page_id = via_page_id self.hops_off_surt = hops_off_surt self.outlinks = outlinks + self.needs_robots_check = needs_robots_check self._canon_hurl = None if priority is not None: diff --git a/brozzler/worker.py b/brozzler/worker.py index c306d95..21aa22e 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -327,12 +327,18 @@ class BrozzlerWorker: self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome_port)) - outlinks = self.brozzle_page(browser, site, page) - if browser.is_running(): - site.cookie_db = browser.persist_and_read_cookie_db() + + if (page.needs_robots_check and + not brozzler.is_permitted_by_robots(site, page.url)): + logging.warn("page %s is blocked by robots.txt", page.url) + else: + outlinks = self.brozzle_page(browser, site, page) + self._frontier.scope_and_schedule_outlinks( + site, page, outlinks) + if browser.is_running(): + site.cookie_db = browser.persist_and_read_cookie_db() + self._frontier.completed_page(site, page) - self._frontier.scope_and_schedule_outlinks( - site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) diff --git a/setup.py b/setup.py index 823fb3f..dc70a9c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev126', + version='1.1b8.dev127', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 1815cc9..ef4c51a 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -29,6 +29,13 @@ import time import brozzler import datetime import requests +import subprocess + +def start_service(service): + subprocess.check_call(['sudo', 'service', service, 'start']) + +def stop_service(service): + subprocess.check_call(['sudo', 'service', service, 'stop']) @pytest.fixture(scope='module') def httpd(request): @@ -102,12 +109,18 @@ def test_brozzle_site(httpd): page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port - assert site.id is None - r = rethinkstuff.Rethinker('localhost', db='brozzler') - frontier = brozzler.RethinkDbFrontier(r) - brozzler.new_site(frontier, site) - assert site.id is not None - assert len(list(frontier.site_pages(site.id))) == 1 + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 + finally: + start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() @@ -118,14 +131,17 @@ def test_brozzle_site(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 2 + assert len(pages) == 3 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, + 'http://localhost:%s/robots.txt' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port} + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() - captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} + captures_by_url = { + c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url @@ -140,7 +156,6 @@ def test_brozzle_site(httpd): os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload - def test_warcprox_selection(httpd): ''' When enable_warcprox_features is true, brozzler is expected to choose and instance of warcprox ''' @@ -156,12 +171,17 @@ def test_warcprox_selection(httpd): enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) - assert site.id is None - r = rethinkstuff.Rethinker('localhost', db='brozzler') - frontier = brozzler.RethinkDbFrontier(r) - brozzler.new_site(frontier, site) - assert site.id is not None - assert len(list(frontier.site_pages(site.id))) == 1 + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 + finally: + start_service('brozzler-worker') # check proxy is set in rethink start = time.time() @@ -179,14 +199,17 @@ def test_warcprox_selection(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 2 + assert len(pages) == 3 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, + 'http://localhost:%s/robots.txt' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port} + time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() - captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} + captures_by_url = { + c['url']:c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url @@ -199,4 +222,57 @@ def test_warcprox_selection(httpd): wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() - assert requests.get(wb_url).content == expected_payload + assert requests.get( + wb_url, allow_redirects=False).content == expected_payload + +def test_obey_robots(httpd): + test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() + site = brozzler.Site( + seed='http://localhost:%s/' % httpd.server_port, + proxy='localhost:8000', enable_warcprox_features=True, + user_agent='im a badbot', # robots.txt blocks badbot + warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) + + # so we can examine rethinkdb before it does anything + try: + stop_service('brozzler-worker') + + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + site_pages = list(frontier.site_pages(site.id)) + assert len(site_pages) == 1 + assert site_pages[0].url == site.seed + assert site_pages[0].needs_robots_check + finally: + start_service('brozzler-worker') + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site = frontier.site(site.id) + assert site.status == 'FINISHED' + + # check that we got the two pages we expected + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 1 + assert {page.url for page in pages} == { + 'http://localhost:%s/' % httpd.server_port} + + # take a look at the captures table + time.sleep(2) # in case warcprox hasn't finished processing urls + robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port + captures = list(r.table('captures').filter({'test_id':test_id}).run()) + assert len(captures) == 1 + assert captures[0]['url'] == robots_url + + # check pywb + t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) + expected_payload = open(os.path.join( + os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() + assert requests.get( + wb_url, allow_redirects=False).content == expected_payload diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index 2a8a0b5..710438b 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -1,4 +1,9 @@ #!/bin/bash +# +# any arguments are passed on to py.test +# so for example to run only "test_obey_robots" you could run +# ./run-tests.sh -k test_obey_robots +# cd $(dirname "${BASH_SOURCE[0]}") @@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ; echo vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest' -vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests' +vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"