diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 874b40d..4ca476b 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -177,6 +177,7 @@ class Chrome: json_url = 'http://localhost:%s/json' % self.port # make this a member variable so that kill -QUIT reports it self._start = time.time() + self._last_warning = self._start while True: try: raw_json = urllib.request.urlopen(json_url, timeout=30).read() @@ -194,11 +195,11 @@ class Chrome: except brozzler.ShutdownRequested: raise except BaseException as e: - if int(time.time() - self._start) % 10 == 5: + if time.time() - self._last_warning > 30: self.logger.warn( 'problem with %s (will keep trying until timeout ' 'of %d seconds): %s', json_url, timeout_sec, e) - pass + self._last_warning = time.time() finally: if time.time() - self._start > timeout_sec: self.logger.error( diff --git a/brozzler/site.py b/brozzler/site.py index fe86f0b..a1414c1 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -1,7 +1,7 @@ ''' brozzler/site.py - classes representing sites and pages -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable): self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0, via_page_id=None, last_claimed_by=None, hops_off_surt=0, - outlinks=None, needs_robots_check=False): + outlinks=None, needs_robots_check=False, blocked_by_robots=None): self.site_id = site_id self.job_id = job_id self.url = url @@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable): self.hops_off_surt = hops_off_surt self.outlinks = outlinks self.needs_robots_check = needs_robots_check + self.blocked_by_robots = blocked_by_robots self._canon_hurl = None if priority is not None: diff --git a/brozzler/worker.py b/brozzler/worker.py index 5eb3128..58a9079 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs youtube-dl on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -337,6 +337,7 @@ class BrozzlerWorker: if (page.needs_robots_check and not brozzler.is_permitted_by_robots(site, page.url)): logging.warn("page %s is blocked by robots.txt", page.url) + page.blocked_by_robots = True else: outlinks = self.brozzle_page(browser, site, page) self._frontier.scope_and_schedule_outlinks( diff --git a/setup.py b/setup.py index a71986f..72ac014 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev174', + version='1.1b9.dev176', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 829790b..a878474 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -3,7 +3,7 @@ test_cluster.py - integration tests for a brozzler cluster, expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be running already -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -257,8 +257,9 @@ def test_obey_robots(httpd): # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 - assert {page.url for page in pages} == { - 'http://localhost:%s/site1/' % httpd.server_port} + page = pages[0] + assert page.url == 'http://localhost:%s/site1/' % httpd.server_port + assert page.blocked_by_robots # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls