Merge branch 'master' into qa

* master:
  new flag Page.blocked_by_robots
  be more patient to avoid spurious warnings waiting for browser to start up
This commit is contained in:
Noah Levitt 2017-01-30 11:39:37 -08:00
commit 8df96ceb59
5 changed files with 13 additions and 9 deletions

View file

@ -177,6 +177,7 @@ class Chrome:
json_url = 'http://localhost:%s/json' % self.port json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it # make this a member variable so that kill -QUIT reports it
self._start = time.time() self._start = time.time()
self._last_warning = self._start
while True: while True:
try: try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read() raw_json = urllib.request.urlopen(json_url, timeout=30).read()
@ -194,11 +195,11 @@ class Chrome:
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except BaseException as e: except BaseException as e:
if int(time.time() - self._start) % 10 == 5: if time.time() - self._last_warning > 30:
self.logger.warn( self.logger.warn(
'problem with %s (will keep trying until timeout ' 'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e) 'of %d seconds): %s', json_url, timeout_sec, e)
pass self._last_warning = time.time()
finally: finally:
if time.time() - self._start > timeout_sec: if time.time() - self._start > timeout_sec:
self.logger.error( self.logger.error(

View file

@ -1,7 +1,7 @@
''' '''
brozzler/site.py - classes representing sites and pages brozzler/site.py - classes representing sites and pages
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable):
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0, via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None, needs_robots_check=False): outlinks=None, needs_robots_check=False, blocked_by_robots=None):
self.site_id = site_id self.site_id = site_id
self.job_id = job_id self.job_id = job_id
self.url = url self.url = url
@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable):
self.hops_off_surt = hops_off_surt self.hops_off_surt = hops_off_surt
self.outlinks = outlinks self.outlinks = outlinks
self.needs_robots_check = needs_robots_check self.needs_robots_check = needs_robots_check
self.blocked_by_robots = blocked_by_robots
self._canon_hurl = None self._canon_hurl = None
if priority is not None: if priority is not None:

View file

@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate, it runs youtube-dl on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier scopes and adds outlinks to the frontier
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -337,6 +337,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)): not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url) logging.warn("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
else: else:
outlinks = self.brozzle_page(browser, site, page) outlinks = self.brozzle_page(browser, site, page)
self._frontier.scope_and_schedule_outlinks( self._frontier.scope_and_schedule_outlinks(

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev174', version='1.1b9.dev176',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -3,7 +3,7 @@
test_cluster.py - integration tests for a brozzler cluster, expects brozzler, test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
warcprox, pywb, rethinkdb and other dependencies to be running already warcprox, pywb, rethinkdb and other dependencies to be running already
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -257,8 +257,9 @@ def test_obey_robots(httpd):
# check that only the one page is in rethinkdb # check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id)) pages = list(frontier.site_pages(site.id))
assert len(pages) == 1 assert len(pages) == 1
assert {page.url for page in pages} == { page = pages[0]
'http://localhost:%s/site1/' % httpd.server_port} assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
assert page.blocked_by_robots
# take a look at the captures table # take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls