mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: new flag Page.blocked_by_robots be more patient to avoid spurious warnings waiting for browser to start up
This commit is contained in:
commit
8df96ceb59
@ -177,6 +177,7 @@ class Chrome:
|
||||
json_url = 'http://localhost:%s/json' % self.port
|
||||
# make this a member variable so that kill -QUIT reports it
|
||||
self._start = time.time()
|
||||
self._last_warning = self._start
|
||||
while True:
|
||||
try:
|
||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||
@ -194,11 +195,11 @@ class Chrome:
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except BaseException as e:
|
||||
if int(time.time() - self._start) % 10 == 5:
|
||||
if time.time() - self._last_warning > 30:
|
||||
self.logger.warn(
|
||||
'problem with %s (will keep trying until timeout '
|
||||
'of %d seconds): %s', json_url, timeout_sec, e)
|
||||
pass
|
||||
self._last_warning = time.time()
|
||||
finally:
|
||||
if time.time() - self._start > timeout_sec:
|
||||
self.logger.error(
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
brozzler/site.py - classes representing sites and pages
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable):
|
||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
|
||||
outlinks=None, needs_robots_check=False):
|
||||
outlinks=None, needs_robots_check=False, blocked_by_robots=None):
|
||||
self.site_id = site_id
|
||||
self.job_id = job_id
|
||||
self.url = url
|
||||
@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable):
|
||||
self.hops_off_surt = hops_off_surt
|
||||
self.outlinks = outlinks
|
||||
self.needs_robots_check = needs_robots_check
|
||||
self.blocked_by_robots = blocked_by_robots
|
||||
self._canon_hurl = None
|
||||
|
||||
if priority is not None:
|
||||
|
@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
||||
scopes and adds outlinks to the frontier
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -337,6 +337,7 @@ class BrozzlerWorker:
|
||||
if (page.needs_robots_check and
|
||||
not brozzler.is_permitted_by_robots(site, page.url)):
|
||||
logging.warn("page %s is blocked by robots.txt", page.url)
|
||||
page.blocked_by_robots = True
|
||||
else:
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev174',
|
||||
version='1.1b9.dev176',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -3,7 +3,7 @@
|
||||
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
|
||||
warcprox, pywb, rethinkdb and other dependencies to be running already
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
Copyright (C) 2016-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -257,8 +257,9 @@ def test_obey_robots(httpd):
|
||||
# check that only the one page is in rethinkdb
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/site1/' % httpd.server_port}
|
||||
page = pages[0]
|
||||
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
|
||||
assert page.blocked_by_robots
|
||||
|
||||
# take a look at the captures table
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
|
Loading…
x
Reference in New Issue
Block a user