Merge branch 'master' into qa

* master:
  fix attempt for deadlock-ish situation
  fix unclosed file warnings when running python in debug mode
  give vagrant vm a name in virtualbox
  add note to readme about browser version
  check browser version at startup
  Reinstate logging
  Fix typo and block legacy google-analytics.com/ga.js
  Use Network.setBlockedUrls instead of Debugger to block URLs
  bump dev version after PR merge
  back to dev version number
  commit for beta release
  this should fix travis build?
  fix tests
  update brozzler-easy for current warcprox api
  claim sites to brozzle in batches to reduce contention over sites table
  lengthen site session brozzling time to 15 minutes
  fix needs_browsing check
  new test test_needs_browsing
  increase timeout waiting for screenshot
  Use TCP_NODELAY in websocket connection to improve performance
This commit is contained in:
Noah Levitt 2018-02-13 17:10:10 -08:00
commit df0717d072
15 changed files with 236 additions and 114 deletions

View file

@ -9,7 +9,7 @@ before_install:
- sudo pip install ansible==2.1.3.0 - sudo pip install ansible==2.1.3.0
install: install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR 'warcprox==2.3' pytest - pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
script: script:
- DISPLAY=:1 py.test -v tests - DISPLAY=:1 py.test -v tests
after_failure: after_failure:

View file

@ -22,7 +22,7 @@ Requirements
- Python 3.4 or later - Python 3.4 or later
- RethinkDB deployment - RethinkDB deployment
- Chromium or Google Chrome browser - Chromium or Google Chrome >= version 64
Worth noting is that the browser requires a graphical environment to run. You Worth noting is that the browser requires a graphical environment to run. You
already have this on your laptop, but on a server it will probably require already have this on your laptop, but on a server it will probably require

View file

@ -5,6 +5,7 @@
become: true become: true
- name: install pywb in virtualenv - name: install pywb in virtualenv
pip: name=pywb pip: name=pywb
version=0.33.2
virtualenv={{venv_root}}/pywb-ve34 virtualenv={{venv_root}}/pywb-ve34
virtualenv_python=python3.4 virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'

View file

@ -67,6 +67,21 @@ logging.Logger.trace = _logger_trace
logging._levelToName[TRACE] = 'TRACE' logging._levelToName[TRACE] = 'TRACE'
logging._nameToLevel['TRACE'] = TRACE logging._nameToLevel['TRACE'] = TRACE
# see https://github.com/internetarchive/brozzler/issues/91
def _logging_handler_handle(self, record):
rv = self.filter(record)
if rv:
try:
self.acquire()
self.emit(record)
finally:
try:
self.release()
except:
pass
return rv
logging.Handler.handle = _logging_handler_handle
_behaviors = None _behaviors = None
def behaviors(behaviors_dir=None): def behaviors(behaviors_dir=None):
"""Return list of JS behaviors loaded from YAML file. """Return list of JS behaviors loaded from YAML file.

View file

@ -61,6 +61,33 @@ class BrowserPool:
self._in_use = set() self._in_use = set()
self._lock = threading.Lock() self._lock = threading.Lock()
def _fresh_browser(self):
# choose available port
sock = socket.socket()
sock.bind(('0.0.0.0', 0))
port = sock.getsockname()[1]
sock.close()
browser = Browser(port=port, **self.kwargs)
return browser
def acquire_multi(self, n=1):
'''
Returns a list of up to `n` browsers.
Raises:
NoBrowsersAvailable if none available
'''
browsers = []
with self._lock:
if len(self._in_use) >= self.size:
raise NoBrowsersAvailable
while len(self._in_use) < self.size and len(browsers) < n:
browser = self._fresh_browser()
browsers.append(browser)
self._in_use.add(browser)
return browsers
def acquire(self): def acquire(self):
''' '''
Returns an available instance. Returns an available instance.
@ -74,14 +101,7 @@ class BrowserPool:
with self._lock: with self._lock:
if len(self._in_use) >= self.size: if len(self._in_use) >= self.size:
raise NoBrowsersAvailable raise NoBrowsersAvailable
browser = self._fresh_browser()
# choose available port
sock = socket.socket()
sock.bind(('0.0.0.0', 0))
port = sock.getsockname()[1]
sock.close()
browser = Browser(port=port, **self.kwargs)
self._in_use.add(browser) self._in_use.add(browser)
return browser return browser
@ -90,6 +110,13 @@ class BrowserPool:
with self._lock: with self._lock:
self._in_use.remove(browser) self._in_use.remove(browser)
def release_all(self, browsers):
for browser in browsers:
browser.stop() # make sure
with self._lock:
for browser in browsers:
self._in_use.remove(browser)
def shutdown_now(self): def shutdown_now(self):
self.logger.info( self.logger.info(
'shutting down browser pool (%s browsers in use)', 'shutting down browser pool (%s browsers in use)',
@ -162,7 +189,8 @@ class WebsockReceiverThread(threading.Thread):
# ping_timeout is used as the timeout for the call to select.select() # ping_timeout is used as the timeout for the call to select.select()
# in addition to its documented purpose, and must have a value to avoid # in addition to its documented purpose, and must have a value to avoid
# hangs in certain situations # hangs in certain situations
self.websock.run_forever(ping_timeout=0.5) self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),),
ping_timeout=0.5)
def _on_message(self, websock, message): def _on_message(self, websock, message):
try: try:
@ -172,21 +200,6 @@ class WebsockReceiverThread(threading.Thread):
'uncaught exception in _handle_message message=%s', 'uncaught exception in _handle_message message=%s',
message, exc_info=True) message, exc_info=True)
def _debugger_paused(self, message):
# we hit the breakpoint set in start(), get rid of google analytics
self.logger.debug('debugger paused! message=%s', message)
scriptId = message['params']['callFrames'][0]['location']['scriptId']
# replace script
self.websock.send(
json.dumps(dict(
id=0, method='Debugger.setScriptSource',
params={'scriptId': scriptId,
'scriptSource': 'console.log("google analytics is no more!");'})))
# resume execution
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
def _network_response_received(self, message): def _network_response_received(self, message):
if (message['params']['response']['status'] == 420 if (message['params']['response']['status'] == 420
and 'Warcprox-Meta' in CaseInsensitiveDict( and 'Warcprox-Meta' in CaseInsensitiveDict(
@ -227,8 +240,6 @@ class WebsockReceiverThread(threading.Thread):
elif message['method'] == 'Network.requestWillBeSent': elif message['method'] == 'Network.requestWillBeSent':
if self.on_request: if self.on_request:
self.on_request(message) self.on_request(message)
elif message['method'] == 'Debugger.paused':
self._debugger_paused(message)
elif message['method'] == 'Page.interstitialShown': elif message['method'] == 'Page.interstitialShown':
# for AITFIVE-1529: handle http auth # for AITFIVE-1529: handle http auth
# for now, we should consider killing the browser when we receive Page.interstitialShown and # for now, we should consider killing the browser when we receive Page.interstitialShown and
@ -330,16 +341,14 @@ class Browser:
self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Network.enable')
self.send_to_chrome(method='Page.enable') self.send_to_chrome(method='Page.enable')
self.send_to_chrome(method='Console.enable') self.send_to_chrome(method='Console.enable')
self.send_to_chrome(method='Debugger.enable')
self.send_to_chrome(method='Runtime.enable') self.send_to_chrome(method='Runtime.enable')
# disable google analytics, see _handle_message() where breakpoint # disable google analytics
# is caught Debugger.paused
self.send_to_chrome( self.send_to_chrome(
method='Debugger.setBreakpointByUrl', method='Network.setBlockedURLs',
params={ params={'urls': ['*google-analytics.com/analytics.js',
'lineNumber': 1, '*google-analytics.com/ga.js']}
'urlRegex': 'https?://www.google-analytics.com/analytics.js'}) )
def stop(self): def stop(self):
''' '''
@ -549,7 +558,7 @@ class Browser:
'problem extracting outlinks, result message: %s', message) 'problem extracting outlinks, result message: %s', message)
return frozenset() return frozenset()
def screenshot(self, timeout=30): def screenshot(self, timeout=90):
self.logger.info('taking screenshot') self.logger.info('taking screenshot')
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot') msg_id = self.send_to_chrome(method='Page.captureScreenshot')

View file

@ -29,6 +29,35 @@ import signal
import sqlite3 import sqlite3
import json import json
import tempfile import tempfile
import sys
def check_version(chrome_exe):
'''
Raises SystemExit if `chrome_exe` is not a supported browser version.
Must run in the main thread to have the desired effect.
'''
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
# Google Chrome 64.0.3282.140
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
# Google Chrome 66.0.3341.0 canary
# linux$ chromium-browser --version
# Using PPAPI flash.
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
cmd = [chrome_exe, '--version']
out = subprocess.check_output(cmd, timeout=60)
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
if not m:
sys.exit(
'unable to parse browser version from output of '
'%r: %r' % (subprocess.list2cmdline(cmd), out))
version_str = m.group(2).decode()
major_version = int(version_str.split('.')[0])
if major_version < 64:
sys.exit('brozzler requires chrome/chromium version 64 or '
'later but %s reports version %s' % (
chrome_exe, version_str))
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + '.' + __qualname__)
@ -279,6 +308,8 @@ class Chrome:
'SIGKILL', self.chrome_process.pid, status) 'SIGKILL', self.chrome_process.pid, status)
finally: finally:
self.chrome_process.stdout.close()
self.chrome_process.stderr.close()
try: try:
self._home_tmpdir.cleanup() self._home_tmpdir.cleanup()
except: except:

View file

@ -167,6 +167,7 @@ def brozzle_page(argv=None):
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
configure_logging(args) configure_logging(args)
brozzler.chrome.check_version(args.chrome_exe)
behavior_parameters = {} behavior_parameters = {}
if args.behavior_parameters: if args.behavior_parameters:
@ -326,6 +327,7 @@ def brozzler_worker(argv=None):
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
configure_logging(args) configure_logging(args)
brozzler.chrome.check_version(args.chrome_exe)
def dump_state(signum, frame): def dump_state(signum, frame):
signal.signal(signal.SIGQUIT, signal.SIG_IGN) signal.signal(signal.SIGQUIT, signal.SIG_IGN)

View file

@ -3,7 +3,7 @@
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
working together in a single process working together in a single process
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -122,8 +122,8 @@ class BrozzlerEasyController:
def __init__(self, args): def __init__(self, args):
self.stop = threading.Event() self.stop = threading.Event()
self.args = args self.args = args
self.warcprox_controller = warcprox.main.init_controller( self.warcprox_controller = warcprox.controller.WarcproxController(
self._warcprox_args(args)) self._warcprox_opts(args))
self.brozzler_worker = self._init_brozzler_worker(args) self.brozzler_worker = self._init_brozzler_worker(args)
self.pywb_httpd = self._init_pywb(args) self.pywb_httpd = self._init_pywb(args)
self.dashboard_httpd = self._init_brozzler_dashboard(args) self.dashboard_httpd = self._init_brozzler_dashboard(args)
@ -221,40 +221,38 @@ class BrozzlerEasyController:
finally: finally:
self.shutdown() self.shutdown()
def _warcprox_args(self, args): def _warcprox_opts(self, args):
''' '''
Takes args as produced by the argument parser built by Takes args as produced by the argument parser built by
_build_arg_parser and builds warcprox arguments object suitable to pass _build_arg_parser and builds warcprox arguments object suitable to pass
to warcprox.main.init_controller. Copies some arguments, renames some, to warcprox.main.init_controller. Copies some arguments, renames some,
populates some with defaults appropriate for brozzler-easy, etc. populates some with defaults appropriate for brozzler-easy, etc.
''' '''
warcprox_args = argparse.Namespace() warcprox_opts = warcprox.Options()
warcprox_args.address = 'localhost' warcprox_opts.address = 'localhost'
# let the OS choose an available port; discover it later using # let the OS choose an available port; discover it later using
# sock.getsockname()[1] # sock.getsockname()[1]
warcprox_args.port = 0 warcprox_opts.port = 0
warcprox_args.cacert = args.cacert warcprox_opts.cacert = args.cacert
warcprox_args.certs_dir = args.certs_dir warcprox_opts.certs_dir = args.certs_dir
warcprox_args.directory = args.warcs_dir warcprox_opts.directory = args.warcs_dir
warcprox_args.gzip = True warcprox_opts.gzip = True
warcprox_args.prefix = 'brozzler' warcprox_opts.prefix = 'brozzler'
warcprox_args.size = 1000 * 1000* 1000 warcprox_opts.size = 1000 * 1000* 1000
warcprox_args.rollover_idle_time = 3 * 60 warcprox_opts.rollover_idle_time = 3 * 60
warcprox_args.digest_algorithm = 'sha1' warcprox_opts.digest_algorithm = 'sha1'
warcprox_args.base32 = True warcprox_opts.base32 = True
warcprox_args.stats_db_file = None warcprox_opts.stats_db_file = None
warcprox_args.playback_port = None warcprox_opts.playback_port = None
warcprox_args.playback_index_db_file = None warcprox_opts.playback_index_db_file = None
warcprox_args.rethinkdb_servers = args.rethinkdb_servers warcprox_opts.rethinkdb_big_table_url = (
warcprox_args.rethinkdb_db = args.rethinkdb_db 'rethinkdb://%s/%s/captures' % (
warcprox_args.rethinkdb_big_table = True args.rethinkdb_servers, args.rethinkdb_db))
warcprox_args.kafka_broker_list = None warcprox_opts.queue_size = 500
warcprox_args.kafka_capture_feed_topic = None warcprox_opts.max_threads = None
warcprox_args.queue_size = 500 warcprox_opts.profile = False
warcprox_args.max_threads = None warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy
warcprox_args.profile = False return warcprox_opts
warcprox_args.onion_tor_socks_proxy = args.onion_tor_socks_proxy
return warcprox_args
def dump_state(self, signum=None, frame=None): def dump_state(self, signum=None, frame=None):
state_strs = [] state_strs = []
@ -270,6 +268,7 @@ def main(argv=None):
arg_parser = _build_arg_parser(argv) arg_parser = _build_arg_parser(argv)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
brozzler.cli.configure_logging(args) brozzler.cli.configure_logging(args)
brozzler.chrome.check_version(args.chrome_exe)
controller = BrozzlerEasyController(args) controller = BrozzlerEasyController(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())

View file

@ -93,7 +93,7 @@ class RethinkDbFrontier:
raise UnexpectedDbResult("expected %r to be %r in %r" % ( raise UnexpectedDbResult("expected %r to be %r in %r" % (
k, expected, result)) k, expected, result))
def claim_site(self, worker_id): def claim_sites(self, n=1):
# XXX keep track of aggregate priority and prioritize sites accordingly? # XXX keep track of aggregate priority and prioritize sites accordingly?
while True: while True:
result = ( result = (
@ -104,34 +104,43 @@ class RethinkDbFrontier:
.order_by(index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed")
.filter((r.row["claimed"] != True) | ( .filter((r.row["claimed"] != True) | (
r.row["last_claimed"] < r.now() - 60*60)) r.row["last_claimed"] < r.now() - 60*60))
.limit(1) .limit(n)
.update( .update(
# try to avoid a race condition resulting in multiple # try to avoid a race condition resulting in multiple
# brozzler-workers claiming the same site # brozzler-workers claiming the same site
# see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038
r.branch((r.row["claimed"] != True) | ( r.branch((r.row["claimed"] != True) | (
r.row["last_claimed"] < r.now() - 60*60), { r.row["last_claimed"] < r.now() - 60*60), {
"claimed": True, "last_claimed_by": worker_id, "claimed": True,
"last_claimed": doublethink.utcnow()}, {}), "last_claimed": doublethink.utcnow()}, {}),
return_changes=True)).run() return_changes=True)).run()
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
if result["replaced"] == 1: self._vet_result(
if result["changes"][0]["old_val"]["claimed"]: result, replaced=list(range(n+1)),
unchanged=list(range(n+1)))
sites = []
for i in range(result["replaced"]):
if result["changes"][i]["old_val"]["claimed"]:
self.logger.warn( self.logger.warn(
"re-claimed site that was still marked 'claimed' " "re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago " "because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from " "at %s, and presumably some error stopped it from "
"being disclaimed", "being disclaimed",
result["changes"][0]["old_val"]["last_claimed"]) result["changes"][i]["old_val"]["last_claimed"])
site = brozzler.Site(self.rr, result["changes"][0]["new_val"]) site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
else: sites.append(site)
if not sites:
raise brozzler.NothingToClaim raise brozzler.NothingToClaim
# XXX This is the only place we enforce time limit for now. Worker # XXX This is the only place we enforce time limit for now. Worker
# loop should probably check time limit. Maybe frontier needs a # loop should probably check time limit. Maybe frontier needs a
# housekeeping thread to ensure that time limits get enforced in a # housekeeping thread to ensure that time limits get enforced in a
# timely fashion. # timely fashion.
if not self._enforce_time_limit(site): for site in list(sites):
return site if self._enforce_time_limit(site):
sites.remove(site)
if sites:
return sites
# else try again
def _enforce_time_limit(self, site): def _enforce_time_limit(self, site):
if (site.time_limit and site.time_limit > 0 if (site.time_limit and site.time_limit > 0

View file

@ -37,6 +37,7 @@ import urlcanon
from requests.structures import CaseInsensitiveDict from requests.structures import CaseInsensitiveDict
import rethinkdb as r import rethinkdb as r
import datetime import datetime
import urllib.parse
class ExtraHeaderAdder(urllib.request.BaseHandler): class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers): def __init__(self, extra_headers):
@ -87,7 +88,9 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
final_url = url final_url = url
while final_url in redirects: while final_url in redirects:
final_url = redirects.pop(final_url)['response_headers']['location'] txn = redirects.pop(final_url)
final_url = urllib.parse.urljoin(
txn['url'], txn['response_headers']['location'])
final_bounces = [] final_bounces = []
for txn in self.transactions: for txn in self.transactions:
@ -100,6 +103,7 @@ class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
HEARTBEAT_INTERVAL = 20.0 HEARTBEAT_INTERVAL = 20.0
SITE_SESSION_MINUTES = 15
def __init__( def __init__(
self, frontier, service_registry=None, max_browsers=1, self, frontier, service_registry=None, max_browsers=1,
@ -188,7 +192,7 @@ class BrozzlerWorker:
# in case youtube-dl takes a long time, heartbeat site.last_claimed # in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site # to prevent another brozzler-worker from claiming the site
try: try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7): if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES):
self.logger.debug( self.logger.debug(
'heartbeating site.last_claimed to prevent another ' 'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id) 'brozzler-worker claiming this site id=%r', site.id)
@ -498,13 +502,16 @@ class BrozzlerWorker:
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):
try: try:
site.last_claimed_by = '%s:%s' % (
socket.gethostname(), browser.chrome.port)
site.save()
start = time.time() start = time.time()
page = None page = None
self._frontier.honor_stop_request(site) self._frontier.honor_stop_request(site)
self.logger.info( self.logger.info(
"brozzling site (proxy=%r) %r", "brozzling site (proxy=%r) %r",
self._proxy_for(site), site) self._proxy_for(site), site)
while time.time() - start < 7 * 60: while time.time() - start < self.SITE_SESSION_MINUTES * 60:
site.refresh() site.refresh()
self._frontier.honor_stop_request(site) self._frontier.honor_stop_request(site)
page = self._frontier.claim_page(site, "%s:%s" % ( page = self._frontier.claim_page(site, "%s:%s" % (
@ -517,8 +524,9 @@ class BrozzlerWorker:
page.blocked_by_robots = True page.blocked_by_robots = True
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
else: else:
outlinks = self.brozzle_page(browser, site, page, outlinks = self.brozzle_page(
enable_youtube_dl=not self._skip_youtube_dl) browser, site, page,
enable_youtube_dl=not self._skip_youtube_dl)
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks( self._frontier.scope_and_schedule_outlinks(
site, page, outlinks) site, page, outlinks)
@ -598,36 +606,50 @@ class BrozzlerWorker:
if due: if due:
self._service_heartbeat() self._service_heartbeat()
def _start_browsing_some_sites(self):
'''
Starts browsing some sites.
Raises:
NoBrowsersAvailable if none available
'''
browsers = self._browser_pool.acquire_multi(
(self._browser_pool.num_available() + 1) // 2)
try:
sites = self._frontier.claim_sites(len(browsers))
except:
self._browser_pool.release_all(browsers)
raise
for i in range(len(browsers)):
if i < len(sites):
th = threading.Thread(
target=self._brozzle_site_thread_target,
args=(browsers[i], sites[i]),
name="BrozzlingThread:%s" % browsers[i].chrome.port,
daemon=True)
with self._browsing_threads_lock:
self._browsing_threads.add(th)
th.start()
else:
self._browser_pool.release(browsers[i])
def run(self): def run(self):
self.logger.info("brozzler worker starting") self.logger.info("brozzler worker starting")
try: try:
latest_state = None
while not self._shutdown.is_set(): while not self._shutdown.is_set():
self._service_heartbeat_if_due() self._service_heartbeat_if_due()
try: try:
browser = self._browser_pool.acquire() self._start_browsing_some_sites()
try:
site = self._frontier.claim_site("%s:%s" % (
socket.gethostname(), browser.chrome.port))
th = threading.Thread(
target=self._brozzle_site_thread_target,
args=(browser, site),
name="BrozzlingThread:%s" % browser.chrome.port,
daemon=True)
with self._browsing_threads_lock:
self._browsing_threads.add(th)
th.start()
except:
self._browser_pool.release(browser)
raise
except brozzler.browser.NoBrowsersAvailable: except brozzler.browser.NoBrowsersAvailable:
if latest_state != "browsers-busy": logging.trace(
self.logger.info( "all %s browsers are in use", self._max_browsers)
"all %s browsers are busy", self._max_browsers)
latest_state = "browsers-busy"
except brozzler.NothingToClaim: except brozzler.NothingToClaim:
pass logging.trace(
"all active sites are already claimed by a "
"brozzler worker")
time.sleep(0.5) time.sleep(0.5)
self.logger.info("shutdown requested") self.logger.info("shutdown requested")
except r.ReqlError as e: except r.ReqlError as e:
self.logger.error( self.logger.error(

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b12.dev279', version='1.1b13.dev285',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
@ -79,8 +79,8 @@ setuptools.setup(
extras_require={ extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'], 'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': [ 'easy': [
'warcprox>=2.1b1.dev87', 'warcprox>=2.4b1.dev145',
'pywb', 'pywb<2',
'flask>=0.11', 'flask>=0.11',
'gunicorn' 'gunicorn'
], ],

View file

@ -661,11 +661,10 @@ def test_warcprox_outage_resiliency(httpd):
opts = warcprox.Options() opts = warcprox.Options()
opts.address = '0.0.0.0' opts.address = '0.0.0.0'
opts.port = 0 opts.port = 0
opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services'
warcprox1 = warcprox.controller.WarcproxController( warcprox1 = warcprox.controller.WarcproxController(opts)
service_registry=svcreg, options=opts) warcprox2 = warcprox.controller.WarcproxController(opts)
warcprox2 = warcprox.controller.WarcproxController(
service_registry=svcreg, options=opts)
warcprox1_thread = threading.Thread( warcprox1_thread = threading.Thread(
target=warcprox1.run_until_shutdown, name='warcprox1') target=warcprox1.run_until_shutdown, name='warcprox1')
warcprox2_thread = threading.Thread( warcprox2_thread = threading.Thread(

View file

@ -826,30 +826,34 @@ def test_claim_site():
rr.table('sites').delete().run() # clean slate rr.table('sites').delete().run() # clean slate
with pytest.raises(brozzler.NothingToClaim): with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_site(worker_id='test_claim_site') claimed_site = frontier.claim_sites()
site = brozzler.Site(rr, {'seed': 'http://example.org/'}) site = brozzler.Site(rr, {'seed': 'http://example.org/'})
brozzler.new_site(frontier, site) brozzler.new_site(frontier, site)
claimed_site = frontier.claim_site(worker_id='test_claim_site') claimed_sites = frontier.claim_sites()
assert len(claimed_sites) == 1
claimed_site = claimed_sites[0]
assert claimed_site.id == site.id assert claimed_site.id == site.id
assert claimed_site.claimed assert claimed_site.claimed
assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
with pytest.raises(brozzler.NothingToClaim): with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_site(worker_id='test_claim_site') claimed_site = frontier.claim_sites()
# site last_claimed less than 1 hour ago still not to be reclaimed # site last_claimed less than 1 hour ago still not to be reclaimed
claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55) claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
claimed_site.save() claimed_site.save()
with pytest.raises(brozzler.NothingToClaim): with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_site(worker_id='test_claim_site') claimed_site = frontier.claim_sites()
# site last_claimed more than 1 hour ago can be reclaimed # site last_claimed more than 1 hour ago can be reclaimed
site = claimed_site site = claimed_site
claimed_site = None claimed_site = None
site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
site.save() site.save()
claimed_site = frontier.claim_site(worker_id='test_claim_site') claimed_sites = frontier.claim_sites()
assert len(claimed_sites) == 1
claimed_site = claimed_sites[0]
assert claimed_site.id == site.id assert claimed_site.id == site.id
# clean up # clean up

View file

@ -310,3 +310,30 @@ def test_thread_raise_second_with_block():
th.join() th.join()
assert isinstance(thread_caught_exception, Exception2) assert isinstance(thread_caught_exception, Exception2)
def test_needs_browsing():
# only one test case here right now, which exposed a bug
class ConvenientHeaders(http.client.HTTPMessage):
def __init__(self, headers):
http.client.HTTPMessage.__init__(self)
for (k, v) in headers.items():
self.add_header(k, v)
page = brozzler.Page(None, {
'url':'http://example.com/a'})
spy = brozzler.worker.YoutubeDLSpy()
spy.transactions.append({
'url': 'http://example.com/a',
'method': 'HEAD',
'status_code': 301,
'response_headers': ConvenientHeaders({'Location': '/b'})})
spy.transactions.append({
'url': 'http://example.com/b',
'method': 'GET',
'status_code': 200,
'response_headers': ConvenientHeaders({
'Content-Type': 'application/pdf'})})
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy)

4
vagrant/Vagrantfile vendored
View file

@ -15,4 +15,8 @@ Vagrant.configure(2) do |config|
ansible.inventory_path = "../ansible/hosts-vagrant" ansible.inventory_path = "../ansible/hosts-vagrant"
ansible.playbook = "../ansible/playbook.yml" ansible.playbook = "../ansible/playbook.yml"
end end
config.vm.provider 'virtualbox' do |v|
v.name = 'brozzler-test-vm'
end
end end