diff --git a/.travis.yml b/.travis.yml index cef3d80..80b82d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox==2.3' pytest +- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest script: - DISPLAY=:1 py.test -v tests after_failure: diff --git a/README.rst b/README.rst index 6dee908..2e6a3a9 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ Requirements - Python 3.4 or later - RethinkDB deployment -- Chromium or Google Chrome browser +- Chromium or Google Chrome >= version 64 Worth noting is that the browser requires a graphical environment to run. You already have this on your laptop, but on a server it will probably require diff --git a/ansible/roles/pywb/tasks/main.yml b/ansible/roles/pywb/tasks/main.yml index 975359f..16b9ea7 100644 --- a/ansible/roles/pywb/tasks/main.yml +++ b/ansible/roles/pywb/tasks/main.yml @@ -5,6 +5,7 @@ become: true - name: install pywb in virtualenv pip: name=pywb + version=0.33.2 virtualenv={{venv_root}}/pywb-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' diff --git a/brozzler/__init__.py b/brozzler/__init__.py index c44f72f..baf5592 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -67,6 +67,21 @@ logging.Logger.trace = _logger_trace logging._levelToName[TRACE] = 'TRACE' logging._nameToLevel['TRACE'] = TRACE +# see https://github.com/internetarchive/brozzler/issues/91 +def _logging_handler_handle(self, record): + rv = self.filter(record) + if rv: + try: + self.acquire() + self.emit(record) + finally: + try: + self.release() + except: + pass + return rv +logging.Handler.handle = _logging_handler_handle + _behaviors = None def behaviors(behaviors_dir=None): """Return list of JS behaviors loaded from YAML file. diff --git a/brozzler/browser.py b/brozzler/browser.py index a4af3e8..1014961 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -61,6 +61,33 @@ class BrowserPool: self._in_use = set() self._lock = threading.Lock() + def _fresh_browser(self): + # choose available port + sock = socket.socket() + sock.bind(('0.0.0.0', 0)) + port = sock.getsockname()[1] + sock.close() + + browser = Browser(port=port, **self.kwargs) + return browser + + def acquire_multi(self, n=1): + ''' + Returns a list of up to `n` browsers. + + Raises: + NoBrowsersAvailable if none available + ''' + browsers = [] + with self._lock: + if len(self._in_use) >= self.size: + raise NoBrowsersAvailable + while len(self._in_use) < self.size and len(browsers) < n: + browser = self._fresh_browser() + browsers.append(browser) + self._in_use.add(browser) + return browsers + def acquire(self): ''' Returns an available instance. @@ -74,14 +101,7 @@ class BrowserPool: with self._lock: if len(self._in_use) >= self.size: raise NoBrowsersAvailable - - # choose available port - sock = socket.socket() - sock.bind(('0.0.0.0', 0)) - port = sock.getsockname()[1] - sock.close() - - browser = Browser(port=port, **self.kwargs) + browser = self._fresh_browser() self._in_use.add(browser) return browser @@ -90,6 +110,13 @@ class BrowserPool: with self._lock: self._in_use.remove(browser) + def release_all(self, browsers): + for browser in browsers: + browser.stop() # make sure + with self._lock: + for browser in browsers: + self._in_use.remove(browser) + def shutdown_now(self): self.logger.info( 'shutting down browser pool (%s browsers in use)', @@ -162,7 +189,8 @@ class WebsockReceiverThread(threading.Thread): # ping_timeout is used as the timeout for the call to select.select() # in addition to its documented purpose, and must have a value to avoid # hangs in certain situations - self.websock.run_forever(ping_timeout=0.5) + self.websock.run_forever(sockopt=((socket.IPPROTO_TCP, socket.TCP_NODELAY, 1),), + ping_timeout=0.5) def _on_message(self, websock, message): try: @@ -172,21 +200,6 @@ class WebsockReceiverThread(threading.Thread): 'uncaught exception in _handle_message message=%s', message, exc_info=True) - def _debugger_paused(self, message): - # we hit the breakpoint set in start(), get rid of google analytics - self.logger.debug('debugger paused! message=%s', message) - scriptId = message['params']['callFrames'][0]['location']['scriptId'] - - # replace script - self.websock.send( - json.dumps(dict( - id=0, method='Debugger.setScriptSource', - params={'scriptId': scriptId, - 'scriptSource': 'console.log("google analytics is no more!");'}))) - - # resume execution - self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) - def _network_response_received(self, message): if (message['params']['response']['status'] == 420 and 'Warcprox-Meta' in CaseInsensitiveDict( @@ -227,8 +240,6 @@ class WebsockReceiverThread(threading.Thread): elif message['method'] == 'Network.requestWillBeSent': if self.on_request: self.on_request(message) - elif message['method'] == 'Debugger.paused': - self._debugger_paused(message) elif message['method'] == 'Page.interstitialShown': # for AITFIVE-1529: handle http auth # for now, we should consider killing the browser when we receive Page.interstitialShown and @@ -330,16 +341,14 @@ class Browser: self.send_to_chrome(method='Network.enable') self.send_to_chrome(method='Page.enable') self.send_to_chrome(method='Console.enable') - self.send_to_chrome(method='Debugger.enable') self.send_to_chrome(method='Runtime.enable') - # disable google analytics, see _handle_message() where breakpoint - # is caught Debugger.paused + # disable google analytics self.send_to_chrome( - method='Debugger.setBreakpointByUrl', - params={ - 'lineNumber': 1, - 'urlRegex': 'https?://www.google-analytics.com/analytics.js'}) + method='Network.setBlockedURLs', + params={'urls': ['*google-analytics.com/analytics.js', + '*google-analytics.com/ga.js']} + ) def stop(self): ''' @@ -549,7 +558,7 @@ class Browser: 'problem extracting outlinks, result message: %s', message) return frozenset() - def screenshot(self, timeout=30): + def screenshot(self, timeout=90): self.logger.info('taking screenshot') self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.captureScreenshot') diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 3b44773..1855bec 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -29,6 +29,35 @@ import signal import sqlite3 import json import tempfile +import sys + +def check_version(chrome_exe): + ''' + Raises SystemExit if `chrome_exe` is not a supported browser version. + + Must run in the main thread to have the desired effect. + ''' + # mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version + # Google Chrome 64.0.3282.140 + # mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version + # Google Chrome 66.0.3341.0 canary + # linux$ chromium-browser --version + # Using PPAPI flash. + # --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version= + # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04 + cmd = [chrome_exe, '--version'] + out = subprocess.check_output(cmd, timeout=60) + m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out) + if not m: + sys.exit( + 'unable to parse browser version from output of ' + '%r: %r' % (subprocess.list2cmdline(cmd), out)) + version_str = m.group(2).decode() + major_version = int(version_str.split('.')[0]) + if major_version < 64: + sys.exit('brozzler requires chrome/chromium version 64 or ' + 'later but %s reports version %s' % ( + chrome_exe, version_str)) class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) @@ -279,6 +308,8 @@ class Chrome: 'SIGKILL', self.chrome_process.pid, status) finally: + self.chrome_process.stdout.close() + self.chrome_process.stderr.close() try: self._home_tmpdir.cleanup() except: diff --git a/brozzler/cli.py b/brozzler/cli.py index b4f9417..942fde8 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -167,6 +167,7 @@ def brozzle_page(argv=None): args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) + brozzler.chrome.check_version(args.chrome_exe) behavior_parameters = {} if args.behavior_parameters: @@ -326,6 +327,7 @@ def brozzler_worker(argv=None): args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) + brozzler.chrome.check_version(args.chrome_exe) def dump_state(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) diff --git a/brozzler/easy.py b/brozzler/easy.py index e41f013..83cf1ba 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -3,7 +3,7 @@ brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all working together in a single process -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -122,8 +122,8 @@ class BrozzlerEasyController: def __init__(self, args): self.stop = threading.Event() self.args = args - self.warcprox_controller = warcprox.main.init_controller( - self._warcprox_args(args)) + self.warcprox_controller = warcprox.controller.WarcproxController( + self._warcprox_opts(args)) self.brozzler_worker = self._init_brozzler_worker(args) self.pywb_httpd = self._init_pywb(args) self.dashboard_httpd = self._init_brozzler_dashboard(args) @@ -221,40 +221,38 @@ class BrozzlerEasyController: finally: self.shutdown() - def _warcprox_args(self, args): + def _warcprox_opts(self, args): ''' Takes args as produced by the argument parser built by _build_arg_parser and builds warcprox arguments object suitable to pass to warcprox.main.init_controller. Copies some arguments, renames some, populates some with defaults appropriate for brozzler-easy, etc. ''' - warcprox_args = argparse.Namespace() - warcprox_args.address = 'localhost' + warcprox_opts = warcprox.Options() + warcprox_opts.address = 'localhost' # let the OS choose an available port; discover it later using # sock.getsockname()[1] - warcprox_args.port = 0 - warcprox_args.cacert = args.cacert - warcprox_args.certs_dir = args.certs_dir - warcprox_args.directory = args.warcs_dir - warcprox_args.gzip = True - warcprox_args.prefix = 'brozzler' - warcprox_args.size = 1000 * 1000* 1000 - warcprox_args.rollover_idle_time = 3 * 60 - warcprox_args.digest_algorithm = 'sha1' - warcprox_args.base32 = True - warcprox_args.stats_db_file = None - warcprox_args.playback_port = None - warcprox_args.playback_index_db_file = None - warcprox_args.rethinkdb_servers = args.rethinkdb_servers - warcprox_args.rethinkdb_db = args.rethinkdb_db - warcprox_args.rethinkdb_big_table = True - warcprox_args.kafka_broker_list = None - warcprox_args.kafka_capture_feed_topic = None - warcprox_args.queue_size = 500 - warcprox_args.max_threads = None - warcprox_args.profile = False - warcprox_args.onion_tor_socks_proxy = args.onion_tor_socks_proxy - return warcprox_args + warcprox_opts.port = 0 + warcprox_opts.cacert = args.cacert + warcprox_opts.certs_dir = args.certs_dir + warcprox_opts.directory = args.warcs_dir + warcprox_opts.gzip = True + warcprox_opts.prefix = 'brozzler' + warcprox_opts.size = 1000 * 1000* 1000 + warcprox_opts.rollover_idle_time = 3 * 60 + warcprox_opts.digest_algorithm = 'sha1' + warcprox_opts.base32 = True + warcprox_opts.stats_db_file = None + warcprox_opts.playback_port = None + warcprox_opts.playback_index_db_file = None + warcprox_opts.rethinkdb_big_table_url = ( + 'rethinkdb://%s/%s/captures' % ( + args.rethinkdb_servers, args.rethinkdb_db)) + warcprox_opts.queue_size = 500 + warcprox_opts.max_threads = None + warcprox_opts.profile = False + warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy + return warcprox_opts def dump_state(self, signum=None, frame=None): state_strs = [] @@ -270,6 +268,7 @@ def main(argv=None): arg_parser = _build_arg_parser(argv) args = arg_parser.parse_args(args=argv[1:]) brozzler.cli.configure_logging(args) + brozzler.chrome.check_version(args.chrome_exe) controller = BrozzlerEasyController(args) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3bd6221..ee43966 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -93,7 +93,7 @@ class RethinkDbFrontier: raise UnexpectedDbResult("expected %r to be %r in %r" % ( k, expected, result)) - def claim_site(self, worker_id): + def claim_sites(self, n=1): # XXX keep track of aggregate priority and prioritize sites accordingly? while True: result = ( @@ -104,34 +104,43 @@ class RethinkDbFrontier: .order_by(index="sites_last_disclaimed") .filter((r.row["claimed"] != True) | ( r.row["last_claimed"] < r.now() - 60*60)) - .limit(1) + .limit(n) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 r.branch((r.row["claimed"] != True) | ( r.row["last_claimed"] < r.now() - 60*60), { - "claimed": True, "last_claimed_by": worker_id, + "claimed": True, "last_claimed": doublethink.utcnow()}, {}), return_changes=True)).run() - self._vet_result(result, replaced=[0,1], unchanged=[0,1]) - if result["replaced"] == 1: - if result["changes"][0]["old_val"]["claimed"]: + + self._vet_result( + result, replaced=list(range(n+1)), + unchanged=list(range(n+1))) + sites = [] + for i in range(result["replaced"]): + if result["changes"][i]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", - result["changes"][0]["old_val"]["last_claimed"]) - site = brozzler.Site(self.rr, result["changes"][0]["new_val"]) - else: + result["changes"][i]["old_val"]["last_claimed"]) + site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) + sites.append(site) + if not sites: raise brozzler.NothingToClaim # XXX This is the only place we enforce time limit for now. Worker # loop should probably check time limit. Maybe frontier needs a # housekeeping thread to ensure that time limits get enforced in a # timely fashion. - if not self._enforce_time_limit(site): - return site + for site in list(sites): + if self._enforce_time_limit(site): + sites.remove(site) + if sites: + return sites + # else try again def _enforce_time_limit(self, site): if (site.time_limit and site.time_limit > 0 diff --git a/brozzler/worker.py b/brozzler/worker.py index 268374e..9bd6169 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -37,6 +37,7 @@ import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as r import datetime +import urllib.parse class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): @@ -87,7 +88,9 @@ class YoutubeDLSpy(urllib.request.BaseHandler): final_url = url while final_url in redirects: - final_url = redirects.pop(final_url)['response_headers']['location'] + txn = redirects.pop(final_url) + final_url = urllib.parse.urljoin( + txn['url'], txn['response_headers']['location']) final_bounces = [] for txn in self.transactions: @@ -100,6 +103,7 @@ class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) HEARTBEAT_INTERVAL = 20.0 + SITE_SESSION_MINUTES = 15 def __init__( self, frontier, service_registry=None, max_browsers=1, @@ -188,7 +192,7 @@ class BrozzlerWorker: # in case youtube-dl takes a long time, heartbeat site.last_claimed # to prevent another brozzler-worker from claiming the site try: - if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7): + if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=self.SITE_SESSION_MINUTES): self.logger.debug( 'heartbeating site.last_claimed to prevent another ' 'brozzler-worker claiming this site id=%r', site.id) @@ -498,13 +502,16 @@ class BrozzlerWorker: def brozzle_site(self, browser, site): try: + site.last_claimed_by = '%s:%s' % ( + socket.gethostname(), browser.chrome.port) + site.save() start = time.time() page = None self._frontier.honor_stop_request(site) self.logger.info( "brozzling site (proxy=%r) %r", self._proxy_for(site), site) - while time.time() - start < 7 * 60: + while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( @@ -517,8 +524,9 @@ class BrozzlerWorker: page.blocked_by_robots = True self._frontier.completed_page(site, page) else: - outlinks = self.brozzle_page(browser, site, page, - enable_youtube_dl=not self._skip_youtube_dl) + outlinks = self.brozzle_page( + browser, site, page, + enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) @@ -598,36 +606,50 @@ class BrozzlerWorker: if due: self._service_heartbeat() + def _start_browsing_some_sites(self): + ''' + Starts browsing some sites. + + Raises: + NoBrowsersAvailable if none available + ''' + browsers = self._browser_pool.acquire_multi( + (self._browser_pool.num_available() + 1) // 2) + try: + sites = self._frontier.claim_sites(len(browsers)) + except: + self._browser_pool.release_all(browsers) + raise + + for i in range(len(browsers)): + if i < len(sites): + th = threading.Thread( + target=self._brozzle_site_thread_target, + args=(browsers[i], sites[i]), + name="BrozzlingThread:%s" % browsers[i].chrome.port, + daemon=True) + with self._browsing_threads_lock: + self._browsing_threads.add(th) + th.start() + else: + self._browser_pool.release(browsers[i]) + def run(self): self.logger.info("brozzler worker starting") try: - latest_state = None while not self._shutdown.is_set(): self._service_heartbeat_if_due() try: - browser = self._browser_pool.acquire() - try: - site = self._frontier.claim_site("%s:%s" % ( - socket.gethostname(), browser.chrome.port)) - th = threading.Thread( - target=self._brozzle_site_thread_target, - args=(browser, site), - name="BrozzlingThread:%s" % browser.chrome.port, - daemon=True) - with self._browsing_threads_lock: - self._browsing_threads.add(th) - th.start() - except: - self._browser_pool.release(browser) - raise + self._start_browsing_some_sites() except brozzler.browser.NoBrowsersAvailable: - if latest_state != "browsers-busy": - self.logger.info( - "all %s browsers are busy", self._max_browsers) - latest_state = "browsers-busy" + logging.trace( + "all %s browsers are in use", self._max_browsers) except brozzler.NothingToClaim: - pass + logging.trace( + "all active sites are already claimed by a " + "brozzler worker") time.sleep(0.5) + self.logger.info("shutdown requested") except r.ReqlError as e: self.logger.error( diff --git a/setup.py b/setup.py index 9e73c33..bbfd2bd 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev279', + version='1.1b13.dev285', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -79,8 +79,8 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.1b1.dev87', - 'pywb', + 'warcprox>=2.4b1.dev145', + 'pywb<2', 'flask>=0.11', 'gunicorn' ], diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 48a9384..32e9734 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -661,11 +661,10 @@ def test_warcprox_outage_resiliency(httpd): opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 + opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services' - warcprox1 = warcprox.controller.WarcproxController( - service_registry=svcreg, options=opts) - warcprox2 = warcprox.controller.WarcproxController( - service_registry=svcreg, options=opts) + warcprox1 = warcprox.controller.WarcproxController(opts) + warcprox2 = warcprox.controller.WarcproxController(opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( diff --git a/tests/test_frontier.py b/tests/test_frontier.py index fdae726..5170a04 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -826,30 +826,34 @@ def test_claim_site(): rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): - claimed_site = frontier.claim_site(worker_id='test_claim_site') + claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) - claimed_site = frontier.claim_site(worker_id='test_claim_site') + claimed_sites = frontier.claim_sites() + assert len(claimed_sites) == 1 + claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): - claimed_site = frontier.claim_site(worker_id='test_claim_site') + claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): - claimed_site = frontier.claim_site(worker_id='test_claim_site') + claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() - claimed_site = frontier.claim_site(worker_id='test_claim_site') + claimed_sites = frontier.claim_sites() + assert len(claimed_sites) == 1 + claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up diff --git a/tests/test_units.py b/tests/test_units.py index 3f40863..ce5067c 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -310,3 +310,30 @@ def test_thread_raise_second_with_block(): th.join() assert isinstance(thread_caught_exception, Exception2) +def test_needs_browsing(): + # only one test case here right now, which exposed a bug + + class ConvenientHeaders(http.client.HTTPMessage): + def __init__(self, headers): + http.client.HTTPMessage.__init__(self) + for (k, v) in headers.items(): + self.add_header(k, v) + + page = brozzler.Page(None, { + 'url':'http://example.com/a'}) + + spy = brozzler.worker.YoutubeDLSpy() + spy.transactions.append({ + 'url': 'http://example.com/a', + 'method': 'HEAD', + 'status_code': 301, + 'response_headers': ConvenientHeaders({'Location': '/b'})}) + spy.transactions.append({ + 'url': 'http://example.com/b', + 'method': 'GET', + 'status_code': 200, + 'response_headers': ConvenientHeaders({ + 'Content-Type': 'application/pdf'})}) + + assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy) + diff --git a/vagrant/Vagrantfile b/vagrant/Vagrantfile index f983e7c..73f13fd 100644 --- a/vagrant/Vagrantfile +++ b/vagrant/Vagrantfile @@ -15,4 +15,8 @@ Vagrant.configure(2) do |config| ansible.inventory_path = "../ansible/hosts-vagrant" ansible.playbook = "../ansible/playbook.yml" end + + config.vm.provider 'virtualbox' do |v| + v.name = 'brozzler-test-vm' + end end