From 934190084c73699747cf3f4c4d2ee7e268927eae Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 24 Mar 2017 13:55:23 -0700 Subject: [PATCH] Refactor the way the proxy is configured. Job/site settings "proxy" and "enable_warcprox_features" are gone. Brozzler-worker now has mutually exclusive options --proxy and --warcprox-auto. --warcprox-auto means find an instance of warcprox in the service registry, and enable warcprox features. --proxy is provided, determines if proxy is warcprox by consulting http://{proxy_address}/status (see https://github.com/internetarchive/warcprox/commit/8caae0d7d3), and enables warcprox features if so. --- README.rst | 6 +- .../templates/brozzler-worker.conf.j2 | 3 +- brozzler/cli.py | 22 +-- brozzler/easy.py | 6 +- brozzler/job_schema.yaml | 13 +- brozzler/robots.py | 14 +- brozzler/site.py | 2 +- brozzler/worker.py | 85 +++++---- job-conf.rst | 24 --- setup.py | 4 +- tests/test_cluster.py | 169 +++++++++++++----- tests/test_frontier.py | 7 +- vagrant/README.rst | 4 +- vagrant/vagrant-brozzler-new-site.py | 2 +- 14 files changed, 208 insertions(+), 153 deletions(-) diff --git a/README.rst b/README.rst index b986d71..9632761 100644 --- a/README.rst +++ b/README.rst @@ -78,7 +78,7 @@ Launch one or more workers: :: - brozzler-worker + brozzler-worker --warcprox-auto Submit jobs: @@ -90,8 +90,7 @@ Submit sites not tied to a job: :: - brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \ - --time-limit=600 http://example.com/ + brozzler-new-site --time-limit=600 http://example.com/ Job Configuration ----------------- @@ -106,7 +105,6 @@ everything else is optional. For details, see ``_. time_limit: 60 # seconds proxy: 127.0.0.1:8000 # point at warcprox for archiving ignore_robots: false - enable_warcprox_features: false warcprox_meta: null metadata: {} seeds: diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 index 3d8906e..ccb6716 100644 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 @@ -20,4 +20,5 @@ kill timeout 60 exec nice brozzler-worker \ --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ - --max-browsers=4 + --max-browsers=4 \ + --warcprox-auto diff --git a/brozzler/cli.py b/brozzler/cli.py index aca352a..a0f5027 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -81,15 +81,6 @@ def rethinker(args): 'BROZZLER_RETHINKDB_DB') or 'brozzler' return doublethink.Rethinker(servers.split(','), db) -def _add_proxy_options(arg_parser): - arg_parser.add_argument( - '--proxy', dest='proxy', default=None, help='http proxy') - arg_parser.add_argument( - '--enable-warcprox-features', dest='enable_warcprox_features', - action='store_true', default=None, help=( - 'enable special features that assume the configured proxy is ' - 'warcprox')) - def configure_logging(args): logging.basicConfig( stream=sys.stderr, level=args.log_level, format=( @@ -159,7 +150,8 @@ def brozzle_page(): arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') - _add_proxy_options(arg_parser) + arg_parser.add_argument( + '--proxy', dest='proxy', default=None, help='http proxy') add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) @@ -170,7 +162,6 @@ def brozzle_page(): behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site(None, { 'id': -1, 'seed': args.url, 'proxy': args.proxy, - 'enable_warcprox_features': args.enable_warcprox_features, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) @@ -237,7 +228,6 @@ def brozzler_new_site(): formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') add_rethinkdb_options(arg_parser) - _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') @@ -273,7 +263,6 @@ def brozzler_new_site(): 'proxy': args.proxy, 'time_limit': int(args.time_limit) if args.time_limit else None, 'ignore_robots': args.ignore_robots, - 'enable_warcprox_features': args.enable_warcprox_features, 'warcprox_meta': json.loads( args.warcprox_meta) if args.warcprox_meta else None, 'behavior_parameters': json.loads( @@ -300,6 +289,13 @@ def brozzler_worker(): arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') + arg_parser.add_argument( + '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--warcprox-auto', dest='warcprox_auto', action='store_true', + help=( + 'when needed, choose an available instance of warcprox from ' + 'the rethinkdb service registry')) add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) diff --git a/brozzler/easy.py b/brozzler/easy.py index 4b2c2cc..6ba406b 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -138,11 +138,9 @@ class BrozzlerEasyController: frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( - frontier, service_registry, - max_browsers=args.max_browsers, - chrome_exe=args.chrome_exe, + frontier, service_registry, chrome_exe=args.chrome_exe, proxy='%s:%s' % self.warcprox_controller.proxy.server_address, - enable_warcprox_features=True) + max_browsers=args.max_browsers) return worker def _init_pywb(self, args): diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 9a2ba58..9a65489 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -9,9 +9,6 @@ id: type: number min: 0 - enable_warcprox_features: - type: boolean - ignore_robots: type: boolean @@ -19,10 +16,6 @@ id: type: dict nullable: true - proxy: - type: string - nullable: true - scope: type: dict schema: @@ -42,7 +35,7 @@ id: type: string regex: - type: string # code up a regex type? + type: string # code up a cerberus regex type? ssurt: type: string @@ -75,10 +68,6 @@ id: max_hops_off_surt: type: integer - # ignored, left for backward compatibility - remember_outlinks: - type: boolean - metadata: type: dict diff --git a/brozzler/robots.py b/brozzler/robots.py index 26329d1..54ee706 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -47,20 +47,22 @@ def _reppy_rules_getitem(self, agent): reppy.parser.Rules.__getitem__ = _reppy_rules_getitem _robots_caches = {} # {site_id:reppy.cache.RobotsCache} -def _robots_cache(site): +def _robots_cache(site, proxy=None): class SessionRaiseOn420(requests.Session): def get(self, url, *args, **kwargs): res = super().get(url, *args, **kwargs) if res.status_code == 420 and 'warcprox-meta' in res.headers: - raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text) + raise brozzler.ReachedLimit( + warcprox_meta=json.loads(res.headers['warcprox-meta']), + http_payload=res.text) else: return res if not site.id in _robots_caches: req_sesh = SessionRaiseOn420() req_sesh.verify = False # ignore cert errors - if site.proxy: - proxie = "http://{}".format(site.proxy) + if proxy: + proxie = "http://%s" % proxy req_sesh.proxies = {"http":proxie,"https":proxie} if site.extra_headers(): req_sesh.headers.update(site.extra_headers()) @@ -70,14 +72,14 @@ def _robots_cache(site): return _robots_caches[site.id] -def is_permitted_by_robots(site, url): +def is_permitted_by_robots(site, url, proxy=None): if site.ignore_robots: return True tries_left = 10 while True: try: - result = _robots_cache(site).allowed( + result = _robots_cache(site, proxy).allowed( url, site.user_agent or "brozzler") return result except BaseException as e: diff --git a/brozzler/site.py b/brozzler/site.py index 8a36256..7580e17 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -80,7 +80,7 @@ class Site(doublethink.Document): def extra_headers(self): hdrs = {} - if self.enable_warcprox_features and self.warcprox_meta: + if self.warcprox_meta: hdrs["Warcprox-Meta"] = json.dumps( self.warcprox_meta, separators=(',', ':')) return hdrs diff --git a/brozzler/worker.py b/brozzler/worker.py index 97f23d9..b9ad609 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -101,15 +101,15 @@ class BrozzlerWorker: def __init__( self, frontier, service_registry=None, max_browsers=1, - chrome_exe="chromium-browser", proxy=None, - enable_warcprox_features=False): + chrome_exe="chromium-browser", warcprox_auto=False, proxy=None): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers - # these two settings can be overridden by the job/site configuration - self._default_proxy = proxy - self._default_enable_warcprox_features = enable_warcprox_features + self._warcprox_auto = warcprox_auto + self._proxy = proxy + assert not (warcprox_auto and proxy) + self._proxy_is_warcprox = None self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -119,12 +119,12 @@ class BrozzlerWorker: self._thread = None self._start_stop_lock = threading.Lock() - def _proxy(self, site): - if site.proxy: + def _proxy_for(self, site): + if self._proxy: + return self._proxy + elif site.proxy: return site.proxy - elif self._default_proxy: - return self._default_proxy - elif self._service_registry and self._enable_warcprox_features(site): + elif self._warcprox_auto: svc = self._service_registry.available_service('warcprox') if svc is None: raise Exception( @@ -138,11 +138,21 @@ class BrozzlerWorker: return site.proxy return None - def _enable_warcprox_features(self, site): - if site.enable_warcprox_features is not None: - return site.enable_warcprox_features + def _using_warcprox(self, site): + if self._proxy: + if self._proxy_is_warcprox is None: + try: + response = requests.get('http://%s/status' % self._proxy) + status = json.loads(response.text) + self._proxy_is_warcprox = (status['role'] == 'warcprox') + except Exception as e: + self._proxy_is_warcprox = False + logging.info( + '%s %s warcprox', self._proxy, + 'IS' if self._proxy_is_warcprox else 'IS NOT') + return self._proxy_is_warcprox else: - return self._default_enable_warcprox_features + return bool(site.proxy or self._warcprox_auto) def _youtube_dl(self, destdir, site): ydl_opts = { @@ -156,12 +166,12 @@ class BrozzlerWorker: "nopart": True, "no_color": True, } - if self._proxy(site): - ydl_opts["proxy"] = "http://{}".format(self._proxy(site)) + if self._proxy_for(site): + ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site)) ## XXX (sometimes?) causes chrome debug websocket to go through ## proxy. Maybe not needed thanks to hls_prefer_native. ## # see https://github.com/rg3/youtube-dl/issues/6087 - ## os.environ["http_proxy"] = "http://{}".format(self._proxy(site)) + ## os.environ["http_proxy"] = "http://{}".format(self._proxy_for(site)) ydl = youtube_dl.YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers())) @@ -224,13 +234,13 @@ class BrozzlerWorker: info = ydl.extract_info(page.url) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) - if self._proxy(site) and self._enable_warcprox_features(site): + if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( - warcprox_address=self._proxy(site), + warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", @@ -303,20 +313,20 @@ class BrozzlerWorker: def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) - if self._proxy(site) and self._enable_warcprox_features(site): + if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " - "screenshot for %s", self._proxy(site), page) + "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( - warcprox_address=self._proxy(site), + warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( - warcprox_address=self._proxy(site), + warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, @@ -347,7 +357,8 @@ class BrozzlerWorker: if not browser.is_running(): browser.start( - proxy=self._proxy(site), cookie_db=site.get('cookie_db')) + proxy=self._proxy_for(site), + cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), @@ -360,10 +371,10 @@ class BrozzlerWorker: def _fetch_url(self, site, page): proxies = None - if self._proxy(site): + if self._proxy_for(site): proxies = { - 'http': 'http://%s' % self._proxy(site), - 'https': 'http://%s' % self._proxy(site), + 'http': 'http://%s' % self._proxy_for(site), + 'https': 'http://%s' % self._proxy_for(site), } self.logger.info('fetching %s', page) @@ -388,17 +399,19 @@ class BrozzlerWorker: return True return False - def _brozzle_site(self, browser, site): - page = None + def brozzle_site(self, browser, site): try: + page = None start = time.time() while time.time() - start < 7 * 60: + site.refresh() self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and - not brozzler.is_permitted_by_robots(site, page.url)): + not brozzler.is_permitted_by_robots( + site, page.url, self._proxy_for(site))): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) @@ -424,8 +437,13 @@ class BrozzlerWorker: except: self.logger.critical("unexpected exception", exc_info=True) finally: - browser.stop() self._frontier.disclaim_site(site, page) + + def _brozzle_site_thread_target(self, browser, site): + try: + self.brozzle_site(browser, site) + finally: + browser.stop() self._browser_pool.release(browser) with self._browsing_threads_lock: self._browsing_threads.remove(threading.current_thread()) @@ -477,9 +495,10 @@ class BrozzlerWorker: socket.gethostname(), browser.chrome.port)) self.logger.info( "brozzling site (proxy=%s) %s", - repr(self._proxy(site)), site) + repr(self._proxy_for(site)), site) th = threading.Thread( - target=self._brozzle_site, args=(browser, site), + target=self._brozzle_site_thread_target, + args=(browser, site), name="BrozzlingThread:%s" % browser.chrome.port, daemon=True) with self._browsing_threads_lock: diff --git a/job-conf.rst b/job-conf.rst index c36ab05..9d8c209 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -14,7 +14,6 @@ an example time_limit: 60 # seconds proxy: 127.0.0.1:8000 # point at warcprox for archiving ignore_robots: false - enable_warcprox_features: false warcprox_meta: warc-prefix: job1 stats: @@ -135,29 +134,6 @@ proxy HTTP proxy, with the format ``host:port``. Typically configured to point to warcprox for archival crawling. -enable_warcprox_features ------------------------- -+-----------------------+---------+----------+---------+ -| scope | type | required | default | -+=======================+=========+==========+=========+ -| seed-level, top-level | boolean | no | false | -+-----------------------+---------+----------+---------+ -If true for a given seed, and the seed is configured to use a proxy, enables -special features that assume the proxy is an instance of warcprox. As of this -writing, the special features that are enabled are: - -- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD - request -- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD - request - -See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX -not yet written). - -*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the -Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not -set.* - ignore_robots ------------- +-----------------------+---------+----------+---------+ diff --git a/setup.py b/setup.py index 0aa6338..f28c537 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b10.dev218', + version='1.1b10.dev219', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -77,7 +77,7 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.1b1.dev57', + 'warcprox>=2.1b1.dev60', 'pywb', 'flask>=0.11', 'gunicorn' diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 6a9ddd7..669c7bf 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -30,6 +30,8 @@ import brozzler import datetime import requests import subprocess +import http.server +import logging def start_service(service): subprocess.check_call(['sudo', 'service', service, 'start']) @@ -113,7 +115,7 @@ def test_brozzle_site(httpd): rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, - 'proxy': 'localhost:8000', 'enable_warcprox_features': True, + 'proxy': 'localhost:8000', 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) # the two pages we expect to be crawled @@ -180,11 +182,89 @@ def test_brozzle_site(httpd): assert response.status_code == 200 assert response.headers['content-type'] == 'image/jpeg' -def test_warcprox_selection(httpd): - ''' When enable_warcprox_features is true, brozzler is expected to choose - and instance of warcprox ''' +def test_proxy_warcprox(httpd): + '''Test --proxy with proxy that happens to be warcprox''' + try: + stop_service('brozzler-worker') + _test_proxy_setting( + httpd, proxy='localhost:8000', warcprox_auto=False, + is_warcprox=True) + finally: + start_service('brozzler-worker') - test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat() +def test_proxy_non_warcprox(httpd): + '''Test --proxy with proxy that happens not to be warcprox''' + class DumbProxyRequestHandler(http.server.SimpleHTTPRequestHandler): + def do_HEAD(self): + if not hasattr(self.server, 'requests'): + self.server.requests = [] + logging.info('%s %s', self.command, self.path) + self.server.requests.append('%s %s' % (self.command, self.path)) + response = urllib.request.urlopen(self.path) + self.wfile.write(('HTTP/1.0 %s %s\r\n' % ( + response.code, response.reason)).encode('ascii')) + for header in response.getheaders(): + self.wfile.write(('%s: %s\r\n' % ( + header[0], header[1])).encode('ascii')) + self.wfile.write(b'\r\n') + return response + def do_GET(self): + response = self.do_HEAD() + self.copyfile(response, self.wfile) + def do_WARCPROX_WRITE_RECORD(self): + if not hasattr(self.server, 'requests'): + self.server.requests = [] + logging.info('%s %s', self.command, self.path) + self.send_error(400) + + proxy = http.server.HTTPServer(('localhost', 0), DumbProxyRequestHandler) + th = threading.Thread(name='dumb-proxy', target=proxy.serve_forever) + th.start() + + try: + stop_service('brozzler-worker') + _test_proxy_setting( + httpd, proxy='localhost:%s' % proxy.server_port, + warcprox_auto=False, is_warcprox=False) + finally: + start_service('brozzler-worker') + assert len(proxy.requests) <= 15 + assert proxy.requests.count('GET /status') == 1 + assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests + assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests + assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == [] + + proxy.shutdown() + th.join() + +def test_no_proxy(httpd): + try: + stop_service('brozzler-worker') + _test_proxy_setting( + httpd, proxy=None, warcprox_auto=False, is_warcprox=False) + finally: + start_service('brozzler-worker') + # XXX how to check that no proxy was used? + +def test_warcprox_auto(httpd): + '''Test --warcprox-auto''' + try: + stop_service('brozzler-worker') + _test_proxy_setting( + httpd, proxy=None, warcprox_auto=True, is_warcprox=True) + finally: + start_service('brozzler-worker') + +def test_proxy_conflict(): + with pytest.raises(AssertionError) as excinfo: + worker = brozzler.worker.BrozzlerWorker( + None, None, warcprox_auto=True, proxy='localhost:12345') + +def _test_proxy_setting( + httpd, proxy=None, warcprox_auto=False, is_warcprox=False): + test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( + proxy, warcprox_auto, is_warcprox, + datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port @@ -192,35 +272,36 @@ def test_warcprox_selection(httpd): robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') + service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, - 'enable_warcprox_features': True, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + assert site.id is None + frontier = brozzler.RethinkDbFrontier(rr) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 - # so we can examine rethinkdb before it does anything - try: - stop_service('brozzler-worker') - assert site.id is None - frontier = brozzler.RethinkDbFrontier(rr) - brozzler.new_site(frontier, site) - assert site.id is not None - assert len(list(frontier.site_pages(site.id))) == 1 - finally: - start_service('brozzler-worker') + worker = brozzler.worker.BrozzlerWorker( + frontier, service_registry, max_browsers=1, + chrome_exe=brozzler.suggest_default_chrome_exe(), + warcprox_auto=warcprox_auto, proxy=proxy) + browser = worker._browser_pool.acquire() + worker.brozzle_site(browser, site) + worker._browser_pool.release(browser) - # check proxy is set in rethink - start = time.time() - while not site.proxy and time.time() - start < 20: - time.sleep(0.5) - site.refresh() - assert site.proxy[-5:] == ':8000' - - # the site should be brozzled fairly quickly - start = time.time() - while site.status != 'FINISHED' and time.time() - start < 300: - time.sleep(0.5) - site.refresh() + # check proxy is set assert site.status == 'FINISHED' + if warcprox_auto: + assert site.proxy[-5:] == ':8000' + else: + assert not site.proxy + site.refresh() # check that these things were persisted + assert site.status == 'FINISHED' + if warcprox_auto: + assert site.proxy[-5:] == ':8000' + else: + assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) @@ -234,26 +315,28 @@ def test_warcprox_selection(httpd): captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} - assert robots in captures_by_url - assert page1 in captures_by_url - assert page2 in captures_by_url - assert 'screenshot:%s' % page1 in captures_by_url - assert 'thumbnail:%s' % page1 in captures_by_url - # no screenshots of plaintext + if is_warcprox: + assert robots in captures_by_url + assert page1 in captures_by_url + assert page2 in captures_by_url + assert 'screenshot:%s' % page1 in captures_by_url + assert 'thumbnail:%s' % page1 in captures_by_url - # check pywb - t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') - wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) - expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() - assert requests.get(wb_url).content == expected_payload + # check pywb + t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) + expected_payload = open(os.path.join( + os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() + assert requests.get(wb_url).content == expected_payload + else: + assert captures_by_url == {} def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, - 'proxy': 'localhost:8000', 'enable_warcprox_features': True, + 'proxy': 'localhost:8000', 'user_agent': 'im a badbot', # robots.txt blocks badbot 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) @@ -306,7 +389,7 @@ def test_login(httpd): rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, - 'proxy': 'localhost:8000', 'enable_warcprox_features': True, + 'proxy': 'localhost:8000', 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, 'username': 'test_username', 'password': 'test_password'}) @@ -347,7 +430,7 @@ def test_seed_redirect(httpd): seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, - 'proxy': 'localhost:8000', 'enable_warcprox_features': True, + 'proxy': 'localhost:8000', 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 71caa19..5b0faa9 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -314,9 +314,7 @@ def test_field_defaults(): # site brozzler.Site.table_ensure(rr) - site = brozzler.Site(rr, { - 'seed': 'http://example.com/', 'enable_warcprox_features': True}) - assert site.enable_warcprox_features is True + site = brozzler.Site(rr, {'seed': 'http://example.com/'}) assert site.id is None assert site.scope assert site.scope['surt'] == 'http://(com,example,)/' @@ -325,15 +323,12 @@ def test_field_defaults(): assert site.scope tite = brozzler.Site.load(rr, site.id) - assert tite.enable_warcprox_features is True assert tite.id == site.id assert tite.scope == site.scope tite.save() - assert tite.enable_warcprox_features is True assert tite.id == site.id assert tite.scope == site.scope tite.refresh() - assert tite.enable_warcprox_features is True assert tite.id == site.id assert tite.scope == site.scope diff --git a/vagrant/README.rst b/vagrant/README.rst index f546da8..d9e1545 100644 --- a/vagrant/README.rst +++ b/vagrant/README.rst @@ -33,8 +33,7 @@ Then you can run brozzler-new-site: :: (brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \ - --proxy=localhost:8000 --enable-warcprox-features \ - http://example.com/ + --proxy=localhost:8000 http://example.com/ Or brozzler-new-job (make sure to set the proxy to localhost:8000): @@ -44,7 +43,6 @@ Or brozzler-new-job (make sure to set the proxy to localhost:8000): (brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml id: job1 proxy: localhost:8000 # point at warcprox for archiving - enable_warcprox_features: true seeds: - url: https://example.org/ (brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index aef5bf2..99401c5 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -77,7 +77,7 @@ def main(argv=[]): 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' '/home/vagrant/brozzler-ve34/bin/python ' '/home/vagrant/brozzler-ve34/bin/brozzler-new-site ' - '--proxy=localhost:8000 --enable-warcprox-features %s %s') % ( + '--proxy=localhost:8000 %s %s') % ( ' '.join(options), args.seed) subprocess.call(['vagrant', 'ssh', '--', cmd])