From 8f5003b7840775eb3c88c7efd8511b58cd3d38be Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 30 Jan 2017 23:47:39 -0800 Subject: [PATCH 1/2] fix oops --- brozzler/cli.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 987dea6..fcd5904 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -285,7 +285,7 @@ def brozzler_new_site(): args.behavior_parameters) if args.behavior_parameters else None, username=args.username, password=args.password) - r = rethinker() + r = rethinker(args) frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) diff --git a/setup.py b/setup.py index 72ac014..001593a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev176', + version='1.1b9.dev177', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 5c684779e52fade6ff55a25640447feb75fb6708 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 31 Jan 2017 10:26:38 -0800 Subject: [PATCH 2/2] pywb support for thumbnail: and screenshot: urls --- README.rst | 46 +++++++++++++++++++-- brozzler/easy.py | 1 + brozzler/pywb.py | 95 +++++++++++++++++++++++++++++++++++++++---- setup.py | 2 +- tests/test_cluster.py | 14 +++++++ 5 files changed, 147 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index 2329f97..6573708 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,10 @@ .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master :target: https://travis-ci.org/internetarchive/brozzler - + .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg :width: 7% -|logo| brozzler +|logo| brozzler =============== "browser" \| "crawler" = "brozzler" @@ -139,6 +139,46 @@ To start the app, run See ``brozzler-dashboard --help`` for configuration options. +Brozzler Wayback +---------------- + +Brozzler comes with a customized version of +`pywb `_ which supports using the rethinkdb +"captures" table (populated by warcprox) as its index. + +To use, first install dependencies. + +:: + + pip install brozzler[easy] + +Write a configuration file pywb.yml. + +:: + + # 'archive_paths' should point to the output directory of warcprox + archive_paths: warcs/ # pywb will fail without a trailing slash + collections: + brozzler: + index_paths: !!python/object:brozzler.pywb.RethinkCDXSource + db: brozzler + table: captures + servers: + - localhost + enable_auto_colls: false + enable_cdx_api: true + framed_replay: true + port: 8880 + +Run pywb like so: + +:: + + $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback + +Then browse http://localhost:8880/brozzler/. + + Headless Chromium ----------------- @@ -208,7 +248,7 @@ to load the plugin by adding this option to your wrapper script: License ------- -Copyright 2015-2016 Internet Archive +Copyright 2015-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. You may diff --git a/brozzler/easy.py b/brozzler/easy.py index 899c15a..d31bae3 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -149,6 +149,7 @@ class BrozzlerEasyController: brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() brozzler.pywb.support_in_progress_warcs() + brozzler.pywb.monkey_patch_wburl() if args.warcs_dir.endswith('/'): warcs_dir = args.warcs_dir diff --git a/brozzler/pywb.py b/brozzler/pywb.py index b88eef0..84773ea 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -1,9 +1,9 @@ ''' brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, -loading from warcs still being written to, and canonicalization rules matching -brozzler conventions +loading from warcs still being written to, canonicalization rules matching +brozzler conventions, support for screenshot: and thumbnail: urls -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,6 +26,8 @@ try: import pywb.cdx.cdxobject import pywb.cdx.cdxserver import pywb.webapp.query_handler + import pywb.framework.basehandlers + import pywb.rewrite.wburl except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' @@ -37,6 +39,7 @@ import rethinkdb import surt import json import brozzler +import argparse class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): def __init__(self, servers, db, table): @@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): 'url': record['url'], 'status': str(record['response_code']), 'digest': record['sha1base32'], - 'length': str(record['record_length']), + 'length': str(record.get('record_length', '-')), 'offset': str(record['offset']), 'filename': record['filename'], } @@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object): # logging.debug('%s -> %s', url, key) return key except Exception as e: - raise pywb.utils.canonicalize.UrlCanonicalizeException( - 'Invalid Url: ' + url) + return url def replace_default_canonicalizer(): '''Replace parent class of CustomUrlCanonicalizer with this class.''' @@ -193,11 +195,90 @@ def support_in_progress_warcs(): return results pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call +class SomeWbUrl(pywb.rewrite.wburl.WbUrl): + def __init__(self, orig_url): + import re + import six + + from six.moves.urllib.parse import urlsplit, urlunsplit + from six.moves.urllib.parse import quote_plus, quote, unquote_plus + + from pywb.utils.loaders import to_native_str + from pywb.rewrite.wburl import WbUrl + + pywb.rewrite.wburl.BaseWbUrl.__init__(self) + + if six.PY2 and isinstance(orig_url, six.text_type): + orig_url = orig_url.encode('utf-8') + orig_url = quote(orig_url) + + self._original_url = orig_url + + if not self._init_query(orig_url): + if not self._init_replay(orig_url): + raise Exception('Invalid WbUrl: ', orig_url) + + new_uri = WbUrl.to_uri(self.url) + + self._do_percent_encode = True + + self.url = new_uri + + # begin brozzler changes + if (self.url.startswith('urn:') or self.url.startswith('screenshot:') + or self.url.startswith('thumbnail:')): + return + # end brozzler changes + + # protocol agnostic url -> http:// + # no protocol -> http:// + #inx = self.url.find('://') + inx = -1 + m = self.SCHEME_RX.match(self.url) + if m: + inx = m.span(1)[0] + + #if inx < 0: + # check for other partially encoded variants + # m = self.PARTIAL_ENC_RX.match(self.url) + # if m: + # len_ = len(m.group(0)) + # self.url = (urllib.unquote_plus(self.url[:len_]) + + # self.url[len_:]) + # inx = self.url.find(':/') + + if inx < 0: + self.url = self.DEFAULT_SCHEME + self.url + else: + inx += 2 + if inx < len(self.url) and self.url[inx] != '/': + self.url = self.url[:inx] + '/' + self.url[inx:] + +def _get_wburl_type(self): + return SomeWbUrl + +def monkey_patch_wburl(): + pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type + +class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli): + def _extend_parser(self, arg_parser): + super()._extend_parser(arg_parser) + arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex + arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter + arg_parser.epilog = ''' +Run pywb like so: + + $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback + +See README.rst for more information. +''' + def main(argv=sys.argv): brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() brozzler.pywb.support_in_progress_warcs() - wayback_cli = pywb.apps.cli.WaybackCli( + brozzler.pywb.monkey_patch_wburl() + wayback_cli = BrozzlerWaybackCli( args=argv[1:], default_port=8880, desc=('brozzler-wayback - pywb wayback (monkey-patched for use ' 'with brozzler)')) diff --git a/setup.py b/setup.py index 001593a..00b4713 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev177', + version='1.1b9.dev178', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index a878474..45a04b7 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -155,6 +155,20 @@ def test_brozzle_site(httpd): os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload + url = 'screenshot:%s' % page1 + t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) + response = requests.get(wb_url) + assert response.status_code == 200 + assert response.headers['content-type'] == 'image/jpeg' + + url = 'thumbnail:%s' % page1 + t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) + response = requests.get(wb_url) + assert response.status_code == 200 + assert response.headers['content-type'] == 'image/jpeg' + def test_warcprox_selection(httpd): ''' When enable_warcprox_features is true, brozzler is expected to choose and instance of warcprox '''