From 8f5003b7840775eb3c88c7efd8511b58cd3d38be Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 30 Jan 2017 23:47:39 -0800
Subject: [PATCH 1/2] fix oops

---
 brozzler/cli.py | 2 +-
 setup.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/brozzler/cli.py b/brozzler/cli.py
index 987dea6..fcd5904 100644
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@@ -285,7 +285,7 @@ def brozzler_new_site():
                 args.behavior_parameters) if args.behavior_parameters else None,
             username=args.username, password=args.password)
 
-    r = rethinker()
+    r = rethinker(args)
     frontier = brozzler.RethinkDbFrontier(r)
     brozzler.new_site(frontier, site)
 
diff --git a/setup.py b/setup.py
index 72ac014..001593a 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b9.dev176',
+        version='1.1b9.dev177',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',

From 5c684779e52fade6ff55a25640447feb75fb6708 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 31 Jan 2017 10:26:38 -0800
Subject: [PATCH 2/2] pywb support for thumbnail: and screenshot: urls

---
 README.rst            | 46 +++++++++++++++++++--
 brozzler/easy.py      |  1 +
 brozzler/pywb.py      | 95 +++++++++++++++++++++++++++++++++++++++----
 setup.py              |  2 +-
 tests/test_cluster.py | 14 +++++++
 5 files changed, 147 insertions(+), 11 deletions(-)

diff --git a/README.rst b/README.rst
index 2329f97..6573708 100644
--- a/README.rst
+++ b/README.rst
@@ -1,10 +1,10 @@
 .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
     :target: https://travis-ci.org/internetarchive/brozzler
-    
+
 .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
    :width: 7%
 
-|logo| brozzler 
+|logo| brozzler
 ===============
 "browser" \| "crawler" = "brozzler"
 
@@ -139,6 +139,46 @@ To start the app, run
 
 See ``brozzler-dashboard --help`` for configuration options.
 
+Brozzler Wayback
+----------------
+
+Brozzler comes with a customized version of
+`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
+"captures" table (populated by warcprox) as its index.
+
+To use, first install dependencies.
+
+::
+
+    pip install brozzler[easy]
+
+Write a configuration file pywb.yml.
+
+::
+
+    # 'archive_paths' should point to the output directory of warcprox
+    archive_paths: warcs/  # pywb will fail without a trailing slash
+    collections:
+      brozzler:
+        index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
+          db: brozzler
+          table: captures
+          servers:
+          - localhost
+    enable_auto_colls: false
+    enable_cdx_api: true
+    framed_replay: true
+    port: 8880
+
+Run pywb like so:
+
+::
+
+    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
+
+Then browse http://localhost:8880/brozzler/.
+
+
 Headless Chromium
 -----------------
 
@@ -208,7 +248,7 @@ to load the plugin by adding this option to your wrapper script:
 License
 -------
 
-Copyright 2015-2016 Internet Archive
+Copyright 2015-2017 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may
 not use this software except in compliance with the License. You may
diff --git a/brozzler/easy.py b/brozzler/easy.py
index 899c15a..d31bae3 100644
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@@ -149,6 +149,7 @@ class BrozzlerEasyController:
         brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
         brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
         brozzler.pywb.support_in_progress_warcs()
+        brozzler.pywb.monkey_patch_wburl()
 
         if args.warcs_dir.endswith('/'):
             warcs_dir = args.warcs_dir
diff --git a/brozzler/pywb.py b/brozzler/pywb.py
index b88eef0..84773ea 100644
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@@ -1,9 +1,9 @@
 '''
 brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
-loading from warcs still being written to, and canonicalization rules matching
-brozzler conventions
+loading from warcs still being written to, canonicalization rules matching
+brozzler conventions, support for screenshot: and thumbnail: urls
 
-Copyright (C) 2016 Internet Archive
+Copyright (C) 2016-2017 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@ try:
     import pywb.cdx.cdxobject
     import pywb.cdx.cdxserver
     import pywb.webapp.query_handler
+    import pywb.framework.basehandlers
+    import pywb.rewrite.wburl
 except ImportError as e:
     logging.critical(
             '%s: %s\n\nYou might need to run "pip install '
@@ -37,6 +39,7 @@ import rethinkdb
 import surt
 import json
 import brozzler
+import argparse
 
 class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
     def __init__(self, servers, db, table):
@@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
                 'url': record['url'],
                 'status': str(record['response_code']),
                 'digest': record['sha1base32'],
-                'length': str(record['record_length']),
+                'length': str(record.get('record_length', '-')),
                 'offset': str(record['offset']),
                 'filename': record['filename'],
             }
@@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object):
             # logging.debug('%s -> %s', url, key)
             return key
         except Exception as e:
-            raise pywb.utils.canonicalize.UrlCanonicalizeException(
-                    'Invalid Url: ' + url)
+            return url
 
     def replace_default_canonicalizer():
         '''Replace parent class of CustomUrlCanonicalizer with this class.'''
@@ -193,11 +195,90 @@ def support_in_progress_warcs():
         return results
     pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
 
+class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
+    def __init__(self, orig_url):
+        import re
+        import six
+
+        from six.moves.urllib.parse import urlsplit, urlunsplit
+        from six.moves.urllib.parse import quote_plus, quote, unquote_plus
+
+        from pywb.utils.loaders import to_native_str
+        from pywb.rewrite.wburl import WbUrl
+
+        pywb.rewrite.wburl.BaseWbUrl.__init__(self)
+
+        if six.PY2 and isinstance(orig_url, six.text_type):
+            orig_url = orig_url.encode('utf-8')
+            orig_url = quote(orig_url)
+
+        self._original_url = orig_url
+
+        if not self._init_query(orig_url):
+            if not self._init_replay(orig_url):
+                raise Exception('Invalid WbUrl: ', orig_url)
+
+        new_uri = WbUrl.to_uri(self.url)
+
+        self._do_percent_encode = True
+
+        self.url = new_uri
+
+        # begin brozzler changes
+        if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
+                or self.url.startswith('thumbnail:')):
+            return
+        # end brozzler changes
+
+        # protocol agnostic url -> http://
+        # no protocol -> http://
+        #inx = self.url.find('://')
+        inx = -1
+        m = self.SCHEME_RX.match(self.url)
+        if m:
+            inx = m.span(1)[0]
+
+        #if inx < 0:
+            # check for other partially encoded variants
+        #    m = self.PARTIAL_ENC_RX.match(self.url)
+        #    if m:
+        #        len_ = len(m.group(0))
+        #        self.url = (urllib.unquote_plus(self.url[:len_]) +
+        #                    self.url[len_:])
+        #        inx = self.url.find(':/')
+
+        if inx < 0:
+            self.url = self.DEFAULT_SCHEME + self.url
+        else:
+            inx += 2
+            if inx < len(self.url) and self.url[inx] != '/':
+                self.url = self.url[:inx] + '/' + self.url[inx:]
+
+def _get_wburl_type(self):
+    return SomeWbUrl
+
+def monkey_patch_wburl():
+    pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
+
+class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
+    def _extend_parser(self, arg_parser):
+        super()._extend_parser(arg_parser)
+        arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
+        arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
+        arg_parser.epilog = '''
+Run pywb like so:
+
+    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
+
+See README.rst for more information.
+'''
+
 def main(argv=sys.argv):
     brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
     brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
     brozzler.pywb.support_in_progress_warcs()
-    wayback_cli = pywb.apps.cli.WaybackCli(
+    brozzler.pywb.monkey_patch_wburl()
+    wayback_cli = BrozzlerWaybackCli(
             args=argv[1:], default_port=8880,
             desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
                   'with brozzler)'))
diff --git a/setup.py b/setup.py
index 001593a..00b4713 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b9.dev177',
+        version='1.1b9.dev178',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index a878474..45a04b7 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -155,6 +155,20 @@ def test_brozzle_site(httpd):
         os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
     assert requests.get(wb_url).content == expected_payload
 
+    url = 'screenshot:%s' % page1
+    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
+    response = requests.get(wb_url)
+    assert response.status_code == 200
+    assert response.headers['content-type'] == 'image/jpeg'
+
+    url = 'thumbnail:%s' % page1
+    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
+    response = requests.get(wb_url)
+    assert response.status_code == 200
+    assert response.headers['content-type'] == 'image/jpeg'
+
 def test_warcprox_selection(httpd):
     ''' When enable_warcprox_features is true, brozzler is expected to choose
     and instance of warcprox '''