Merge branch 'master' into qa

* master:
  pywb support for thumbnail: and screenshot: urls
  fix oops
This commit is contained in:
Noah Levitt 2017-01-31 10:26:48 -08:00
commit 1874434d60
6 changed files with 148 additions and 12 deletions

View File

@ -1,10 +1,10 @@
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
:target: https://travis-ci.org/internetarchive/brozzler
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
:width: 7%
|logo| brozzler
|logo| brozzler
===============
"browser" \| "crawler" = "brozzler"
@ -139,6 +139,46 @@ To start the app, run
See ``brozzler-dashboard --help`` for configuration options.
Brozzler Wayback
----------------
Brozzler comes with a customized version of
`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
"captures" table (populated by warcprox) as its index.
To use, first install dependencies.
::
pip install brozzler[easy]
Write a configuration file pywb.yml.
::
# 'archive_paths' should point to the output directory of warcprox
archive_paths: warcs/ # pywb will fail without a trailing slash
collections:
brozzler:
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
db: brozzler
table: captures
servers:
- localhost
enable_auto_colls: false
enable_cdx_api: true
framed_replay: true
port: 8880
Run pywb like so:
::
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
Then browse http://localhost:8880/brozzler/.
Headless Chromium
-----------------
@ -208,7 +248,7 @@ to load the plugin by adding this option to your wrapper script:
License
-------
Copyright 2015-2016 Internet Archive
Copyright 2015-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this software except in compliance with the License. You may

View File

@ -285,7 +285,7 @@ def brozzler_new_site():
args.behavior_parameters) if args.behavior_parameters else None,
username=args.username, password=args.password)
r = rethinker()
r = rethinker(args)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)

View File

@ -149,6 +149,7 @@ class BrozzlerEasyController:
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
brozzler.pywb.monkey_patch_wburl()
if args.warcs_dir.endswith('/'):
warcs_dir = args.warcs_dir

View File

@ -1,9 +1,9 @@
'''
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, and canonicalization rules matching
brozzler conventions
loading from warcs still being written to, canonicalization rules matching
brozzler conventions, support for screenshot: and thumbnail: urls
Copyright (C) 2016 Internet Archive
Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -26,6 +26,8 @@ try:
import pywb.cdx.cdxobject
import pywb.cdx.cdxserver
import pywb.webapp.query_handler
import pywb.framework.basehandlers
import pywb.rewrite.wburl
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
@ -37,6 +39,7 @@ import rethinkdb
import surt
import json
import brozzler
import argparse
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table):
@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
'url': record['url'],
'status': str(record['response_code']),
'digest': record['sha1base32'],
'length': str(record['record_length']),
'length': str(record.get('record_length', '-')),
'offset': str(record['offset']),
'filename': record['filename'],
}
@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object):
# logging.debug('%s -> %s', url, key)
return key
except Exception as e:
raise pywb.utils.canonicalize.UrlCanonicalizeException(
'Invalid Url: ' + url)
return url
def replace_default_canonicalizer():
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
@ -193,11 +195,90 @@ def support_in_progress_warcs():
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url):
import re
import six
from six.moves.urllib.parse import urlsplit, urlunsplit
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
from pywb.utils.loaders import to_native_str
from pywb.rewrite.wburl import WbUrl
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
if six.PY2 and isinstance(orig_url, six.text_type):
orig_url = orig_url.encode('utf-8')
orig_url = quote(orig_url)
self._original_url = orig_url
if not self._init_query(orig_url):
if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url)
new_uri = WbUrl.to_uri(self.url)
self._do_percent_encode = True
self.url = new_uri
# begin brozzler changes
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
or self.url.startswith('thumbnail:')):
return
# end brozzler changes
# protocol agnostic url -> http://
# no protocol -> http://
#inx = self.url.find('://')
inx = -1
m = self.SCHEME_RX.match(self.url)
if m:
inx = m.span(1)[0]
#if inx < 0:
# check for other partially encoded variants
# m = self.PARTIAL_ENC_RX.match(self.url)
# if m:
# len_ = len(m.group(0))
# self.url = (urllib.unquote_plus(self.url[:len_]) +
# self.url[len_:])
# inx = self.url.find(':/')
if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url
else:
inx += 2
if inx < len(self.url) and self.url[inx] != '/':
self.url = self.url[:inx] + '/' + self.url[inx:]
def _get_wburl_type(self):
return SomeWbUrl
def monkey_patch_wburl():
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
def _extend_parser(self, arg_parser):
super()._extend_parser(arg_parser)
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
arg_parser.epilog = '''
Run pywb like so:
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
See README.rst for more information.
'''
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
wayback_cli = pywb.apps.cli.WaybackCli(
brozzler.pywb.monkey_patch_wburl()
wayback_cli = BrozzlerWaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev176',
version='1.1b9.dev178',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -155,6 +155,20 @@ def test_brozzle_site(httpd):
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
url = 'screenshot:%s' % page1
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
response = requests.get(wb_url)
assert response.status_code == 200
assert response.headers['content-type'] == 'image/jpeg'
url = 'thumbnail:%s' % page1
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
response = requests.get(wb_url)
assert response.status_code == 200
assert response.headers['content-type'] == 'image/jpeg'
def test_warcprox_selection(httpd):
''' When enable_warcprox_features is true, brozzler is expected to choose
and instance of warcprox '''