new prog "brozzler-wayback" runs monkey-patched pywb

This commit is contained in:
Noah Levitt 2016-09-14 17:04:01 -07:00
parent 1c5c9417d2
commit 85073ab82b
2 changed files with 16 additions and 3 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
'''
brozzler/pywb.py - pywb support for rethinkdb index
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, and canonicalization rules matching
brozzler conventions
Copyright (C) 2016 Internet Archive
@ -35,6 +36,7 @@ import rethinkstuff
import rethinkdb
import surt
import json
import brozzler
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table):
@ -192,3 +194,13 @@ def support_in_progress_warcs():
results.append('%s.open' % warc_path)
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
wayback_cli = pywb.apps.cli.WaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))
wayback_cli.run()

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b6.dev78',
version='1.1b6.dev79',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -53,6 +53,7 @@ setuptools.setup(
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-webconsole=brozzler.webconsole:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
],
},
install_requires=[