new prog "brozzler-wayback" runs monkey-patched pywb

This commit is contained in:
Noah Levitt 2016-09-14 17:04:01 -07:00
parent 1c5c9417d2
commit 85073ab82b
2 changed files with 16 additions and 3 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
''' '''
brozzler/pywb.py - pywb support for rethinkdb index brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, and canonicalization rules matching
brozzler conventions
Copyright (C) 2016 Internet Archive Copyright (C) 2016 Internet Archive
@ -35,6 +36,7 @@ import rethinkstuff
import rethinkdb import rethinkdb
import surt import surt
import json import json
import brozzler
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table): def __init__(self, servers, db, table):
@ -192,3 +194,13 @@ def support_in_progress_warcs():
results.append('%s.open' % warc_path) results.append('%s.open' % warc_path)
return results return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
wayback_cli = pywb.apps.cli.WaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))
wayback_cli.run()

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b6.dev78', version='1.1b6.dev79',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
@ -53,6 +53,7 @@ setuptools.setup(
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-webconsole=brozzler.webconsole:main', 'brozzler-webconsole=brozzler.webconsole:main',
'brozzler-easy=brozzler.easy:main', 'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
], ],
}, },
install_requires=[ install_requires=[