mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
new prog "brozzler-wayback" runs monkey-patched pywb
This commit is contained in:
parent
1c5c9417d2
commit
85073ab82b
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
'''
|
'''
|
||||||
brozzler/pywb.py - pywb support for rethinkdb index
|
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
||||||
|
loading from warcs still being written to, and canonicalization rules matching
|
||||||
|
brozzler conventions
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
@ -35,6 +36,7 @@ import rethinkstuff
|
|||||||
import rethinkdb
|
import rethinkdb
|
||||||
import surt
|
import surt
|
||||||
import json
|
import json
|
||||||
|
import brozzler
|
||||||
|
|
||||||
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
def __init__(self, servers, db, table):
|
def __init__(self, servers, db, table):
|
||||||
@ -192,3 +194,13 @@ def support_in_progress_warcs():
|
|||||||
results.append('%s.open' % warc_path)
|
results.append('%s.open' % warc_path)
|
||||||
return results
|
return results
|
||||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||||
|
|
||||||
|
def main(argv=sys.argv):
|
||||||
|
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||||
|
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||||
|
brozzler.pywb.support_in_progress_warcs()
|
||||||
|
wayback_cli = pywb.apps.cli.WaybackCli(
|
||||||
|
args=argv[1:], default_port=8880,
|
||||||
|
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
||||||
|
'with brozzler)'))
|
||||||
|
wayback_cli.run()
|
||||||
|
3
setup.py
3
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b6.dev78',
|
version='1.1b6.dev79',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
@ -53,6 +53,7 @@ setuptools.setup(
|
|||||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||||
'brozzler-webconsole=brozzler.webconsole:main',
|
'brozzler-webconsole=brozzler.webconsole:main',
|
||||||
'brozzler-easy=brozzler.easy:main',
|
'brozzler-easy=brozzler.easy:main',
|
||||||
|
'brozzler-wayback=brozzler.pywb:main',
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
|
Loading…
x
Reference in New Issue
Block a user