diff --git a/brozzler/easy.py b/brozzler/easy.py index aeed9ed..e8a0d21 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -64,11 +64,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument( '-d', '--warcs-dir', dest='warcs_dir', default='./warcs', help='where to write warcs') - arg_parser.add_argument( - '-v', '--verbose', dest='verbose', action='store_true') - arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') - # arg_parser.add_argument('--version', action='version', - # version="warcprox {}".format(warcprox.__version__)) # === warcprox args === arg_parser.add_argument( @@ -100,6 +95,17 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): '--pywb-port', dest='pywb_port', type=int, default=8091, help='pywb wayback port') + # === common at the bottom args === + arg_parser.add_argument( + '-v', '--verbose', dest='verbose', action='store_true') + arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') + # arg_parser.add_argument( + # '-s', '--silent', dest='log_level', action='store_const', + # default=logging.INFO, const=logging.CRITICAL) + arg_parser.add_argument( + '--version', action='version', + version='brozzler %s - %s' % (brozzler.__version__, prog)) + return arg_parser class BrozzlerEasyController: @@ -127,9 +133,9 @@ class BrozzlerEasyController: return worker def _init_pywb(self, args): - # replace parent class of CustomUrlCanonicalizer - pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( - brozzler.pywb.TheGoodUrlCanonicalizer,) + brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() + brozzler.pywb.support_in_progress_warcs() + if args.warcs_dir.endswith('/'): warcs_dir = args.warcs_dir else: diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 3e19a7f..633b959 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -18,6 +18,8 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . ''' +import sys +import logging try: import pywb.apps.cli import pywb.cdx.cdxdomainspecific @@ -30,8 +32,6 @@ except ImportError as e: 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, e) sys.exit(1) -import sys -import logging import rethinkstuff import rethinkdb import surt @@ -124,3 +124,25 @@ class TheGoodUrlCanonicalizer(object): raise pywb.utils.canonicalize.UrlCanonicalizeException( 'Invalid Url: ' + url) + def replace_default_canonicalizer(): + '''Replace parent class of CustomUrlCanonicalizer with this class.''' + pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( + TheGoodUrlCanonicalizer,) + +def support_in_progress_warcs(): + ''' + Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still + being written to (warcs having ".open" suffix). This way if a cdx entry + references foo.warc.gz, pywb will try both foo.warc.gz and + foo.warc.gz.open. + ''' + _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__ + def _prefix_resolver_call(self, filename, cdx=None): + raw_results = _orig_prefix_resolver_call(self, filename, cdx) + results = [] + for warc_path in raw_results: + results.append(warc_path) + results.append('%s.open' % warc_path) + return results + pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call + diff --git a/setup.py b/setup.py index 8a9a962..9a57376 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1b3.dev52', + version='1.1b3.dev53', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',