have pywb support loading warc records from warc files still being written (look for foo.warc.gz.open)

This commit is contained in:
Noah Levitt 2016-07-17 20:09:56 -05:00
parent b62d5a6350
commit 7d9f019e67
3 changed files with 39 additions and 11 deletions

View File

@ -64,11 +64,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
help='where to write warcs')
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true')
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
# arg_parser.add_argument('--version', action='version',
# version="warcprox {}".format(warcprox.__version__))
# === warcprox args ===
arg_parser.add_argument(
@ -100,6 +95,17 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
'--pywb-port', dest='pywb_port', type=int, default=8091,
help='pywb wayback port')
# === common at the bottom args ===
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true')
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (brozzler.__version__, prog))
return arg_parser
class BrozzlerEasyController:
@ -127,9 +133,9 @@ class BrozzlerEasyController:
return worker
def _init_pywb(self, args):
# replace parent class of CustomUrlCanonicalizer
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
brozzler.pywb.TheGoodUrlCanonicalizer,)
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.support_in_progress_warcs()
if args.warcs_dir.endswith('/'):
warcs_dir = args.warcs_dir
else:

View File

@ -18,6 +18,8 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
import sys
import logging
try:
import pywb.apps.cli
import pywb.cdx.cdxdomainspecific
@ -30,8 +32,6 @@ except ImportError as e:
'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e)
sys.exit(1)
import sys
import logging
import rethinkstuff
import rethinkdb
import surt
@ -124,3 +124,25 @@ class TheGoodUrlCanonicalizer(object):
raise pywb.utils.canonicalize.UrlCanonicalizeException(
'Invalid Url: ' + url)
def replace_default_canonicalizer():
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
TheGoodUrlCanonicalizer,)
def support_in_progress_warcs():
'''
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
being written to (warcs having ".open" suffix). This way if a cdx entry
references foo.warc.gz, pywb will try both foo.warc.gz and
foo.warc.gz.open.
'''
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
def _prefix_resolver_call(self, filename, cdx=None):
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
results = []
for warc_path in raw_results:
results.append(warc_path)
results.append('%s.open' % warc_path)
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup(
name='brozzler',
version='1.1b3.dev52',
version='1.1b3.dev53',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',