mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
have pywb support loading warc records from warc files still being written (look for foo.warc.gz.open)
This commit is contained in:
parent
b62d5a6350
commit
7d9f019e67
@ -64,11 +64,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser.add_argument(
|
||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
||||
help='where to write warcs')
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
# arg_parser.add_argument('--version', action='version',
|
||||
# version="warcprox {}".format(warcprox.__version__))
|
||||
|
||||
# === warcprox args ===
|
||||
arg_parser.add_argument(
|
||||
@ -100,6 +95,17 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
'--pywb-port', dest='pywb_port', type=int, default=8091,
|
||||
help='pywb wayback port')
|
||||
|
||||
# === common at the bottom args ===
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
# arg_parser.add_argument(
|
||||
# '-s', '--silent', dest='log_level', action='store_const',
|
||||
# default=logging.INFO, const=logging.CRITICAL)
|
||||
arg_parser.add_argument(
|
||||
'--version', action='version',
|
||||
version='brozzler %s - %s' % (brozzler.__version__, prog))
|
||||
|
||||
return arg_parser
|
||||
|
||||
class BrozzlerEasyController:
|
||||
@ -127,9 +133,9 @@ class BrozzlerEasyController:
|
||||
return worker
|
||||
|
||||
def _init_pywb(self, args):
|
||||
# replace parent class of CustomUrlCanonicalizer
|
||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer,)
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||
brozzler.pywb.support_in_progress_warcs()
|
||||
|
||||
if args.warcs_dir.endswith('/'):
|
||||
warcs_dir = args.warcs_dir
|
||||
else:
|
||||
|
@ -18,6 +18,8 @@ You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import logging
|
||||
try:
|
||||
import pywb.apps.cli
|
||||
import pywb.cdx.cdxdomainspecific
|
||||
@ -30,8 +32,6 @@ except ImportError as e:
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import sys
|
||||
import logging
|
||||
import rethinkstuff
|
||||
import rethinkdb
|
||||
import surt
|
||||
@ -124,3 +124,25 @@ class TheGoodUrlCanonicalizer(object):
|
||||
raise pywb.utils.canonicalize.UrlCanonicalizeException(
|
||||
'Invalid Url: ' + url)
|
||||
|
||||
def replace_default_canonicalizer():
|
||||
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||
TheGoodUrlCanonicalizer,)
|
||||
|
||||
def support_in_progress_warcs():
|
||||
'''
|
||||
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
||||
being written to (warcs having ".open" suffix). This way if a cdx entry
|
||||
references foo.warc.gz, pywb will try both foo.warc.gz and
|
||||
foo.warc.gz.open.
|
||||
'''
|
||||
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
||||
def _prefix_resolver_call(self, filename, cdx=None):
|
||||
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
||||
results = []
|
||||
for warc_path in raw_results:
|
||||
results.append(warc_path)
|
||||
results.append('%s.open' % warc_path)
|
||||
return results
|
||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user