mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-22 13:54:31 -04:00
have pywb support loading warc records from warc files still being written (look for foo.warc.gz.open)
This commit is contained in:
parent
b62d5a6350
commit
7d9f019e67
3 changed files with 39 additions and 11 deletions
|
@ -64,11 +64,6 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
||||||
help='where to write warcs')
|
help='where to write warcs')
|
||||||
arg_parser.add_argument(
|
|
||||||
'-v', '--verbose', dest='verbose', action='store_true')
|
|
||||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
|
||||||
# arg_parser.add_argument('--version', action='version',
|
|
||||||
# version="warcprox {}".format(warcprox.__version__))
|
|
||||||
|
|
||||||
# === warcprox args ===
|
# === warcprox args ===
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
@ -100,6 +95,17 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||||
'--pywb-port', dest='pywb_port', type=int, default=8091,
|
'--pywb-port', dest='pywb_port', type=int, default=8091,
|
||||||
help='pywb wayback port')
|
help='pywb wayback port')
|
||||||
|
|
||||||
|
# === common at the bottom args ===
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'-v', '--verbose', dest='verbose', action='store_true')
|
||||||
|
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||||
|
# arg_parser.add_argument(
|
||||||
|
# '-s', '--silent', dest='log_level', action='store_const',
|
||||||
|
# default=logging.INFO, const=logging.CRITICAL)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--version', action='version',
|
||||||
|
version='brozzler %s - %s' % (brozzler.__version__, prog))
|
||||||
|
|
||||||
return arg_parser
|
return arg_parser
|
||||||
|
|
||||||
class BrozzlerEasyController:
|
class BrozzlerEasyController:
|
||||||
|
@ -127,9 +133,9 @@ class BrozzlerEasyController:
|
||||||
return worker
|
return worker
|
||||||
|
|
||||||
def _init_pywb(self, args):
|
def _init_pywb(self, args):
|
||||||
# replace parent class of CustomUrlCanonicalizer
|
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
brozzler.pywb.support_in_progress_warcs()
|
||||||
brozzler.pywb.TheGoodUrlCanonicalizer,)
|
|
||||||
if args.warcs_dir.endswith('/'):
|
if args.warcs_dir.endswith('/'):
|
||||||
warcs_dir = args.warcs_dir
|
warcs_dir = args.warcs_dir
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -18,6 +18,8 @@ You should have received a copy of the GNU Affero General Public License
|
||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
try:
|
try:
|
||||||
import pywb.apps.cli
|
import pywb.apps.cli
|
||||||
import pywb.cdx.cdxdomainspecific
|
import pywb.cdx.cdxdomainspecific
|
||||||
|
@ -30,8 +32,6 @@ except ImportError as e:
|
||||||
'brozzler[easy]".\nSee README.rst for more information.',
|
'brozzler[easy]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
import rethinkdb
|
import rethinkdb
|
||||||
import surt
|
import surt
|
||||||
|
@ -124,3 +124,25 @@ class TheGoodUrlCanonicalizer(object):
|
||||||
raise pywb.utils.canonicalize.UrlCanonicalizeException(
|
raise pywb.utils.canonicalize.UrlCanonicalizeException(
|
||||||
'Invalid Url: ' + url)
|
'Invalid Url: ' + url)
|
||||||
|
|
||||||
|
def replace_default_canonicalizer():
|
||||||
|
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
||||||
|
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||||
|
TheGoodUrlCanonicalizer,)
|
||||||
|
|
||||||
|
def support_in_progress_warcs():
|
||||||
|
'''
|
||||||
|
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
||||||
|
being written to (warcs having ".open" suffix). This way if a cdx entry
|
||||||
|
references foo.warc.gz, pywb will try both foo.warc.gz and
|
||||||
|
foo.warc.gz.open.
|
||||||
|
'''
|
||||||
|
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
||||||
|
def _prefix_resolver_call(self, filename, cdx=None):
|
||||||
|
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
||||||
|
results = []
|
||||||
|
for warc_path in raw_results:
|
||||||
|
results.append(warc_path)
|
||||||
|
results.append('%s.open' % warc_path)
|
||||||
|
return results
|
||||||
|
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b3.dev52',
|
version='1.1b3.dev53',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue