mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-25 09:59:25 -04:00
207 lines
7.8 KiB
Python
207 lines
7.8 KiB
Python
'''
|
|
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
|
loading from warcs still being written to, and canonicalization rules matching
|
|
brozzler conventions
|
|
|
|
Copyright (C) 2016 Internet Archive
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
'''
|
|
|
|
import sys
|
|
import logging
|
|
try:
|
|
import pywb.apps.cli
|
|
import pywb.cdx.cdxdomainspecific
|
|
import pywb.cdx.cdxobject
|
|
import pywb.cdx.cdxserver
|
|
import pywb.webapp.query_handler
|
|
except ImportError as e:
|
|
logging.critical(
|
|
'%s: %s\n\nYou might need to run "pip install '
|
|
'brozzler[easy]".\nSee README.rst for more information.',
|
|
type(e).__name__, e)
|
|
sys.exit(1)
|
|
import rethinkstuff
|
|
import rethinkdb
|
|
import surt
|
|
import json
|
|
import brozzler
|
|
|
|
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
|
def __init__(self, servers, db, table):
|
|
self.servers = servers
|
|
self.db = db
|
|
self.table = table
|
|
|
|
@property
|
|
def r(self):
|
|
try:
|
|
return self._r
|
|
except AttributeError:
|
|
self._r = rethinkstuff.Rethinker(self.servers, self.db)
|
|
return self._r
|
|
|
|
def load_cdx(self, cdx_query):
|
|
# logging.debug('vars(cdx_query)=%s', vars(cdx_query))
|
|
rethink_results = self._query_rethinkdb(cdx_query)
|
|
return self._gen_cdx_lines(rethink_results)
|
|
|
|
def _gen_cdx_lines(self, rethink_results):
|
|
for record in rethink_results:
|
|
# XXX inefficient, it gets parsed later, figure out how to
|
|
# short-circuit this step and create the CDXObject directly
|
|
blob = {
|
|
'url': record['url'],
|
|
'mime': record['content_type'],
|
|
'status': str(record['response_code']),
|
|
'digest': record['sha1base32'],
|
|
'length': str(record['length']), # XXX is this the right length?
|
|
'offset': str(record['offset']),
|
|
'filename': record['filename'],
|
|
}
|
|
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
|
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
|
record['canon_surt'], record['timestamp'],
|
|
json.dumps(blob))
|
|
yield cdx_line.encode('utf-8')
|
|
|
|
def _query_rethinkdb(self, cdx_query):
|
|
start_key = cdx_query.key.decode('utf-8')
|
|
end_key = cdx_query.end_key.decode('utf-8')
|
|
reql = self.r.table(self.table).between(
|
|
[start_key[:150], rethinkdb.minval],
|
|
[end_key[:150]+'!', rethinkdb.maxval],
|
|
index='abbr_canon_surt_timestamp')
|
|
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
|
|
|
# filters have to come after order_by apparently
|
|
|
|
# TODO support for POST, etc
|
|
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
|
reql = reql.filter(
|
|
lambda capture: rethinkdb.expr(
|
|
['WARCPROX_WRITE_RECORD','GET']).contains(
|
|
capture['http_method']))
|
|
reql = reql.filter(
|
|
lambda capture: (capture['canon_surt'] >= start_key)
|
|
& (capture['canon_surt'] < end_key))
|
|
|
|
if cdx_query.limit:
|
|
reql = reql.limit(cdx_query.limit)
|
|
|
|
logging.debug('rethinkdb query: %s', reql)
|
|
results = reql.run()
|
|
return results
|
|
|
|
class TheGoodUrlCanonicalizer(object):
|
|
'''
|
|
Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces
|
|
surts with scheme and with trailing comma, and does not "massage"
|
|
www.foo.org into foo.org.
|
|
'''
|
|
def __init__(self, surt_ordered=True):
|
|
'''We are always surt ordered (surt_ordered param is ignored)'''
|
|
self.surt_ordered = True
|
|
|
|
def __call__(self, url):
|
|
try:
|
|
key = surt.surt(
|
|
url, trailing_comma=True, host_massage=False,
|
|
with_scheme=True)
|
|
# logging.debug('%s -> %s', url, key)
|
|
return key
|
|
except Exception as e:
|
|
raise pywb.utils.canonicalize.UrlCanonicalizeException(
|
|
'Invalid Url: ' + url)
|
|
|
|
def replace_default_canonicalizer():
|
|
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
|
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
|
TheGoodUrlCanonicalizer,)
|
|
|
|
def good_surts_from_default(default_surt):
|
|
'''
|
|
Takes a standard surt without scheme and without trailing comma, and
|
|
returns a list of "good" surts that together match the same set of
|
|
urls. For example:
|
|
|
|
good_surts_from_default('com,example)/path')
|
|
|
|
returns
|
|
|
|
['http://(com,example,)/path',
|
|
'https://(com,example,)/path',
|
|
'http://(com,example,www,)/path',
|
|
'https://(com,example,www,)/path']
|
|
|
|
'''
|
|
if default_surt == '':
|
|
return ['']
|
|
|
|
parts = default_surt.split(')', 1)
|
|
if len(parts) == 2:
|
|
orig_host_part, path_part = parts
|
|
good_surts = [
|
|
'http://(%s,)%s' % (orig_host_part, path_part),
|
|
'https://(%s,)%s' % (orig_host_part, path_part),
|
|
'http://(%s,www,)%s' % (orig_host_part, path_part),
|
|
'https://(%s,www,)%s' % (orig_host_part, path_part),
|
|
]
|
|
else: # no path part
|
|
host_part = parts[0]
|
|
good_surts = [
|
|
'http://(%s' % host_part,
|
|
'https://(%s' % host_part,
|
|
]
|
|
return good_surts
|
|
|
|
def monkey_patch_dsrules_init():
|
|
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
|
|
def cdx_dsrule_init(self, url_prefix, rules):
|
|
orig_init(self, url_prefix, rules)
|
|
good_surts = []
|
|
for url_prefix in self.url_prefix:
|
|
good_surts.extend(
|
|
TheGoodUrlCanonicalizer.good_surts_from_default(
|
|
url_prefix))
|
|
self.url_prefix = good_surts
|
|
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
|
|
|
|
def support_in_progress_warcs():
|
|
'''
|
|
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
|
being written to (warcs having ".open" suffix). This way if a cdx entry
|
|
references foo.warc.gz, pywb will try both foo.warc.gz and
|
|
foo.warc.gz.open.
|
|
'''
|
|
_orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__
|
|
def _prefix_resolver_call(self, filename, cdx=None):
|
|
raw_results = _orig_prefix_resolver_call(self, filename, cdx)
|
|
results = []
|
|
for warc_path in raw_results:
|
|
results.append(warc_path)
|
|
results.append('%s.open' % warc_path)
|
|
return results
|
|
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
|
|
|
def main(argv=sys.argv):
|
|
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
|
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
|
brozzler.pywb.support_in_progress_warcs()
|
|
wayback_cli = pywb.apps.cli.WaybackCli(
|
|
args=argv[1:], default_port=8880,
|
|
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
|
'with brozzler)'))
|
|
wayback_cli.run()
|