implement sha1 lookup and url prefix lookup for brozzler-list-captures

This commit is contained in:
Noah Levitt 2017-01-12 01:26:09 +00:00
parent 32097a8f8b
commit 64a0ea879a
3 changed files with 34 additions and 30 deletions

View file

@ -364,6 +364,11 @@ def brozzler_list_captures():
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]), prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'-p', '--prefix', dest='prefix', action='store_true', help=(
'use prefix match for url (n.b. may not work as expected if '
'searching key has query string because canonicalization can '
'reorder query parameters)'))
_add_rethinkdb_options(arg_parser) _add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) _add_common_options(arg_parser)
arg_parser.add_argument( arg_parser.add_argument(
@ -383,36 +388,40 @@ def brozzler_list_captures():
return json.JSONEncoder.default(self, o) return json.JSONEncoder.default(self, o)
if args.url_or_sha1[:5] == 'sha1:': if args.url_or_sha1[:5] == 'sha1:':
raise Exception('not implemented') if args.prefix:
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): logging.warn(
# if algo != "sha1": 'ignoring supplied --prefix option which does not apply '
# raise Exception( 'to lookup by sha1')
# "digest type is %s but big captures table is indexed by " # assumes it's already base32 (XXX could detect if hex and convert)
# "sha1" % algo) sha1base32 = args.url_or_sha1[5:].upper()
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8") reql = r.table('captures').between(
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() [sha1base32, rethinkdb.minval, rethinkdb.minval],
# results = list(results_iter) [sha1base32, rethinkdb.maxval, rethinkdb.maxval],
# if len(results) > 0: index='sha1_warc_type')
# if len(results) > 1: logging.debug('rethinkdb query: %s', reql)
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) results = reql.run()
# result = results[0] for result in results:
# else: print(json.dumps(result, cls=Jsonner, indent=2))
# result = None
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
# result, sha1base32, bucket)
# return result
else: else:
key = surt.surt( key = surt.surt(
args.url_or_sha1, trailing_comma=True, host_massage=False, args.url_or_sha1, trailing_comma=True, host_massage=False,
with_scheme=True) with_scheme=True)
abbr_start_key = key[:150]
if args.prefix:
# surt is necessarily ascii and \x7f is the last ascii character
abbr_end_key = key[:150] + '\x7f'
end_key = key + '\x7f'
else:
abbr_end_key = key[:150]
end_key = key
reql = r.table('captures').between( reql = r.table('captures').between(
[key[:150], rethinkdb.minval], [abbr_start_key, rethinkdb.minval],
[key[:150]+'!', rethinkdb.maxval], [abbr_end_key, rethinkdb.maxval],
index='abbr_canon_surt_timestamp') index='abbr_canon_surt_timestamp', right_bound='closed')
reql = reql.order_by(index='abbr_canon_surt_timestamp') reql = reql.order_by(index='abbr_canon_surt_timestamp')
reql = reql.filter( reql = reql.filter(
lambda capture: (capture['canon_surt'] >= key) lambda capture: (capture['canon_surt'] >= key)
& (capture['canon_surt'] <= key)) & (capture['canon_surt'] <= end_key))
logging.debug('rethinkdb query: %s', reql) logging.debug('rethinkdb query: %s', reql)
results = reql.run() results = reql.run()
for result in results: for result in results:

View file

@ -84,12 +84,9 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
end_key = cdx_query.end_key.decode('utf-8') end_key = cdx_query.end_key.decode('utf-8')
reql = self.r.table(self.table).between( reql = self.r.table(self.table).between(
[start_key[:150], rethinkdb.minval], [start_key[:150], rethinkdb.minval],
[end_key[:150]+'!', rethinkdb.maxval], [end_key[:150], rethinkdb.maxval],
index='abbr_canon_surt_timestamp') index='abbr_canon_surt_timestamp', right_bound='closed')
reql = reql.order_by(index='abbr_canon_surt_timestamp') reql = reql.order_by(index='abbr_canon_surt_timestamp')
# filters have to come after order_by apparently
# TODO support for POST, etc # TODO support for POST, etc
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
reql = reql.filter( reql = reql.filter(
@ -99,10 +96,8 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
reql = reql.filter( reql = reql.filter(
lambda capture: (capture['canon_surt'] >= start_key) lambda capture: (capture['canon_surt'] >= start_key)
& (capture['canon_surt'] < end_key)) & (capture['canon_surt'] < end_key))
if cdx_query.limit: if cdx_query.limit:
reql = reql.limit(cdx_query.limit) reql = reql.limit(cdx_query.limit)
logging.debug('rethinkdb query: %s', reql) logging.debug('rethinkdb query: %s', reql)
results = reql.run() results = reql.run()
return results return results

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev161', version='1.1b9.dev162',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',