mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-15 02:19:25 -04:00
implement sha1 lookup and url prefix lookup for brozzler-list-captures
This commit is contained in:
parent
32097a8f8b
commit
64a0ea879a
3 changed files with 34 additions and 30 deletions
|
@ -364,6 +364,11 @@ def brozzler_list_captures():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'-p', '--prefix', dest='prefix', action='store_true', help=(
|
||||||
|
'use prefix match for url (n.b. may not work as expected if '
|
||||||
|
'searching key has query string because canonicalization can '
|
||||||
|
'reorder query parameters)'))
|
||||||
_add_rethinkdb_options(arg_parser)
|
_add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
_add_common_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
@ -383,36 +388,40 @@ def brozzler_list_captures():
|
||||||
return json.JSONEncoder.default(self, o)
|
return json.JSONEncoder.default(self, o)
|
||||||
|
|
||||||
if args.url_or_sha1[:5] == 'sha1:':
|
if args.url_or_sha1[:5] == 'sha1:':
|
||||||
raise Exception('not implemented')
|
if args.prefix:
|
||||||
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
logging.warn(
|
||||||
# if algo != "sha1":
|
'ignoring supplied --prefix option which does not apply '
|
||||||
# raise Exception(
|
'to lookup by sha1')
|
||||||
# "digest type is %s but big captures table is indexed by "
|
# assumes it's already base32 (XXX could detect if hex and convert)
|
||||||
# "sha1" % algo)
|
sha1base32 = args.url_or_sha1[5:].upper()
|
||||||
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
reql = r.table('captures').between(
|
||||||
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
[sha1base32, rethinkdb.minval, rethinkdb.minval],
|
||||||
# results = list(results_iter)
|
[sha1base32, rethinkdb.maxval, rethinkdb.maxval],
|
||||||
# if len(results) > 0:
|
index='sha1_warc_type')
|
||||||
# if len(results) > 1:
|
logging.debug('rethinkdb query: %s', reql)
|
||||||
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
|
results = reql.run()
|
||||||
# result = results[0]
|
for result in results:
|
||||||
# else:
|
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||||
# result = None
|
|
||||||
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
|
|
||||||
# result, sha1base32, bucket)
|
|
||||||
# return result
|
|
||||||
else:
|
else:
|
||||||
key = surt.surt(
|
key = surt.surt(
|
||||||
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
||||||
with_scheme=True)
|
with_scheme=True)
|
||||||
|
abbr_start_key = key[:150]
|
||||||
|
if args.prefix:
|
||||||
|
# surt is necessarily ascii and \x7f is the last ascii character
|
||||||
|
abbr_end_key = key[:150] + '\x7f'
|
||||||
|
end_key = key + '\x7f'
|
||||||
|
else:
|
||||||
|
abbr_end_key = key[:150]
|
||||||
|
end_key = key
|
||||||
reql = r.table('captures').between(
|
reql = r.table('captures').between(
|
||||||
[key[:150], rethinkdb.minval],
|
[abbr_start_key, rethinkdb.minval],
|
||||||
[key[:150]+'!', rethinkdb.maxval],
|
[abbr_end_key, rethinkdb.maxval],
|
||||||
index='abbr_canon_surt_timestamp')
|
index='abbr_canon_surt_timestamp', right_bound='closed')
|
||||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
||||||
reql = reql.filter(
|
reql = reql.filter(
|
||||||
lambda capture: (capture['canon_surt'] >= key)
|
lambda capture: (capture['canon_surt'] >= key)
|
||||||
& (capture['canon_surt'] <= key))
|
& (capture['canon_surt'] <= end_key))
|
||||||
logging.debug('rethinkdb query: %s', reql)
|
logging.debug('rethinkdb query: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
for result in results:
|
for result in results:
|
||||||
|
|
|
@ -84,12 +84,9 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
end_key = cdx_query.end_key.decode('utf-8')
|
end_key = cdx_query.end_key.decode('utf-8')
|
||||||
reql = self.r.table(self.table).between(
|
reql = self.r.table(self.table).between(
|
||||||
[start_key[:150], rethinkdb.minval],
|
[start_key[:150], rethinkdb.minval],
|
||||||
[end_key[:150]+'!', rethinkdb.maxval],
|
[end_key[:150], rethinkdb.maxval],
|
||||||
index='abbr_canon_surt_timestamp')
|
index='abbr_canon_surt_timestamp', right_bound='closed')
|
||||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
||||||
|
|
||||||
# filters have to come after order_by apparently
|
|
||||||
|
|
||||||
# TODO support for POST, etc
|
# TODO support for POST, etc
|
||||||
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
# http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails
|
||||||
reql = reql.filter(
|
reql = reql.filter(
|
||||||
|
@ -99,10 +96,8 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
reql = reql.filter(
|
reql = reql.filter(
|
||||||
lambda capture: (capture['canon_surt'] >= start_key)
|
lambda capture: (capture['canon_surt'] >= start_key)
|
||||||
& (capture['canon_surt'] < end_key))
|
& (capture['canon_surt'] < end_key))
|
||||||
|
|
||||||
if cdx_query.limit:
|
if cdx_query.limit:
|
||||||
reql = reql.limit(cdx_query.limit)
|
reql = reql.limit(cdx_query.limit)
|
||||||
|
|
||||||
logging.debug('rethinkdb query: %s', reql)
|
logging.debug('rethinkdb query: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
return results
|
return results
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev161',
|
version='1.1b9.dev162',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue