new utility brozzler-list-captures for looking up entries in the "captures" table

This commit is contained in:
Noah Levitt 2016-11-30 00:52:14 +00:00
parent 9567c088c8
commit 3c43fdaced
2 changed files with 68 additions and 1 deletions

View File

@ -350,3 +350,69 @@ def brozzler_ensure_tables():
# sites, pages, jobs tables
brozzler.frontier.RethinkDbFrontier(r)
def brozzler_list_captures():
'''
Handy utility for looking up entries in the rethinkdb "captures" table by
url or sha1.
'''
import surt
import rethinkdb
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
arg_parser.add_argument(
'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
class Jsonner(json.JSONEncoder):
def default(self, o):
if isinstance(o, datetime.datetime):
return o.isoformat()
return json.JSONEncoder.default(self, o)
if args.url_or_sha1[:5] == 'sha1:':
raise Exception('not implemented')
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
# if algo != "sha1":
# raise Exception(
# "digest type is %s but big captures table is indexed by "
# "sha1" % algo)
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
# results = list(results_iter)
# if len(results) > 0:
# if len(results) > 1:
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
# result = results[0]
# else:
# result = None
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
# result, sha1base32, bucket)
# return result
else:
key = surt.surt(
args.url_or_sha1, trailing_comma=True, host_massage=False,
with_scheme=True)
reql = r.table('captures').between(
[key[:150], rethinkdb.minval],
[key[:150]+'!', rethinkdb.maxval],
index='abbr_canon_surt_timestamp')
reql = reql.order_by(index='abbr_canon_surt_timestamp')
reql = reql.filter(
lambda capture: (capture['canon_surt'] >= key)
& (capture['canon_surt'] <= key))
logging.debug('rethinkdb query: %s', reql)
results = reql.run()
for result in results:
print(json.dumps(result, cls=Jsonner, indent=2))

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev129',
version='1.1b8.dev130',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -51,6 +51,7 @@ setuptools.setup(
'brozzler-new-site=brozzler.cli:brozzler_new_site',
'brozzler-worker=brozzler.cli:brozzler_worker',
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
'brozzler-dashboard=brozzler.dashboard:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',