mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
new utility brozzler-list-captures for looking up entries in the "captures" table
This commit is contained in:
parent
9567c088c8
commit
3c43fdaced
@ -350,3 +350,69 @@ def brozzler_ensure_tables():
|
||||
|
||||
# sites, pages, jobs tables
|
||||
brozzler.frontier.RethinkDbFrontier(r)
|
||||
|
||||
def brozzler_list_captures():
|
||||
'''
|
||||
Handy utility for looking up entries in the rethinkdb "captures" table by
|
||||
url or sha1.
|
||||
'''
|
||||
import surt
|
||||
import rethinkdb
|
||||
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'url_or_sha1', metavar='URL_or_SHA1',
|
||||
help='url or sha1 to look up in captures table')
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
|
||||
class Jsonner(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.isoformat()
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
if args.url_or_sha1[:5] == 'sha1:':
|
||||
raise Exception('not implemented')
|
||||
# def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||
# if algo != "sha1":
|
||||
# raise Exception(
|
||||
# "digest type is %s but big captures table is indexed by "
|
||||
# "sha1" % algo)
|
||||
# sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||
# results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
||||
# results = list(results_iter)
|
||||
# if len(results) > 0:
|
||||
# if len(results) > 1:
|
||||
# self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
|
||||
# result = results[0]
|
||||
# else:
|
||||
# result = None
|
||||
# self.logger.debug("returning %s for sha1base32=%s bucket=%s",
|
||||
# result, sha1base32, bucket)
|
||||
# return result
|
||||
else:
|
||||
key = surt.surt(
|
||||
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
||||
with_scheme=True)
|
||||
reql = r.table('captures').between(
|
||||
[key[:150], rethinkdb.minval],
|
||||
[key[:150]+'!', rethinkdb.maxval],
|
||||
index='abbr_canon_surt_timestamp')
|
||||
reql = reql.order_by(index='abbr_canon_surt_timestamp')
|
||||
reql = reql.filter(
|
||||
lambda capture: (capture['canon_surt'] >= key)
|
||||
& (capture['canon_surt'] <= key))
|
||||
logging.debug('rethinkdb query: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
||||
|
3
setup.py
3
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b8.dev129',
|
||||
version='1.1b8.dev130',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -51,6 +51,7 @@ setuptools.setup(
|
||||
'brozzler-new-site=brozzler.cli:brozzler_new_site',
|
||||
'brozzler-worker=brozzler.cli:brozzler_worker',
|
||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
|
||||
'brozzler-dashboard=brozzler.dashboard:main',
|
||||
'brozzler-easy=brozzler.easy:main',
|
||||
'brozzler-wayback=brozzler.pywb:main',
|
||||
|
Loading…
x
Reference in New Issue
Block a user