feat: Add new cli option to insert url into existing crawl.

This commit is contained in:
Adam Miller 2024-09-24 15:21:36 -07:00
parent 0d8721a4d3
commit 16d2b17772
2 changed files with 41 additions and 2 deletions

View File

@ -437,6 +437,44 @@ def brozzler_new_site(argv=None):
brozzler.new_site(frontier, site)
def brozzler_new_page(argv=None):
"""
Command line utility entry point for queuing a new brozzler page.
Takes a url and site_id and adds a page object in rethinkdb, which
brozzler-workers will look at and start crawling.
"""
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(argv[0]),
description="brozzler-new-page - queue url to brozzler site",
formatter_class=BetterArgumentDefaultsHelpFormatter,
)
arg_parser.add_argument("url", metavar="URL", help="URL to add to site")
arg_parser.add_argument("site_id", metavar="SITE_ID",
help="UUID of site object to add the page to")
arg_parser.add_argument("parent_page_id", metavar="PARENT_PAGE_ID",
help="ID of Page object to add the page as an outlink to")
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
site_result = rr.table("sites").get(args.site_id).run()
if not site_result:
raise Exception()
site = brozzler.Site(rr, site_result)
parent_page_result = rr.table("pages").get(args.parent_page_id).run()
if not parent_page_result:
raise Exception()
parent_page = brozzler.Page(rr, parent_page_result)
frontier = brozzler.RethinkDbFrontier(rr)
frontier.scope_and_schedule_outlinks(site=site, parent_page=parent_page, outlinks=[args.url])
def brozzler_worker(argv=None):
"""
Main entry point for brozzler, gets sites and pages to brozzle from
@ -1065,7 +1103,7 @@ def brozzler_list_captures(argv=None):
reql = reql.order_by(index="abbr_canon_surt_timestamp")
reql = reql.filter(
lambda capture: (capture["canon_surt"] >= key)
& (capture["canon_surt"] <= end_key)
& (capture["canon_surt"] <= end_key)
)
logging.debug("querying rethinkdb: %s", reql)
results = reql.run()

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup(
name="brozzler",
version="1.5.54",
version="1.5.55",
description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler",
author="Noah Levitt",
@ -51,6 +51,7 @@ setuptools.setup(
"brozzle-page=brozzler.cli:brozzle_page",
"brozzler-new-job=brozzler.cli:brozzler_new_job",
"brozzler-new-site=brozzler.cli:brozzler_new_site",
"brozzler-new-page=brozzler.cli:brozzler_new_page",
"brozzler-worker=brozzler.cli:brozzler_worker",
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",