Merge 5138fe95846b0982c8d100a8afbd0a0a18916b1f into d987ba2e4b1d9a61070a3edf5f796d652e9f5e87

This commit is contained in:
Adam Miller 2024-09-29 11:25:40 +08:00 committed by GitHub
commit f1ef6fe752
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 0 deletions

View File

@ -459,6 +459,50 @@ def brozzler_new_site(argv=None):
brozzler.new_site(frontier, site)
def brozzler_new_page(argv=None):
"""
Command line utility entry point for queuing a new brozzler page.
Takes a url, site_id, and parent_page_id, and adds a page object in rethinkdb, which
brozzler-workers will look at and start crawling.
"""
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(argv[0]),
description="brozzler-new-page - queue url to brozzler site",
formatter_class=BetterArgumentDefaultsHelpFormatter,
)
arg_parser.add_argument("url", metavar="URL", help="URL to add to site")
arg_parser.add_argument(
"site_id", metavar="SITE_ID", help="UUID of site object to add the page to"
)
arg_parser.add_argument(
"parent_page_id",
metavar="PARENT_PAGE_ID",
help="ID of Page object to add the page as an outlink to",
)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
site_result = rr.table("sites").get(args.site_id).run()
if not site_result:
raise Exception()
site = brozzler.Site(rr, site_result)
parent_page_result = rr.table("pages").get(args.parent_page_id).run()
if not parent_page_result:
raise Exception()
parent_page = brozzler.Page(rr, parent_page_result)
frontier = brozzler.RethinkDbFrontier(rr)
frontier.scope_and_schedule_outlinks(
site=site, parent_page=parent_page, outlinks=[args.url]
)
def brozzler_worker(argv=None):
"""
Main entry point for brozzler, gets sites and pages to brozzle from

View File

@ -51,6 +51,7 @@ setuptools.setup(
"brozzle-page=brozzler.cli:brozzle_page",
"brozzler-new-job=brozzler.cli:brozzler_new_job",
"brozzler-new-site=brozzler.cli:brozzler_new_site",
"brozzler-new-page=brozzler.cli:brozzler_new_page",
"brozzler-worker=brozzler.cli:brozzler_worker",
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",