mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge 5138fe95846b0982c8d100a8afbd0a0a18916b1f into d987ba2e4b1d9a61070a3edf5f796d652e9f5e87
This commit is contained in:
commit
f1ef6fe752
@ -459,6 +459,50 @@ def brozzler_new_site(argv=None):
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
|
||||
def brozzler_new_page(argv=None):
|
||||
"""
|
||||
Command line utility entry point for queuing a new brozzler page.
|
||||
Takes a url, site_id, and parent_page_id, and adds a page object in rethinkdb, which
|
||||
brozzler-workers will look at and start crawling.
|
||||
"""
|
||||
argv = argv or sys.argv
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(argv[0]),
|
||||
description="brozzler-new-page - queue url to brozzler site",
|
||||
formatter_class=BetterArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
arg_parser.add_argument("url", metavar="URL", help="URL to add to site")
|
||||
arg_parser.add_argument(
|
||||
"site_id", metavar="SITE_ID", help="UUID of site object to add the page to"
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"parent_page_id",
|
||||
metavar="PARENT_PAGE_ID",
|
||||
help="ID of Page object to add the page as an outlink to",
|
||||
)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser, argv)
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
configure_logging(args)
|
||||
|
||||
rr = rethinker(args)
|
||||
|
||||
site_result = rr.table("sites").get(args.site_id).run()
|
||||
if not site_result:
|
||||
raise Exception()
|
||||
site = brozzler.Site(rr, site_result)
|
||||
|
||||
parent_page_result = rr.table("pages").get(args.parent_page_id).run()
|
||||
if not parent_page_result:
|
||||
raise Exception()
|
||||
parent_page = brozzler.Page(rr, parent_page_result)
|
||||
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.scope_and_schedule_outlinks(
|
||||
site=site, parent_page=parent_page, outlinks=[args.url]
|
||||
)
|
||||
|
||||
|
||||
def brozzler_worker(argv=None):
|
||||
"""
|
||||
Main entry point for brozzler, gets sites and pages to brozzle from
|
||||
|
1
setup.py
1
setup.py
@ -51,6 +51,7 @@ setuptools.setup(
|
||||
"brozzle-page=brozzler.cli:brozzle_page",
|
||||
"brozzler-new-job=brozzler.cli:brozzler_new_job",
|
||||
"brozzler-new-site=brozzler.cli:brozzler_new_site",
|
||||
"brozzler-new-page=brozzler.cli:brozzler_new_page",
|
||||
"brozzler-worker=brozzler.cli:brozzler_worker",
|
||||
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
|
||||
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",
|
||||
|
Loading…
x
Reference in New Issue
Block a user