mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
feat: Add new cli option to insert url into existing crawl.
This commit is contained in:
parent
0d8721a4d3
commit
16d2b17772
@ -437,6 +437,44 @@ def brozzler_new_site(argv=None):
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
|
||||
def brozzler_new_page(argv=None):
|
||||
"""
|
||||
Command line utility entry point for queuing a new brozzler page.
|
||||
Takes a url and site_id and adds a page object in rethinkdb, which
|
||||
brozzler-workers will look at and start crawling.
|
||||
"""
|
||||
argv = argv or sys.argv
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(argv[0]),
|
||||
description="brozzler-new-page - queue url to brozzler site",
|
||||
formatter_class=BetterArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
arg_parser.add_argument("url", metavar="URL", help="URL to add to site")
|
||||
arg_parser.add_argument("site_id", metavar="SITE_ID",
|
||||
help="UUID of site object to add the page to")
|
||||
arg_parser.add_argument("parent_page_id", metavar="PARENT_PAGE_ID",
|
||||
help="ID of Page object to add the page as an outlink to")
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser, argv)
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
configure_logging(args)
|
||||
|
||||
rr = rethinker(args)
|
||||
|
||||
site_result = rr.table("sites").get(args.site_id).run()
|
||||
if not site_result:
|
||||
raise Exception()
|
||||
site = brozzler.Site(rr, site_result)
|
||||
|
||||
parent_page_result = rr.table("pages").get(args.parent_page_id).run()
|
||||
if not parent_page_result:
|
||||
raise Exception()
|
||||
parent_page = brozzler.Page(rr, parent_page_result)
|
||||
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.scope_and_schedule_outlinks(site=site, parent_page=parent_page, outlinks=[args.url])
|
||||
|
||||
|
||||
def brozzler_worker(argv=None):
|
||||
"""
|
||||
Main entry point for brozzler, gets sites and pages to brozzle from
|
||||
@ -1065,7 +1103,7 @@ def brozzler_list_captures(argv=None):
|
||||
reql = reql.order_by(index="abbr_canon_surt_timestamp")
|
||||
reql = reql.filter(
|
||||
lambda capture: (capture["canon_surt"] >= key)
|
||||
& (capture["canon_surt"] <= end_key)
|
||||
& (capture["canon_surt"] <= end_key)
|
||||
)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
results = reql.run()
|
||||
|
3
setup.py
3
setup.py
@ -34,7 +34,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name="brozzler",
|
||||
version="1.5.54",
|
||||
version="1.5.55",
|
||||
description="Distributed web crawling with browsers",
|
||||
url="https://github.com/internetarchive/brozzler",
|
||||
author="Noah Levitt",
|
||||
@ -51,6 +51,7 @@ setuptools.setup(
|
||||
"brozzle-page=brozzler.cli:brozzle_page",
|
||||
"brozzler-new-job=brozzler.cli:brozzler_new_job",
|
||||
"brozzler-new-site=brozzler.cli:brozzler_new_site",
|
||||
"brozzler-new-page=brozzler.cli:brozzler_new_page",
|
||||
"brozzler-worker=brozzler.cli:brozzler_worker",
|
||||
"brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables",
|
||||
"brozzler-list-captures=brozzler.cli:brozzler_list_captures",
|
||||
|
Loading…
x
Reference in New Issue
Block a user