Merge branch 'master' into qa

* master:
  bump version number for last pull request
This commit is contained in:
Noah Levitt 2017-04-05 16:15:33 -07:00
commit 40022ad0c2
2 changed files with 38 additions and 29 deletions

View File

@ -58,7 +58,7 @@ def add_common_options(arg_parser):
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (
brozzler.__version__, os.path.basename(sys.argv[0])))
brozzler.__version__, os.path.basename(argv[0])))
def add_rethinkdb_options(arg_parser):
arg_parser.add_argument(
@ -124,13 +124,14 @@ class BetterArgumentDefaultsHelpFormatter(
else:
return super()._get_help_string(action)
def brozzle_page():
def brozzle_page(argv=None):
'''
Command line utility entry point for brozzling a single page. Opens url in
a browser, running some javascript behaviors, and prints outlinks.
'''
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
description='brozzle-page - brozzle a single page',
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
@ -154,7 +155,7 @@ def brozzle_page():
'--proxy', dest='proxy', default=None, help='http proxy')
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
behavior_parameters = {}
@ -187,14 +188,15 @@ def brozzle_page():
finally:
browser.stop()
def brozzler_new_job():
def brozzler_new_job(argv=None):
'''
Command line utility entry point for queuing a new brozzler job. Takes a
yaml brozzler job configuration file, creates job, sites, and pages objects
in rethinkdb, which brozzler-workers will look at and start crawling.
'''
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
description='brozzler-new-job - queue new job with brozzler',
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
@ -203,7 +205,7 @@ def brozzler_new_job():
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -215,14 +217,15 @@ def brozzler_new_job():
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
sys.exit(1)
def brozzler_new_site():
def brozzler_new_site(argv=None):
'''
Command line utility entry point for queuing a new brozzler site.
Takes a seed url and creates a site and page object in rethinkdb, which
brozzler-workers will look at and start crawling.
'''
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
description='brozzler-new-site - register site to brozzle',
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
@ -253,7 +256,7 @@ def brozzler_new_site():
help='use this password to try to log in if a login form is found')
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -271,13 +274,14 @@ def brozzler_new_site():
frontier = brozzler.RethinkDbFrontier(rr)
brozzler.new_site(frontier, site)
def brozzler_worker():
def brozzler_worker(argv=None):
'''
Main entry point for brozzler, gets sites and pages to brozzle from
rethinkdb, brozzles them.
'''
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
@ -296,7 +300,7 @@ def brozzler_worker():
'the rethinkdb service registry'))
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
def sigterm(signum, frame):
@ -341,7 +345,7 @@ def brozzler_worker():
logging.info('brozzler-worker is all done, exiting')
def brozzler_ensure_tables():
def brozzler_ensure_tables(argv=None):
'''
Creates rethinkdb tables if they don't already exist. Brozzler
(brozzler-worker, brozzler-new-job, etc) normally creates the tables it
@ -349,13 +353,14 @@ def brozzler_ensure_tables():
the same time, you can end up with duplicate broken tables. So it's a good
idea to use this utility at an early step when spinning up a cluster.
'''
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -375,9 +380,10 @@ class Jsonner(json.JSONEncoder):
else:
return json.JSONEncoder.default(self, o)
def brozzler_list_jobs():
def brozzler_list_jobs(argv=None):
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'--yaml', dest='yaml', action='store_true', help=(
@ -395,7 +401,7 @@ def brozzler_list_jobs():
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -426,9 +432,10 @@ def brozzler_list_jobs():
for result in results:
print(json.dumps(result, cls=Jsonner, indent=2))
def brozzler_list_sites():
def brozzler_list_sites(argv=None):
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'--yaml', dest='yaml', action='store_true', help=(
@ -452,7 +459,7 @@ def brozzler_list_sites():
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -478,9 +485,10 @@ def brozzler_list_sites():
for result in results:
print(json.dumps(result, cls=Jsonner, indent=2))
def brozzler_list_pages():
def brozzler_list_pages(argv=None):
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'--yaml', dest='yaml', action='store_true', help=(
@ -509,7 +517,7 @@ def brozzler_list_pages():
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)
@ -554,15 +562,16 @@ def brozzler_list_pages():
for result in results:
print(json.dumps(result, cls=Jsonner, indent=2))
def brozzler_list_captures():
def brozzler_list_captures(argv=None):
'''
Handy utility for looking up entries in the rethinkdb "captures" table by
url or sha1.
'''
import urlcanon
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
prog=os.path.basename(argv[0]),
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'-p', '--prefix', dest='prefix', action='store_true', help=(
@ -578,7 +587,7 @@ def brozzler_list_captures():
'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:])
args = arg_parser.parse_args(args=argv[1:])
configure_logging(args)
rr = rethinker(args)

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b11.dev226',
version='1.1b11.dev227',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',