mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-09 15:02:28 -04:00
Merge branch 'master' into qa
* master: add --yaml option to brozzler-list-* commands take screenshot before running behavior (but after login) - thanks danielbicho
This commit is contained in:
commit
f6fdb91d57
3 changed files with 42 additions and 31 deletions
|
@ -426,13 +426,12 @@ class Browser:
|
||||||
user_agent=user_agent, timeout=300)
|
user_agent=user_agent, timeout=300)
|
||||||
if password:
|
if password:
|
||||||
self.try_login(username, password, timeout=300)
|
self.try_login(username, password, timeout=300)
|
||||||
|
if on_screenshot:
|
||||||
|
jpeg_bytes = self.screenshot()
|
||||||
|
on_screenshot(jpeg_bytes)
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters)
|
page_url, behavior_parameters)
|
||||||
self.run_behavior(behavior_script, timeout=900)
|
self.run_behavior(behavior_script, timeout=900)
|
||||||
if on_screenshot:
|
|
||||||
self.scroll_to_top()
|
|
||||||
jpeg_bytes = self.screenshot()
|
|
||||||
on_screenshot(jpeg_bytes)
|
|
||||||
outlinks = self.extract_outlinks()
|
outlinks = self.extract_outlinks()
|
||||||
## for each hashtag not already visited:
|
## for each hashtag not already visited:
|
||||||
## navigate_to_hashtag (nothing to wait for so no timeout?)
|
## navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||||
|
@ -502,17 +501,6 @@ class Browser:
|
||||||
jpeg_bytes = base64.b64decode(message['result']['data'])
|
jpeg_bytes = base64.b64decode(message['result']['data'])
|
||||||
return jpeg_bytes
|
return jpeg_bytes
|
||||||
|
|
||||||
def scroll_to_top(self, timeout=30):
|
|
||||||
self.logger.info('scrolling to top')
|
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
|
||||||
msg_id = self.send_to_chrome(
|
|
||||||
method='Runtime.evaluate',
|
|
||||||
params={'expression': 'window.scrollTo(0, 0);'})
|
|
||||||
self._wait_for(
|
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
|
||||||
timeout=timeout)
|
|
||||||
self.websock_thread.pop_result(msg_id)
|
|
||||||
|
|
||||||
def url(self, timeout=30):
|
def url(self, timeout=30):
|
||||||
'''
|
'''
|
||||||
Returns value of document.URL from the browser.
|
Returns value of document.URL from the browser.
|
||||||
|
|
|
@ -212,7 +212,7 @@ def brozzler_new_job():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
description='brozzler-new-job - queue new job with brozzler',
|
description='brozzler-new-job - queue new job with brozzler',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||||
help='brozzler job configuration file in yaml')
|
help='brozzler job configuration file in yaml')
|
||||||
|
@ -240,7 +240,7 @@ def brozzler_new_site():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
description='brozzler-new-site - register site to brozzle',
|
description='brozzler-new-site - register site to brozzle',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||||
add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_proxy_options(arg_parser)
|
_add_proxy_options(arg_parser)
|
||||||
|
@ -295,7 +295,7 @@ def brozzler_worker():
|
||||||
'''
|
'''
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(__file__),
|
prog=os.path.basename(__file__),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
|
@ -360,7 +360,7 @@ def brozzler_ensure_tables():
|
||||||
'''
|
'''
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
|
@ -387,7 +387,7 @@ class Jsonner(json.JSONEncoder):
|
||||||
def brozzler_list_jobs():
|
def brozzler_list_jobs():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-a', '--all', dest='all', action='store_true', help=(
|
'-a', '--all', dest='all', action='store_true', help=(
|
||||||
'list all jobs (by default, only active jobs are listed)'))
|
'list all jobs (by default, only active jobs are listed)'))
|
||||||
|
@ -409,10 +409,13 @@ def brozzler_list_jobs():
|
||||||
def brozzler_list_sites():
|
def brozzler_list_sites():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-a', '--all', dest='all', action='store_true', help=(
|
'-a', '--all', dest='all', action='store_true', help=(
|
||||||
'list all sites (by default, only active sites are listed)'))
|
'list all sites (by default, only active sites are listed)'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--yaml', dest='yaml', action='store_true', help=(
|
||||||
|
'yaml output (default is json)'))
|
||||||
group = arg_parser.add_mutually_exclusive_group()
|
group = arg_parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
'--jobless', dest='jobless', action='store_true', help=(
|
'--jobless', dest='jobless', action='store_true', help=(
|
||||||
|
@ -441,19 +444,27 @@ def brozzler_list_sites():
|
||||||
reql = reql.filter({'status': 'ACTIVE'})
|
reql = reql.filter({'status': 'ACTIVE'})
|
||||||
logging.debug('querying rethinkdb: %s', reql)
|
logging.debug('querying rethinkdb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
for result in results:
|
if args.yaml:
|
||||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
yaml.dump_all(
|
||||||
|
results, stream=sys.stdout, explicit_start=True,
|
||||||
|
default_flow_style=False)
|
||||||
|
else:
|
||||||
|
for result in results:
|
||||||
|
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||||
|
|
||||||
def brozzler_list_pages():
|
def brozzler_list_pages():
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--yaml', dest='yaml', action='store_true', help=(
|
||||||
|
'yaml output (default is json)'))
|
||||||
group = arg_parser.add_mutually_exclusive_group(required=True)
|
group = arg_parser.add_mutually_exclusive_group(required=True)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
'--job', dest='job', metavar='JOB_ID', help=(
|
'--job', dest='job', metavar='JOB_ID', help=(
|
||||||
'list pages for all sites of the supplied job'))
|
'list pages for all sites of the supplied job'))
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
'--site', dest='site', metavar='SITE', help=(
|
'--site', dest='site', metavar='SITE_ID', help=(
|
||||||
'list pages of the supplied site'))
|
'list pages of the supplied site'))
|
||||||
group = arg_parser.add_mutually_exclusive_group()
|
group = arg_parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
|
@ -506,8 +517,13 @@ def brozzler_list_pages():
|
||||||
reql = reql.filter({'claimed': True})
|
reql = reql.filter({'claimed': True})
|
||||||
logging.debug('querying rethinkb: %s', reql)
|
logging.debug('querying rethinkb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
for result in results:
|
if args.yaml:
|
||||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
yaml.dump_all(
|
||||||
|
results, stream=sys.stdout, explicit_start=True,
|
||||||
|
default_flow_style=False)
|
||||||
|
else:
|
||||||
|
for result in results:
|
||||||
|
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||||
|
|
||||||
def brozzler_list_captures():
|
def brozzler_list_captures():
|
||||||
'''
|
'''
|
||||||
|
@ -519,12 +535,15 @@ def brozzler_list_captures():
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-p', '--prefix', dest='prefix', action='store_true', help=(
|
'-p', '--prefix', dest='prefix', action='store_true', help=(
|
||||||
'use prefix match for url (n.b. may not work as expected if '
|
'use prefix match for url (n.b. may not work as expected if '
|
||||||
'searching key has query string because canonicalization can '
|
'searching key has query string because canonicalization can '
|
||||||
'reorder query parameters)'))
|
'reorder query parameters)'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--yaml', dest='yaml', action='store_true', help=(
|
||||||
|
'yaml output (default is json)'))
|
||||||
add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
@ -549,8 +568,6 @@ def brozzler_list_captures():
|
||||||
index='sha1_warc_type')
|
index='sha1_warc_type')
|
||||||
logging.debug('querying rethinkdb: %s', reql)
|
logging.debug('querying rethinkdb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
for result in results:
|
|
||||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
|
||||||
else:
|
else:
|
||||||
key = surt.surt(
|
key = surt.surt(
|
||||||
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
||||||
|
@ -573,6 +590,12 @@ def brozzler_list_captures():
|
||||||
& (capture['canon_surt'] <= end_key))
|
& (capture['canon_surt'] <= end_key))
|
||||||
logging.debug('querying rethinkdb: %s', reql)
|
logging.debug('querying rethinkdb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
|
|
||||||
|
if args.yaml:
|
||||||
|
yaml.dump_all(
|
||||||
|
results, stream=sys.stdout, explicit_start=True,
|
||||||
|
default_flow_style=False)
|
||||||
|
else:
|
||||||
for result in results:
|
for result in results:
|
||||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev186',
|
version='1.1b9.dev188',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue