diff --git a/README.rst b/README.rst index 2b47792..3e53867 100644 --- a/README.rst +++ b/README.rst @@ -95,7 +95,8 @@ Job Configuration Jobs are defined using yaml files. Options may be specified either at the top-level or on individual seeds. A job id and at least one seed url -must be specified, everything else is optional. +must be specified, everything else is optional. For details, see +``_. :: diff --git a/brozzler/job.py b/brozzler/job.py index ba259ec..85e955d 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -87,7 +87,7 @@ def new_site(frontier, site): frontier.new_page(page) logging.info("queued page %s", page) else: - logging.warn("seed url {} is blocked by robots.txt".format(site.seed)) + logging.warn("seed url %s is blocked by robots.txt", site.seed) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) diff --git a/brozzler/webconsole/static/partials/workers.html b/brozzler/webconsole/static/partials/workers.html index 61f9a61..5f39c77 100644 --- a/brozzler/webconsole/static/partials/workers.html +++ b/brozzler/webconsole/static/partials/workers.html @@ -12,11 +12,19 @@

Workers

+

This page depends on some deployment details outside of brozzler + itself, namely that port 8901 on each brozzler-worker is running + websockify bridging VNC running on the same host. The vagrant+ansible + configuration in the brozzler repo contains an example of that. + https://github.com/internetarchive/brozzler/tree/master/vagrant +

+
+
{{worker}}
{{worker.host}}
-
diff --git a/job-conf.rst b/job-conf.rst new file mode 100644 index 0000000..a073bed --- /dev/null +++ b/job-conf.rst @@ -0,0 +1,81 @@ +brozzler job configuration +========================== + +Jobs are defined using yaml files. Options may be specified either at the +top-level or on individual seeds. A job id and at least one seed url +must be specified, everything else is optional. + +an example +---------- + +:: + + id: myjob + time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving + ignore_robots: false + enable_warcprox_features: false + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + metadata: {} + seeds: + - url: http://one.example.org/ + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + - url: http://two.example.org/ + time_limit: 30 + - url: http://three.example.org/ + time_limit: 10 + ignore_robots: true + scope: + surt: http://(org,example, + +how inheritance works +--------------------- + +Most of the available options apply to seeds. Such options can also be +specified at the top level, in which case the seeds inherit the options. If +an option is specified both at the top level and at the level of an individual +seed, the results are merged with the seed-level value taking precedence in +case of conflicts. It's probably easiest to make sense of this by way of an +example. + +In the example yaml above, ``warcprox_meta`` is specified at the top level and +at the seed level for the seed http://one.example.org/. At the top level we +have:: + + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + +At the seed level we have:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + +The merged configuration as applied to the seed http://one.example.org/ will +be:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-stats + - job1-seed1-stats + +Notice that: + +- There is a collision on ``warc-prefix`` and the seed-level value wins. +- Since ``buckets`` is a list, the merged result includes all the values from + both the top level and the seed level. diff --git a/setup.py b/setup.py index de5c821..047a2e2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev85', + version='1.1b6.dev86', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/vagrant-brozzler-new-job.py b/vagrant/vagrant-brozzler-new-job.py new file mode 100755 index 0000000..767091b --- /dev/null +++ b/vagrant/vagrant-brozzler-new-job.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to +queue a job for your vagrant brozzler deployment. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. +''' + +import sys +import os +import argparse +import subprocess + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument( + 'job_conf_file', metavar='JOB_CONF_FILE', + help='brozzler job configuration file in yaml') + args = arg_parser.parse_args(args=argv[1:]) + + with open(args.job_conf_file, 'rb') as f: + yaml_bytes = f.read() + subprocess.call( + ['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'], + stdin=yaml_bytes) + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + +if __name__ == '__main__': + main(sys.argv) + +## # cd to path with Vagrantfile so "vagrant ssh" knows what to do +## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +## cd $script_dir +## +## vagrant ssh -- \ +## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ +## /home/vagrant/brozzler-ve34/bin/python \ +## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@" diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 7e073aa..9986fef 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -3,7 +3,7 @@ vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to queue a site for your vagrant brozzler deployment. -Fills in the --proxy option automatically. some other options are passed +Fills in the --proxy option automatically. Some other options are passed through. This is a standalone script with no dependencies other than python, and should