let rethinkdb generate job.id if not supplied in configuration

This commit is contained in:
Noah Levitt 2017-02-03 14:53:50 -08:00
parent 129a1e8f47
commit 5a0301ac12
6 changed files with 22 additions and 18 deletions

View File

@ -97,9 +97,8 @@ Job Configuration
-----------------
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional. For details, see
`<job-conf.rst>`_.
top-level or on individual seeds. At least one seed url must be specified,
everything else is optional. For details, see `<job-conf.rst>`_.
::
@ -238,7 +237,7 @@ option:
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
to load the plugin by adding this option to your wrapper script:
::

View File

@ -1,7 +1,7 @@
'''
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -99,6 +99,7 @@ class RethinkDbFrontier:
if not job.id:
# only if "id" has not already been set
job.id = result["generated_keys"][0]
return job
def new_site(self, site):
self.logger.info("inserting into 'sites' table %s", site)

View File

@ -2,7 +2,7 @@
brozzler/job.py - Job class representing a brozzler crawl job, and functions
for setting up a job with supplied configuration
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -61,17 +61,22 @@ def merge(a, b):
return a
def new_job_file(frontier, job_conf_file):
'''Returns new Job.'''
logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f:
job_conf = yaml.load(f)
new_job(frontier, job_conf)
return new_job(frontier, job_conf)
def new_job(frontier, job_conf):
'''Returns new Job.'''
validate_conf(job_conf)
job = Job(
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
started=rethinkstuff.utcnow())
# insert the job now to make sure it has an id
job = frontier.new_job(job)
sites = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
password=merged_conf.get("password"))
sites.append(site)
# insert all the sites into database before the job
for site in sites:
new_site(frontier, site)
frontier.new_job(job)
return job
def new_site(frontier, site):
site.id = str(uuid.uuid4())

View File

@ -2,7 +2,7 @@ id:
type:
- string
- integer
required: true
required: false
<<: &multi_level_options
time_limit:

View File

@ -2,8 +2,8 @@ brozzler job configuration
**************************
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional.
top-level or on individual seeds. At least one seed url must be specified,
everything else is optional.
an example
==========
@ -85,11 +85,11 @@ settings reference
id
--
+-----------+--------+----------+---------+
| scope | type | required | default |
+===========+========+==========+=========+
| top-level | string | yes? | *n/a* |
+-----------+--------+----------+---------+
+-----------+--------+----------+--------------------------+
| scope | type | required | default |
+===========+========+==========+==========================+
| top-level | string | no | *generated by rethinkdb* |
+-----------+--------+----------+--------------------------+
An arbitrary identifier for this job. Must be unique across this deployment of
brozzler.

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev181',
version='1.1b9.dev182',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',