diff --git a/README.rst b/README.rst index 6573708..0c00b73 100644 --- a/README.rst +++ b/README.rst @@ -97,9 +97,8 @@ Job Configuration ----------------- Jobs are defined using yaml files. Options may be specified either at the -top-level or on individual seeds. A job id and at least one seed url -must be specified, everything else is optional. For details, see -``_. +top-level or on individual seeds. At least one seed url must be specified, +everything else is optional. For details, see ``_. :: @@ -238,7 +237,7 @@ option: brozzler-worker --chrome-exe ~/bin/headless_chromium.sh To render Flash content, `download `_ -and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium +and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium to load the plugin by adding this option to your wrapper script: :: diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 8bf8e8f..33c2053 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -1,7 +1,7 @@ ''' brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -99,6 +99,7 @@ class RethinkDbFrontier: if not job.id: # only if "id" has not already been set job.id = result["generated_keys"][0] + return job def new_site(self, site): self.logger.info("inserting into 'sites' table %s", site) diff --git a/brozzler/job.py b/brozzler/job.py index ede81b8..bf6f571 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -2,7 +2,7 @@ brozzler/job.py - Job class representing a brozzler crawl job, and functions for setting up a job with supplied configuration -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -61,17 +61,22 @@ def merge(a, b): return a def new_job_file(frontier, job_conf_file): + '''Returns new Job.''' logging.info("loading %s", job_conf_file) with open(job_conf_file) as f: job_conf = yaml.load(f) - new_job(frontier, job_conf) + return new_job(frontier, job_conf) def new_job(frontier, job_conf): + '''Returns new Job.''' validate_conf(job_conf) job = Job( id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) + # insert the job now to make sure it has an id + job = frontier.new_job(job) + sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) @@ -92,11 +97,10 @@ def new_job(frontier, job_conf): password=merged_conf.get("password")) sites.append(site) - # insert all the sites into database before the job for site in sites: new_site(frontier, site) - frontier.new_job(job) + return job def new_site(frontier, site): site.id = str(uuid.uuid4()) diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index d9c1df9..3b5ae7d 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -2,7 +2,7 @@ id: type: - string - integer - required: true + required: false <<: &multi_level_options time_limit: diff --git a/job-conf.rst b/job-conf.rst index 6773afd..c36ab05 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -2,8 +2,8 @@ brozzler job configuration ************************** Jobs are defined using yaml files. Options may be specified either at the -top-level or on individual seeds. A job id and at least one seed url -must be specified, everything else is optional. +top-level or on individual seeds. At least one seed url must be specified, +everything else is optional. an example ========== @@ -85,11 +85,11 @@ settings reference id -- -+-----------+--------+----------+---------+ -| scope | type | required | default | -+===========+========+==========+=========+ -| top-level | string | yes? | *n/a* | -+-----------+--------+----------+---------+ ++-----------+--------+----------+--------------------------+ +| scope | type | required | default | ++===========+========+==========+==========================+ +| top-level | string | no | *generated by rethinkdb* | ++-----------+--------+----------+--------------------------+ An arbitrary identifier for this job. Must be unique across this deployment of brozzler. diff --git a/setup.py b/setup.py index 35dd97f..830b51e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev181', + version='1.1b9.dev182', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',