let rethinkdb generate job.id if not supplied in configuration

This commit is contained in:
Noah Levitt 2017-02-03 14:53:50 -08:00
parent 129a1e8f47
commit 5a0301ac12
6 changed files with 22 additions and 18 deletions

View file

@ -97,9 +97,8 @@ Job Configuration
----------------- -----------------
Jobs are defined using yaml files. Options may be specified either at the Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url top-level or on individual seeds. At least one seed url must be specified,
must be specified, everything else is optional. For details, see everything else is optional. For details, see `<job-conf.rst>`_.
`<job-conf.rst>`_.
:: ::

View file

@ -1,7 +1,7 @@
''' '''
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -99,6 +99,7 @@ class RethinkDbFrontier:
if not job.id: if not job.id:
# only if "id" has not already been set # only if "id" has not already been set
job.id = result["generated_keys"][0] job.id = result["generated_keys"][0]
return job
def new_site(self, site): def new_site(self, site):
self.logger.info("inserting into 'sites' table %s", site) self.logger.info("inserting into 'sites' table %s", site)

View file

@ -2,7 +2,7 @@
brozzler/job.py - Job class representing a brozzler crawl job, and functions brozzler/job.py - Job class representing a brozzler crawl job, and functions
for setting up a job with supplied configuration for setting up a job with supplied configuration
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -61,17 +61,22 @@ def merge(a, b):
return a return a
def new_job_file(frontier, job_conf_file): def new_job_file(frontier, job_conf_file):
'''Returns new Job.'''
logging.info("loading %s", job_conf_file) logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f: with open(job_conf_file) as f:
job_conf = yaml.load(f) job_conf = yaml.load(f)
new_job(frontier, job_conf) return new_job(frontier, job_conf)
def new_job(frontier, job_conf): def new_job(frontier, job_conf):
'''Returns new Job.'''
validate_conf(job_conf) validate_conf(job_conf)
job = Job( job = Job(
id=job_conf.get("id"), conf=job_conf, status="ACTIVE", id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
started=rethinkstuff.utcnow()) started=rethinkstuff.utcnow())
# insert the job now to make sure it has an id
job = frontier.new_job(job)
sites = [] sites = []
for seed_conf in job_conf["seeds"]: for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf) merged_conf = merge(seed_conf, job_conf)
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
password=merged_conf.get("password")) password=merged_conf.get("password"))
sites.append(site) sites.append(site)
# insert all the sites into database before the job
for site in sites: for site in sites:
new_site(frontier, site) new_site(frontier, site)
frontier.new_job(job) return job
def new_site(frontier, site): def new_site(frontier, site):
site.id = str(uuid.uuid4()) site.id = str(uuid.uuid4())

View file

@ -2,7 +2,7 @@ id:
type: type:
- string - string
- integer - integer
required: true required: false
<<: &multi_level_options <<: &multi_level_options
time_limit: time_limit:

View file

@ -2,8 +2,8 @@ brozzler job configuration
************************** **************************
Jobs are defined using yaml files. Options may be specified either at the Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url top-level or on individual seeds. At least one seed url must be specified,
must be specified, everything else is optional. everything else is optional.
an example an example
========== ==========
@ -85,11 +85,11 @@ settings reference
id id
-- --
+-----------+--------+----------+---------+ +-----------+--------+----------+--------------------------+
| scope | type | required | default | | scope | type | required | default |
+===========+========+==========+=========+ +===========+========+==========+==========================+
| top-level | string | yes? | *n/a* | | top-level | string | no | *generated by rethinkdb* |
+-----------+--------+----------+---------+ +-----------+--------+----------+--------------------------+
An arbitrary identifier for this job. Must be unique across this deployment of An arbitrary identifier for this job. Must be unique across this deployment of
brozzler. brozzler.

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev181', version='1.1b9.dev182',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',