mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 23:35:54 -04:00
let rethinkdb generate job.id if not supplied in configuration
This commit is contained in:
parent
129a1e8f47
commit
5a0301ac12
@ -97,9 +97,8 @@ Job Configuration
|
||||
-----------------
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional. For details, see
|
||||
`<job-conf.rst>`_.
|
||||
top-level or on individual seeds. At least one seed url must be specified,
|
||||
everything else is optional. For details, see `<job-conf.rst>`_.
|
||||
|
||||
::
|
||||
|
||||
@ -238,7 +237,7 @@ option:
|
||||
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
|
||||
|
||||
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
|
||||
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
|
||||
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
|
||||
to load the plugin by adding this option to your wrapper script:
|
||||
|
||||
::
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -99,6 +99,7 @@ class RethinkDbFrontier:
|
||||
if not job.id:
|
||||
# only if "id" has not already been set
|
||||
job.id = result["generated_keys"][0]
|
||||
return job
|
||||
|
||||
def new_site(self, site):
|
||||
self.logger.info("inserting into 'sites' table %s", site)
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||
for setting up a job with supplied configuration
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -61,17 +61,22 @@ def merge(a, b):
|
||||
return a
|
||||
|
||||
def new_job_file(frontier, job_conf_file):
|
||||
'''Returns new Job.'''
|
||||
logging.info("loading %s", job_conf_file)
|
||||
with open(job_conf_file) as f:
|
||||
job_conf = yaml.load(f)
|
||||
new_job(frontier, job_conf)
|
||||
return new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
'''Returns new Job.'''
|
||||
validate_conf(job_conf)
|
||||
job = Job(
|
||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||
started=rethinkstuff.utcnow())
|
||||
|
||||
# insert the job now to make sure it has an id
|
||||
job = frontier.new_job(job)
|
||||
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
|
||||
password=merged_conf.get("password"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
|
||||
frontier.new_job(job)
|
||||
return job
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
|
@ -2,7 +2,7 @@ id:
|
||||
type:
|
||||
- string
|
||||
- integer
|
||||
required: true
|
||||
required: false
|
||||
|
||||
<<: &multi_level_options
|
||||
time_limit:
|
||||
|
14
job-conf.rst
14
job-conf.rst
@ -2,8 +2,8 @@ brozzler job configuration
|
||||
**************************
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional.
|
||||
top-level or on individual seeds. At least one seed url must be specified,
|
||||
everything else is optional.
|
||||
|
||||
an example
|
||||
==========
|
||||
@ -85,11 +85,11 @@ settings reference
|
||||
|
||||
id
|
||||
--
|
||||
+-----------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+=========+
|
||||
| top-level | string | yes? | *n/a* |
|
||||
+-----------+--------+----------+---------+
|
||||
+-----------+--------+----------+--------------------------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+==========================+
|
||||
| top-level | string | no | *generated by rethinkdb* |
|
||||
+-----------+--------+----------+--------------------------+
|
||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||
brozzler.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user