mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
let rethinkdb generate job.id if not supplied in configuration
This commit is contained in:
parent
129a1e8f47
commit
5a0301ac12
6 changed files with 22 additions and 18 deletions
|
@ -97,9 +97,8 @@ Job Configuration
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
Jobs are defined using yaml files. Options may be specified either at the
|
Jobs are defined using yaml files. Options may be specified either at the
|
||||||
top-level or on individual seeds. A job id and at least one seed url
|
top-level or on individual seeds. At least one seed url must be specified,
|
||||||
must be specified, everything else is optional. For details, see
|
everything else is optional. For details, see `<job-conf.rst>`_.
|
||||||
`<job-conf.rst>`_.
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
'''
|
'''
|
||||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -99,6 +99,7 @@ class RethinkDbFrontier:
|
||||||
if not job.id:
|
if not job.id:
|
||||||
# only if "id" has not already been set
|
# only if "id" has not already been set
|
||||||
job.id = result["generated_keys"][0]
|
job.id = result["generated_keys"][0]
|
||||||
|
return job
|
||||||
|
|
||||||
def new_site(self, site):
|
def new_site(self, site):
|
||||||
self.logger.info("inserting into 'sites' table %s", site)
|
self.logger.info("inserting into 'sites' table %s", site)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||||
for setting up a job with supplied configuration
|
for setting up a job with supplied configuration
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -61,17 +61,22 @@ def merge(a, b):
|
||||||
return a
|
return a
|
||||||
|
|
||||||
def new_job_file(frontier, job_conf_file):
|
def new_job_file(frontier, job_conf_file):
|
||||||
|
'''Returns new Job.'''
|
||||||
logging.info("loading %s", job_conf_file)
|
logging.info("loading %s", job_conf_file)
|
||||||
with open(job_conf_file) as f:
|
with open(job_conf_file) as f:
|
||||||
job_conf = yaml.load(f)
|
job_conf = yaml.load(f)
|
||||||
new_job(frontier, job_conf)
|
return new_job(frontier, job_conf)
|
||||||
|
|
||||||
def new_job(frontier, job_conf):
|
def new_job(frontier, job_conf):
|
||||||
|
'''Returns new Job.'''
|
||||||
validate_conf(job_conf)
|
validate_conf(job_conf)
|
||||||
job = Job(
|
job = Job(
|
||||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||||
started=rethinkstuff.utcnow())
|
started=rethinkstuff.utcnow())
|
||||||
|
|
||||||
|
# insert the job now to make sure it has an id
|
||||||
|
job = frontier.new_job(job)
|
||||||
|
|
||||||
sites = []
|
sites = []
|
||||||
for seed_conf in job_conf["seeds"]:
|
for seed_conf in job_conf["seeds"]:
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
|
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
|
||||||
password=merged_conf.get("password"))
|
password=merged_conf.get("password"))
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
# insert all the sites into database before the job
|
|
||||||
for site in sites:
|
for site in sites:
|
||||||
new_site(frontier, site)
|
new_site(frontier, site)
|
||||||
|
|
||||||
frontier.new_job(job)
|
return job
|
||||||
|
|
||||||
def new_site(frontier, site):
|
def new_site(frontier, site):
|
||||||
site.id = str(uuid.uuid4())
|
site.id = str(uuid.uuid4())
|
||||||
|
|
|
@ -2,7 +2,7 @@ id:
|
||||||
type:
|
type:
|
||||||
- string
|
- string
|
||||||
- integer
|
- integer
|
||||||
required: true
|
required: false
|
||||||
|
|
||||||
<<: &multi_level_options
|
<<: &multi_level_options
|
||||||
time_limit:
|
time_limit:
|
||||||
|
|
12
job-conf.rst
12
job-conf.rst
|
@ -2,8 +2,8 @@ brozzler job configuration
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
Jobs are defined using yaml files. Options may be specified either at the
|
Jobs are defined using yaml files. Options may be specified either at the
|
||||||
top-level or on individual seeds. A job id and at least one seed url
|
top-level or on individual seeds. At least one seed url must be specified,
|
||||||
must be specified, everything else is optional.
|
everything else is optional.
|
||||||
|
|
||||||
an example
|
an example
|
||||||
==========
|
==========
|
||||||
|
@ -85,11 +85,11 @@ settings reference
|
||||||
|
|
||||||
id
|
id
|
||||||
--
|
--
|
||||||
+-----------+--------+----------+---------+
|
+-----------+--------+----------+--------------------------+
|
||||||
| scope | type | required | default |
|
| scope | type | required | default |
|
||||||
+===========+========+==========+=========+
|
+===========+========+==========+==========================+
|
||||||
| top-level | string | yes? | *n/a* |
|
| top-level | string | no | *generated by rethinkdb* |
|
||||||
+-----------+--------+----------+---------+
|
+-----------+--------+----------+--------------------------+
|
||||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||||
brozzler.
|
brozzler.
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev181',
|
version='1.1b9.dev182',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue