mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
Merge pull request #17 from ato/validate-conf
Validate job conf against a simple schema
This commit is contained in:
commit
692c1c48e1
@ -182,7 +182,12 @@ def brozzler_new_job():
|
|||||||
r = rethinkstuff.Rethinker(
|
r = rethinkstuff.Rethinker(
|
||||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
try:
|
||||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||||
|
except brozzler.job.InvalidJobConf as e:
|
||||||
|
print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
|
||||||
|
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def brozzler_new_site():
|
def brozzler_new_site():
|
||||||
'''
|
'''
|
||||||
|
@ -24,6 +24,28 @@ import json
|
|||||||
import datetime
|
import datetime
|
||||||
import uuid
|
import uuid
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
|
import os
|
||||||
|
import cerberus
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
def load_schema():
|
||||||
|
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yml')
|
||||||
|
with open(schema_file) as f:
|
||||||
|
return yaml.load(f)
|
||||||
|
|
||||||
|
class JobValidator(cerberus.Validator):
|
||||||
|
def _validate_type_url(self, value):
|
||||||
|
url = urllib.parse.urlparse(value)
|
||||||
|
return url.scheme in ('http', 'https', 'ftp')
|
||||||
|
|
||||||
|
class InvalidJobConf(Exception):
|
||||||
|
def __init__(self, errors):
|
||||||
|
self.errors = errors
|
||||||
|
|
||||||
|
def validate_conf(job_conf, schema=load_schema()):
|
||||||
|
v = JobValidator(schema)
|
||||||
|
if not v.validate(job_conf):
|
||||||
|
raise InvalidJobConf(v.errors)
|
||||||
|
|
||||||
def merge(a, b):
|
def merge(a, b):
|
||||||
if isinstance(a, dict) and isinstance(b, dict):
|
if isinstance(a, dict) and isinstance(b, dict):
|
||||||
@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file):
|
|||||||
new_job(frontier, job_conf)
|
new_job(frontier, job_conf)
|
||||||
|
|
||||||
def new_job(frontier, job_conf):
|
def new_job(frontier, job_conf):
|
||||||
|
validate_conf(job_conf)
|
||||||
job = Job(
|
job = Job(
|
||||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||||
started=rethinkstuff.utcnow())
|
started=rethinkstuff.utcnow())
|
||||||
@ -52,8 +75,6 @@ def new_job(frontier, job_conf):
|
|||||||
sites = []
|
sites = []
|
||||||
for seed_conf in job_conf["seeds"]:
|
for seed_conf in job_conf["seeds"]:
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
# XXX check for unknown settings, invalid url, etc
|
|
||||||
|
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
job_id=job.id, seed=merged_conf["url"],
|
job_id=job.id, seed=merged_conf["url"],
|
||||||
scope=merged_conf.get("scope"),
|
scope=merged_conf.get("scope"),
|
||||||
|
79
brozzler/job_schema.yml
Normal file
79
brozzler/job_schema.yml
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
id:
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
|
||||||
|
<<: &multi_level_options
|
||||||
|
time_limit:
|
||||||
|
type: number
|
||||||
|
min: 0
|
||||||
|
|
||||||
|
enable_warcprox_features:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
ignore_robots:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
warcprox_meta:
|
||||||
|
type: dict
|
||||||
|
nullable: true
|
||||||
|
|
||||||
|
proxy:
|
||||||
|
type: string
|
||||||
|
nullable: true
|
||||||
|
|
||||||
|
scope:
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
surt:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
accepts:
|
||||||
|
type: list
|
||||||
|
schema: &scope_rule
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
|
||||||
|
domain:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
url_match:
|
||||||
|
type: string
|
||||||
|
allowed:
|
||||||
|
- STRING_MATCH
|
||||||
|
- SURT_MATCH
|
||||||
|
- REGEX_MATCH
|
||||||
|
|
||||||
|
value:
|
||||||
|
type: string
|
||||||
|
dependencies:
|
||||||
|
- url_match
|
||||||
|
|
||||||
|
blocks:
|
||||||
|
type: list
|
||||||
|
schema: *scope_rule
|
||||||
|
|
||||||
|
max_hops:
|
||||||
|
type: integer
|
||||||
|
|
||||||
|
max_hops_off_surt:
|
||||||
|
type: integer
|
||||||
|
|
||||||
|
remember_outlinks:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
metadata:
|
||||||
|
type: dict
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
type: list
|
||||||
|
required: true
|
||||||
|
minlength: 1
|
||||||
|
schema:
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
|
||||||
|
url:
|
||||||
|
type: url
|
||||||
|
required: true
|
||||||
|
|
||||||
|
<<: *multi_level_options
|
1
setup.py
1
setup.py
@ -67,6 +67,7 @@ setuptools.setup(
|
|||||||
'rethinkstuff>=0.1.5',
|
'rethinkstuff>=0.1.5',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'psutil==4.3.0',
|
'psutil==4.3.0',
|
||||||
|
'cerberus==1.0.1',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'webconsole': ['flask>=0.11', 'gunicorn'],
|
'webconsole': ['flask>=0.11', 'gunicorn'],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user