Merge pull request #17 from ato/validate-conf

Validate job conf against a simple schema
This commit is contained in:
Noah Levitt 2016-10-04 14:13:47 -07:00 committed by GitHub
commit 692c1c48e1
4 changed files with 109 additions and 3 deletions

View File

@ -182,7 +182,12 @@ def brozzler_new_job():
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
try:
brozzler.job.new_job_file(frontier, args.job_conf_file)
except brozzler.job.InvalidJobConf as e:
print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
sys.exit(1)
def brozzler_new_site():
'''

View File

@ -24,6 +24,28 @@ import json
import datetime
import uuid
import rethinkstuff
import os
import cerberus
import urllib
def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yml')
with open(schema_file) as f:
return yaml.load(f)
class JobValidator(cerberus.Validator):
def _validate_type_url(self, value):
url = urllib.parse.urlparse(value)
return url.scheme in ('http', 'https', 'ftp')
class InvalidJobConf(Exception):
def __init__(self, errors):
self.errors = errors
def validate_conf(job_conf, schema=load_schema()):
v = JobValidator(schema)
if not v.validate(job_conf):
raise InvalidJobConf(v.errors)
def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict):
@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file):
new_job(frontier, job_conf)
def new_job(frontier, job_conf):
validate_conf(job_conf)
job = Job(
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
started=rethinkstuff.utcnow())
@ -52,8 +75,6 @@ def new_job(frontier, job_conf):
sites = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
# XXX check for unknown settings, invalid url, etc
site = brozzler.Site(
job_id=job.id, seed=merged_conf["url"],
scope=merged_conf.get("scope"),

79
brozzler/job_schema.yml Normal file
View File

@ -0,0 +1,79 @@
id:
type: string
required: true
<<: &multi_level_options
time_limit:
type: number
min: 0
enable_warcprox_features:
type: boolean
ignore_robots:
type: boolean
warcprox_meta:
type: dict
nullable: true
proxy:
type: string
nullable: true
scope:
type: dict
schema:
surt:
type: string
accepts:
type: list
schema: &scope_rule
type: dict
schema:
domain:
type: string
url_match:
type: string
allowed:
- STRING_MATCH
- SURT_MATCH
- REGEX_MATCH
value:
type: string
dependencies:
- url_match
blocks:
type: list
schema: *scope_rule
max_hops:
type: integer
max_hops_off_surt:
type: integer
remember_outlinks:
type: boolean
metadata:
type: dict
seeds:
type: list
required: true
minlength: 1
schema:
type: dict
schema:
url:
type: url
required: true
<<: *multi_level_options

View File

@ -67,6 +67,7 @@ setuptools.setup(
'rethinkstuff>=0.1.5',
'rethinkdb>=2.3,<2.4',
'psutil==4.3.0',
'cerberus==1.0.1',
],
extras_require={
'webconsole': ['flask>=0.11', 'gunicorn'],