mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #17 from ato/validate-conf
Validate job conf against a simple schema
This commit is contained in:
commit
692c1c48e1
@ -182,7 +182,12 @@ def brozzler_new_job():
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
try:
|
||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||
except brozzler.job.InvalidJobConf as e:
|
||||
print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
|
||||
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def brozzler_new_site():
|
||||
'''
|
||||
|
@ -24,6 +24,28 @@ import json
|
||||
import datetime
|
||||
import uuid
|
||||
import rethinkstuff
|
||||
import os
|
||||
import cerberus
|
||||
import urllib
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yml')
|
||||
with open(schema_file) as f:
|
||||
return yaml.load(f)
|
||||
|
||||
class JobValidator(cerberus.Validator):
|
||||
def _validate_type_url(self, value):
|
||||
url = urllib.parse.urlparse(value)
|
||||
return url.scheme in ('http', 'https', 'ftp')
|
||||
|
||||
class InvalidJobConf(Exception):
|
||||
def __init__(self, errors):
|
||||
self.errors = errors
|
||||
|
||||
def validate_conf(job_conf, schema=load_schema()):
|
||||
v = JobValidator(schema)
|
||||
if not v.validate(job_conf):
|
||||
raise InvalidJobConf(v.errors)
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file):
|
||||
new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
validate_conf(job_conf)
|
||||
job = Job(
|
||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||
started=rethinkstuff.utcnow())
|
||||
@ -52,8 +75,6 @@ def new_job(frontier, job_conf):
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
# XXX check for unknown settings, invalid url, etc
|
||||
|
||||
site = brozzler.Site(
|
||||
job_id=job.id, seed=merged_conf["url"],
|
||||
scope=merged_conf.get("scope"),
|
||||
|
79
brozzler/job_schema.yml
Normal file
79
brozzler/job_schema.yml
Normal file
@ -0,0 +1,79 @@
|
||||
id:
|
||||
type: string
|
||||
required: true
|
||||
|
||||
<<: &multi_level_options
|
||||
time_limit:
|
||||
type: number
|
||||
min: 0
|
||||
|
||||
enable_warcprox_features:
|
||||
type: boolean
|
||||
|
||||
ignore_robots:
|
||||
type: boolean
|
||||
|
||||
warcprox_meta:
|
||||
type: dict
|
||||
nullable: true
|
||||
|
||||
proxy:
|
||||
type: string
|
||||
nullable: true
|
||||
|
||||
scope:
|
||||
type: dict
|
||||
schema:
|
||||
surt:
|
||||
type: string
|
||||
|
||||
accepts:
|
||||
type: list
|
||||
schema: &scope_rule
|
||||
type: dict
|
||||
schema:
|
||||
|
||||
domain:
|
||||
type: string
|
||||
|
||||
url_match:
|
||||
type: string
|
||||
allowed:
|
||||
- STRING_MATCH
|
||||
- SURT_MATCH
|
||||
- REGEX_MATCH
|
||||
|
||||
value:
|
||||
type: string
|
||||
dependencies:
|
||||
- url_match
|
||||
|
||||
blocks:
|
||||
type: list
|
||||
schema: *scope_rule
|
||||
|
||||
max_hops:
|
||||
type: integer
|
||||
|
||||
max_hops_off_surt:
|
||||
type: integer
|
||||
|
||||
remember_outlinks:
|
||||
type: boolean
|
||||
|
||||
metadata:
|
||||
type: dict
|
||||
|
||||
seeds:
|
||||
type: list
|
||||
required: true
|
||||
minlength: 1
|
||||
schema:
|
||||
type: dict
|
||||
schema:
|
||||
|
||||
url:
|
||||
type: url
|
||||
required: true
|
||||
|
||||
<<: *multi_level_options
|
Loading…
x
Reference in New Issue
Block a user