From 5ac67fe513e1962abb77a8eb0beb4df3d0a394e5 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 4 Oct 2016 21:16:16 +1100 Subject: [PATCH 1/2] Validate job conf against a Cerberus schema --- brozzler/cli.py | 7 +++- brozzler/job.py | 25 +++++++++++-- brozzler/job_schema.yml | 77 +++++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 4 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 brozzler/job_schema.yml diff --git a/brozzler/cli.py b/brozzler/cli.py index 7d7e7a7..1265290 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -182,7 +182,12 @@ def brozzler_new_job(): r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) - brozzler.job.new_job_file(frontier, args.job_conf_file) + try: + brozzler.job.new_job_file(frontier, args.job_conf_file) + except brozzler.job.InvalidJobConf as e: + print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr) + print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr) + sys.exit(1) def brozzler_new_site(): ''' diff --git a/brozzler/job.py b/brozzler/job.py index 85e955d..3a7b874 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -24,6 +24,28 @@ import json import datetime import uuid import rethinkstuff +import os +import cerberus +import urllib + +def load_schema(): + schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yml') + with open(schema_file) as f: + return yaml.load(f) + +class JobValidator(cerberus.Validator): + def _validate_type_url(self, value): + url = urllib.parse.urlparse(value) + return url.scheme in ('http', 'https', 'ftp') + +class InvalidJobConf(Exception): + def __init__(self, errors): + self.errors = errors + +def validate_conf(job_conf, schema=load_schema()): + v = JobValidator(schema) + if not v.validate(job_conf): + raise InvalidJobConf(v.errors) def merge(a, b): if isinstance(a, dict) and isinstance(b, dict): @@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file): new_job(frontier, job_conf) def new_job(frontier, job_conf): + validate_conf(job_conf) job = Job( id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) @@ -52,8 +75,6 @@ def new_job(frontier, job_conf): sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) - # XXX check for unknown settings, invalid url, etc - site = brozzler.Site( job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), diff --git a/brozzler/job_schema.yml b/brozzler/job_schema.yml new file mode 100644 index 0000000..e9d2078 --- /dev/null +++ b/brozzler/job_schema.yml @@ -0,0 +1,77 @@ +id: + type: string + required: true + +<<: &multi_level_options + time_limit: + type: number + min: 0 + + enable_warcprox_features: + type: boolean + + ignore_robots: + type: boolean + + warcprox_meta: + type: dict + nullable: true + + proxy: + type: string + nullable: true + + scope: + type: dict + schema: + surt: + type: string + + accepts: + type: list + schema: &scope_rule + type: dict + schema: + + domain: + type: string + + url_match: + type: string + allowed: + - STRING_MATCH + - SURT_MATCH + - REGEX_MATCH + + value: + type: string + dependencies: + - url_match + + blocks: + type: list + schema: *scope_rule + + max_hops: + type: integer + + max_hops_off_surt: + type: integer + + remember_outlinks: + type: boolean + + metadata: + type: dict + +seeds: + type: list + schema: + type: dict + schema: + + url: + type: url + required: true + + <<: *multi_level_options \ No newline at end of file diff --git a/setup.py b/setup.py index 620216a..4f20061 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ setuptools.setup( 'rethinkstuff>=0.1.5', 'rethinkdb>=2.3,<2.4', 'psutil==4.3.0', + 'cerberus==1.0.1', ], extras_require={ 'webconsole': ['flask>=0.11', 'gunicorn'], From eafc65938b3bc7230ee0133eecc703592c05a313 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 5 Oct 2016 04:31:22 +1100 Subject: [PATCH 2/2] Enforce at least one seed --- brozzler/job_schema.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/brozzler/job_schema.yml b/brozzler/job_schema.yml index e9d2078..a8f4b4e 100644 --- a/brozzler/job_schema.yml +++ b/brozzler/job_schema.yml @@ -66,6 +66,8 @@ id: seeds: type: list + required: true + minlength: 1 schema: type: dict schema: