refactor to simplify starting new job from code

This commit is contained in:
Noah Levitt 2015-08-25 19:52:33 +00:00
parent 68de85022a
commit efa640c640
4 changed files with 76 additions and 61 deletions

View File

@ -20,7 +20,7 @@ cd brozzler
# set up virtualenv if desired # set up virtualenv if desired
pip install -r requirements.txt . pip install -r requirements.txt .
``` ```
Brozzler also requires a rabbitmq server. Brozzler also requires rethinkdb.
Fonts for good screenshots Fonts for good screenshots
-------------------------- --------------------------

View File

@ -24,52 +24,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level, logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict):
merged = dict(a)
b_tmp = dict(b)
for k in a:
merged[k] = merge(a[k], b_tmp.pop(k, None))
merged.update(b_tmp)
return merged
elif isinstance(a, list) and isinstance(b, list):
return a + b
else:
return a
logging.info("loading %s", args.job_conf_file)
with open(args.job_conf_file) as f:
job_conf = yaml.load(f)
logging.info("job_conf=%s", job_conf)
seeds = job_conf.pop("seeds")
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
sites = []
for seed_conf in seeds:
if "id" in seed_conf:
seed_conf.pop("id")
merged_conf = merge(seed_conf, job_conf)
# XXX check for unknown settings, invalid url, etc
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
extra_headers = None
if "warcprox_meta" in merged_conf:
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
extra_headers = {"Warcprox-Meta":warcprox_meta}
site = brozzler.Site(seed=merged_conf["url"],
scope=merged_conf.get("scope"),
time_limit=merged_conf.get("time_limit"),
proxy=merged_conf.get("proxy"),
ignore_robots=merged_conf.get("ignore_robots"),
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
extra_headers=extra_headers)
sites.append(site)
frontier = brozzler.RethinkDbFrontier(args.db.split(",")) frontier = brozzler.RethinkDbFrontier(args.db.split(","))
for site in sites: brozzler.job.new_job_file(frontier, job_conf_file)
brozzler.new_site(frontier, site)

View File

@ -5,6 +5,7 @@ from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool from brozzler.browser import Browser, BrowserPool
from brozzler.job import new_job, new_site
def _read_version(): def _read_version():
import os import os
@ -44,17 +45,4 @@ class ReachedLimit(Exception):
def __str__(self): def __str__(self):
return self.__repr__() return self.__repr__()
def new_site(db, site):
_logging.info("new site {}".format(site))
db.new_site(site)
try:
if is_permitted_by_robots(site, site.seed):
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
db.new_page(page)
else:
_logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
except ReachedLimit as e:
site.note_limit_reached(e)
db.update_site(site)
# vim: set sw=4 et: # vim: set sw=4 et:

73
brozzler/job.py Normal file
View File

@ -0,0 +1,73 @@
# vim: set sw=4 et:
import logging
import brozzler
import yaml
import json
def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict):
merged = dict(a)
b_tmp = dict(b)
for k in a:
merged[k] = merge(a[k], b_tmp.pop(k, None))
merged.update(b_tmp)
return merged
elif isinstance(a, list) and isinstance(b, list):
return a + b
else:
return a
def new_job_file(frontier, job_conf_file):
logging.info("loading %s", args.job_conf_file)
with open(args.job_conf_file) as f:
job_conf = yaml.load(f)
new_job(frontier, job_conf)
def new_job(frontier, job_conf):
# logging.info("job_conf=%s", job_conf)
seeds = job_conf.pop("seeds")
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
sites = []
for seed_conf in seeds:
if "id" in seed_conf:
seed_conf.pop("id")
merged_conf = merge(seed_conf, job_conf)
# XXX check for unknown settings, invalid url, etc
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
extra_headers = None
if "warcprox_meta" in merged_conf:
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
extra_headers = {"Warcprox-Meta":warcprox_meta}
site = brozzler.Site(seed=merged_conf["url"],
scope=merged_conf.get("scope"),
time_limit=merged_conf.get("time_limit"),
proxy=merged_conf.get("proxy"),
ignore_robots=merged_conf.get("ignore_robots"),
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
extra_headers=extra_headers)
sites.append(site)
# frontier = brozzler.RethinkDbFrontier(args.db.split(","))
for site in sites:
new_site(frontier, site)
def new_site(frontier, site):
logging.info("new site {}".format(site))
frontier.new_site(site)
try:
if is_permitted_by_robots(site, site.seed):
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
frontier.new_page(page)
else:
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
except ReachedLimit as e:
site.note_limit_reached(e)
frontier.update_site(site)