mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
refactor to simplify starting new job from code
This commit is contained in:
parent
68de85022a
commit
efa640c640
@ -20,7 +20,7 @@ cd brozzler
|
||||
# set up virtualenv if desired
|
||||
pip install -r requirements.txt .
|
||||
```
|
||||
Brozzler also requires a rabbitmq server.
|
||||
Brozzler also requires rethinkdb.
|
||||
|
||||
Fonts for good screenshots
|
||||
--------------------------
|
||||
|
@ -24,52 +24,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
merged = dict(a)
|
||||
b_tmp = dict(b)
|
||||
for k in a:
|
||||
merged[k] = merge(a[k], b_tmp.pop(k, None))
|
||||
merged.update(b_tmp)
|
||||
return merged
|
||||
elif isinstance(a, list) and isinstance(b, list):
|
||||
return a + b
|
||||
else:
|
||||
return a
|
||||
|
||||
logging.info("loading %s", args.job_conf_file)
|
||||
with open(args.job_conf_file) as f:
|
||||
job_conf = yaml.load(f)
|
||||
logging.info("job_conf=%s", job_conf)
|
||||
|
||||
seeds = job_conf.pop("seeds")
|
||||
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
|
||||
|
||||
sites = []
|
||||
for seed_conf in seeds:
|
||||
if "id" in seed_conf:
|
||||
seed_conf.pop("id")
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
# XXX check for unknown settings, invalid url, etc
|
||||
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
|
||||
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
|
||||
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
|
||||
|
||||
extra_headers = None
|
||||
if "warcprox_meta" in merged_conf:
|
||||
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
|
||||
extra_headers = {"Warcprox-Meta":warcprox_meta}
|
||||
|
||||
site = brozzler.Site(seed=merged_conf["url"],
|
||||
scope=merged_conf.get("scope"),
|
||||
time_limit=merged_conf.get("time_limit"),
|
||||
proxy=merged_conf.get("proxy"),
|
||||
ignore_robots=merged_conf.get("ignore_robots"),
|
||||
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
|
||||
extra_headers=extra_headers)
|
||||
sites.append(site)
|
||||
|
||||
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||
for site in sites:
|
||||
brozzler.new_site(frontier, site)
|
||||
brozzler.job.new_job_file(frontier, job_conf_file)
|
||||
|
||||
|
@ -5,6 +5,7 @@ from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool
|
||||
from brozzler.job import new_job, new_site
|
||||
|
||||
def _read_version():
|
||||
import os
|
||||
@ -44,17 +45,4 @@ class ReachedLimit(Exception):
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def new_site(db, site):
|
||||
_logging.info("new site {}".format(site))
|
||||
db.new_site(site)
|
||||
try:
|
||||
if is_permitted_by_robots(site, site.seed):
|
||||
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
|
||||
db.new_page(page)
|
||||
else:
|
||||
_logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||
except ReachedLimit as e:
|
||||
site.note_limit_reached(e)
|
||||
db.update_site(site)
|
||||
|
||||
# vim: set sw=4 et:
|
||||
|
73
brozzler/job.py
Normal file
73
brozzler/job.py
Normal file
@ -0,0 +1,73 @@
|
||||
# vim: set sw=4 et:
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
import yaml
|
||||
import json
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
merged = dict(a)
|
||||
b_tmp = dict(b)
|
||||
for k in a:
|
||||
merged[k] = merge(a[k], b_tmp.pop(k, None))
|
||||
merged.update(b_tmp)
|
||||
return merged
|
||||
elif isinstance(a, list) and isinstance(b, list):
|
||||
return a + b
|
||||
else:
|
||||
return a
|
||||
|
||||
def new_job_file(frontier, job_conf_file):
|
||||
logging.info("loading %s", args.job_conf_file)
|
||||
with open(args.job_conf_file) as f:
|
||||
job_conf = yaml.load(f)
|
||||
new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
# logging.info("job_conf=%s", job_conf)
|
||||
seeds = job_conf.pop("seeds")
|
||||
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
|
||||
|
||||
sites = []
|
||||
for seed_conf in seeds:
|
||||
if "id" in seed_conf:
|
||||
seed_conf.pop("id")
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
# XXX check for unknown settings, invalid url, etc
|
||||
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
|
||||
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
|
||||
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
|
||||
|
||||
extra_headers = None
|
||||
if "warcprox_meta" in merged_conf:
|
||||
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
|
||||
extra_headers = {"Warcprox-Meta":warcprox_meta}
|
||||
|
||||
site = brozzler.Site(seed=merged_conf["url"],
|
||||
scope=merged_conf.get("scope"),
|
||||
time_limit=merged_conf.get("time_limit"),
|
||||
proxy=merged_conf.get("proxy"),
|
||||
ignore_robots=merged_conf.get("ignore_robots"),
|
||||
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
|
||||
extra_headers=extra_headers)
|
||||
sites.append(site)
|
||||
|
||||
# frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
|
||||
def new_site(frontier, site):
|
||||
logging.info("new site {}".format(site))
|
||||
frontier.new_site(site)
|
||||
try:
|
||||
if is_permitted_by_robots(site, site.seed):
|
||||
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
|
||||
frontier.new_page(page)
|
||||
else:
|
||||
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||
except ReachedLimit as e:
|
||||
site.note_limit_reached(e)
|
||||
frontier.update_site(site)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user