mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
refactor to simplify starting new job from code
This commit is contained in:
parent
68de85022a
commit
efa640c640
@ -20,7 +20,7 @@ cd brozzler
|
|||||||
# set up virtualenv if desired
|
# set up virtualenv if desired
|
||||||
pip install -r requirements.txt .
|
pip install -r requirements.txt .
|
||||||
```
|
```
|
||||||
Brozzler also requires a rabbitmq server.
|
Brozzler also requires rethinkdb.
|
||||||
|
|
||||||
Fonts for good screenshots
|
Fonts for good screenshots
|
||||||
--------------------------
|
--------------------------
|
||||||
|
@ -24,52 +24,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
|||||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||||
|
|
||||||
def merge(a, b):
|
|
||||||
if isinstance(a, dict) and isinstance(b, dict):
|
|
||||||
merged = dict(a)
|
|
||||||
b_tmp = dict(b)
|
|
||||||
for k in a:
|
|
||||||
merged[k] = merge(a[k], b_tmp.pop(k, None))
|
|
||||||
merged.update(b_tmp)
|
|
||||||
return merged
|
|
||||||
elif isinstance(a, list) and isinstance(b, list):
|
|
||||||
return a + b
|
|
||||||
else:
|
|
||||||
return a
|
|
||||||
|
|
||||||
logging.info("loading %s", args.job_conf_file)
|
|
||||||
with open(args.job_conf_file) as f:
|
|
||||||
job_conf = yaml.load(f)
|
|
||||||
logging.info("job_conf=%s", job_conf)
|
|
||||||
|
|
||||||
seeds = job_conf.pop("seeds")
|
|
||||||
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
|
|
||||||
|
|
||||||
sites = []
|
|
||||||
for seed_conf in seeds:
|
|
||||||
if "id" in seed_conf:
|
|
||||||
seed_conf.pop("id")
|
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
|
||||||
# XXX check for unknown settings, invalid url, etc
|
|
||||||
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
|
|
||||||
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
|
|
||||||
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
|
|
||||||
|
|
||||||
extra_headers = None
|
|
||||||
if "warcprox_meta" in merged_conf:
|
|
||||||
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
|
|
||||||
extra_headers = {"Warcprox-Meta":warcprox_meta}
|
|
||||||
|
|
||||||
site = brozzler.Site(seed=merged_conf["url"],
|
|
||||||
scope=merged_conf.get("scope"),
|
|
||||||
time_limit=merged_conf.get("time_limit"),
|
|
||||||
proxy=merged_conf.get("proxy"),
|
|
||||||
ignore_robots=merged_conf.get("ignore_robots"),
|
|
||||||
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
|
|
||||||
extra_headers=extra_headers)
|
|
||||||
sites.append(site)
|
|
||||||
|
|
||||||
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||||
for site in sites:
|
brozzler.job.new_job_file(frontier, job_conf_file)
|
||||||
brozzler.new_site(frontier, site)
|
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ from brozzler.worker import BrozzlerWorker
|
|||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
from brozzler.frontier import RethinkDbFrontier
|
from brozzler.frontier import RethinkDbFrontier
|
||||||
from brozzler.browser import Browser, BrowserPool
|
from brozzler.browser import Browser, BrowserPool
|
||||||
|
from brozzler.job import new_job, new_site
|
||||||
|
|
||||||
def _read_version():
|
def _read_version():
|
||||||
import os
|
import os
|
||||||
@ -44,17 +45,4 @@ class ReachedLimit(Exception):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
def new_site(db, site):
|
|
||||||
_logging.info("new site {}".format(site))
|
|
||||||
db.new_site(site)
|
|
||||||
try:
|
|
||||||
if is_permitted_by_robots(site, site.seed):
|
|
||||||
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
|
|
||||||
db.new_page(page)
|
|
||||||
else:
|
|
||||||
_logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
|
||||||
except ReachedLimit as e:
|
|
||||||
site.note_limit_reached(e)
|
|
||||||
db.update_site(site)
|
|
||||||
|
|
||||||
# vim: set sw=4 et:
|
# vim: set sw=4 et:
|
||||||
|
73
brozzler/job.py
Normal file
73
brozzler/job.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import brozzler
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
|
||||||
|
def merge(a, b):
|
||||||
|
if isinstance(a, dict) and isinstance(b, dict):
|
||||||
|
merged = dict(a)
|
||||||
|
b_tmp = dict(b)
|
||||||
|
for k in a:
|
||||||
|
merged[k] = merge(a[k], b_tmp.pop(k, None))
|
||||||
|
merged.update(b_tmp)
|
||||||
|
return merged
|
||||||
|
elif isinstance(a, list) and isinstance(b, list):
|
||||||
|
return a + b
|
||||||
|
else:
|
||||||
|
return a
|
||||||
|
|
||||||
|
def new_job_file(frontier, job_conf_file):
|
||||||
|
logging.info("loading %s", args.job_conf_file)
|
||||||
|
with open(args.job_conf_file) as f:
|
||||||
|
job_conf = yaml.load(f)
|
||||||
|
new_job(frontier, job_conf)
|
||||||
|
|
||||||
|
def new_job(frontier, job_conf):
|
||||||
|
# logging.info("job_conf=%s", job_conf)
|
||||||
|
seeds = job_conf.pop("seeds")
|
||||||
|
# logging.info("=== global settings ===\n%s", yaml.dump(job_conf))
|
||||||
|
|
||||||
|
sites = []
|
||||||
|
for seed_conf in seeds:
|
||||||
|
if "id" in seed_conf:
|
||||||
|
seed_conf.pop("id")
|
||||||
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
|
# XXX check for unknown settings, invalid url, etc
|
||||||
|
# logging.info("merge(%s, %s) = %s", seed_conf, global_conf, merged_conf)
|
||||||
|
# logging.info("=== seed_conf ===\n%s", yaml.dump(seed_conf))
|
||||||
|
# logging.info("=== merged_conf ===\n%s", yaml.dump(merged_conf))
|
||||||
|
|
||||||
|
extra_headers = None
|
||||||
|
if "warcprox_meta" in merged_conf:
|
||||||
|
warcprox_meta = json.dumps(merged_conf["warcprox_meta"], separators=(',', ':'))
|
||||||
|
extra_headers = {"Warcprox-Meta":warcprox_meta}
|
||||||
|
|
||||||
|
site = brozzler.Site(seed=merged_conf["url"],
|
||||||
|
scope=merged_conf.get("scope"),
|
||||||
|
time_limit=merged_conf.get("time_limit"),
|
||||||
|
proxy=merged_conf.get("proxy"),
|
||||||
|
ignore_robots=merged_conf.get("ignore_robots"),
|
||||||
|
enable_warcprox_features=merged_conf.get("enable_warcprox_features"),
|
||||||
|
extra_headers=extra_headers)
|
||||||
|
sites.append(site)
|
||||||
|
|
||||||
|
# frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||||
|
for site in sites:
|
||||||
|
new_site(frontier, site)
|
||||||
|
|
||||||
|
def new_site(frontier, site):
|
||||||
|
logging.info("new site {}".format(site))
|
||||||
|
frontier.new_site(site)
|
||||||
|
try:
|
||||||
|
if is_permitted_by_robots(site, site.seed):
|
||||||
|
page = Page(site.seed, site_id=site.id, hops_from_seed=0, priority=1000)
|
||||||
|
frontier.new_page(page)
|
||||||
|
else:
|
||||||
|
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||||
|
except ReachedLimit as e:
|
||||||
|
site.note_limit_reached(e)
|
||||||
|
frontier.update_site(site)
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user