diff --git a/bin/brozzler-new-job b/bin/brozzler-new-job index f949a8d..f2c799c 100755 --- a/bin/brozzler-new-job +++ b/bin/brozzler-new-job @@ -14,7 +14,7 @@ arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') arg_parser.add_argument("--db", dest="db", default="localhost", - help="comma-separated list of RethinkDB server addresses, e.g. db0.example.com,db0.example.com:39015,db1.example.com") + help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org") arg_parser.add_argument("-v", "--verbose", dest="log_level", action="store_const", default=logging.INFO, const=logging.DEBUG) arg_parser.add_argument("--version", action="version", @@ -69,7 +69,7 @@ for seed_conf in seeds: extra_headers=extra_headers) sites.append(site) -db = brozzler.BrozzlerRethinkDb(args.db.split(",")) +frontier = brozzler.RethinkDbFrontier(args.db.split(",")) for site in sites: - brozzler.new_site(db, site) + brozzler.new_site(frontier, site) diff --git a/bin/brozzler-new-site b/bin/brozzler-new-site index 0c90122..ca97b9a 100755 --- a/bin/brozzler-new-site +++ b/bin/brozzler-new-site @@ -43,6 +43,6 @@ site = brozzler.Site(seed=args.seed, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features, extra_headers=extra_headers) -db = brozzler.BrozzlerRethinkDb(args.db.split(",")) -brozzler.new_site(db, site) +frontier = brozzler.RethinkDbFrontier(args.db.split(",")) +brozzler.new_site(frontier, site) diff --git a/bin/brozzler-worker b/bin/brozzler-worker index 184d4e2..c7bd3f0 100755 --- a/bin/brozzler-worker +++ b/bin/brozzler-worker @@ -15,7 +15,7 @@ import traceback arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument("--db", dest="db", default="localhost", - help="comma-separated list of RethinkDB server addresses, e.g. db0.example.org,db0.example.org:39015,db1.example.org") + help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org") arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', help='executable to use to invoke chrome') arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1', @@ -49,8 +49,8 @@ signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGINT, sigint) -db = brozzler.BrozzlerRethinkDb(args.db.split(",")) -worker = brozzler.BrozzlerWorker(db, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) +frontier = brozzler.RethinkDbFrontier(args.db.split(",")) +worker = brozzler.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) worker.start() diff --git a/brozzler/__init__.py b/brozzler/__init__.py index bfac5f9..e76d059 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -3,7 +3,7 @@ import logging as _logging from brozzler.site import Page, Site from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots -from brozzler.db import BrozzlerRethinkDb +from brozzler.frontier import RethinkDbFrontier from brozzler.browser import Browser, BrowserPool def _read_version(): diff --git a/brozzler/db.py b/brozzler/frontier.py similarity index 99% rename from brozzler/db.py rename to brozzler/frontier.py index dc5dc06..100784a 100644 --- a/brozzler/db.py +++ b/brozzler/frontier.py @@ -9,7 +9,7 @@ import random class UnexpectedDbResult(Exception): pass -class BrozzlerRethinkDb: +class RethinkDbFrontier: logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3): diff --git a/brozzler/worker.py b/brozzler/worker.py index 98ae5f1..9c0c39e 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -13,8 +13,8 @@ import json class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) - def __init__(self, db, max_browsers=1, chrome_exe="chromium-browser"): - self._db = db + def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"): + self._frontier = frontier self._max_browsers = max_browsers self._browser_pool = brozzler.browser.BrowserPool(max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -46,21 +46,21 @@ class BrozzlerWorker: page.brozzle_count += 1 page.claimed = False # XXX set priority? - self._db.update_page(page) + self._frontier.update_page(page) if page.redirect_url and page.hops_from_seed == 0: site.note_seed_redirect(page.redirect_url) - self._db.update_site(site) + self._frontier.update_site(site) def _disclaim_site(self, site, page=None): self.logger.info("disclaiming %s", site) site.claimed = False - if not page and not self._db.has_outstanding_pages(site): + if not page and not self._frontier.has_outstanding_pages(site): self.logger.info("site FINISHED! %s", site) site.status = "FINISHED" - self._db.update_site(site) + self._frontier.update_site(site) if page: page.claimed = False - self._db.update_page(page) + self._frontier.update_page(page) def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None): headers = {"Content-Type":content_type} @@ -109,13 +109,13 @@ class BrozzlerWorker: if site.is_in_scope(url, parent_page): if brozzler.is_permitted_by_robots(site, url): new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1) - existing_child_page = self._db.get_page(new_child_page) + existing_child_page = self._frontier.get_page(new_child_page) if existing_child_page: existing_child_page.priority += new_child_page.priority - self._db.update_page(existing_child_page) + self._frontier.update_page(existing_child_page) counts["updated"] += 1 else: - self._db.new_page(new_child_page) + self._frontier.new_page(new_child_page) counts["added"] += 1 else: counts["blocked"] += 1 @@ -152,7 +152,7 @@ class BrozzlerWorker: try: browser.start(proxy=site.proxy) while not self._shutdown_requested.is_set() and time.time() - start < 300: - page = self._db.claim_page(site) + page = self._frontier.claim_page(site) self.brozzle_page(browser, ydl, site, page) self._completed_page(site, page) page = None @@ -176,7 +176,7 @@ class BrozzlerWorker: try: browser = self._browser_pool.acquire() try: - site = self._db.claim_site() + site = self._frontier.claim_site() self.logger.info("brozzling site %s", site) ydl = self._youtube_dl(site) th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),