rename "db" to "frontier"

This commit is contained in:
Noah Levitt 2015-08-19 17:47:05 +00:00
parent cd3a644298
commit b8506a2ab4
6 changed files with 22 additions and 22 deletions

View file

@ -14,7 +14,7 @@ arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
arg_parser.add_argument("--db", dest="db", default="localhost", arg_parser.add_argument("--db", dest="db", default="localhost",
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.com,db0.example.com:39015,db1.example.com") help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
arg_parser.add_argument("-v", "--verbose", dest="log_level", arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG) action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version", arg_parser.add_argument("--version", action="version",
@ -69,7 +69,7 @@ for seed_conf in seeds:
extra_headers=extra_headers) extra_headers=extra_headers)
sites.append(site) sites.append(site)
db = brozzler.BrozzlerRethinkDb(args.db.split(",")) frontier = brozzler.RethinkDbFrontier(args.db.split(","))
for site in sites: for site in sites:
brozzler.new_site(db, site) brozzler.new_site(frontier, site)

View file

@ -43,6 +43,6 @@ site = brozzler.Site(seed=args.seed, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features, enable_warcprox_features=args.enable_warcprox_features,
extra_headers=extra_headers) extra_headers=extra_headers)
db = brozzler.BrozzlerRethinkDb(args.db.split(",")) frontier = brozzler.RethinkDbFrontier(args.db.split(","))
brozzler.new_site(db, site) brozzler.new_site(frontier, site)

View file

@ -15,7 +15,7 @@ import traceback
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument("--db", dest="db", default="localhost", arg_parser.add_argument("--db", dest="db", default="localhost",
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.org,db0.example.org:39015,db1.example.org") help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome') help='executable to use to invoke chrome')
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1', arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
@ -49,8 +49,8 @@ signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint) signal.signal(signal.SIGINT, sigint)
db = brozzler.BrozzlerRethinkDb(args.db.split(",")) frontier = brozzler.RethinkDbFrontier(args.db.split(","))
worker = brozzler.BrozzlerWorker(db, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) worker = brozzler.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
worker.start() worker.start()

View file

@ -3,7 +3,7 @@ import logging as _logging
from brozzler.site import Page, Site from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots from brozzler.robots import is_permitted_by_robots
from brozzler.db import BrozzlerRethinkDb from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool from brozzler.browser import Browser, BrowserPool
def _read_version(): def _read_version():

View file

@ -9,7 +9,7 @@ import random
class UnexpectedDbResult(Exception): class UnexpectedDbResult(Exception):
pass pass
class BrozzlerRethinkDb: class RethinkDbFrontier:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3): def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3):

View file

@ -13,8 +13,8 @@ import json
class BrozzlerWorker: class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, db, max_browsers=1, chrome_exe="chromium-browser"): def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"):
self._db = db self._frontier = frontier
self._max_browsers = max_browsers self._max_browsers = max_browsers
self._browser_pool = brozzler.browser.BrowserPool(max_browsers, self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
chrome_exe=chrome_exe, ignore_cert_errors=True) chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -46,21 +46,21 @@ class BrozzlerWorker:
page.brozzle_count += 1 page.brozzle_count += 1
page.claimed = False page.claimed = False
# XXX set priority? # XXX set priority?
self._db.update_page(page) self._frontier.update_page(page)
if page.redirect_url and page.hops_from_seed == 0: if page.redirect_url and page.hops_from_seed == 0:
site.note_seed_redirect(page.redirect_url) site.note_seed_redirect(page.redirect_url)
self._db.update_site(site) self._frontier.update_site(site)
def _disclaim_site(self, site, page=None): def _disclaim_site(self, site, page=None):
self.logger.info("disclaiming %s", site) self.logger.info("disclaiming %s", site)
site.claimed = False site.claimed = False
if not page and not self._db.has_outstanding_pages(site): if not page and not self._frontier.has_outstanding_pages(site):
self.logger.info("site FINISHED! %s", site) self.logger.info("site FINISHED! %s", site)
site.status = "FINISHED" site.status = "FINISHED"
self._db.update_site(site) self._frontier.update_site(site)
if page: if page:
page.claimed = False page.claimed = False
self._db.update_page(page) self._frontier.update_page(page)
def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None): def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
headers = {"Content-Type":content_type} headers = {"Content-Type":content_type}
@ -109,13 +109,13 @@ class BrozzlerWorker:
if site.is_in_scope(url, parent_page): if site.is_in_scope(url, parent_page):
if brozzler.is_permitted_by_robots(site, url): if brozzler.is_permitted_by_robots(site, url):
new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1) new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
existing_child_page = self._db.get_page(new_child_page) existing_child_page = self._frontier.get_page(new_child_page)
if existing_child_page: if existing_child_page:
existing_child_page.priority += new_child_page.priority existing_child_page.priority += new_child_page.priority
self._db.update_page(existing_child_page) self._frontier.update_page(existing_child_page)
counts["updated"] += 1 counts["updated"] += 1
else: else:
self._db.new_page(new_child_page) self._frontier.new_page(new_child_page)
counts["added"] += 1 counts["added"] += 1
else: else:
counts["blocked"] += 1 counts["blocked"] += 1
@ -152,7 +152,7 @@ class BrozzlerWorker:
try: try:
browser.start(proxy=site.proxy) browser.start(proxy=site.proxy)
while not self._shutdown_requested.is_set() and time.time() - start < 300: while not self._shutdown_requested.is_set() and time.time() - start < 300:
page = self._db.claim_page(site) page = self._frontier.claim_page(site)
self.brozzle_page(browser, ydl, site, page) self.brozzle_page(browser, ydl, site, page)
self._completed_page(site, page) self._completed_page(site, page)
page = None page = None
@ -176,7 +176,7 @@ class BrozzlerWorker:
try: try:
browser = self._browser_pool.acquire() browser = self._browser_pool.acquire()
try: try:
site = self._db.claim_site() site = self._frontier.claim_site()
self.logger.info("brozzling site %s", site) self.logger.info("brozzling site %s", site)
ydl = self._youtube_dl(site) ydl = self._youtube_dl(site)
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),