rename "db" to "frontier"

This commit is contained in:
Noah Levitt 2015-08-19 17:47:05 +00:00
parent cd3a644298
commit b8506a2ab4
6 changed files with 22 additions and 22 deletions

View File

@ -14,7 +14,7 @@ arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
arg_parser.add_argument("--db", dest="db", default="localhost",
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.com,db0.example.com:39015,db1.example.com")
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
@ -69,7 +69,7 @@ for seed_conf in seeds:
extra_headers=extra_headers)
sites.append(site)
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
for site in sites:
brozzler.new_site(db, site)
brozzler.new_site(frontier, site)

View File

@ -43,6 +43,6 @@ site = brozzler.Site(seed=args.seed, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features,
extra_headers=extra_headers)
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
brozzler.new_site(db, site)
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
brozzler.new_site(frontier, site)

View File

@ -15,7 +15,7 @@ import traceback
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument("--db", dest="db", default="localhost",
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.org,db0.example.org:39015,db1.example.org")
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
@ -49,8 +49,8 @@ signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint)
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
worker = brozzler.BrozzlerWorker(db, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
worker = brozzler.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
worker.start()

View File

@ -3,7 +3,7 @@ import logging as _logging
from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.db import BrozzlerRethinkDb
from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool
def _read_version():

View File

@ -9,7 +9,7 @@ import random
class UnexpectedDbResult(Exception):
pass
class BrozzlerRethinkDb:
class RethinkDbFrontier:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3):

View File

@ -13,8 +13,8 @@ import json
class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, db, max_browsers=1, chrome_exe="chromium-browser"):
self._db = db
def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"):
self._frontier = frontier
self._max_browsers = max_browsers
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -46,21 +46,21 @@ class BrozzlerWorker:
page.brozzle_count += 1
page.claimed = False
# XXX set priority?
self._db.update_page(page)
self._frontier.update_page(page)
if page.redirect_url and page.hops_from_seed == 0:
site.note_seed_redirect(page.redirect_url)
self._db.update_site(site)
self._frontier.update_site(site)
def _disclaim_site(self, site, page=None):
self.logger.info("disclaiming %s", site)
site.claimed = False
if not page and not self._db.has_outstanding_pages(site):
if not page and not self._frontier.has_outstanding_pages(site):
self.logger.info("site FINISHED! %s", site)
site.status = "FINISHED"
self._db.update_site(site)
self._frontier.update_site(site)
if page:
page.claimed = False
self._db.update_page(page)
self._frontier.update_page(page)
def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
headers = {"Content-Type":content_type}
@ -109,13 +109,13 @@ class BrozzlerWorker:
if site.is_in_scope(url, parent_page):
if brozzler.is_permitted_by_robots(site, url):
new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
existing_child_page = self._db.get_page(new_child_page)
existing_child_page = self._frontier.get_page(new_child_page)
if existing_child_page:
existing_child_page.priority += new_child_page.priority
self._db.update_page(existing_child_page)
self._frontier.update_page(existing_child_page)
counts["updated"] += 1
else:
self._db.new_page(new_child_page)
self._frontier.new_page(new_child_page)
counts["added"] += 1
else:
counts["blocked"] += 1
@ -152,7 +152,7 @@ class BrozzlerWorker:
try:
browser.start(proxy=site.proxy)
while not self._shutdown_requested.is_set() and time.time() - start < 300:
page = self._db.claim_page(site)
page = self._frontier.claim_page(site)
self.brozzle_page(browser, ydl, site, page)
self._completed_page(site, page)
page = None
@ -176,7 +176,7 @@ class BrozzlerWorker:
try:
browser = self._browser_pool.acquire()
try:
site = self._db.claim_site()
site = self._frontier.claim_site()
self.logger.info("brozzling site %s", site)
ydl = self._youtube_dl(site)
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),