mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
rename "db" to "frontier"
This commit is contained in:
parent
cd3a644298
commit
b8506a2ab4
@ -14,7 +14,7 @@ arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
|
||||
arg_parser.add_argument("--db", dest="db", default="localhost",
|
||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.com,db0.example.com:39015,db1.example.com")
|
||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
|
||||
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||
arg_parser.add_argument("--version", action="version",
|
||||
@ -69,7 +69,7 @@ for seed_conf in seeds:
|
||||
extra_headers=extra_headers)
|
||||
sites.append(site)
|
||||
|
||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
||||
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||
for site in sites:
|
||||
brozzler.new_site(db, site)
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
|
@ -43,6 +43,6 @@ site = brozzler.Site(seed=args.seed, proxy=args.proxy,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
extra_headers=extra_headers)
|
||||
|
||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
||||
brozzler.new_site(db, site)
|
||||
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
|
@ -15,7 +15,7 @@ import traceback
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument("--db", dest="db", default="localhost",
|
||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.org,db0.example.org:39015,db1.example.org")
|
||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
|
||||
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||
@ -49,8 +49,8 @@ signal.signal(signal.SIGQUIT, dump_state)
|
||||
signal.signal(signal.SIGTERM, sigterm)
|
||||
signal.signal(signal.SIGINT, sigint)
|
||||
|
||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
||||
worker = brozzler.BrozzlerWorker(db, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||
worker = brozzler.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||
|
||||
worker.start()
|
||||
|
||||
|
@ -3,7 +3,7 @@ import logging as _logging
|
||||
from brozzler.site import Page, Site
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.db import BrozzlerRethinkDb
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool
|
||||
|
||||
def _read_version():
|
||||
|
@ -9,7 +9,7 @@ import random
|
||||
class UnexpectedDbResult(Exception):
|
||||
pass
|
||||
|
||||
class BrozzlerRethinkDb:
|
||||
class RethinkDbFrontier:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3):
|
@ -13,8 +13,8 @@ import json
|
||||
class BrozzlerWorker:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, db, max_browsers=1, chrome_exe="chromium-browser"):
|
||||
self._db = db
|
||||
def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"):
|
||||
self._frontier = frontier
|
||||
self._max_browsers = max_browsers
|
||||
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
||||
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
@ -46,21 +46,21 @@ class BrozzlerWorker:
|
||||
page.brozzle_count += 1
|
||||
page.claimed = False
|
||||
# XXX set priority?
|
||||
self._db.update_page(page)
|
||||
self._frontier.update_page(page)
|
||||
if page.redirect_url and page.hops_from_seed == 0:
|
||||
site.note_seed_redirect(page.redirect_url)
|
||||
self._db.update_site(site)
|
||||
self._frontier.update_site(site)
|
||||
|
||||
def _disclaim_site(self, site, page=None):
|
||||
self.logger.info("disclaiming %s", site)
|
||||
site.claimed = False
|
||||
if not page and not self._db.has_outstanding_pages(site):
|
||||
if not page and not self._frontier.has_outstanding_pages(site):
|
||||
self.logger.info("site FINISHED! %s", site)
|
||||
site.status = "FINISHED"
|
||||
self._db.update_site(site)
|
||||
self._frontier.update_site(site)
|
||||
if page:
|
||||
page.claimed = False
|
||||
self._db.update_page(page)
|
||||
self._frontier.update_page(page)
|
||||
|
||||
def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
|
||||
headers = {"Content-Type":content_type}
|
||||
@ -109,13 +109,13 @@ class BrozzlerWorker:
|
||||
if site.is_in_scope(url, parent_page):
|
||||
if brozzler.is_permitted_by_robots(site, url):
|
||||
new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||
existing_child_page = self._db.get_page(new_child_page)
|
||||
existing_child_page = self._frontier.get_page(new_child_page)
|
||||
if existing_child_page:
|
||||
existing_child_page.priority += new_child_page.priority
|
||||
self._db.update_page(existing_child_page)
|
||||
self._frontier.update_page(existing_child_page)
|
||||
counts["updated"] += 1
|
||||
else:
|
||||
self._db.new_page(new_child_page)
|
||||
self._frontier.new_page(new_child_page)
|
||||
counts["added"] += 1
|
||||
else:
|
||||
counts["blocked"] += 1
|
||||
@ -152,7 +152,7 @@ class BrozzlerWorker:
|
||||
try:
|
||||
browser.start(proxy=site.proxy)
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 300:
|
||||
page = self._db.claim_page(site)
|
||||
page = self._frontier.claim_page(site)
|
||||
self.brozzle_page(browser, ydl, site, page)
|
||||
self._completed_page(site, page)
|
||||
page = None
|
||||
@ -176,7 +176,7 @@ class BrozzlerWorker:
|
||||
try:
|
||||
browser = self._browser_pool.acquire()
|
||||
try:
|
||||
site = self._db.claim_site()
|
||||
site = self._frontier.claim_site()
|
||||
self.logger.info("brozzling site %s", site)
|
||||
ydl = self._youtube_dl(site)
|
||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
||||
|
Loading…
x
Reference in New Issue
Block a user