mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-02 03:26:11 -04:00
rename "db" to "frontier"
This commit is contained in:
parent
cd3a644298
commit
b8506a2ab4
6 changed files with 22 additions and 22 deletions
|
@ -14,7 +14,7 @@ arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
|
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
|
||||||
arg_parser.add_argument("--db", dest="db", default="localhost",
|
arg_parser.add_argument("--db", dest="db", default="localhost",
|
||||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.com,db0.example.com:39015,db1.example.com")
|
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
|
||||||
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
arg_parser.add_argument("--version", action="version",
|
arg_parser.add_argument("--version", action="version",
|
||||||
|
@ -69,7 +69,7 @@ for seed_conf in seeds:
|
||||||
extra_headers=extra_headers)
|
extra_headers=extra_headers)
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||||
for site in sites:
|
for site in sites:
|
||||||
brozzler.new_site(db, site)
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,6 @@ site = brozzler.Site(seed=args.seed, proxy=args.proxy,
|
||||||
enable_warcprox_features=args.enable_warcprox_features,
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
extra_headers=extra_headers)
|
extra_headers=extra_headers)
|
||||||
|
|
||||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||||
brozzler.new_site(db, site)
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ import traceback
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument("--db", dest="db", default="localhost",
|
arg_parser.add_argument("--db", dest="db", default="localhost",
|
||||||
help="comma-separated list of RethinkDB server addresses, e.g. db0.example.org,db0.example.org:39015,db1.example.org")
|
help="comma-separated list of RethinkDB server addresses, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org")
|
||||||
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
|
@ -49,8 +49,8 @@ signal.signal(signal.SIGQUIT, dump_state)
|
||||||
signal.signal(signal.SIGTERM, sigterm)
|
signal.signal(signal.SIGTERM, sigterm)
|
||||||
signal.signal(signal.SIGINT, sigint)
|
signal.signal(signal.SIGINT, sigint)
|
||||||
|
|
||||||
db = brozzler.BrozzlerRethinkDb(args.db.split(","))
|
frontier = brozzler.RethinkDbFrontier(args.db.split(","))
|
||||||
worker = brozzler.BrozzlerWorker(db, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
worker = brozzler.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||||
|
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import logging as _logging
|
||||||
from brozzler.site import Page, Site
|
from brozzler.site import Page, Site
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker
|
||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
from brozzler.db import BrozzlerRethinkDb
|
from brozzler.frontier import RethinkDbFrontier
|
||||||
from brozzler.browser import Browser, BrowserPool
|
from brozzler.browser import Browser, BrowserPool
|
||||||
|
|
||||||
def _read_version():
|
def _read_version():
|
||||||
|
|
|
@ -9,7 +9,7 @@ import random
|
||||||
class UnexpectedDbResult(Exception):
|
class UnexpectedDbResult(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BrozzlerRethinkDb:
|
class RethinkDbFrontier:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3):
|
def __init__(self, servers=["localhost"], db="brozzler", shards=3, replicas=3):
|
|
@ -13,8 +13,8 @@ import json
|
||||||
class BrozzlerWorker:
|
class BrozzlerWorker:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, db, max_browsers=1, chrome_exe="chromium-browser"):
|
def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"):
|
||||||
self._db = db
|
self._frontier = frontier
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
||||||
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
|
@ -46,21 +46,21 @@ class BrozzlerWorker:
|
||||||
page.brozzle_count += 1
|
page.brozzle_count += 1
|
||||||
page.claimed = False
|
page.claimed = False
|
||||||
# XXX set priority?
|
# XXX set priority?
|
||||||
self._db.update_page(page)
|
self._frontier.update_page(page)
|
||||||
if page.redirect_url and page.hops_from_seed == 0:
|
if page.redirect_url and page.hops_from_seed == 0:
|
||||||
site.note_seed_redirect(page.redirect_url)
|
site.note_seed_redirect(page.redirect_url)
|
||||||
self._db.update_site(site)
|
self._frontier.update_site(site)
|
||||||
|
|
||||||
def _disclaim_site(self, site, page=None):
|
def _disclaim_site(self, site, page=None):
|
||||||
self.logger.info("disclaiming %s", site)
|
self.logger.info("disclaiming %s", site)
|
||||||
site.claimed = False
|
site.claimed = False
|
||||||
if not page and not self._db.has_outstanding_pages(site):
|
if not page and not self._frontier.has_outstanding_pages(site):
|
||||||
self.logger.info("site FINISHED! %s", site)
|
self.logger.info("site FINISHED! %s", site)
|
||||||
site.status = "FINISHED"
|
site.status = "FINISHED"
|
||||||
self._db.update_site(site)
|
self._frontier.update_site(site)
|
||||||
if page:
|
if page:
|
||||||
page.claimed = False
|
page.claimed = False
|
||||||
self._db.update_page(page)
|
self._frontier.update_page(page)
|
||||||
|
|
||||||
def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
|
def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
|
||||||
headers = {"Content-Type":content_type}
|
headers = {"Content-Type":content_type}
|
||||||
|
@ -109,13 +109,13 @@ class BrozzlerWorker:
|
||||||
if site.is_in_scope(url, parent_page):
|
if site.is_in_scope(url, parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
new_child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||||
existing_child_page = self._db.get_page(new_child_page)
|
existing_child_page = self._frontier.get_page(new_child_page)
|
||||||
if existing_child_page:
|
if existing_child_page:
|
||||||
existing_child_page.priority += new_child_page.priority
|
existing_child_page.priority += new_child_page.priority
|
||||||
self._db.update_page(existing_child_page)
|
self._frontier.update_page(existing_child_page)
|
||||||
counts["updated"] += 1
|
counts["updated"] += 1
|
||||||
else:
|
else:
|
||||||
self._db.new_page(new_child_page)
|
self._frontier.new_page(new_child_page)
|
||||||
counts["added"] += 1
|
counts["added"] += 1
|
||||||
else:
|
else:
|
||||||
counts["blocked"] += 1
|
counts["blocked"] += 1
|
||||||
|
@ -152,7 +152,7 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
browser.start(proxy=site.proxy)
|
browser.start(proxy=site.proxy)
|
||||||
while not self._shutdown_requested.is_set() and time.time() - start < 300:
|
while not self._shutdown_requested.is_set() and time.time() - start < 300:
|
||||||
page = self._db.claim_page(site)
|
page = self._frontier.claim_page(site)
|
||||||
self.brozzle_page(browser, ydl, site, page)
|
self.brozzle_page(browser, ydl, site, page)
|
||||||
self._completed_page(site, page)
|
self._completed_page(site, page)
|
||||||
page = None
|
page = None
|
||||||
|
@ -176,7 +176,7 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
browser = self._browser_pool.acquire()
|
browser = self._browser_pool.acquire()
|
||||||
try:
|
try:
|
||||||
site = self._db.claim_site()
|
site = self._frontier.claim_site()
|
||||||
self.logger.info("brozzling site %s", site)
|
self.logger.info("brozzling site %s", site)
|
||||||
ydl = self._youtube_dl(site)
|
ydl = self._youtube_dl(site)
|
||||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue