mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
register with service registry; only start chrome right before using it, so that web console vnc windows aren't always full of about:blank
This commit is contained in:
parent
b91d7e4c3f
commit
343b5c0f82
@ -13,6 +13,8 @@ import signal
|
||||
import pprint
|
||||
import traceback
|
||||
import rethinkstuff
|
||||
import warnings
|
||||
import requests
|
||||
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
@ -32,6 +34,9 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
|
||||
def sigterm(signum, frame):
|
||||
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
||||
@ -55,7 +60,8 @@ signal.signal(signal.SIGINT, sigint)
|
||||
|
||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
worker = brozzler.worker.BrozzlerWorker(frontier, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||
service_registry = rethinkstuff.ServiceRegistry(r)
|
||||
worker = brozzler.worker.BrozzlerWorker(frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||
|
||||
worker.start()
|
||||
|
||||
|
@ -28,6 +28,7 @@ class BrowserPool:
|
||||
|
||||
def __init__(self, size=3, **kwargs):
|
||||
"""kwargs are passed on to Browser.__init__"""
|
||||
self.size = size
|
||||
self._available = set()
|
||||
self._in_use = set()
|
||||
|
||||
@ -58,6 +59,12 @@ class BrowserPool:
|
||||
for browser in self._in_use:
|
||||
browser.abort_browse_page()
|
||||
|
||||
def num_available(self):
|
||||
return len(self._available)
|
||||
|
||||
def num_in_use(self):
|
||||
return len(self._in_use)
|
||||
|
||||
class NoBrowsersAvailable(Exception):
|
||||
pass
|
||||
|
||||
@ -68,10 +75,10 @@ class BrowsingAborted(BrowsingException):
|
||||
pass
|
||||
|
||||
class Browser:
|
||||
"""Runs chrome/chromium to synchronously browse one page at a time using
|
||||
worker.browse_page(). Currently the implementation starts up a new instance
|
||||
of chrome for each page browsed, always on the same debug port. (In the
|
||||
future, it may keep the browser running indefinitely.)"""
|
||||
"""
|
||||
Runs chrome/chromium to synchronously browse one page at a time using
|
||||
worker.browse_page(). Should not be accessed from multiple threads.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
@ -88,6 +95,8 @@ class Browser:
|
||||
self._abort_browse_page = False
|
||||
self._chrome_instance = None
|
||||
self._aw_snap_hes_dead_jim = None
|
||||
self._work_dir = None
|
||||
self._websocket_url = None
|
||||
|
||||
def __repr__(self):
|
||||
return "{}.{}:{}".format(Browser.__module__, Browser.__qualname__, self.chrome_port)
|
||||
@ -100,27 +109,31 @@ class Browser:
|
||||
self.stop()
|
||||
|
||||
def start(self, proxy=None):
|
||||
# these can raise exceptions
|
||||
self._work_dir = tempfile.TemporaryDirectory()
|
||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||
executable=self.chrome_exe,
|
||||
user_home_dir=self._work_dir.name,
|
||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||
ignore_cert_errors=self.ignore_cert_errors,
|
||||
proxy=proxy or self.proxy)
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
if not self._chrome_instance:
|
||||
# these can raise exceptions
|
||||
self._work_dir = tempfile.TemporaryDirectory()
|
||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||
executable=self.chrome_exe,
|
||||
user_home_dir=self._work_dir.name,
|
||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||
ignore_cert_errors=self.ignore_cert_errors,
|
||||
proxy=proxy or self.proxy)
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
|
||||
def stop(self):
|
||||
try:
|
||||
if self._chrome_instance:
|
||||
if self.is_running():
|
||||
self._chrome_instance.stop()
|
||||
self._chrome_instance = None
|
||||
if self._work_dir:
|
||||
self._work_dir.cleanup()
|
||||
self._work_dir = None
|
||||
self._websocket_url = None
|
||||
except:
|
||||
self.logger.error("problem stopping", exc_info=True)
|
||||
|
||||
def is_running(self):
|
||||
return bool(self._websocket_url)
|
||||
|
||||
def abort_browse_page(self):
|
||||
self._abort_browse_page = True
|
||||
|
||||
@ -133,6 +146,8 @@ class Browser:
|
||||
|
||||
Returns extracted outlinks.
|
||||
"""
|
||||
if not self.is_running():
|
||||
raise BrowsingException("browser has not been started")
|
||||
self.url = url
|
||||
self.extra_headers = extra_headers
|
||||
self.on_request = on_request
|
||||
@ -430,8 +445,9 @@ class Chrome:
|
||||
logging.error("unexpected exception", exc_info=True)
|
||||
|
||||
def stop(self):
|
||||
if self._shutdown.is_set():
|
||||
if not self.chrome_process or self._shutdown.is_set():
|
||||
return
|
||||
|
||||
timeout_sec = 300
|
||||
self._shutdown.set()
|
||||
self.logger.info("terminating chrome pid {}".format(self.chrome_process.pid))
|
||||
|
@ -1,5 +1,3 @@
|
||||
# vim: set sw=4 et:
|
||||
|
||||
import os
|
||||
import logging
|
||||
import brozzler
|
||||
@ -13,12 +11,16 @@ import json
|
||||
import PIL.Image
|
||||
import io
|
||||
import socket
|
||||
import datetime
|
||||
|
||||
class BrozzlerWorker:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, frontier, max_browsers=1, chrome_exe="chromium-browser"):
|
||||
HEARTBEAT_INTERVAL = 20.0
|
||||
|
||||
def __init__(self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser"):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
||||
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
@ -74,7 +76,7 @@ class BrozzlerWorker:
|
||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||
self.logger.info("sending WARCPROX_WRITE_RECORD request to warcprox with youtube-dl json for %s", page)
|
||||
self._warcprox_write_record(warcprox_address=site.proxy,
|
||||
url=page.url, warc_type="metadata",
|
||||
url="youtube-dl:%s" % page.url, warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers)
|
||||
@ -128,6 +130,8 @@ class BrozzlerWorker:
|
||||
except:
|
||||
self.logger.error("youtube_dl raised exception on {}".format(page), exc_info=True)
|
||||
|
||||
if not browser.is_running():
|
||||
browser.start(proxy=site.proxy)
|
||||
outlinks = browser.browse_page(page.url,
|
||||
extra_headers=site.extra_headers, on_screenshot=on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
@ -137,8 +141,7 @@ class BrozzlerWorker:
|
||||
start = time.time()
|
||||
page = None
|
||||
try:
|
||||
browser.start(proxy=site.proxy)
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 7 * 60:
|
||||
page = self._frontier.claim_page(site, self._id)
|
||||
outlinks = self.brozzle_page(browser, ydl, site, page)
|
||||
self._frontier.completed_page(site, page)
|
||||
@ -158,10 +161,28 @@ class BrozzlerWorker:
|
||||
self._frontier.disclaim_site(site, page)
|
||||
self._browser_pool.release(browser)
|
||||
|
||||
def _service_heartbeat(self):
|
||||
if hasattr(self, "status_info"):
|
||||
status_info = self.status_info
|
||||
else:
|
||||
status_info = {
|
||||
"role": "brozzler-worker",
|
||||
"heartbeat_interval": self.HEARTBEAT_INTERVAL,
|
||||
}
|
||||
status_info["load"] = 1.0 * self._browser_pool.num_in_use() / self._browser_pool.size
|
||||
status_info["browser_pool_size"] = self._browser_pool.size
|
||||
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
||||
|
||||
self.status_info = self._service_registry.heartbeat(status_info)
|
||||
self.logger.debug("status in service registry: %s", self.status_info)
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
latest_state = None
|
||||
while not self._shutdown_requested.is_set():
|
||||
if self._service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(datetime.timezone.utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
||||
self._service_heartbeat()
|
||||
|
||||
try:
|
||||
browser = self._browser_pool.acquire()
|
||||
try:
|
||||
@ -185,6 +206,9 @@ class BrozzlerWorker:
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
self.logger.critical("thread exiting due to unexpected exception", exc_info=True)
|
||||
finally:
|
||||
if self._service_registry and hasattr(self, "status_info"):
|
||||
self._service_registry.unregister(self.status_info["id"])
|
||||
|
||||
def start(self):
|
||||
th = threading.Thread(target=self.run, name="BrozzlerWorker")
|
||||
|
@ -70,8 +70,13 @@ def job(job_id):
|
||||
|
||||
@app.route("/api/workers")
|
||||
def workers():
|
||||
workers_ = [{"host":host,"vnc_websocket_port":8901} for host in ["aidata400", "aidata401", "aidata400-bu", "aidata401-bu"]]
|
||||
return flask.jsonify(workers=workers_)
|
||||
workers_ = r.table("services").filter({"role":"brozzler-worker"}).run()
|
||||
return flask.jsonify(workers=list(workers_))
|
||||
|
||||
@app.route("/api/services")
|
||||
def services():
|
||||
services_ = r.table("services").run()
|
||||
return flask.jsonify(services=list(services_))
|
||||
|
||||
@app.route("/api/jobs")
|
||||
def jobs():
|
||||
|
Loading…
x
Reference in New Issue
Block a user