2024-02-08 12:07:41 -08:00

359 lines
11 KiB
Python

"""
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import sys
try:
import flask
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__,
e,
)
sys.exit(1)
import doublethink
import json
import os
import importlib
import rethinkdb as rdb
import yaml
import base64
r = rdb.RethinkDB()
app = flask.Flask(__name__)
# configure with environment variables
SETTINGS = {
"RETHINKDB_SERVERS": os.environ.get(
"BROZZLER_RETHINKDB_SERVERS", "localhost"
).split(","),
"RETHINKDB_DB": os.environ.get("BROZZLER_RETHINKDB_DB", "brozzler"),
"WAYBACK_BASEURL": os.environ.get(
"WAYBACK_BASEURL", "http://localhost:8880/brozzler"
),
"DASHBOARD_PORT": os.environ.get("DASHBOARD_PORT", "8000"),
"DASHBOARD_INTERFACE": os.environ.get("DASHBOARD_INTERFACE", "localhost"),
}
rr = doublethink.Rethinker(SETTINGS["RETHINKDB_SERVERS"], db=SETTINGS["RETHINKDB_DB"])
_svc_reg = None
def service_registry():
global _svc_reg
if not _svc_reg:
_svc_reg = doublethink.ServiceRegistry(rr)
return _svc_reg
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id):
reql = (
rr.table("pages")
.between(
[site_id, 0, False, r.minval],
[site_id, 0, False, r.maxval],
index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
def queue(site_id):
logging.debug("flask.request.args=%s", flask.request.args)
start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90)
reql = rr.table("pages").between(
[site_id, 0, False, r.minval],
[site_id, 0, False, r.maxval],
index="priority_by_site",
)[start:end]
logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count")
@app.route("/api/site/<site_id>/pages_count")
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
def page_count(site_id):
reql = (
rr.table("pages")
.between(
[site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval],
index="priority_by_site",
)
.count()
)
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
def pages(site_id):
"""Pages already crawled."""
start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90))
reql = (
rr.table("pages")
.between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], index="least_hops"
)
.order_by(index="least_hops")[start:end]
)
logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
def page(page_id):
reql = rr.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id):
reql = rr.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return app.response_class(
yaml.dump(page_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
def site(site_id):
reql = rr.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
s = reql.run()
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id):
reql = rr.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run()
return app.response_class(
yaml.dump(site_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/stats/<bucket>")
def stats(bucket):
reql = rr.table("stats").get(bucket)
logging.debug("querying rethinkdb: %s", reql)
stats_ = reql.run()
return flask.jsonify(stats_)
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
def sites(job_id):
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = rr.table("sites").get_all(jid, index="job_id")
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobless-sites")
def jobless_sites():
# XXX inefficient (unindexed) query
reql = rr.table("sites").filter(~r.row.has_fields("job_id"))
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
def job(job_id):
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = rr.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return flask.jsonify(job_)
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id):
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = rr.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return app.response_class(
yaml.dump(job_, default_flow_style=False), mimetype="application/yaml"
)
@app.route("/api/workers")
def workers():
workers_ = service_registry().available_services("brozzler-worker")
return flask.jsonify(workers=list(workers_))
@app.route("/api/services")
def services():
services_ = service_registry().available_services()
return flask.jsonify(services=list(services_))
@app.route("/api/jobs")
def jobs():
reql = rr.table("jobs").order_by(r.desc("id"))
logging.debug("querying rethinkdb: %s", reql)
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_)
@app.route("/api/config")
def config():
return flask.jsonify(config=SETTINGS)
@app.route("/api/<path:path>")
@app.route("/api", defaults={"path": ""})
def api404(path):
flask.abort(404)
@app.route("/", defaults={"path": ""})
@app.route("/<path:path>")
def root(path):
return flask.render_template("index.html")
try:
import gunicorn.app.base
from gunicorn.six import iteritems
import gunicorn.glogging
class BypassGunicornLogging(gunicorn.glogging.Logger):
def setup(self, cfg):
self.error_log.handlers = logging.root.handlers
self.access_log.handlers = logging.root.handlers
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None):
self.options = options or {}
self.application = app
super(GunicornBrozzlerDashboard, self).__init__()
def load_config(self):
config = dict(
[
(key, value)
for key, value in iteritems(self.options)
if key in self.cfg.settings and value is not None
]
)
for key, value in iteritems(config):
self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging)
self.cfg.set("accesslog", "dummy-value")
def load(self):
return self.application
def run(**options):
logging.info("running brozzler-dashboard using gunicorn")
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info("running brozzler-dashboard using simple flask app.run")
app.run(host=SETTINGS["DASHBOARD_INTERFACE"], port=SETTINGS["DASHBOARD_PORT"])
def main(argv=None):
import argparse
import brozzler.cli
argv = argv or sys.argv
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=(
"brozzler-dashboard - web application for viewing brozzler " "crawl status"
),
epilog=(
"brozzler-dashboard has no command line options, but can be "
"configured using the following environment variables:\n\n"
" BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. "
"db0.foo.org,db0.foo.org:38015,db1.foo.org (default: "
"localhost)\n"
" BROZZLER_RETHINKDB_DB rethinkdb database name "
"(default: brozzler)\n"
" WAYBACK_BASEURL base url for constructing wayback "
"links (default http://localhost:8880/brozzler)"
" DASHBOARD_PORT brozzler-dashboard listening port (default: 8000)\n"
" DASHBOARD_INTERFACE brozzler-dashboard network interface binding (default: localhost)"
),
)
brozzler.cli.add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
brozzler.cli.configure_logging(args)
run()
if __name__ == "__main__":
main()