mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
This commit is contained in:
commit
8f726eac76
55
README.rst
55
README.rst
@ -20,19 +20,56 @@ archiving.
|
||||
Installation
|
||||
------------
|
||||
|
||||
XXX These instructions don't work at the moment. Brozzler requires some
|
||||
customized packages not easily installable in the outside world. I intend to
|
||||
remedy the situation soon.
|
||||
|
||||
::
|
||||
|
||||
# set up virtualenv if desired
|
||||
pip install git+https://github.com/nlevitt/brozzler.git
|
||||
pip install brozzler
|
||||
|
||||
Brozzler also requires a rethinkdb deployment.
|
||||
|
||||
Fonts for good screenshots
|
||||
--------------------------
|
||||
Usage
|
||||
-----
|
||||
|
||||
Launch one or more workers:
|
||||
|
||||
::
|
||||
|
||||
brozzler-worker -e chromium
|
||||
|
||||
Submit jobs:
|
||||
|
||||
::
|
||||
|
||||
brozzler-new-job myjob.yaml
|
||||
|
||||
Job Configuration
|
||||
-----------------
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional.
|
||||
|
||||
::
|
||||
|
||||
id: myjob
|
||||
time_limit: 60 # seconds
|
||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||
ignore_robots: false
|
||||
enable_warcprox_features: false
|
||||
warcprox_meta: null
|
||||
metadata: {}
|
||||
seeds:
|
||||
- url: http://one.example.org/
|
||||
- url: http://two.example.org/
|
||||
time_limit: 30
|
||||
- url: http://three.example.org/
|
||||
time_limit: 10
|
||||
ignore_robots: true
|
||||
scope:
|
||||
surt: http://(org,example,
|
||||
|
||||
Fonts (for decent screenshots)
|
||||
------------------------------
|
||||
|
||||
On ubuntu 14.04 trusty I installed these packages:
|
||||
|
||||
@ -42,12 +79,10 @@ fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica
|
||||
fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core
|
||||
ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
|
||||
|
||||
Haven't looked much at the resulting screenshots yet though.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
Copyright 2015 Internet Archive
|
||||
Copyright 2015-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
not use this software except in compliance with the License. You may
|
||||
|
@ -27,6 +27,7 @@ import re
|
||||
import rethinkstuff
|
||||
import warnings
|
||||
import requests
|
||||
import json
|
||||
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||
description="brozzler-new-site - register site to brozzle",
|
||||
@ -63,7 +64,7 @@ site = brozzler.Site(
|
||||
time_limit=int(args.time_limit) if args.time_limit else None,
|
||||
ignore_robots=args.ignore_robots,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
warcprox_meta=json.loads(args.warcprox_meta))
|
||||
warcprox_meta=json.loads(args.warcprox_meta) if args.warcprox_meta else None)
|
||||
|
||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
|
159
brozzler/behaviors.d/fec_gov.js
Normal file
159
brozzler/behaviors.d/fec_gov.js
Normal file
@ -0,0 +1,159 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/fec_gov.js - click on links that execute JavaScript to
|
||||
* download report csv files for fec.gov/data
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
alreadyClicked : {},
|
||||
|
||||
intervalFunc : function() {
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var cssDownloadLinkSelector = "a[id^='id_csv']";
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
var clickDownloadLinkTargets = documents[j].querySelectorAll(cssDownloadLinkSelector);
|
||||
for (var i = 0; i < clickDownloadLinkTargets.length; i++) {
|
||||
var sourceName = clickDownloadLinkTargets[i].id.substring(7);
|
||||
var clickRadioButtonTargets = documents[j].querySelectorAll("input[name='" + sourceName + "']");
|
||||
|
||||
if (clickRadioButtonTargets.length == 0) {
|
||||
if (clickDownloadLinkTargets[i].umbraClicked) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
|
||||
clickDownloadLinkTargets[i].click(); //click the link to download the csv
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
clickDownloadLinkTargets[i].umbraClicked = true;
|
||||
}
|
||||
else {
|
||||
for (var k = 0; k < clickRadioButtonTargets.length; ++k) {
|
||||
if (clickRadioButtonTargets[k].umbraClicked) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(clickRadioButtonTargets[k]);
|
||||
if (where == 0) {
|
||||
console.log("clicking on " + clickRadioButtonTargets[k]);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickRadioButtonTargets[k].dispatchEvent(mouseOverEvent);
|
||||
clickRadioButtonTargets[k].click(); //select the correct date with the radio button
|
||||
mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
|
||||
clickDownloadLinkTargets[i].click(); //click the link to download the csv for the selected date
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
clickRadioButtonTargets[k].umbraClicked = true;
|
||||
|
||||
//alert("clicking on " + clickRadioButtonTargets[k].name);
|
||||
//alert("clicking on " + clickDownloadLinkTargets[i].id);
|
||||
|
||||
|
||||
break; //break from clickTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.idleSince) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
},
|
||||
|
||||
start : function() {
|
||||
var that = this;
|
||||
this.intervalId = setInterval(function() {
|
||||
that.intervalFunc()
|
||||
}, 250);
|
||||
},
|
||||
|
||||
isFinished : function() {
|
||||
if (this.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - this.idleSince;
|
||||
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(this.intervalId);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
aboveBelowOrOnScreen : function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
return umbraBehavior.isFinished()
|
||||
};
|
||||
|
||||
umbraBehavior.start();
|
@ -98,6 +98,10 @@ behaviors:
|
||||
click_css_selector: button[data-more-results-bottom-button]
|
||||
click_until_hard_timeout: True
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4692
|
||||
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
|
||||
behavior_js: fec_gov.js
|
||||
request_idle_timeout_sec: 10
|
||||
- # default fallback behavior
|
||||
url_regex: '^.*$'
|
||||
request_idle_timeout_sec: 10
|
||||
|
@ -1,21 +1,21 @@
|
||||
#
|
||||
# brozzler/browser.py - classes responsible for running web browsers
|
||||
# (chromium/chromium) and browsing web pages in them
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler/browser.py - classes responsible for running web browsers
|
||||
(chromium/chromium) and browsing web pages in them
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import logging
|
||||
import json
|
||||
@ -58,7 +58,10 @@ class BrowserPool:
|
||||
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
|
||||
|
||||
def acquire(self):
|
||||
"""Returns browser from pool if available, raises NoBrowsersAvailable otherwise."""
|
||||
"""
|
||||
Returns browser from pool if available, raises NoBrowsersAvailable
|
||||
otherwise.
|
||||
"""
|
||||
with self._lock:
|
||||
try:
|
||||
browser = self._available.pop()
|
||||
@ -277,11 +280,23 @@ class Browser:
|
||||
self.logger.info("retrieving outlinks for %s", self.url)
|
||||
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
|
||||
params={"expression": self.OUTLINKS_JS})
|
||||
return False
|
||||
else: # self._waiting_on_outlinks_msg_id
|
||||
return False
|
||||
|
||||
OUTLINKS_JS = """
|
||||
var compileOutlinks = function(frame) {
|
||||
var outlinks = Array.prototype.slice.call(
|
||||
frame.document.querySelectorAll('a[href]'));
|
||||
for (var i = 0; i < frame.frames.length; i++) {
|
||||
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
|
||||
}
|
||||
return outlinks;
|
||||
}
|
||||
compileOutlinks(window).join(' ');
|
||||
"""
|
||||
|
||||
def _browse_interval_func(self):
|
||||
"""Called periodically while page is being browsed. Returns True when
|
||||
finished browsing."""
|
||||
@ -393,7 +408,8 @@ class Browser:
|
||||
self._waiting_on_scroll_to_top_msg_id = None
|
||||
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
||||
self.logger.debug("got outlinks message=%s", message)
|
||||
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
|
||||
self._outlinks = frozenset(
|
||||
message["result"]["result"]["value"].split())
|
||||
elif message["id"] == self._waiting_on_document_url_msg_id:
|
||||
if message["result"]["result"]["value"] != self.url:
|
||||
if self.on_url_change:
|
||||
|
@ -39,21 +39,43 @@ class RethinkDbFrontier:
|
||||
def _ensure_db(self):
|
||||
dbs = self.r.db_list().run()
|
||||
if not self.r.dbname in dbs:
|
||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.logger.info(
|
||||
"creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.r.db_create(self.r.dbname).run()
|
||||
tables = self.r.table_list().run()
|
||||
if not "sites" in tables:
|
||||
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'sites' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("sites").index_create(
|
||||
"sites_last_disclaimed", [
|
||||
self.r.row["status"],
|
||||
self.r.row["last_disclaimed"]]).run()
|
||||
self.r.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'pages' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("pages").index_create(
|
||||
"priority_by_site", [
|
||||
self.r.row["site_id"], self.r.row["brozzle_count"],
|
||||
self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||
# this index is for displaying pages in a sensible order in the web
|
||||
# console
|
||||
self.r.table("pages").index_create(
|
||||
"least_hops", [
|
||||
r.row["site_id"], r.row["brozzle_count"],
|
||||
r.row["hops_from_seed"]])
|
||||
if not "jobs" in tables:
|
||||
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'jobs' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
|
||||
def _vet_result(self, result, **kwargs):
|
||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||
|
11
setup.py
11
setup.py
@ -19,10 +19,11 @@
|
||||
import setuptools
|
||||
import glob
|
||||
|
||||
setuptools.setup(name='brozzler',
|
||||
version='1.1.dev10',
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1.dev20',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/nlevitt/brozzler',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst', encoding='UTF-8').read(),
|
||||
@ -41,10 +42,10 @@ setuptools.setup(name='brozzler',
|
||||
'rethinkstuff',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'psutil',
|
||||
],
|
||||
],
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
'Development Status :: 3 - Alpha',
|
||||
'Development Status :: 4 - Beta',
|
||||
'Environment :: Console',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
|
@ -1 +1 @@
|
||||
flask --debug --app=brozzler-webconsole.py run --host=0.0.0.0 --port=8081
|
||||
gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app
|
||||
|
@ -1,21 +1,21 @@
|
||||
#
|
||||
# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
|
||||
# api endspoints etc
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
|
||||
api endspoints etc
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import flask
|
||||
import rethinkstuff
|
||||
@ -24,16 +24,26 @@ import sys
|
||||
import os
|
||||
import importlib
|
||||
import rethinkdb
|
||||
import logging
|
||||
import yaml
|
||||
|
||||
# XXX flask does its own logging config
|
||||
# import logging
|
||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
||||
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
# flask does its own logging config
|
||||
# logging.basicConfig(
|
||||
# stream=sys.stdout, level=logging.INFO,
|
||||
# format=(
|
||||
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
|
||||
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
||||
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
||||
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
||||
app.logger.setLevel(logging.INFO)
|
||||
app.logger.info('will this show in the log?')
|
||||
|
||||
# configure with environment variables
|
||||
SETTINGS= {
|
||||
SETTINGS = {
|
||||
'RETHINKDB_SERVERS': os.environ.get(
|
||||
'RETHINKDB_SERVERS', 'localhost').split(','),
|
||||
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'),
|
||||
@ -81,10 +91,10 @@ def pages(site_id):
|
||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
||||
start = int(flask.request.args.get("start", 0))
|
||||
end = int(flask.request.args.get("end", start + 90))
|
||||
app.logger.info("yes new query")
|
||||
pages_ = r.table("pages").between(
|
||||
[site_id, 1, False, r.minval],
|
||||
[site_id, r.maxval, False, r.maxval],
|
||||
index="priority_by_site")[start:end].run()
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index="least_hops").order_by(index="least_hops")[start:end].run()
|
||||
return flask.jsonify(pages=list(pages_))
|
||||
|
||||
@app.route("/api/sites/<site_id>")
|
||||
@ -110,6 +120,14 @@ def job(job_id):
|
||||
job_ = r.table("jobs").get(job_id).run()
|
||||
return flask.jsonify(job_)
|
||||
|
||||
@app.route("/api/jobs/<int:job_id>/yaml")
|
||||
@app.route("/api/job/<int:job_id>/yaml")
|
||||
def job_yaml(job_id):
|
||||
job_ = r.table("jobs").get(job_id).run()
|
||||
return app.response_class(
|
||||
yaml.dump(job_, default_flow_style=False),
|
||||
mimetype='application/yaml')
|
||||
|
||||
@app.route("/api/workers")
|
||||
def workers():
|
||||
workers_ = service_registry.available_services("brozzler-worker")
|
||||
|
@ -125,11 +125,10 @@ function loadSiteStats($http, site, job) {
|
||||
$http.get("/api/sites/" + site.id + "/page_count").success(pageCountSuccessCallback(site, job));
|
||||
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
|
||||
|
||||
// parse Warcprox-Meta to find stats bucket
|
||||
var warcprox_meta = angular.fromJson(site.extra_headers["Warcprox-Meta"]);
|
||||
for (var j = 0; j < warcprox_meta.stats.buckets.length; j++) {
|
||||
if (warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
|
||||
var bucket = warcprox_meta.stats.buckets[j];
|
||||
// look at Warcprox-Meta to find stats bucket
|
||||
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
|
||||
if (site.warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
|
||||
var bucket = site.warcprox_meta.stats.buckets[j];
|
||||
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
|
||||
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
|
||||
}
|
||||
@ -138,7 +137,8 @@ function loadSiteStats($http, site, job) {
|
||||
|
||||
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
|
||||
function($scope, $routeParams, $http) {
|
||||
console.log('JobController');
|
||||
$scope.show_yaml = false;
|
||||
// console.log('JobController');
|
||||
$http.get("/api/config").success(function(data) {
|
||||
$scope.config = data.config;
|
||||
});
|
||||
@ -159,6 +159,9 @@ brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$htt
|
||||
}
|
||||
});
|
||||
});
|
||||
$http.get("/api/jobs/" + $routeParams.id + "/yaml").success(function(data) {
|
||||
$scope.job_yaml = data;
|
||||
});
|
||||
}]);
|
||||
|
||||
brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$http", "$window",
|
||||
|
@ -10,7 +10,12 @@
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<h2>Job {{job.id}} <small>{{job.started}}-{{job.finished}} {{job.status}}</small></h2>
|
||||
<h2 ng-click="show_yaml = !show_yaml">
|
||||
<span class="fa fa-caret-right"
|
||||
ng-class="{ 'fa-caret-right': !show_yaml, 'fa-caret-down': !!show_yaml }"></span>
|
||||
Job {{job.id}} <small>{{job.started}}-{{job.finished}}: {{job.status}}</small>
|
||||
</h2>
|
||||
<pre style="display:{{show_yaml?'block':'none'}}">{{job_yaml}}</pre>
|
||||
|
||||
<div class="row bigstats">
|
||||
<div class="col-sm-6 col-md-3">
|
||||
|
@ -40,12 +40,22 @@
|
||||
<div class="col-sm-12">
|
||||
<h2>Pages</h2>
|
||||
<div class="col-sm-6 col-md-4" ng-repeat="page in pages">
|
||||
<a class="thumbnail" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">
|
||||
<img style="width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
|
||||
<div class="thumbnail">
|
||||
<img style="border:1px solid #ddd;width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
|
||||
<div class="caption">
|
||||
<h5>{{page.url}}</h5>
|
||||
<ul class="fa-ul">
|
||||
<li>
|
||||
<span class="fa fa-li fa-camera"></span>
|
||||
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/screenshot:{{page.url}}">full size screenshot ></a>
|
||||
</li>
|
||||
<li>
|
||||
<span class="fa fa-li fa-university"></span>
|
||||
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">wayback ></a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-sm-12" ng-show="loading">
|
||||
|
@ -1,2 +1,4 @@
|
||||
git+https://github.com/mitsuhiko/flask.git
|
||||
rethinkstuff>=0.1.5
|
||||
flask>=0.11
|
||||
gunicorn
|
||||
PyYAML
|
||||
|
Loading…
x
Reference in New Issue
Block a user