mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-12 16:25:34 -04:00
Merge branch 'master' into qa
This commit is contained in:
commit
8f726eac76
13 changed files with 360 additions and 84 deletions
55
README.rst
55
README.rst
|
@ -20,19 +20,56 @@ archiving.
|
||||||
Installation
|
Installation
|
||||||
------------
|
------------
|
||||||
|
|
||||||
XXX These instructions don't work at the moment. Brozzler requires some
|
|
||||||
customized packages not easily installable in the outside world. I intend to
|
|
||||||
remedy the situation soon.
|
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
# set up virtualenv if desired
|
# set up virtualenv if desired
|
||||||
pip install git+https://github.com/nlevitt/brozzler.git
|
pip install brozzler
|
||||||
|
|
||||||
Brozzler also requires a rethinkdb deployment.
|
Brozzler also requires a rethinkdb deployment.
|
||||||
|
|
||||||
Fonts for good screenshots
|
Usage
|
||||||
--------------------------
|
-----
|
||||||
|
|
||||||
|
Launch one or more workers:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
brozzler-worker -e chromium
|
||||||
|
|
||||||
|
Submit jobs:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
brozzler-new-job myjob.yaml
|
||||||
|
|
||||||
|
Job Configuration
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Jobs are defined using yaml files. Options may be specified either at the
|
||||||
|
top-level or on individual seeds. A job id and at least one seed url
|
||||||
|
must be specified, everything else is optional.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
id: myjob
|
||||||
|
time_limit: 60 # seconds
|
||||||
|
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||||
|
ignore_robots: false
|
||||||
|
enable_warcprox_features: false
|
||||||
|
warcprox_meta: null
|
||||||
|
metadata: {}
|
||||||
|
seeds:
|
||||||
|
- url: http://one.example.org/
|
||||||
|
- url: http://two.example.org/
|
||||||
|
time_limit: 30
|
||||||
|
- url: http://three.example.org/
|
||||||
|
time_limit: 10
|
||||||
|
ignore_robots: true
|
||||||
|
scope:
|
||||||
|
surt: http://(org,example,
|
||||||
|
|
||||||
|
Fonts (for decent screenshots)
|
||||||
|
------------------------------
|
||||||
|
|
||||||
On ubuntu 14.04 trusty I installed these packages:
|
On ubuntu 14.04 trusty I installed these packages:
|
||||||
|
|
||||||
|
@ -42,12 +79,10 @@ fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica
|
||||||
fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core
|
fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core
|
||||||
ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
|
ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
|
||||||
|
|
||||||
Haven't looked much at the resulting screenshots yet though.
|
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
Copyright 2015 Internet Archive
|
Copyright 2015-2016 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
not use this software except in compliance with the License. You may
|
not use this software except in compliance with the License. You may
|
||||||
|
|
|
@ -27,6 +27,7 @@ import re
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
import warnings
|
import warnings
|
||||||
import requests
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
description="brozzler-new-site - register site to brozzle",
|
description="brozzler-new-site - register site to brozzle",
|
||||||
|
@ -63,7 +64,7 @@ site = brozzler.Site(
|
||||||
time_limit=int(args.time_limit) if args.time_limit else None,
|
time_limit=int(args.time_limit) if args.time_limit else None,
|
||||||
ignore_robots=args.ignore_robots,
|
ignore_robots=args.ignore_robots,
|
||||||
enable_warcprox_features=args.enable_warcprox_features,
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
warcprox_meta=json.loads(args.warcprox_meta))
|
warcprox_meta=json.loads(args.warcprox_meta) if args.warcprox_meta else None)
|
||||||
|
|
||||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
|
159
brozzler/behaviors.d/fec_gov.js
Normal file
159
brozzler/behaviors.d/fec_gov.js
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
/*
|
||||||
|
* brozzler/behaviors.d/fec_gov.js - click on links that execute JavaScript to
|
||||||
|
* download report csv files for fec.gov/data
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014-2016 Internet Archive
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
var umbraBehavior = {
|
||||||
|
IDLE_TIMEOUT_SEC : 10,
|
||||||
|
idleSince : null,
|
||||||
|
alreadyClicked : {},
|
||||||
|
|
||||||
|
intervalFunc : function() {
|
||||||
|
var clickedSomething = false;
|
||||||
|
var somethingLeftBelow = false;
|
||||||
|
var somethingLeftAbove = false;
|
||||||
|
var cssDownloadLinkSelector = "a[id^='id_csv']";
|
||||||
|
|
||||||
|
var iframes = document.querySelectorAll("iframe");
|
||||||
|
var documents = Array(iframes.length + 1);
|
||||||
|
documents[0] = document;
|
||||||
|
|
||||||
|
for (var i = 0; i < iframes.length; i++) {
|
||||||
|
documents[i+1] = iframes[i].contentWindow.document;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var j = 0; j < documents.length; j++) {
|
||||||
|
var clickDownloadLinkTargets = documents[j].querySelectorAll(cssDownloadLinkSelector);
|
||||||
|
for (var i = 0; i < clickDownloadLinkTargets.length; i++) {
|
||||||
|
var sourceName = clickDownloadLinkTargets[i].id.substring(7);
|
||||||
|
var clickRadioButtonTargets = documents[j].querySelectorAll("input[name='" + sourceName + "']");
|
||||||
|
|
||||||
|
if (clickRadioButtonTargets.length == 0) {
|
||||||
|
if (clickDownloadLinkTargets[i].umbraClicked) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var mouseOverEvent = document.createEvent('Events');
|
||||||
|
mouseOverEvent.initEvent("mouseover",true, false);
|
||||||
|
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
|
||||||
|
clickDownloadLinkTargets[i].click(); //click the link to download the csv
|
||||||
|
clickedSomething = true;
|
||||||
|
this.idleSince = null;
|
||||||
|
clickDownloadLinkTargets[i].umbraClicked = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (var k = 0; k < clickRadioButtonTargets.length; ++k) {
|
||||||
|
if (clickRadioButtonTargets[k].umbraClicked) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var where = this.aboveBelowOrOnScreen(clickRadioButtonTargets[k]);
|
||||||
|
if (where == 0) {
|
||||||
|
console.log("clicking on " + clickRadioButtonTargets[k]);
|
||||||
|
// do mouse over event on click target
|
||||||
|
// since some urls are requsted only on
|
||||||
|
// this event - see
|
||||||
|
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||||
|
var mouseOverEvent = document.createEvent('Events');
|
||||||
|
mouseOverEvent.initEvent("mouseover",true, false);
|
||||||
|
clickRadioButtonTargets[k].dispatchEvent(mouseOverEvent);
|
||||||
|
clickRadioButtonTargets[k].click(); //select the correct date with the radio button
|
||||||
|
mouseOverEvent = document.createEvent('Events');
|
||||||
|
mouseOverEvent.initEvent("mouseover",true, false);
|
||||||
|
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
|
||||||
|
clickDownloadLinkTargets[i].click(); //click the link to download the csv for the selected date
|
||||||
|
clickedSomething = true;
|
||||||
|
this.idleSince = null;
|
||||||
|
clickRadioButtonTargets[k].umbraClicked = true;
|
||||||
|
|
||||||
|
//alert("clicking on " + clickRadioButtonTargets[k].name);
|
||||||
|
//alert("clicking on " + clickDownloadLinkTargets[i].id);
|
||||||
|
|
||||||
|
|
||||||
|
break; //break from clickTargets loop, but not from iframe loop
|
||||||
|
} else if (where > 0) {
|
||||||
|
somethingLeftBelow = true;
|
||||||
|
} else if (where < 0) {
|
||||||
|
somethingLeftAbove = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!clickedSomething) {
|
||||||
|
if (somethingLeftAbove) {
|
||||||
|
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||||
|
window.scrollBy(0, -500);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (somethingLeftBelow) {
|
||||||
|
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||||
|
// + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 200);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||||
|
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||||
|
// + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 200);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (this.idleSince == null) {
|
||||||
|
this.idleSince = Date.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.idleSince) {
|
||||||
|
this.idleSince = Date.now();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
start : function() {
|
||||||
|
var that = this;
|
||||||
|
this.intervalId = setInterval(function() {
|
||||||
|
that.intervalFunc()
|
||||||
|
}, 250);
|
||||||
|
},
|
||||||
|
|
||||||
|
isFinished : function() {
|
||||||
|
if (this.idleSince != null) {
|
||||||
|
var idleTimeMs = Date.now() - this.idleSince;
|
||||||
|
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||||
|
clearInterval(this.intervalId);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
|
||||||
|
aboveBelowOrOnScreen : function(e) {
|
||||||
|
var eTop = e.getBoundingClientRect().top;
|
||||||
|
if (eTop < window.scrollY) {
|
||||||
|
return -1; // above
|
||||||
|
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||||
|
return 1; // below
|
||||||
|
} else {
|
||||||
|
return 0; // on screen
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Called from outside of this script.
|
||||||
|
var umbraBehaviorFinished = function() {
|
||||||
|
return umbraBehavior.isFinished()
|
||||||
|
};
|
||||||
|
|
||||||
|
umbraBehavior.start();
|
|
@ -98,6 +98,10 @@ behaviors:
|
||||||
click_css_selector: button[data-more-results-bottom-button]
|
click_css_selector: button[data-more-results-bottom-button]
|
||||||
click_until_hard_timeout: True
|
click_until_hard_timeout: True
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
|
- # https://webarchive.jira.com/browse/ARI-4692
|
||||||
|
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
|
||||||
|
behavior_js: fec_gov.js
|
||||||
|
request_idle_timeout_sec: 10
|
||||||
- # default fallback behavior
|
- # default fallback behavior
|
||||||
url_regex: '^.*$'
|
url_regex: '^.*$'
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
#
|
'''
|
||||||
# brozzler/browser.py - classes responsible for running web browsers
|
brozzler/browser.py - classes responsible for running web browsers
|
||||||
# (chromium/chromium) and browsing web pages in them
|
(chromium/chromium) and browsing web pages in them
|
||||||
#
|
|
||||||
# Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
You may obtain a copy of the License at
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
limitations under the License.
|
||||||
#
|
'''
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
@ -58,7 +58,10 @@ class BrowserPool:
|
||||||
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
|
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
|
||||||
|
|
||||||
def acquire(self):
|
def acquire(self):
|
||||||
"""Returns browser from pool if available, raises NoBrowsersAvailable otherwise."""
|
"""
|
||||||
|
Returns browser from pool if available, raises NoBrowsersAvailable
|
||||||
|
otherwise.
|
||||||
|
"""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
try:
|
try:
|
||||||
browser = self._available.pop()
|
browser = self._available.pop()
|
||||||
|
@ -277,11 +280,23 @@ class Browser:
|
||||||
self.logger.info("retrieving outlinks for %s", self.url)
|
self.logger.info("retrieving outlinks for %s", self.url)
|
||||||
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
|
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
|
||||||
method="Runtime.evaluate",
|
method="Runtime.evaluate",
|
||||||
params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
|
params={"expression": self.OUTLINKS_JS})
|
||||||
return False
|
return False
|
||||||
else: # self._waiting_on_outlinks_msg_id
|
else: # self._waiting_on_outlinks_msg_id
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
OUTLINKS_JS = """
|
||||||
|
var compileOutlinks = function(frame) {
|
||||||
|
var outlinks = Array.prototype.slice.call(
|
||||||
|
frame.document.querySelectorAll('a[href]'));
|
||||||
|
for (var i = 0; i < frame.frames.length; i++) {
|
||||||
|
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
|
||||||
|
}
|
||||||
|
return outlinks;
|
||||||
|
}
|
||||||
|
compileOutlinks(window).join(' ');
|
||||||
|
"""
|
||||||
|
|
||||||
def _browse_interval_func(self):
|
def _browse_interval_func(self):
|
||||||
"""Called periodically while page is being browsed. Returns True when
|
"""Called periodically while page is being browsed. Returns True when
|
||||||
finished browsing."""
|
finished browsing."""
|
||||||
|
@ -393,7 +408,8 @@ class Browser:
|
||||||
self._waiting_on_scroll_to_top_msg_id = None
|
self._waiting_on_scroll_to_top_msg_id = None
|
||||||
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
||||||
self.logger.debug("got outlinks message=%s", message)
|
self.logger.debug("got outlinks message=%s", message)
|
||||||
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
|
self._outlinks = frozenset(
|
||||||
|
message["result"]["result"]["value"].split())
|
||||||
elif message["id"] == self._waiting_on_document_url_msg_id:
|
elif message["id"] == self._waiting_on_document_url_msg_id:
|
||||||
if message["result"]["result"]["value"] != self.url:
|
if message["result"]["result"]["value"] != self.url:
|
||||||
if self.on_url_change:
|
if self.on_url_change:
|
||||||
|
|
|
@ -39,21 +39,43 @@ class RethinkDbFrontier:
|
||||||
def _ensure_db(self):
|
def _ensure_db(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.r.db_list().run()
|
||||||
if not self.r.dbname in dbs:
|
if not self.r.dbname in dbs:
|
||||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
|
"creating rethinkdb database %s", repr(self.r.dbname))
|
||||||
self.r.db_create(self.r.dbname).run()
|
self.r.db_create(self.r.dbname).run()
|
||||||
tables = self.r.table_list().run()
|
tables = self.r.table_list().run()
|
||||||
if not "sites" in tables:
|
if not "sites" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'sites' in database %s",
|
||||||
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run()
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
self.r.table("sites").index_create(
|
||||||
|
"sites_last_disclaimed", [
|
||||||
|
self.r.row["status"],
|
||||||
|
self.r.row["last_disclaimed"]]).run()
|
||||||
self.r.table("sites").index_create("job_id").run()
|
self.r.table("sites").index_create("job_id").run()
|
||||||
if not "pages" in tables:
|
if not "pages" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'pages' in database %s",
|
||||||
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
self.r.table("pages").index_create(
|
||||||
|
"priority_by_site", [
|
||||||
|
self.r.row["site_id"], self.r.row["brozzle_count"],
|
||||||
|
self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||||
|
# this index is for displaying pages in a sensible order in the web
|
||||||
|
# console
|
||||||
|
self.r.table("pages").index_create(
|
||||||
|
"least_hops", [
|
||||||
|
r.row["site_id"], r.row["brozzle_count"],
|
||||||
|
r.row["hops_from_seed"]])
|
||||||
if not "jobs" in tables:
|
if not "jobs" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'jobs' in database %s",
|
||||||
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
|
||||||
def _vet_result(self, result, **kwargs):
|
def _vet_result(self, result, **kwargs):
|
||||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||||
|
|
11
setup.py
11
setup.py
|
@ -19,10 +19,11 @@
|
||||||
import setuptools
|
import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(
|
||||||
version='1.1.dev10',
|
name='brozzler',
|
||||||
|
version='1.1.dev20',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('README.rst', encoding='UTF-8').read(),
|
long_description=open('README.rst', encoding='UTF-8').read(),
|
||||||
|
@ -41,10 +42,10 @@ setuptools.setup(name='brozzler',
|
||||||
'rethinkstuff',
|
'rethinkstuff',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'psutil',
|
'psutil',
|
||||||
],
|
],
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 3 - Alpha',
|
'Development Status :: 4 - Beta',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.4',
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
flask --debug --app=brozzler-webconsole.py run --host=0.0.0.0 --port=8081
|
gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
#
|
'''
|
||||||
# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
|
brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
|
||||||
# api endspoints etc
|
api endspoints etc
|
||||||
#
|
|
||||||
# Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
You may obtain a copy of the License at
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
limitations under the License.
|
||||||
#
|
'''
|
||||||
|
|
||||||
import flask
|
import flask
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
|
@ -24,16 +24,26 @@ import sys
|
||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import rethinkdb
|
import rethinkdb
|
||||||
|
import logging
|
||||||
|
import yaml
|
||||||
|
|
||||||
# XXX flask does its own logging config
|
# flask does its own logging config
|
||||||
# import logging
|
# logging.basicConfig(
|
||||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
# stream=sys.stdout, level=logging.INFO,
|
||||||
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
# format=(
|
||||||
|
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||||
|
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||||
|
|
||||||
app = flask.Flask(__name__)
|
app = flask.Flask(__name__)
|
||||||
|
|
||||||
|
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
||||||
|
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
||||||
|
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
||||||
|
app.logger.setLevel(logging.INFO)
|
||||||
|
app.logger.info('will this show in the log?')
|
||||||
|
|
||||||
# configure with environment variables
|
# configure with environment variables
|
||||||
SETTINGS= {
|
SETTINGS = {
|
||||||
'RETHINKDB_SERVERS': os.environ.get(
|
'RETHINKDB_SERVERS': os.environ.get(
|
||||||
'RETHINKDB_SERVERS', 'localhost').split(','),
|
'RETHINKDB_SERVERS', 'localhost').split(','),
|
||||||
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'),
|
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'),
|
||||||
|
@ -81,10 +91,10 @@ def pages(site_id):
|
||||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
app.logger.info("flask.request.args=%s", flask.request.args)
|
||||||
start = int(flask.request.args.get("start", 0))
|
start = int(flask.request.args.get("start", 0))
|
||||||
end = int(flask.request.args.get("end", start + 90))
|
end = int(flask.request.args.get("end", start + 90))
|
||||||
|
app.logger.info("yes new query")
|
||||||
pages_ = r.table("pages").between(
|
pages_ = r.table("pages").between(
|
||||||
[site_id, 1, False, r.minval],
|
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||||
[site_id, r.maxval, False, r.maxval],
|
index="least_hops").order_by(index="least_hops")[start:end].run()
|
||||||
index="priority_by_site")[start:end].run()
|
|
||||||
return flask.jsonify(pages=list(pages_))
|
return flask.jsonify(pages=list(pages_))
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>")
|
@app.route("/api/sites/<site_id>")
|
||||||
|
@ -110,6 +120,14 @@ def job(job_id):
|
||||||
job_ = r.table("jobs").get(job_id).run()
|
job_ = r.table("jobs").get(job_id).run()
|
||||||
return flask.jsonify(job_)
|
return flask.jsonify(job_)
|
||||||
|
|
||||||
|
@app.route("/api/jobs/<int:job_id>/yaml")
|
||||||
|
@app.route("/api/job/<int:job_id>/yaml")
|
||||||
|
def job_yaml(job_id):
|
||||||
|
job_ = r.table("jobs").get(job_id).run()
|
||||||
|
return app.response_class(
|
||||||
|
yaml.dump(job_, default_flow_style=False),
|
||||||
|
mimetype='application/yaml')
|
||||||
|
|
||||||
@app.route("/api/workers")
|
@app.route("/api/workers")
|
||||||
def workers():
|
def workers():
|
||||||
workers_ = service_registry.available_services("brozzler-worker")
|
workers_ = service_registry.available_services("brozzler-worker")
|
||||||
|
|
|
@ -125,11 +125,10 @@ function loadSiteStats($http, site, job) {
|
||||||
$http.get("/api/sites/" + site.id + "/page_count").success(pageCountSuccessCallback(site, job));
|
$http.get("/api/sites/" + site.id + "/page_count").success(pageCountSuccessCallback(site, job));
|
||||||
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
|
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
|
||||||
|
|
||||||
// parse Warcprox-Meta to find stats bucket
|
// look at Warcprox-Meta to find stats bucket
|
||||||
var warcprox_meta = angular.fromJson(site.extra_headers["Warcprox-Meta"]);
|
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
|
||||||
for (var j = 0; j < warcprox_meta.stats.buckets.length; j++) {
|
if (site.warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
|
||||||
if (warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
|
var bucket = site.warcprox_meta.stats.buckets[j];
|
||||||
var bucket = warcprox_meta.stats.buckets[j];
|
|
||||||
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
|
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
|
||||||
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
|
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
|
||||||
}
|
}
|
||||||
|
@ -138,7 +137,8 @@ function loadSiteStats($http, site, job) {
|
||||||
|
|
||||||
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
|
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
|
||||||
function($scope, $routeParams, $http) {
|
function($scope, $routeParams, $http) {
|
||||||
console.log('JobController');
|
$scope.show_yaml = false;
|
||||||
|
// console.log('JobController');
|
||||||
$http.get("/api/config").success(function(data) {
|
$http.get("/api/config").success(function(data) {
|
||||||
$scope.config = data.config;
|
$scope.config = data.config;
|
||||||
});
|
});
|
||||||
|
@ -159,6 +159,9 @@ brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$htt
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
$http.get("/api/jobs/" + $routeParams.id + "/yaml").success(function(data) {
|
||||||
|
$scope.job_yaml = data;
|
||||||
|
});
|
||||||
}]);
|
}]);
|
||||||
|
|
||||||
brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$http", "$window",
|
brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$http", "$window",
|
||||||
|
|
|
@ -10,7 +10,12 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<h2>Job {{job.id}} <small>{{job.started}}-{{job.finished}} {{job.status}}</small></h2>
|
<h2 ng-click="show_yaml = !show_yaml">
|
||||||
|
<span class="fa fa-caret-right"
|
||||||
|
ng-class="{ 'fa-caret-right': !show_yaml, 'fa-caret-down': !!show_yaml }"></span>
|
||||||
|
Job {{job.id}} <small>{{job.started}}-{{job.finished}}: {{job.status}}</small>
|
||||||
|
</h2>
|
||||||
|
<pre style="display:{{show_yaml?'block':'none'}}">{{job_yaml}}</pre>
|
||||||
|
|
||||||
<div class="row bigstats">
|
<div class="row bigstats">
|
||||||
<div class="col-sm-6 col-md-3">
|
<div class="col-sm-6 col-md-3">
|
||||||
|
|
|
@ -40,12 +40,22 @@
|
||||||
<div class="col-sm-12">
|
<div class="col-sm-12">
|
||||||
<h2>Pages</h2>
|
<h2>Pages</h2>
|
||||||
<div class="col-sm-6 col-md-4" ng-repeat="page in pages">
|
<div class="col-sm-6 col-md-4" ng-repeat="page in pages">
|
||||||
<a class="thumbnail" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">
|
<div class="thumbnail">
|
||||||
<img style="width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
|
<img style="border:1px solid #ddd;width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
|
||||||
<div class="caption">
|
<div class="caption">
|
||||||
<h5>{{page.url}}</h5>
|
<h5>{{page.url}}</h5>
|
||||||
|
<ul class="fa-ul">
|
||||||
|
<li>
|
||||||
|
<span class="fa fa-li fa-camera"></span>
|
||||||
|
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/screenshot:{{page.url}}">full size screenshot ></a>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
<span class="fa fa-li fa-university"></span>
|
||||||
|
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">wayback ></a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
</a>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-sm-12" ng-show="loading">
|
<div class="col-sm-12" ng-show="loading">
|
||||||
|
|
|
@ -1,2 +1,4 @@
|
||||||
git+https://github.com/mitsuhiko/flask.git
|
|
||||||
rethinkstuff>=0.1.5
|
rethinkstuff>=0.1.5
|
||||||
|
flask>=0.11
|
||||||
|
gunicorn
|
||||||
|
PyYAML
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue