Merge branch 'master' into qa

This commit is contained in:
Barbara Miller 2016-06-22 17:45:44 -07:00
commit 8f726eac76
13 changed files with 360 additions and 84 deletions

View File

@ -20,19 +20,56 @@ archiving.
Installation
------------
XXX These instructions don't work at the moment. Brozzler requires some
customized packages not easily installable in the outside world. I intend to
remedy the situation soon.
::
# set up virtualenv if desired
pip install git+https://github.com/nlevitt/brozzler.git
pip install brozzler
Brozzler also requires a rethinkdb deployment.
Fonts for good screenshots
--------------------------
Usage
-----
Launch one or more workers:
::
brozzler-worker -e chromium
Submit jobs:
::
brozzler-new-job myjob.yaml
Job Configuration
-----------------
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional.
::
id: myjob
time_limit: 60 # seconds
proxy: 127.0.0.1:8000 # point at warcprox for archiving
ignore_robots: false
enable_warcprox_features: false
warcprox_meta: null
metadata: {}
seeds:
- url: http://one.example.org/
- url: http://two.example.org/
time_limit: 30
- url: http://three.example.org/
time_limit: 10
ignore_robots: true
scope:
surt: http://(org,example,
Fonts (for decent screenshots)
------------------------------
On ubuntu 14.04 trusty I installed these packages:
@ -42,12 +79,10 @@ fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica
fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core
ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
Haven't looked much at the resulting screenshots yet though.
License
-------
Copyright 2015 Internet Archive
Copyright 2015-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this software except in compliance with the License. You may

View File

@ -27,6 +27,7 @@ import re
import rethinkstuff
import warnings
import requests
import json
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzler-new-site - register site to brozzle",
@ -63,7 +64,7 @@ site = brozzler.Site(
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
warcprox_meta=json.loads(args.warcprox_meta))
warcprox_meta=json.loads(args.warcprox_meta) if args.warcprox_meta else None)
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)

View File

@ -0,0 +1,159 @@
/*
* brozzler/behaviors.d/fec_gov.js - click on links that execute JavaScript to
* download report csv files for fec.gov/data
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10,
idleSince : null,
alreadyClicked : {},
intervalFunc : function() {
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssDownloadLinkSelector = "a[id^='id_csv']";
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
for (var j = 0; j < documents.length; j++) {
var clickDownloadLinkTargets = documents[j].querySelectorAll(cssDownloadLinkSelector);
for (var i = 0; i < clickDownloadLinkTargets.length; i++) {
var sourceName = clickDownloadLinkTargets[i].id.substring(7);
var clickRadioButtonTargets = documents[j].querySelectorAll("input[name='" + sourceName + "']");
if (clickRadioButtonTargets.length == 0) {
if (clickDownloadLinkTargets[i].umbraClicked) {
continue;
}
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
clickDownloadLinkTargets[i].click(); //click the link to download the csv
clickedSomething = true;
this.idleSince = null;
clickDownloadLinkTargets[i].umbraClicked = true;
}
else {
for (var k = 0; k < clickRadioButtonTargets.length; ++k) {
if (clickRadioButtonTargets[k].umbraClicked) {
continue;
}
var where = this.aboveBelowOrOnScreen(clickRadioButtonTargets[k]);
if (where == 0) {
console.log("clicking on " + clickRadioButtonTargets[k]);
// do mouse over event on click target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickRadioButtonTargets[k].dispatchEvent(mouseOverEvent);
clickRadioButtonTargets[k].click(); //select the correct date with the radio button
mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickDownloadLinkTargets[i].dispatchEvent(mouseOverEvent);
clickDownloadLinkTargets[i].click(); //click the link to download the csv for the selected date
clickedSomething = true;
this.idleSince = null;
clickRadioButtonTargets[k].umbraClicked = true;
//alert("clicking on " + clickRadioButtonTargets[k].name);
//alert("clicking on " + clickDownloadLinkTargets[i].id);
break; //break from clickTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
}
}
if (!clickedSomething) {
if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
}
if (!this.idleSince) {
this.idleSince = Date.now();
}
},
start : function() {
var that = this;
this.intervalId = setInterval(function() {
that.intervalFunc()
}, 250);
},
isFinished : function() {
if (this.idleSince != null) {
var idleTimeMs = Date.now() - this.idleSince;
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
clearInterval(this.intervalId);
return true;
}
}
return false;
},
aboveBelowOrOnScreen : function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
},
};
// Called from outside of this script.
var umbraBehaviorFinished = function() {
return umbraBehavior.isFinished()
};
umbraBehavior.start();

View File

@ -98,6 +98,10 @@ behaviors:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4692
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js: fec_gov.js
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10

View File

@ -1,21 +1,21 @@
#
# brozzler/browser.py - classes responsible for running web browsers
# (chromium/chromium) and browsing web pages in them
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
'''
brozzler/browser.py - classes responsible for running web browsers
(chromium/chromium) and browsing web pages in them
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import json
@ -58,7 +58,10 @@ class BrowserPool:
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
def acquire(self):
"""Returns browser from pool if available, raises NoBrowsersAvailable otherwise."""
"""
Returns browser from pool if available, raises NoBrowsersAvailable
otherwise.
"""
with self._lock:
try:
browser = self._available.pop()
@ -277,11 +280,23 @@ class Browser:
self.logger.info("retrieving outlinks for %s", self.url)
self._waiting_on_outlinks_msg_id = self.send_to_chrome(
method="Runtime.evaluate",
params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
params={"expression": self.OUTLINKS_JS})
return False
else: # self._waiting_on_outlinks_msg_id
return False
OUTLINKS_JS = """
var compileOutlinks = function(frame) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
}
return outlinks;
}
compileOutlinks(window).join(' ');
"""
def _browse_interval_func(self):
"""Called periodically while page is being browsed. Returns True when
finished browsing."""
@ -393,7 +408,8 @@ class Browser:
self._waiting_on_scroll_to_top_msg_id = None
elif message["id"] == self._waiting_on_outlinks_msg_id:
self.logger.debug("got outlinks message=%s", message)
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
self._outlinks = frozenset(
message["result"]["result"]["value"].split())
elif message["id"] == self._waiting_on_document_url_msg_id:
if message["result"]["result"]["value"] != self.url:
if self.on_url_change:

View File

@ -39,21 +39,43 @@ class RethinkDbFrontier:
def _ensure_db(self):
dbs = self.r.db_list().run()
if not self.r.dbname in dbs:
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
self.logger.info(
"creating rethinkdb database %s", repr(self.r.dbname))
self.r.db_create(self.r.dbname).run()
tables = self.r.table_list().run()
if not "sites" in tables:
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run()
self.logger.info(
"creating rethinkdb table 'sites' in database %s",
repr(self.r.dbname))
self.r.table_create(
"sites", shards=self.shards, replicas=self.replicas).run()
self.r.table("sites").index_create(
"sites_last_disclaimed", [
self.r.row["status"],
self.r.row["last_disclaimed"]]).run()
self.r.table("sites").index_create("job_id").run()
if not "pages" in tables:
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
self.logger.info(
"creating rethinkdb table 'pages' in database %s",
repr(self.r.dbname))
self.r.table_create(
"pages", shards=self.shards, replicas=self.replicas).run()
self.r.table("pages").index_create(
"priority_by_site", [
self.r.row["site_id"], self.r.row["brozzle_count"],
self.r.row["claimed"], self.r.row["priority"]]).run()
# this index is for displaying pages in a sensible order in the web
# console
self.r.table("pages").index_create(
"least_hops", [
r.row["site_id"], r.row["brozzle_count"],
r.row["hops_from_seed"]])
if not "jobs" in tables:
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
self.logger.info(
"creating rethinkdb table 'jobs' in database %s",
repr(self.r.dbname))
self.r.table_create(
"jobs", shards=self.shards, replicas=self.replicas).run()
def _vet_result(self, result, **kwargs):
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)

View File

@ -19,10 +19,11 @@
import setuptools
import glob
setuptools.setup(name='brozzler',
version='1.1.dev10',
setuptools.setup(
name='brozzler',
version='1.1.dev20',
description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
author_email='nlevitt@archive.org',
long_description=open('README.rst', encoding='UTF-8').read(),
@ -41,10 +42,10 @@ setuptools.setup(name='brozzler',
'rethinkstuff',
'rethinkdb>=2.3,<2.4',
'psutil',
],
],
zip_safe=False,
classifiers=[
'Development Status :: 3 - Alpha',
'Development Status :: 4 - Beta',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.4',

View File

@ -1 +1 @@
flask --debug --app=brozzler-webconsole.py run --host=0.0.0.0 --port=8081
gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app

View File

@ -1,21 +1,21 @@
#
# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
# api endspoints etc
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
'''
brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
api endspoints etc
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import flask
import rethinkstuff
@ -24,16 +24,26 @@ import sys
import os
import importlib
import rethinkdb
import logging
import yaml
# XXX flask does its own logging config
# import logging
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
# flask does its own logging config
# logging.basicConfig(
# stream=sys.stdout, level=logging.INFO,
# format=(
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
app = flask.Flask(__name__)
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)
app.logger.info('will this show in the log?')
# configure with environment variables
SETTINGS= {
SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get(
'RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'),
@ -81,10 +91,10 @@ def pages(site_id):
app.logger.info("flask.request.args=%s", flask.request.args)
start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90))
app.logger.info("yes new query")
pages_ = r.table("pages").between(
[site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval],
index="priority_by_site")[start:end].run()
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
index="least_hops").order_by(index="least_hops")[start:end].run()
return flask.jsonify(pages=list(pages_))
@app.route("/api/sites/<site_id>")
@ -110,6 +120,14 @@ def job(job_id):
job_ = r.table("jobs").get(job_id).run()
return flask.jsonify(job_)
@app.route("/api/jobs/<int:job_id>/yaml")
@app.route("/api/job/<int:job_id>/yaml")
def job_yaml(job_id):
job_ = r.table("jobs").get(job_id).run()
return app.response_class(
yaml.dump(job_, default_flow_style=False),
mimetype='application/yaml')
@app.route("/api/workers")
def workers():
workers_ = service_registry.available_services("brozzler-worker")

View File

@ -125,11 +125,10 @@ function loadSiteStats($http, site, job) {
$http.get("/api/sites/" + site.id + "/page_count").success(pageCountSuccessCallback(site, job));
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
// parse Warcprox-Meta to find stats bucket
var warcprox_meta = angular.fromJson(site.extra_headers["Warcprox-Meta"]);
for (var j = 0; j < warcprox_meta.stats.buckets.length; j++) {
if (warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
var bucket = warcprox_meta.stats.buckets[j];
// look at Warcprox-Meta to find stats bucket
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
if (site.warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
var bucket = site.warcprox_meta.stats.buckets[j];
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
}
@ -138,7 +137,8 @@ function loadSiteStats($http, site, job) {
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
function($scope, $routeParams, $http) {
console.log('JobController');
$scope.show_yaml = false;
// console.log('JobController');
$http.get("/api/config").success(function(data) {
$scope.config = data.config;
});
@ -159,6 +159,9 @@ brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$htt
}
});
});
$http.get("/api/jobs/" + $routeParams.id + "/yaml").success(function(data) {
$scope.job_yaml = data;
});
}]);
brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$http", "$window",

View File

@ -10,7 +10,12 @@
</div>
<div>
<h2>Job {{job.id}} <small>{{job.started}}-{{job.finished}} {{job.status}}</small></h2>
<h2 ng-click="show_yaml = !show_yaml">
<span class="fa fa-caret-right"
ng-class="{ 'fa-caret-right': !show_yaml, 'fa-caret-down': !!show_yaml }"></span>
Job {{job.id}} <small>{{job.started}}-{{job.finished}}: {{job.status}}</small>
</h2>
<pre style="display:{{show_yaml?'block':'none'}}">{{job_yaml}}</pre>
<div class="row bigstats">
<div class="col-sm-6 col-md-3">

View File

@ -40,12 +40,22 @@
<div class="col-sm-12">
<h2>Pages</h2>
<div class="col-sm-6 col-md-4" ng-repeat="page in pages">
<a class="thumbnail" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">
<img style="width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
<div class="thumbnail">
<img style="border:1px solid #ddd;width:300px;height:190px" src="{{config.WAYBACK_BASEURL}}/3/thumbnail:{{page.url}}" alt="thumb">
<div class="caption">
<h5>{{page.url}}</h5>
<ul class="fa-ul">
<li>
<span class="fa fa-li fa-camera"></span>
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/screenshot:{{page.url}}">full size screenshot &gt;</a>
</li>
<li>
<span class="fa fa-li fa-university"></span>
<a target="_blank" href="{{config.WAYBACK_BASEURL}}/3/{{page.url}}">wayback &gt;</a>
</li>
</ul>
</div>
</a>
</div>
</div>
</div>
<div class="col-sm-12" ng-show="loading">

View File

@ -1,2 +1,4 @@
git+https://github.com/mitsuhiko/flask.git
rethinkstuff>=0.1.5
flask>=0.11
gunicorn
PyYAML