add license headers

This commit is contained in:
Noah Levitt 2016-04-25 20:02:11 +00:00
parent e210d417fb
commit df61e55b6b
24 changed files with 497 additions and 78 deletions

View file

@ -1,5 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim: set sw=4 et: #
# brozzle-page - command line utility for brozzling a single page, i.e. opening
# it in a browser, running some javascript behaviors, and printing outlinks
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse import argparse
import os import os

View file

@ -1,4 +1,23 @@
#!/usr/bin/env python #!/usr/bin/env python
#
# brozzler-new-job - takes a yaml brozzler job configuration file, creates
# job, sites, and pages objects in rethinkdb, which brozzler-workers will look
# at and start crawling
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse import argparse
import os import os

View file

@ -1,5 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim: set sw=4 et: #
# brozzler-new-site - takes a seed url and creates a site and page object in
# rethinkdb, which brozzler-workers will look at and start crawling
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse import argparse
import os import os

View file

@ -1,5 +1,23 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim: set sw=4 et: #
# brozzler-worker - main entrypoint for brozzler, gets sites and pages to
# brozzle from rethinkdb, brozzles them
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse import argparse
import os import os

View file

@ -1,3 +1,22 @@
#
# brozzler/__init__.py - __init__.py for brozzler package, contains some common
# code
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json as _json import json as _json
import logging as _logging import logging as _logging
from pkg_resources import get_distribution as _get_distribution from pkg_resources import get_distribution as _get_distribution

View file

@ -1,8 +1,21 @@
// vim:set sw=8 et: /*
// * brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of
// Scrolls to the bottom of the page, and clicks on embedded soundcloud * the page and clicks on embedded soundcloud elements
// elements. *
// * Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraAboveBelowOrOnScreen = function(e) { var umbraAboveBelowOrOnScreen = function(e) {
var eTop = e.getBoundingClientRect().top; var eTop = e.getBoundingClientRect().top;
@ -26,21 +39,21 @@ var umbraIntervalFunc = function() {
var umbraSoundCloudEmbeddedElements = []; var umbraSoundCloudEmbeddedElements = [];
getUmbraSoundCloudEmbeddedElements(umbraSoundCloudEmbeddedElements); getUmbraSoundCloudEmbeddedElements(umbraSoundCloudEmbeddedElements);
var clickedSomething = false; var clickedSomething = false;
var somethingLeftBelow = false; var somethingLeftBelow = false;
var somethingLeftAbove = false; var somethingLeftAbove = false;
var missedAbove = 0; var missedAbove = 0;
for (var i = 0; i < umbraSoundCloudEmbeddedElements.length; i++) { for (var i = 0; i < umbraSoundCloudEmbeddedElements.length; i++) {
var targetId = umbraSoundCloudEmbeddedElements[i].id; var targetId = umbraSoundCloudEmbeddedElements[i].id;
var target = umbraSoundCloudEmbeddedElements[i].target; var target = umbraSoundCloudEmbeddedElements[i].target;
if (!(targetId in umbraAlreadyClicked)) { if (!(targetId in umbraAlreadyClicked)) {
var where = umbraAboveBelowOrOnScreen(target); var where = umbraAboveBelowOrOnScreen(target);
if (where == 0) { // on screen if (where == 0) { // on screen
// var pos = target.getBoundingClientRect().top; // var pos = target.getBoundingClientRect().top;
// window.scrollTo(0, target.getBoundingClientRect().top - 100); // window.scrollTo(0, target.getBoundingClientRect().top - 100);
@ -52,14 +65,14 @@ var umbraIntervalFunc = function() {
clickedSomething = true; clickedSomething = true;
umbraState.idleSince = null; umbraState.idleSince = null;
break; break;
} else if (where > 0) { } else if (where > 0) {
somethingLeftBelow = true; somethingLeftBelow = true;
} else if (where < 0) { } else if (where < 0) {
somethingLeftAbove = true; somethingLeftAbove = true;
} }
} }
} }
if (!clickedSomething) { if (!clickedSomething) {
if (somethingLeftAbove) { if (somethingLeftAbove) {
console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
@ -77,7 +90,7 @@ var umbraIntervalFunc = function() {
umbraState.idleSince = Date.now(); umbraState.idleSince = Date.now();
} }
} }
if (umbraState.idleSince == null) { if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now(); umbraState.idleSince = Date.now();
} }
@ -86,31 +99,31 @@ var umbraIntervalFunc = function() {
//try to detect sound cloud "Play" buttons and return them as targets for clicking //try to detect sound cloud "Play" buttons and return them as targets for clicking
var getUmbraSoundCloudEmbeddedElements = function(soundCloudEmbeddedElements, currentIframeDepth, currentDocument, var getUmbraSoundCloudEmbeddedElements = function(soundCloudEmbeddedElements, currentIframeDepth, currentDocument,
iframeElement) { iframeElement) {
//set default values for parameters //set default values for parameters
currentIframeDepth = currentIframeDepth || 0; currentIframeDepth = currentIframeDepth || 0;
currentDocument = currentDocument || document; currentDocument = currentDocument || document;
if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) { if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) {
return; return;
} }
//collect all buttons on current document first //collect all buttons on current document first
var button = []; var button = [];
button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_SOUNDCLOUD_EMBEDDED_SELECTOR); button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_SOUNDCLOUD_EMBEDDED_SELECTOR);
var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : ""; var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : "";
for (var i = 0; i < button.length; i++) { for (var i = 0; i < button.length; i++) {
soundCloudEmbeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)}); soundCloudEmbeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)});
} }
//now get all buttons in embedded iframes //now get all buttons in embedded iframes
var iframe = []; var iframe = [];
iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_SOUNDCLOUD_EMBEDDED_SELECTOR); iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_SOUNDCLOUD_EMBEDDED_SELECTOR);
for (var i = 0; i < iframe.length; i++) { for (var i = 0; i < iframe.length; i++) {
getUmbraSoundCloudEmbeddedElements(soundCloudEmbeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]); getUmbraSoundCloudEmbeddedElements(soundCloudEmbeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]);
} }
@ -135,7 +148,7 @@ var umbraBehaviorFinished = function() {
var getElementCssPath = function(element) { var getElementCssPath = function(element) {
var names = []; var names = [];
while (element.parentNode){ while (element.parentNode){
if (element.id){ if (element.id){
names.unshift('#' + element.id); names.unshift('#' + element.id);
@ -146,14 +159,14 @@ var getElementCssPath = function(element) {
} }
else { else {
for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++); for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++);
names.unshift(element.tagName + ":nth-child(" + c + ")"); names.unshift(element.tagName + ":nth-child(" + c + ")");
} }
element = element.parentNode; element = element.parentNode;
} }
} }
return names.join(" > "); return names.join(" > ");
} }

View file

@ -1,4 +1,21 @@
// vim:set sw=8 et: /*
* brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom
* of the page, clicks to expand images, a few other things
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraAboveBelowOrOnScreen = function(e) { var umbraAboveBelowOrOnScreen = function(e) {
var eTop = e.getBoundingClientRect().top; var eTop = e.getBoundingClientRect().top;
@ -28,26 +45,26 @@ var umbraScrolledThingFailedScrollAttempts = {};
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0}; var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
var umbraIntervalFunc = function() { var umbraIntervalFunc = function() {
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
var everythingScrolled = true; var everythingScrolled = true;
for (var i = 0; i < thingsToScroll.length; i++) { for (var i = 0; i < thingsToScroll.length; i++) {
var target = thingsToScroll[i]; var target = thingsToScroll[i];
if (!(target in umbraAlreadyScrolledThing)) { if (!(target in umbraAlreadyScrolledThing)) {
everythingScrolled = false; everythingScrolled = false;
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id); console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
var lastScrollTop = target.scrollTop; var lastScrollTop = target.scrollTop;
target.scrollTop = target.scrollHeight; target.scrollTop = target.scrollHeight;
umbraState.idleSince = null; umbraState.idleSince = null;
if (target.scrollTop >= target.scrollHeight) { if (target.scrollTop >= target.scrollHeight) {
umbraAlreadyScrolledThing[target] = true; umbraAlreadyScrolledThing[target] = true;
} }
else if (target.scrollTop == lastScrollTop) { else if (target.scrollTop == lastScrollTop) {
if (umbraScrolledThingFailedScrollAttempts[target]) { if (umbraScrolledThingFailedScrollAttempts[target]) {
umbraScrolledThingFailedScrollAttempts[target]++; umbraScrolledThingFailedScrollAttempts[target]++;
@ -55,7 +72,7 @@ var umbraIntervalFunc = function() {
else { else {
umbraScrolledThingFailedScrollAttempts[target] = 1; umbraScrolledThingFailedScrollAttempts[target] = 1;
} }
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) { if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
umbraAlreadyScrolledThing[target] = true; umbraAlreadyScrolledThing[target] = true;
} }
@ -67,24 +84,24 @@ var umbraIntervalFunc = function() {
} }
else { else {
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id) console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
} }
umbraState.expectingSomething = null; umbraState.expectingSomething = null;
} }
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) { if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
if (umbraState.idleSince == null) { if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now(); umbraState.idleSince = Date.now();
} }
return; return;
} }
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]'); var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]');
for (var i = 0; i < closeButtons.length; i++) { for (var i = 0; i < closeButtons.length; i++) {
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible // XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
if (closeButtons[i].clientWidth > 0) { if (closeButtons[i].clientWidth > 0) {
if (umbraState.expectingSomething == 'closeButton') { if (umbraState.expectingSomething == 'closeButton') {
console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML); console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML);
umbraState.expectingSomething = null; umbraState.expectingSomething = null;
} else { } else {
@ -106,7 +123,7 @@ var umbraIntervalFunc = function() {
var missedAbove = 0; var missedAbove = 0;
for (var i = 0; i < thingsToClick.length; i++) { for (var i = 0; i < thingsToClick.length; i++) {
var target = thingsToClick[i]; var target = thingsToClick[i];
if (!(target in umbraAlreadyClicked)) { if (!(target in umbraAlreadyClicked)) {
var where = umbraAboveBelowOrOnScreen(target); var where = umbraAboveBelowOrOnScreen(target);
if (where == 0) { // on screen if (where == 0) { // on screen
@ -122,14 +139,14 @@ var umbraIntervalFunc = function() {
clickedSomething = true; clickedSomething = true;
umbraState.idleSince = null; umbraState.idleSince = null;
break; break;
} else if (where > 0) { } else if (where > 0) {
somethingLeftBelow = true; somethingLeftBelow = true;
} else if (where < 0) { } else if (where < 0) {
somethingLeftAbove = true; somethingLeftAbove = true;
} }
} }
} }
if (window.scrollY > umbraState.bottomReachedScrollY) { if (window.scrollY > umbraState.bottomReachedScrollY) {
umbraState.bottomReachedScrollY = window.scrollY; umbraState.bottomReachedScrollY = window.scrollY;
} }
@ -159,7 +176,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script. // Called from outside of this script.
var umbraBehaviorFinished = function() { var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) { if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince; var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {

View file

@ -1,17 +1,33 @@
// vim:set sw=8 et: /*
* brozzler/behaviors.d/flickr.js - behavior for flickr.com
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
setInterval(function() { window.scrollBy(0,50); }, 100); setInterval(function() { window.scrollBy(0,50); }, 100);
setTimeout(function() { setTimeout(function() {
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
f = a.iterateNext(); f = a.iterateNext();
f.click(); f.click();
}, 5000); }, 5000);
setTimeout(function() { setTimeout(function() {
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
setInterval(function() { setInterval(function() {
f = a.iterateNext(); f = a.iterateNext();
f.click(); f.click();
}, 5000); }, 5000);
}, 5000); }, 5000);

View file

@ -1,5 +1,20 @@
// vim:set sw=8 et: /*
// * brozzler/behaviors.d/flickr.js - behavior for instagram
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraInstagramBehavior = { var umbraInstagramBehavior = {
IDLE_TIMEOUT_SEC: 20, IDLE_TIMEOUT_SEC: 20,
@ -12,11 +27,11 @@ var umbraInstagramBehavior = {
intervalFunc: function() { intervalFunc: function() {
if (this.state === "loading-thumbs") { if (this.state === "loading-thumbs") {
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
window.scrollBy(0, 200); window.scrollBy(0, 200);
this.idleSince = null; this.idleSince = null;
return; return;
} }
var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)"); var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)");
if (moreButtons.length > 0) { if (moreButtons.length > 0) {
@ -24,8 +39,8 @@ var umbraInstagramBehavior = {
moreButtons[0].click(); moreButtons[0].click();
this.idleSince = null; this.idleSince = null;
return; return;
} }
if (this.idleSince == null) { if (this.idleSince == null) {
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()"); console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
this.idleSince = Date.now(); this.idleSince = Date.now();
@ -37,12 +52,12 @@ var umbraInstagramBehavior = {
this.state = "clicking-first-thumb"; this.state = "clicking-first-thumb";
this.idleSince = null; this.idleSince = null;
return; return;
} else { } else {
// console.log("still might be waiting for something to load..."); // console.log("still might be waiting for something to load...");
return; return;
} }
} }
} }
if (this.state === "clicking-first-thumb") { if (this.state === "clicking-first-thumb") {
var images = document.querySelectorAll("a.pgmiImageLink"); var images = document.querySelectorAll("a.pgmiImageLink");

View file

@ -1,4 +1,21 @@
// vim:set sw=8 et: /*
* brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to
* play/crawl embedded videos
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraState = {'idleSince':null}; var umbraState = {'idleSince':null};
var umbraIntervalID = setInterval(umbraScrollInterval,50); var umbraIntervalID = setInterval(umbraScrollInterval,50);
@ -10,7 +27,7 @@ function umbraScrollInterval() {
umbraScroll(); umbraScroll();
umbraState.idleSince=null; umbraState.idleSince=null;
} }
else { else {
var videoBoxes = document.querySelectorAll("div#vid_box a"); var videoBoxes = document.querySelectorAll("div#vid_box a");
var clickedVideo = false; var clickedVideo = false;
@ -50,4 +67,4 @@ var umbraBehaviorFinished = function() {
} }
return false; return false;
} }

View file

@ -1,3 +1,21 @@
/*
* brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to
* play/crawl embedded videos
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = { var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10, IDLE_TIMEOUT_SEC : 10,
@ -12,7 +30,7 @@ var umbraBehavior = {
var iframes = document.querySelectorAll("iframe"); var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1); var documents = Array(iframes.length + 1);
documents[0] = document; documents[0] = document;
for (var i = 0; i < iframes.length; i++) { for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document; documents[i+1] = iframes[i].contentWindow.document;
} }

View file

@ -1,3 +1,22 @@
/*
* brozzler/behaviors.d/simpleclicks.js.in - simpleclicks behavior template,
* clicks on elements matching templatized css selector
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = { var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10, IDLE_TIMEOUT_SEC : 10,
idleSince : null, idleSince : null,
@ -12,11 +31,11 @@ var umbraBehavior = {
//handle Python to JavaScript boolean conversion //handle Python to JavaScript boolean conversion
clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false; clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false;
var iframes = document.querySelectorAll("iframe"); var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1); var documents = Array(iframes.length + 1);
documents[0] = document; documents[0] = document;
for (var i = 0; i < iframes.length; i++) { for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document; documents[i+1] = iframes[i].contentWindow.document;
} }

View file

@ -1,4 +1,21 @@
// vim:set sw=8 et: /*
* brozzler/behaviors.d/vimeo.js - behavior for vimeo.com, clicks to play/crawl
* videos
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraState = {'idleSince':null}; var umbraState = {'idleSince':null};
var umbraVideoElements = document.getElementsByTagName('video'); var umbraVideoElements = document.getElementsByTagName('video');

View file

@ -1,4 +1,21 @@
# vim: set sw=4 et: #
# brozzler/behaviors.py - manages behaviors, which are javascript scripts that
# run in brozzled web pages
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json import json
import itertools import itertools

View file

@ -1,3 +1,21 @@
#
# brozzler/behaviors.yaml - behavior configuration
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# first matched behavior is used, so order matters here # first matched behavior is used, so order matters here
behaviors: behaviors:
- -

View file

@ -1,5 +1,21 @@
#!/usr/bin/env python #
# vim: set sw=4 et: # brozzler/browser.py - classes responsible for running web browsers
# (chromium/chromium) and browsing web pages in them
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging import logging
import json import json

View file

@ -1,3 +1,21 @@
#
# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging import logging
import brozzler import brozzler
import random import random

View file

@ -1,3 +1,22 @@
#
# brozzler/job.py - Job class representing a brozzler crawl job, and functions
# for setting up a job with supplied configuration
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging import logging
import brozzler import brozzler
import yaml import yaml

View file

@ -1,4 +1,20 @@
# vim: set sw=4 et: #
# brozzler/robots.py - robots.txt support
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json import json
import logging import logging

View file

@ -1,3 +1,21 @@
#
# brozzler/site.py - classes representing sites and pages
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import surt import surt
import json import json
import logging import logging

View file

@ -1,3 +1,23 @@
#
# brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
# it runs youtube-dl on them, browses them and runs behaviors if appropriate,
# scopes and adds outlinks to the frontier
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os import os
import logging import logging
import brozzler import brozzler

View file

@ -1,3 +1,21 @@
#
# setup.py - brozzler setup script
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import setuptools import setuptools
import glob import glob

View file

@ -1,3 +1,22 @@
#
# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
# api endspoints etc
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import flask import flask
import rethinkstuff import rethinkstuff
import json import json

View file

@ -1,3 +1,21 @@
/*
* brozzler-webconsole/static/js/app.js - brozzler console angularjs code
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"use strict"; "use strict";
var brozzlerConsoleApp = angular.module("brozzlerConsoleApp", [ var brozzlerConsoleApp = angular.module("brozzlerConsoleApp", [