mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
add license headers
This commit is contained in:
parent
e210d417fb
commit
df61e55b6b
@ -1,5 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzle-page - command line utility for brozzling a single page, i.e. opening
|
||||
# it in a browser, running some javascript behaviors, and printing outlinks
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
@ -1,4 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# brozzler-new-job - takes a yaml brozzler job configuration file, creates
|
||||
# job, sites, and pages objects in rethinkdb, which brozzler-workers will look
|
||||
# at and start crawling
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
@ -1,5 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzler-new-site - takes a seed url and creates a site and page object in
|
||||
# rethinkdb, which brozzler-workers will look at and start crawling
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
@ -1,5 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzler-worker - main entrypoint for brozzler, gets sites and pages to
|
||||
# brozzle from rethinkdb, brozzles them
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
@ -1,3 +1,22 @@
|
||||
#
|
||||
# brozzler/__init__.py - __init__.py for brozzler package, contains some common
|
||||
# code
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json as _json
|
||||
import logging as _logging
|
||||
from pkg_resources import get_distribution as _get_distribution
|
||||
|
@ -1,8 +1,21 @@
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
// Scrolls to the bottom of the page, and clicks on embedded soundcloud
|
||||
// elements.
|
||||
//
|
||||
/*
|
||||
* brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of
|
||||
* the page and clicks on embedded soundcloud elements
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraAboveBelowOrOnScreen = function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
@ -26,21 +39,21 @@ var umbraIntervalFunc = function() {
|
||||
var umbraSoundCloudEmbeddedElements = [];
|
||||
|
||||
getUmbraSoundCloudEmbeddedElements(umbraSoundCloudEmbeddedElements);
|
||||
|
||||
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var missedAbove = 0;
|
||||
|
||||
|
||||
for (var i = 0; i < umbraSoundCloudEmbeddedElements.length; i++) {
|
||||
|
||||
|
||||
var targetId = umbraSoundCloudEmbeddedElements[i].id;
|
||||
var target = umbraSoundCloudEmbeddedElements[i].target;
|
||||
|
||||
|
||||
if (!(targetId in umbraAlreadyClicked)) {
|
||||
|
||||
|
||||
var where = umbraAboveBelowOrOnScreen(target);
|
||||
|
||||
|
||||
if (where == 0) { // on screen
|
||||
// var pos = target.getBoundingClientRect().top;
|
||||
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||
@ -52,14 +65,14 @@ var umbraIntervalFunc = function() {
|
||||
clickedSomething = true;
|
||||
umbraState.idleSince = null;
|
||||
break;
|
||||
} else if (where > 0) {
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
@ -77,7 +90,7 @@ var umbraIntervalFunc = function() {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
@ -86,31 +99,31 @@ var umbraIntervalFunc = function() {
|
||||
//try to detect sound cloud "Play" buttons and return them as targets for clicking
|
||||
var getUmbraSoundCloudEmbeddedElements = function(soundCloudEmbeddedElements, currentIframeDepth, currentDocument,
|
||||
iframeElement) {
|
||||
|
||||
|
||||
//set default values for parameters
|
||||
currentIframeDepth = currentIframeDepth || 0;
|
||||
currentDocument = currentDocument || document;
|
||||
|
||||
|
||||
if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
//collect all buttons on current document first
|
||||
var button = [];
|
||||
|
||||
|
||||
button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_SOUNDCLOUD_EMBEDDED_SELECTOR);
|
||||
|
||||
var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : "";
|
||||
|
||||
|
||||
for (var i = 0; i < button.length; i++) {
|
||||
soundCloudEmbeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)});
|
||||
}
|
||||
|
||||
|
||||
//now get all buttons in embedded iframes
|
||||
var iframe = [];
|
||||
|
||||
|
||||
iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_SOUNDCLOUD_EMBEDDED_SELECTOR);
|
||||
|
||||
|
||||
for (var i = 0; i < iframe.length; i++) {
|
||||
getUmbraSoundCloudEmbeddedElements(soundCloudEmbeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]);
|
||||
}
|
||||
@ -135,7 +148,7 @@ var umbraBehaviorFinished = function() {
|
||||
var getElementCssPath = function(element) {
|
||||
|
||||
var names = [];
|
||||
|
||||
|
||||
while (element.parentNode){
|
||||
if (element.id){
|
||||
names.unshift('#' + element.id);
|
||||
@ -146,14 +159,14 @@ var getElementCssPath = function(element) {
|
||||
}
|
||||
else {
|
||||
for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++);
|
||||
|
||||
|
||||
names.unshift(element.tagName + ":nth-child(" + c + ")");
|
||||
}
|
||||
|
||||
|
||||
element = element.parentNode;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return names.join(" > ");
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,21 @@
|
||||
// vim:set sw=8 et:
|
||||
/*
|
||||
* brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom
|
||||
* of the page, clicks to expand images, a few other things
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraAboveBelowOrOnScreen = function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
@ -28,26 +45,26 @@ var umbraScrolledThingFailedScrollAttempts = {};
|
||||
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
|
||||
|
||||
var umbraIntervalFunc = function() {
|
||||
|
||||
|
||||
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
|
||||
var everythingScrolled = true;
|
||||
|
||||
|
||||
for (var i = 0; i < thingsToScroll.length; i++) {
|
||||
var target = thingsToScroll[i];
|
||||
|
||||
|
||||
if (!(target in umbraAlreadyScrolledThing)) {
|
||||
|
||||
|
||||
everythingScrolled = false;
|
||||
|
||||
|
||||
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
|
||||
var lastScrollTop = target.scrollTop;
|
||||
target.scrollTop = target.scrollHeight;
|
||||
|
||||
|
||||
umbraState.idleSince = null;
|
||||
|
||||
|
||||
if (target.scrollTop >= target.scrollHeight) {
|
||||
umbraAlreadyScrolledThing[target] = true;
|
||||
}
|
||||
}
|
||||
else if (target.scrollTop == lastScrollTop) {
|
||||
if (umbraScrolledThingFailedScrollAttempts[target]) {
|
||||
umbraScrolledThingFailedScrollAttempts[target]++;
|
||||
@ -55,7 +72,7 @@ var umbraIntervalFunc = function() {
|
||||
else {
|
||||
umbraScrolledThingFailedScrollAttempts[target] = 1;
|
||||
}
|
||||
|
||||
|
||||
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
|
||||
umbraAlreadyScrolledThing[target] = true;
|
||||
}
|
||||
@ -67,24 +84,24 @@ var umbraIntervalFunc = function() {
|
||||
}
|
||||
else {
|
||||
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
umbraState.expectingSomething = null;
|
||||
}
|
||||
|
||||
|
||||
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
|
||||
if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]');
|
||||
for (var i = 0; i < closeButtons.length; i++) {
|
||||
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
|
||||
if (closeButtons[i].clientWidth > 0) {
|
||||
if (umbraState.expectingSomething == 'closeButton') {
|
||||
if (closeButtons[i].clientWidth > 0) {
|
||||
if (umbraState.expectingSomething == 'closeButton') {
|
||||
console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML);
|
||||
umbraState.expectingSomething = null;
|
||||
} else {
|
||||
@ -106,7 +123,7 @@ var umbraIntervalFunc = function() {
|
||||
var missedAbove = 0;
|
||||
|
||||
for (var i = 0; i < thingsToClick.length; i++) {
|
||||
var target = thingsToClick[i];
|
||||
var target = thingsToClick[i];
|
||||
if (!(target in umbraAlreadyClicked)) {
|
||||
var where = umbraAboveBelowOrOnScreen(target);
|
||||
if (where == 0) { // on screen
|
||||
@ -122,14 +139,14 @@ var umbraIntervalFunc = function() {
|
||||
clickedSomething = true;
|
||||
umbraState.idleSince = null;
|
||||
break;
|
||||
} else if (where > 0) {
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (window.scrollY > umbraState.bottomReachedScrollY) {
|
||||
umbraState.bottomReachedScrollY = window.scrollY;
|
||||
}
|
||||
@ -159,7 +176,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
|
||||
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
|
@ -1,17 +1,33 @@
|
||||
// vim:set sw=8 et:
|
||||
/*
|
||||
* brozzler/behaviors.d/flickr.js - behavior for flickr.com
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
f = a.iterateNext();
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
@ -1,5 +1,20 @@
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
/*
|
||||
* brozzler/behaviors.d/flickr.js - behavior for instagram
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraInstagramBehavior = {
|
||||
IDLE_TIMEOUT_SEC: 20,
|
||||
@ -12,11 +27,11 @@ var umbraInstagramBehavior = {
|
||||
|
||||
intervalFunc: function() {
|
||||
if (this.state === "loading-thumbs") {
|
||||
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)");
|
||||
if (moreButtons.length > 0) {
|
||||
@ -24,8 +39,8 @@ var umbraInstagramBehavior = {
|
||||
moreButtons[0].click();
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (this.idleSince == null) {
|
||||
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
|
||||
this.idleSince = Date.now();
|
||||
@ -37,12 +52,12 @@ var umbraInstagramBehavior = {
|
||||
this.state = "clicking-first-thumb";
|
||||
this.idleSince = null;
|
||||
return;
|
||||
} else {
|
||||
} else {
|
||||
// console.log("still might be waiting for something to load...");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (this.state === "clicking-first-thumb") {
|
||||
var images = document.querySelectorAll("a.pgmiImageLink");
|
||||
|
@ -1,4 +1,21 @@
|
||||
// vim:set sw=8 et:
|
||||
/*
|
||||
* brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to
|
||||
* play/crawl embedded videos
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraIntervalID = setInterval(umbraScrollInterval,50);
|
||||
@ -10,7 +27,7 @@ function umbraScrollInterval() {
|
||||
umbraScroll();
|
||||
umbraState.idleSince=null;
|
||||
}
|
||||
else {
|
||||
else {
|
||||
var videoBoxes = document.querySelectorAll("div#vid_box a");
|
||||
var clickedVideo = false;
|
||||
|
||||
@ -50,4 +67,4 @@ var umbraBehaviorFinished = function() {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,3 +1,21 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to
|
||||
* play/crawl embedded videos
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
@ -12,7 +30,7 @@ var umbraBehavior = {
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
@ -1,3 +1,22 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/simpleclicks.js.in - simpleclicks behavior template,
|
||||
* clicks on elements matching templatized css selector
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
@ -12,11 +31,11 @@ var umbraBehavior = {
|
||||
|
||||
//handle Python to JavaScript boolean conversion
|
||||
clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false;
|
||||
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
@ -1,4 +1,21 @@
|
||||
// vim:set sw=8 et:
|
||||
/*
|
||||
* brozzler/behaviors.d/vimeo.js - behavior for vimeo.com, clicks to play/crawl
|
||||
* videos
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraVideoElements = document.getElementsByTagName('video');
|
||||
|
@ -1,4 +1,21 @@
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzler/behaviors.py - manages behaviors, which are javascript scripts that
|
||||
# run in brozzled web pages
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json
|
||||
import itertools
|
||||
|
@ -1,3 +1,21 @@
|
||||
#
|
||||
# brozzler/behaviors.yaml - behavior configuration
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# first matched behavior is used, so order matters here
|
||||
behaviors:
|
||||
-
|
||||
|
@ -1,5 +1,21 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzler/browser.py - classes responsible for running web browsers
|
||||
# (chromium/chromium) and browsing web pages in them
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
|
@ -1,3 +1,21 @@
|
||||
#
|
||||
# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
import random
|
||||
|
@ -1,3 +1,22 @@
|
||||
#
|
||||
# brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||
# for setting up a job with supplied configuration
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
import yaml
|
||||
|
@ -1,4 +1,20 @@
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# brozzler/robots.py - robots.txt support
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
@ -1,3 +1,21 @@
|
||||
#
|
||||
# brozzler/site.py - classes representing sites and pages
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import surt
|
||||
import json
|
||||
import logging
|
||||
|
@ -1,3 +1,23 @@
|
||||
#
|
||||
# brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
# it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
||||
# scopes and adds outlinks to the frontier
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import logging
|
||||
import brozzler
|
||||
|
18
setup.py
18
setup.py
@ -1,3 +1,21 @@
|
||||
#
|
||||
# setup.py - brozzler setup script
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import setuptools
|
||||
import glob
|
||||
|
||||
|
@ -1,3 +1,22 @@
|
||||
#
|
||||
# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
|
||||
# api endspoints etc
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import flask
|
||||
import rethinkstuff
|
||||
import json
|
||||
|
@ -1,3 +1,21 @@
|
||||
/*
|
||||
* brozzler-webconsole/static/js/app.js - brozzler console angularjs code
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
"use strict";
|
||||
|
||||
var brozzlerConsoleApp = angular.module("brozzlerConsoleApp", [
|
||||
|
Loading…
x
Reference in New Issue
Block a user