From df61e55b6bb26092f9e05bbb64564a201d86331d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 25 Apr 2016 20:02:11 +0000 Subject: [PATCH] add license headers --- bin/brozzle-page | 19 +++++- bin/brozzler-new-job | 19 ++++++ bin/brozzler-new-site | 19 +++++- bin/brozzler-worker | 20 +++++- brozzler/__init__.py | 19 ++++++ brozzler/behaviors.d/default.js | 65 +++++++++++-------- brozzler/behaviors.d/facebook.js | 59 +++++++++++------ brozzler/behaviors.d/flickr.js | 32 ++++++--- brozzler/behaviors.d/instagram.js | 33 +++++++--- brozzler/behaviors.d/marquette_edu.js | 23 ++++++- brozzler/behaviors.d/psu24.js | 20 +++++- brozzler/behaviors.d/simpleclicks.js.in | 23 ++++++- brozzler/behaviors.d/vimeo.js | 19 +++++- brozzler/behaviors.py | 19 +++++- brozzler/behaviors.yaml | 18 +++++ brozzler/browser.py | 20 +++++- brozzler/frontier.py | 18 +++++ brozzler/job.py | 19 ++++++ brozzler/robots.py | 18 ++++- brozzler/site.py | 18 +++++ brozzler/worker.py | 20 ++++++ setup.py | 18 +++++ webconsole/brozzler-webconsole/__init__.py | 19 ++++++ .../brozzler-webconsole/static/js/app.js | 18 +++++ 24 files changed, 497 insertions(+), 78 deletions(-) diff --git a/bin/brozzle-page b/bin/brozzle-page index 24daca8..d51aaf5 100755 --- a/bin/brozzle-page +++ b/bin/brozzle-page @@ -1,5 +1,22 @@ #!/usr/bin/env python -# vim: set sw=4 et: +# +# brozzle-page - command line utility for brozzling a single page, i.e. opening +# it in a browser, running some javascript behaviors, and printing outlinks +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import argparse import os diff --git a/bin/brozzler-new-job b/bin/brozzler-new-job index 5db58e2..4ddb65e 100755 --- a/bin/brozzler-new-job +++ b/bin/brozzler-new-job @@ -1,4 +1,23 @@ #!/usr/bin/env python +# +# brozzler-new-job - takes a yaml brozzler job configuration file, creates +# job, sites, and pages objects in rethinkdb, which brozzler-workers will look +# at and start crawling +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import argparse import os diff --git a/bin/brozzler-new-site b/bin/brozzler-new-site index ee1540e..511ab51 100755 --- a/bin/brozzler-new-site +++ b/bin/brozzler-new-site @@ -1,5 +1,22 @@ #!/usr/bin/env python -# vim: set sw=4 et: +# +# brozzler-new-site - takes a seed url and creates a site and page object in +# rethinkdb, which brozzler-workers will look at and start crawling +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import argparse import os diff --git a/bin/brozzler-worker b/bin/brozzler-worker index b6c3506..0db2bed 100755 --- a/bin/brozzler-worker +++ b/bin/brozzler-worker @@ -1,5 +1,23 @@ #!/usr/bin/env python -# vim: set sw=4 et: +# +# brozzler-worker - main entrypoint for brozzler, gets sites and pages to +# brozzle from rethinkdb, brozzles them +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import argparse import os diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 2990f1c..b0890e0 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -1,3 +1,22 @@ +# +# brozzler/__init__.py - __init__.py for brozzler package, contains some common +# code +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import json as _json import logging as _logging from pkg_resources import get_distribution as _get_distribution diff --git a/brozzler/behaviors.d/default.js b/brozzler/behaviors.d/default.js index 67b5691..04f81d7 100644 --- a/brozzler/behaviors.d/default.js +++ b/brozzler/behaviors.d/default.js @@ -1,8 +1,21 @@ -// vim:set sw=8 et: -// -// Scrolls to the bottom of the page, and clicks on embedded soundcloud -// elements. -// +/* + * brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of + * the page and clicks on embedded soundcloud elements + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraAboveBelowOrOnScreen = function(e) { var eTop = e.getBoundingClientRect().top; @@ -26,21 +39,21 @@ var umbraIntervalFunc = function() { var umbraSoundCloudEmbeddedElements = []; getUmbraSoundCloudEmbeddedElements(umbraSoundCloudEmbeddedElements); - + var clickedSomething = false; var somethingLeftBelow = false; var somethingLeftAbove = false; var missedAbove = 0; - + for (var i = 0; i < umbraSoundCloudEmbeddedElements.length; i++) { - + var targetId = umbraSoundCloudEmbeddedElements[i].id; var target = umbraSoundCloudEmbeddedElements[i].target; - + if (!(targetId in umbraAlreadyClicked)) { - + var where = umbraAboveBelowOrOnScreen(target); - + if (where == 0) { // on screen // var pos = target.getBoundingClientRect().top; // window.scrollTo(0, target.getBoundingClientRect().top - 100); @@ -52,14 +65,14 @@ var umbraIntervalFunc = function() { clickedSomething = true; umbraState.idleSince = null; break; - } else if (where > 0) { + } else if (where > 0) { somethingLeftBelow = true; } else if (where < 0) { somethingLeftAbove = true; } } } - + if (!clickedSomething) { if (somethingLeftAbove) { console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); @@ -77,7 +90,7 @@ var umbraIntervalFunc = function() { umbraState.idleSince = Date.now(); } } - + if (umbraState.idleSince == null) { umbraState.idleSince = Date.now(); } @@ -86,31 +99,31 @@ var umbraIntervalFunc = function() { //try to detect sound cloud "Play" buttons and return them as targets for clicking var getUmbraSoundCloudEmbeddedElements = function(soundCloudEmbeddedElements, currentIframeDepth, currentDocument, iframeElement) { - + //set default values for parameters currentIframeDepth = currentIframeDepth || 0; currentDocument = currentDocument || document; - + if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) { return; } - + //collect all buttons on current document first var button = []; - + button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_SOUNDCLOUD_EMBEDDED_SELECTOR); var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : ""; - + for (var i = 0; i < button.length; i++) { soundCloudEmbeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)}); } - + //now get all buttons in embedded iframes var iframe = []; - + iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_SOUNDCLOUD_EMBEDDED_SELECTOR); - + for (var i = 0; i < iframe.length; i++) { getUmbraSoundCloudEmbeddedElements(soundCloudEmbeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]); } @@ -135,7 +148,7 @@ var umbraBehaviorFinished = function() { var getElementCssPath = function(element) { var names = []; - + while (element.parentNode){ if (element.id){ names.unshift('#' + element.id); @@ -146,14 +159,14 @@ var getElementCssPath = function(element) { } else { for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++); - + names.unshift(element.tagName + ":nth-child(" + c + ")"); } - + element = element.parentNode; } } - + return names.join(" > "); } diff --git a/brozzler/behaviors.d/facebook.js b/brozzler/behaviors.d/facebook.js index d6baf20..55d1a25 100644 --- a/brozzler/behaviors.d/facebook.js +++ b/brozzler/behaviors.d/facebook.js @@ -1,4 +1,21 @@ -// vim:set sw=8 et: +/* + * brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom + * of the page, clicks to expand images, a few other things + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraAboveBelowOrOnScreen = function(e) { var eTop = e.getBoundingClientRect().top; @@ -28,26 +45,26 @@ var umbraScrolledThingFailedScrollAttempts = {}; var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0}; var umbraIntervalFunc = function() { - + var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); var everythingScrolled = true; - + for (var i = 0; i < thingsToScroll.length; i++) { var target = thingsToScroll[i]; - + if (!(target in umbraAlreadyScrolledThing)) { - + everythingScrolled = false; - + console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id); var lastScrollTop = target.scrollTop; target.scrollTop = target.scrollHeight; - + umbraState.idleSince = null; - + if (target.scrollTop >= target.scrollHeight) { umbraAlreadyScrolledThing[target] = true; - } + } else if (target.scrollTop == lastScrollTop) { if (umbraScrolledThingFailedScrollAttempts[target]) { umbraScrolledThingFailedScrollAttempts[target]++; @@ -55,7 +72,7 @@ var umbraIntervalFunc = function() { else { umbraScrolledThingFailedScrollAttempts[target] = 1; } - + if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) { umbraAlreadyScrolledThing[target] = true; } @@ -67,24 +84,24 @@ var umbraIntervalFunc = function() { } else { console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id) - } - + } + umbraState.expectingSomething = null; } - + if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) { if (umbraState.idleSince == null) { umbraState.idleSince = Date.now(); } - + return; } - + var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]'); for (var i = 0; i < closeButtons.length; i++) { // XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible - if (closeButtons[i].clientWidth > 0) { - if (umbraState.expectingSomething == 'closeButton') { + if (closeButtons[i].clientWidth > 0) { + if (umbraState.expectingSomething == 'closeButton') { console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML); umbraState.expectingSomething = null; } else { @@ -106,7 +123,7 @@ var umbraIntervalFunc = function() { var missedAbove = 0; for (var i = 0; i < thingsToClick.length; i++) { - var target = thingsToClick[i]; + var target = thingsToClick[i]; if (!(target in umbraAlreadyClicked)) { var where = umbraAboveBelowOrOnScreen(target); if (where == 0) { // on screen @@ -122,14 +139,14 @@ var umbraIntervalFunc = function() { clickedSomething = true; umbraState.idleSince = null; break; - } else if (where > 0) { + } else if (where > 0) { somethingLeftBelow = true; } else if (where < 0) { somethingLeftAbove = true; } } } - + if (window.scrollY > umbraState.bottomReachedScrollY) { umbraState.bottomReachedScrollY = window.scrollY; } @@ -159,7 +176,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; // Called from outside of this script. var umbraBehaviorFinished = function() { - + if (umbraState.idleSince != null) { var idleTimeMs = Date.now() - umbraState.idleSince; if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { diff --git a/brozzler/behaviors.d/flickr.js b/brozzler/behaviors.d/flickr.js index a341480..6e5077a 100644 --- a/brozzler/behaviors.d/flickr.js +++ b/brozzler/behaviors.d/flickr.js @@ -1,17 +1,33 @@ -// vim:set sw=8 et: +/* + * brozzler/behaviors.d/flickr.js - behavior for flickr.com + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ setInterval(function() { window.scrollBy(0,50); }, 100); -setTimeout(function() { - a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); - f = a.iterateNext(); +setTimeout(function() { + a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); + f = a.iterateNext(); f.click(); }, 5000); -setTimeout(function() { - a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); - setInterval(function() { - f = a.iterateNext(); +setTimeout(function() { + a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); + setInterval(function() { + f = a.iterateNext(); f.click(); }, 5000); }, 5000); diff --git a/brozzler/behaviors.d/instagram.js b/brozzler/behaviors.d/instagram.js index ecda823..9bc0842 100644 --- a/brozzler/behaviors.d/instagram.js +++ b/brozzler/behaviors.d/instagram.js @@ -1,5 +1,20 @@ -// vim:set sw=8 et: -// +/* + * brozzler/behaviors.d/flickr.js - behavior for instagram + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraInstagramBehavior = { IDLE_TIMEOUT_SEC: 20, @@ -12,11 +27,11 @@ var umbraInstagramBehavior = { intervalFunc: function() { if (this.state === "loading-thumbs") { - if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { window.scrollBy(0, 200); this.idleSince = null; return; - } + } var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)"); if (moreButtons.length > 0) { @@ -24,8 +39,8 @@ var umbraInstagramBehavior = { moreButtons[0].click(); this.idleSince = null; return; - } - + } + if (this.idleSince == null) { console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()"); this.idleSince = Date.now(); @@ -37,12 +52,12 @@ var umbraInstagramBehavior = { this.state = "clicking-first-thumb"; this.idleSince = null; return; - } else { + } else { // console.log("still might be waiting for something to load..."); return; } - } - } + } + } if (this.state === "clicking-first-thumb") { var images = document.querySelectorAll("a.pgmiImageLink"); diff --git a/brozzler/behaviors.d/marquette_edu.js b/brozzler/behaviors.d/marquette_edu.js index 76cd85c..b3a70f1 100644 --- a/brozzler/behaviors.d/marquette_edu.js +++ b/brozzler/behaviors.d/marquette_edu.js @@ -1,4 +1,21 @@ -// vim:set sw=8 et: +/* + * brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to + * play/crawl embedded videos + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraState = {'idleSince':null}; var umbraIntervalID = setInterval(umbraScrollInterval,50); @@ -10,7 +27,7 @@ function umbraScrollInterval() { umbraScroll(); umbraState.idleSince=null; } - else { + else { var videoBoxes = document.querySelectorAll("div#vid_box a"); var clickedVideo = false; @@ -50,4 +67,4 @@ var umbraBehaviorFinished = function() { } return false; } - + diff --git a/brozzler/behaviors.d/psu24.js b/brozzler/behaviors.d/psu24.js index 8084bca..a551961 100644 --- a/brozzler/behaviors.d/psu24.js +++ b/brozzler/behaviors.d/psu24.js @@ -1,3 +1,21 @@ +/* + * brozzler/behaviors.d/flickr.js - behavior for marquette.edu, clicks to + * play/crawl embedded videos + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraBehavior = { IDLE_TIMEOUT_SEC : 10, @@ -12,7 +30,7 @@ var umbraBehavior = { var iframes = document.querySelectorAll("iframe"); var documents = Array(iframes.length + 1); documents[0] = document; - + for (var i = 0; i < iframes.length; i++) { documents[i+1] = iframes[i].contentWindow.document; } diff --git a/brozzler/behaviors.d/simpleclicks.js.in b/brozzler/behaviors.d/simpleclicks.js.in index c9e8c90..425cecf 100644 --- a/brozzler/behaviors.d/simpleclicks.js.in +++ b/brozzler/behaviors.d/simpleclicks.js.in @@ -1,3 +1,22 @@ +/* + * brozzler/behaviors.d/simpleclicks.js.in - simpleclicks behavior template, + * clicks on elements matching templatized css selector + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + var umbraBehavior = { IDLE_TIMEOUT_SEC : 10, idleSince : null, @@ -12,11 +31,11 @@ var umbraBehavior = { //handle Python to JavaScript boolean conversion clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false; - + var iframes = document.querySelectorAll("iframe"); var documents = Array(iframes.length + 1); documents[0] = document; - + for (var i = 0; i < iframes.length; i++) { documents[i+1] = iframes[i].contentWindow.document; } diff --git a/brozzler/behaviors.d/vimeo.js b/brozzler/behaviors.d/vimeo.js index 72a10cb..446937a 100644 --- a/brozzler/behaviors.d/vimeo.js +++ b/brozzler/behaviors.d/vimeo.js @@ -1,4 +1,21 @@ -// vim:set sw=8 et: +/* + * brozzler/behaviors.d/vimeo.js - behavior for vimeo.com, clicks to play/crawl + * videos + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ var umbraState = {'idleSince':null}; var umbraVideoElements = document.getElementsByTagName('video'); diff --git a/brozzler/behaviors.py b/brozzler/behaviors.py index 3ab35de..62de529 100644 --- a/brozzler/behaviors.py +++ b/brozzler/behaviors.py @@ -1,4 +1,21 @@ -# vim: set sw=4 et: +# +# brozzler/behaviors.py - manages behaviors, which are javascript scripts that +# run in brozzled web pages +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import json import itertools diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 2284d88..7d980d1 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -1,3 +1,21 @@ +# +# brozzler/behaviors.yaml - behavior configuration +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # first matched behavior is used, so order matters here behaviors: - diff --git a/brozzler/browser.py b/brozzler/browser.py index df462f9..4344a0c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,5 +1,21 @@ -#!/usr/bin/env python -# vim: set sw=4 et: +# +# brozzler/browser.py - classes responsible for running web browsers +# (chromium/chromium) and browsing web pages in them +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import logging import json diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3cf9c5f..37b75d7 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -1,3 +1,21 @@ +# +# brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import logging import brozzler import random diff --git a/brozzler/job.py b/brozzler/job.py index 225becd..625e455 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -1,3 +1,22 @@ +# +# brozzler/job.py - Job class representing a brozzler crawl job, and functions +# for setting up a job with supplied configuration +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import logging import brozzler import yaml diff --git a/brozzler/robots.py b/brozzler/robots.py index 433caee..131fdcc 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -1,4 +1,20 @@ -# vim: set sw=4 et: +# +# brozzler/robots.py - robots.txt support +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import json import logging diff --git a/brozzler/site.py b/brozzler/site.py index c4538ce..74e692b 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -1,3 +1,21 @@ +# +# brozzler/site.py - classes representing sites and pages +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import surt import json import logging diff --git a/brozzler/worker.py b/brozzler/worker.py index 9ebf88d..c20defd 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -1,3 +1,23 @@ +# +# brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning +# it runs youtube-dl on them, browses them and runs behaviors if appropriate, +# scopes and adds outlinks to the frontier +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import os import logging import brozzler diff --git a/setup.py b/setup.py index e71d3ff..6dab8ec 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,21 @@ +# +# setup.py - brozzler setup script +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import setuptools import glob diff --git a/webconsole/brozzler-webconsole/__init__.py b/webconsole/brozzler-webconsole/__init__.py index 5948a26..645b0f8 100644 --- a/webconsole/brozzler-webconsole/__init__.py +++ b/webconsole/brozzler-webconsole/__init__.py @@ -1,3 +1,22 @@ +# +# brozzler-webconsole/__init__.py - flask app for brozzler web console, defines +# api endspoints etc +# +# Copyright (C) 2014-2016 Internet Archive +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import flask import rethinkstuff import json diff --git a/webconsole/brozzler-webconsole/static/js/app.js b/webconsole/brozzler-webconsole/static/js/app.js index d81370f..0497580 100644 --- a/webconsole/brozzler-webconsole/static/js/app.js +++ b/webconsole/brozzler-webconsole/static/js/app.js @@ -1,3 +1,21 @@ +/* + * brozzler-webconsole/static/js/app.js - brozzler console angularjs code + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + "use strict"; var brozzlerConsoleApp = angular.module("brozzlerConsoleApp", [