From a147ed1940224edb3ce568ca37089d951681325b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 9 May 2016 22:50:40 -0700 Subject: [PATCH 01/33] multiclicks behavior --- brozzler/behaviors.d/multiclicks.js.template | 105 +++++++++++++++++++ brozzler/behaviors.yaml | 6 ++ 2 files changed, 111 insertions(+) create mode 100644 brozzler/behaviors.d/multiclicks.js.template diff --git a/brozzler/behaviors.d/multiclicks.js.template b/brozzler/behaviors.d/multiclicks.js.template new file mode 100644 index 0000000..7a39469 --- /dev/null +++ b/brozzler/behaviors.d/multiclicks.js.template @@ -0,0 +1,105 @@ +/* + * brozzler/behaviors.d/multiclicks.js.template - click on each of several elements + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraAboveBelowOrOnScreen = function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } +} + +var umbraState = {'idleSince':null}; +var umbraAlreadyClicked = {}; + +var umbraIntervalFunc = function() { + var clickedSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + var cssSelector = "${css_selector}"; + + var clickTargets = document.querySelectorAll(cssSelector); + + for (var i = 0; i < clickTargets.length; i++) { + targetID = clickTargets[i].id; + if (targetID === "") { + targetID = location.host + "-" + i; + } + if (!(targetID in umbraAlreadyClicked)) { + var where = umbraAboveBelowOrOnScreen(clickTargets[i]); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + clickTargets[i].dispatchEvent(mouseOverEvent); + clickTargets[i].click(); + clickedSomething = true; + umbraState.idleSince = null; + umbraAlreadyClicked[targetID] = true; + break; //break from clickTargets loop + + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + + if (!clickedSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); + window.scrollBy(0, -500); + umbraState.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + } + + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } +} + +// If we haven't had anything to do (scrolled, clicked, etc) in this amount of +// time, then we consider ourselves finished with the page. +var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 12; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + if (umbraState.idleSince != null) { + var idleTimeMs = Date.now() - umbraState.idleSince; + if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + clearInterval(umbraIntervalId); + return true; + } + } + return false; +} + +var umbraIntervalId = setInterval(umbraIntervalFunc, 5000); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index a44f7af..a87f483 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -45,6 +45,12 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 + - # ARI-4838 racineco.com document viewers + url_regex: '^https?://(?:www\.)?racineco\.com/.*$' + behavior_js_template: multiclicks.js.template + default_parameters: + css_selector: img[id^='NavtwocolUserControl11_NavMeeting_item'] + request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' behavior_js_template: simpleclicks.js.template From 3d0c6b7e723d518e120a82228f504588f336a1b1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 17 May 2016 17:40:41 -0700 Subject: [PATCH 02/33] huffpo slideshow custom behavior --- brozzler/behaviors.d/huffpostslides.js | 114 +++++++++++++++++++++++++ brozzler/behaviors.yaml | 4 + 2 files changed, 118 insertions(+) create mode 100644 brozzler/behaviors.d/huffpostslides.js diff --git a/brozzler/behaviors.d/huffpostslides.js b/brozzler/behaviors.d/huffpostslides.js new file mode 100644 index 0000000..411dc13 --- /dev/null +++ b/brozzler/behaviors.d/huffpostslides.js @@ -0,0 +1,114 @@ +/* + * brozzler/behaviors.d/huffpostslides.js - from article, start slideshow and + * click through end + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraAboveBelowOrOnScreen = function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } +} + +var umbraState = {'idleSince':null}; +var umbraAlreadyClicked = {}; + +var umbraIntervalFunc = function() { + var clickedSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + + if (!('slides' in umbraAlreadyClicked)) { + var target = document.querySelector('.slideshow'); + var where = umbraAboveBelowOrOnScreen(target); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + target.dispatchEvent(mouseOverEvent); + target.click(); + clickedSomething = true; + umbraState.idleSince = null; + umbraAlreadyClicked['slides'] = true; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } else if (!(location.href in umbraAlreadyClicked)){ + var target = document.querySelector('.slideshow-overlay__container__left__nav__next'); + target.id = location.href + var where = umbraAboveBelowOrOnScreen(target); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + target.dispatchEvent(mouseOverEvent); + target.click(); + clickedSomething = true; + umbraState.idleSince = null; + console.log('clicked ' + target.id); + umbraAlreadyClicked[target.id] = true; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + + if (!clickedSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); + window.scrollBy(0, -500); + umbraState.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + } + + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } +} + +// If we haven't had anything to do (scrolled, clicked, etc) in this amount of +// time, then we consider ourselves finished with the page. +var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 5; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + if (umbraState.idleSince != null) { + var idleTimeMs = Date.now() - umbraState.idleSince; + if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + clearInterval(umbraIntervalId); + return true; + } + } + return false; +} + +var umbraIntervalId = setInterval(umbraIntervalFunc, 2000); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index a87f483..2bc56a4 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -51,6 +51,10 @@ behaviors: default_parameters: css_selector: img[id^='NavtwocolUserControl11_NavMeeting_item'] request_idle_timeout_sec: 10 + - + url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' + behavior_js_template: huffpostslides.js + request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' behavior_js_template: simpleclicks.js.template From 3c5c6a714a115466da4f45b66c90e9e90067d9ae Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sat, 21 May 2016 19:27:05 -0700 Subject: [PATCH 03/33] clickGetPDFs.template --- brozzler/behaviors.d/clickGetPDFs.js.template | 134 ++++++++++++++++++ brozzler/behaviors.yaml | 2 +- 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 brozzler/behaviors.d/clickGetPDFs.js.template diff --git a/brozzler/behaviors.d/clickGetPDFs.js.template b/brozzler/behaviors.d/clickGetPDFs.js.template new file mode 100644 index 0000000..65f62e7 --- /dev/null +++ b/brozzler/behaviors.d/clickGetPDFs.js.template @@ -0,0 +1,134 @@ +/* + * brozzler/behaviors.d/clickGetPDFs.js.template - click on each of several elements and + * click on linked PDFs found + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraAboveBelowOrOnScreen = function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } +} + +var umbraState = {'idleSince':null}; +var umbraAlreadyClicked = {}; + +var umbraIntervalFunc = function() { + var clickedSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + var cssSelector = "${css_selector}"; + var pdfSelector = "a"; + var i = 0; + + var clickTargets = document.querySelectorAll(cssSelector); + var pdfTargets = document.querySelectorAll(pdfSelector); + + if (pdfTargets.length > 0) { + for (i = 0; i < pdfTargets.length; i++) { + if (pdfTargets[i].href.toLowerCase().lastIndexOf('pdf') > 0) { + if (!(pdfTargets[i].href in umbraAlreadyClicked)){ + var where = umbraAboveBelowOrOnScreen(pdfTargets[i]); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + pdfTargets[i].dispatchEvent(mouseOverEvent); + pdfTargets[i].click(); + clickedSomething = true; + umbraState.idleSince = null; + umbraAlreadyClicked[pdfTargets[i].href] = true; + break; // break from loop + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + } + } + + if((i === pdfTargets.length) && !clickedSomething) { + for (var i = 0; i < clickTargets.length; i++) { + targetID = clickTargets[i].id; + if (targetID === "") { + targetID = location.host + "-" + i; + } + if (!(targetID in umbraAlreadyClicked)) { + var where = umbraAboveBelowOrOnScreen(clickTargets[i]); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + clickTargets[i].dispatchEvent(mouseOverEvent); + clickTargets[i].click(); + clickedSomething = true; + umbraState.idleSince = null; + umbraAlreadyClicked[targetID] = true; + break; // break from loop + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + } + + if (!clickedSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); + window.scrollBy(0, -500); + umbraState.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + } + + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } +} + +// If we haven't had anything to do (scrolled, clicked, etc) in this amount of +// time, then we consider ourselves finished with the page. +var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 7; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + if (umbraState.idleSince != null) { + var idleTimeMs = Date.now() - umbraState.idleSince; + if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + clearInterval(umbraIntervalId); + return true; + } + } + return false; +} + +var umbraIntervalId = setInterval(umbraIntervalFunc, 5000); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 2bc56a4..e4b7b90 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -47,7 +47,7 @@ behaviors: request_idle_timeout_sec: 10 - # ARI-4838 racineco.com document viewers url_regex: '^https?://(?:www\.)?racineco\.com/.*$' - behavior_js_template: multiclicks.js.template + behavior_js_template: clickGetPDFs.js.template default_parameters: css_selector: img[id^='NavtwocolUserControl11_NavMeeting_item'] request_idle_timeout_sec: 10 From d7bd19a7f6b1c77a194d55361166d2582ef74aaf Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 27 May 2016 22:20:36 -0700 Subject: [PATCH 04/33] kcfed test --- brozzler/behaviors.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index e4b7b90..abaae51 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -51,6 +51,12 @@ behaviors: default_parameters: css_selector: img[id^='NavtwocolUserControl11_NavMeeting_item'] request_idle_timeout_sec: 10 + - # ARI-4930 test + url_regex: '^https?://(?:www\.)?kansascityfed\.org/publications/research/er/archive/.*$' + behavior_js_template: clickGetPDFs.js.template + default_parameters: + css_selector: li.years>a + request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' behavior_js_template: huffpostslides.js From 73454d2ac74e4a13ec29fb7e3a10cb85de13b7c3 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 8 May 2016 19:55:50 -0700 Subject: [PATCH 05/33] clickGetPDFs for kansascityfed --- brozzler/behaviors.d/clickGetPDFs.js.template | 66 +++++++++++++------ brozzler/behaviors.yaml | 6 -- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/brozzler/behaviors.d/clickGetPDFs.js.template b/brozzler/behaviors.d/clickGetPDFs.js.template index 65f62e7..7eaced5 100644 --- a/brozzler/behaviors.d/clickGetPDFs.js.template +++ b/brozzler/behaviors.d/clickGetPDFs.js.template @@ -1,5 +1,5 @@ /* - * brozzler/behaviors.d/clickGetPDFs.js.template - click on each of several elements and + * brozzler/behaviors.d/clickGetPDFs.js.template - click on each of several elements and * click on linked PDFs found * * Copyright (C) 2014-2016 Internet Archive @@ -17,6 +17,12 @@ * limitations under the License. */ +/* + * to do: + * disable multiple file download warning dialog + * enable storage across page loads to capture all PDFs AND decrease work + */ + var umbraAboveBelowOrOnScreen = function(e) { var eTop = e.getBoundingClientRect().top; if (eTop < window.scrollY) { @@ -26,35 +32,58 @@ var umbraAboveBelowOrOnScreen = function(e) { } else { return 0; // on screen } -} +}; var umbraState = {'idleSince':null}; -var umbraAlreadyClicked = {}; +var umbraAlreadyClicked = {'2016':true}; + +var clickTargets = document.querySelectorAll("${css_selector}"); + +var pdfSelector = "a"; var umbraIntervalFunc = function() { var clickedSomething = false; var somethingLeftBelow = false; var somethingLeftAbove = false; - var cssSelector = "${css_selector}"; - var pdfSelector = "a"; var i = 0; - var clickTargets = document.querySelectorAll(cssSelector); var pdfTargets = document.querySelectorAll(pdfSelector); - if (pdfTargets.length > 0) { + if (!(clickTargets[0].text in umbraAlreadyClicked)) { + targetID = clickTargets[0].text; + logmsg = 'clicking ' + targetID; + console.log(logmsg); + var where = umbraAboveBelowOrOnScreen(clickTargets[0]); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + clickTargets[0].dispatchEvent(mouseOverEvent); + clickTargets[0].click(); + clickedSomething = true; + umbraState.idleSince = null; + umbraAlreadyClicked[targetID] = true; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } else if (pdfTargets.length > 0) { for (i = 0; i < pdfTargets.length; i++) { if (pdfTargets[i].href.toLowerCase().lastIndexOf('pdf') > 0) { - if (!(pdfTargets[i].href in umbraAlreadyClicked)){ + pdfID = pdfTargets[i].href; + logmsg = 'clicking ' + pdfID; + console.log(logmsg); + if (!(pdfID in umbraAlreadyClicked)){ var where = umbraAboveBelowOrOnScreen(pdfTargets[i]); if (where === 0) { + pdfTargets[i].setAttribute('download',''); var mouseOverEvent = document.createEvent('Events'); mouseOverEvent.initEvent("mouseover", true, false); pdfTargets[i].dispatchEvent(mouseOverEvent); pdfTargets[i].click(); clickedSomething = true; umbraState.idleSince = null; - umbraAlreadyClicked[pdfTargets[i].href] = true; + umbraAlreadyClicked[pdfID] = true; break; // break from loop } else if (where > 0) { somethingLeftBelow = true; @@ -67,11 +96,10 @@ var umbraIntervalFunc = function() { } if((i === pdfTargets.length) && !clickedSomething) { - for (var i = 0; i < clickTargets.length; i++) { - targetID = clickTargets[i].id; - if (targetID === "") { - targetID = location.host + "-" + i; - } + for (i = 1; i < clickTargets.length; i++) { + targetID = clickTargets[i].text; + logmsg = 'clicking ' + targetID; + console.log(logmsg); if (!(targetID in umbraAlreadyClicked)) { var where = umbraAboveBelowOrOnScreen(clickTargets[i]); if (where === 0) { @@ -105,15 +133,15 @@ var umbraIntervalFunc = function() { } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { window.scrollBy(0, 200); umbraState.idleSince = null; - } else if (umbraState.idleSince == null) { + } else if (umbraState.idleSince === null) { umbraState.idleSince = Date.now(); } } - if (umbraState.idleSince == null) { + if (umbraState.idleSince === null) { umbraState.idleSince = Date.now(); } -} +}; // If we haven't had anything to do (scrolled, clicked, etc) in this amount of // time, then we consider ourselves finished with the page. @@ -121,7 +149,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 7; // Called from outside of this script. var umbraBehaviorFinished = function() { - if (umbraState.idleSince != null) { + if (umbraState.idleSince !== null) { var idleTimeMs = Date.now() - umbraState.idleSince; if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { clearInterval(umbraIntervalId); @@ -129,6 +157,6 @@ var umbraBehaviorFinished = function() { } } return false; -} +}; var umbraIntervalId = setInterval(umbraIntervalFunc, 5000); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index abaae51..6cdd902 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -45,12 +45,6 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 - - # ARI-4838 racineco.com document viewers - url_regex: '^https?://(?:www\.)?racineco\.com/.*$' - behavior_js_template: clickGetPDFs.js.template - default_parameters: - css_selector: img[id^='NavtwocolUserControl11_NavMeeting_item'] - request_idle_timeout_sec: 10 - # ARI-4930 test url_regex: '^https?://(?:www\.)?kansascityfed\.org/publications/research/er/archive/.*$' behavior_js_template: clickGetPDFs.js.template From d398d2d006b7623e38ee1a275f55d38b94f09d87 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sat, 25 Jun 2016 15:05:37 -0700 Subject: [PATCH 06/33] disable downloads & kccfed behavior --- brozzler/behaviors.d/clickGetPDFs.js.template | 4 ++-- brozzler/behaviors.yaml | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/brozzler/behaviors.d/clickGetPDFs.js.template b/brozzler/behaviors.d/clickGetPDFs.js.template index 7eaced5..4349835 100644 --- a/brozzler/behaviors.d/clickGetPDFs.js.template +++ b/brozzler/behaviors.d/clickGetPDFs.js.template @@ -1,5 +1,5 @@ /* - * brozzler/behaviors.d/clickGetPDFs.js.template - click on each of several elements and + * brozzler/behaviors.d/clickGetPDFs.js.template - WIP click on each of several elements and * click on linked PDFs found * * Copyright (C) 2014-2016 Internet Archive @@ -76,7 +76,7 @@ var umbraIntervalFunc = function() { if (!(pdfID in umbraAlreadyClicked)){ var where = umbraAboveBelowOrOnScreen(pdfTargets[i]); if (where === 0) { - pdfTargets[i].setAttribute('download',''); + // pdfTargets[i].setAttribute('download',''); // umbra ignores downloads var mouseOverEvent = document.createEvent('Events'); mouseOverEvent.initEvent("mouseover", true, false); pdfTargets[i].dispatchEvent(mouseOverEvent); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 6cdd902..764012c 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -45,12 +45,12 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 - - # ARI-4930 test - url_regex: '^https?://(?:www\.)?kansascityfed\.org/publications/research/er/archive/.*$' - behavior_js_template: clickGetPDFs.js.template - default_parameters: - css_selector: li.years>a - request_idle_timeout_sec: 10 +# - # ARI-4930 test +# url_regex: '^https?://(?:www\.)?kansascityfed\.org/publications/research/er/archive/.*$' +# behavior_js_template: clickGetPDFs.js.template +# default_parameters: +# css_selector: li.years>a +# request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' behavior_js_template: huffpostslides.js From c9a0e1ed245fff72e623580549c85f578057c875 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Sep 2016 20:24:01 -0700 Subject: [PATCH 07/33] facebook login troubleshooting --- brozzler/behaviors.d/facebook.js.template | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/behaviors.d/facebook.js.template index d93e127..8d3d049 100644 --- a/brozzler/behaviors.d/facebook.js.template +++ b/brozzler/behaviors.d/facebook.js.template @@ -48,6 +48,8 @@ var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrol var umbraIntervalFunc = function() { + var fbLogoutMenu = document.querySelector("div#logoutMenu") + if (!(fbLogoutMenu)) { console.log("no #logoutMenu; not logged in?");} var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); var everythingScrolled = true; @@ -173,12 +175,17 @@ var umbraIntervalFunc = function() { } var umbraFacebookLogin = function() { + console.log("starting facebook login...") var emailInput = document.querySelector("form#login_form input#email"); + if (emailInput) { console.log("emailInput found"); } var passwordInput = document.querySelector("form#login_form input#pass"); + if (passwordInput) { console.log("passwordInput found"); } var loginButton = document.querySelector("form#login_form label#loginbutton > input"); + if (loginButton) { console.log("loginButton found"); } emailInput.value=UMBRA_FB_USER_NAME; passwordInput.value=UMBRA_FB_PASSWORD; loginButton.click(); + console.log("logging in with facebook user: " + UMBRA_FB_USER_NAME + " with password: " + UMBRA_FB_PASSWORD); } // If we haven't had anything to do (scrolled, clicked, etc) in this amount of From 76d85168981c0f25f283329a1fe09b6588607398 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Sep 2016 15:13:51 -0700 Subject: [PATCH 08/33] better facebook login logging --- brozzler/behaviors.d/facebook.js.template | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/behaviors.d/facebook.js.template index 8d3d049..da8348a 100644 --- a/brozzler/behaviors.d/facebook.js.template +++ b/brozzler/behaviors.d/facebook.js.template @@ -48,8 +48,6 @@ var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrol var umbraIntervalFunc = function() { - var fbLogoutMenu = document.querySelector("div#logoutMenu") - if (!(fbLogoutMenu)) { console.log("no #logoutMenu; not logged in?");} var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); var everythingScrolled = true; @@ -175,15 +173,12 @@ var umbraIntervalFunc = function() { } var umbraFacebookLogin = function() { - console.log("starting facebook login...") var emailInput = document.querySelector("form#login_form input#email"); - if (emailInput) { console.log("emailInput found"); } var passwordInput = document.querySelector("form#login_form input#pass"); - if (passwordInput) { console.log("passwordInput found"); } var loginButton = document.querySelector("form#login_form label#loginbutton > input"); - if (loginButton) { console.log("loginButton found"); } emailInput.value=UMBRA_FB_USER_NAME; passwordInput.value=UMBRA_FB_PASSWORD; + if (loginButton) { console.log("clicking #loginButton"); } loginButton.click(); console.log("logging in with facebook user: " + UMBRA_FB_USER_NAME + " with password: " + UMBRA_FB_PASSWORD); } From a8430baf1bdbe9fa50e9360466726696c40f4fc0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 16 Sep 2016 13:39:00 -0700 Subject: [PATCH 09/33] better still facebook login logging --- brozzler/behaviors.d/facebook.js.template | 2 -- 1 file changed, 2 deletions(-) diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/behaviors.d/facebook.js.template index da8348a..d93e127 100644 --- a/brozzler/behaviors.d/facebook.js.template +++ b/brozzler/behaviors.d/facebook.js.template @@ -178,9 +178,7 @@ var umbraFacebookLogin = function() { var loginButton = document.querySelector("form#login_form label#loginbutton > input"); emailInput.value=UMBRA_FB_USER_NAME; passwordInput.value=UMBRA_FB_PASSWORD; - if (loginButton) { console.log("clicking #loginButton"); } loginButton.click(); - console.log("logging in with facebook user: " + UMBRA_FB_USER_NAME + " with password: " + UMBRA_FB_PASSWORD); } // If we haven't had anything to do (scrolled, clicked, etc) in this amount of From 016367bc4622b970fd75a423d45e35f85888b079 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 29 Sep 2016 20:58:10 -0700 Subject: [PATCH 10/33] clean up Browser dirs and add flags_location --- brozzler/browser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 1b3301b..a9b3851 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -137,9 +137,10 @@ class Browser: # these can raise exceptions self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() + data_dir = os.path.join(self._work_dir.name, "chrome-user-data") + flags_location = os.path.join(data_dir, "Local State") if cookie_db is not None: - cookie_dir = os.path.join( - self._work_dir.name, "chrome-user-data", "Default") + cookie_dir = os.path.join(data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") self.logger.debug( "cookie DB provided, writing to %s", cookie_location) From 0b9518bc5c617059250dcb4e19e80f66ef14ae34 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 29 Sep 2016 23:13:15 -0700 Subject: [PATCH 11/33] read/write chromium local state --- brozzler/browser.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index a9b3851..69dee7c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -138,7 +138,6 @@ class Browser: self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() data_dir = os.path.join(self._work_dir.name, "chrome-user-data") - flags_location = os.path.join(data_dir, "Local State") if cookie_db is not None: cookie_dir = os.path.join(data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") @@ -154,6 +153,15 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) + flags_location = os.path.join(data_dir, "Local State") + try: + with open(flags_location, 'w+') as f: + json.dump(make_local_state, f) + except OSError: + self.logger.error( + "exception writing local state file at %s", + flags_location, exc_info=True) + self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, @@ -205,6 +213,17 @@ class Browser: cookie_location, exc_info=True) return cookie_db + def make_chromium_local_state(self): + local_state_location = '/tmp/chromium/Local State' + with open(local_state_location) as j: + local_state = json.load(j) + if 'enabled_labs_experiments' in local_state['browser']: + if 'enable-brotli@2' not in local_state['browser']['enabled_labs_experiments']: + local_state['browser']['enabled_labs_experiments'].append('enable-brotli@2') + else: + local_state['browser']['enabled_labs_experiments'] = ['enable-brotli@2'] + return local_state + def _find_available_port(self): port_available = False port = self.chrome_port From b5f207749146d93f71a0f110721f9865e5a7f6fb Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 29 Sep 2016 20:58:10 -0700 Subject: [PATCH 12/33] clean up Browser dirs and add flags_location --- brozzler/browser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/browser.py b/brozzler/browser.py index 69dee7c..1f20daf 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -138,6 +138,7 @@ class Browser: self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() data_dir = os.path.join(self._work_dir.name, "chrome-user-data") + flags_location = os.path.join(data_dir, "Local State") if cookie_db is not None: cookie_dir = os.path.join(data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") From 022a4a60ca25bdec243afac3a83acce34e7deb4a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 29 Sep 2016 23:13:15 -0700 Subject: [PATCH 13/33] read/write chromium local state --- brozzler/browser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 1f20daf..69dee7c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -138,7 +138,6 @@ class Browser: self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() data_dir = os.path.join(self._work_dir.name, "chrome-user-data") - flags_location = os.path.join(data_dir, "Local State") if cookie_db is not None: cookie_dir = os.path.join(data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") From 8b63555f737757e97b82e19b2c155589675ba1d0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 29 Sep 2016 23:58:51 -0700 Subject: [PATCH 14/33] better variable name --- brozzler/browser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 69dee7c..97edbd7 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -153,14 +153,14 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) - flags_location = os.path.join(data_dir, "Local State") + local_state_location = os.path.join(data_dir, 'Local State') try: - with open(flags_location, 'w+') as f: - json.dump(make_local_state, f) + with open(local_state_location, 'w+') as f: + json.dump(make_chromium_local_state(), f) except OSError: self.logger.error( "exception writing local state file at %s", - flags_location, exc_info=True) + local_state_location, exc_info=True) self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, From abc7c0bfca631d347c8f25b85f374e857177bd89 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 00:18:41 -0700 Subject: [PATCH 15/33] simplify to troubleshoot --- brozzler/browser.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 97edbd7..ce3dbe2 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -153,10 +153,21 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) + local_state_origin = '/tmp/chromium/Local State' + with open(local_state_origin) as j: + local_state = json.load(j) + if 'enabled_labs_experiments' in local_state['browser']: + if 'enable-brotli@2' not in local_state['browser']['enabled_labs_experiments']: + local_state['browser']['enabled_labs_experiments'].append('enable-brotli@2') + else: + local_state['browser']['enabled_labs_experiments'] = ['enable-brotli@2'] + + print(local_state) + local_state_location = os.path.join(data_dir, 'Local State') try: with open(local_state_location, 'w+') as f: - json.dump(make_chromium_local_state(), f) + json.dump(local_state, f) except OSError: self.logger.error( "exception writing local state file at %s", @@ -213,17 +224,6 @@ class Browser: cookie_location, exc_info=True) return cookie_db - def make_chromium_local_state(self): - local_state_location = '/tmp/chromium/Local State' - with open(local_state_location) as j: - local_state = json.load(j) - if 'enabled_labs_experiments' in local_state['browser']: - if 'enable-brotli@2' not in local_state['browser']['enabled_labs_experiments']: - local_state['browser']['enabled_labs_experiments'].append('enable-brotli@2') - else: - local_state['browser']['enabled_labs_experiments'] = ['enable-brotli@2'] - return local_state - def _find_available_port(self): port_available = False port = self.chrome_port From 84947c11487045ee1574c08a05678bd69bf97b09 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 09:30:39 -0700 Subject: [PATCH 16/33] make data_dir --- brozzler/browser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/browser.py b/brozzler/browser.py index ce3dbe2..8da494c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -138,6 +138,7 @@ class Browser: self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() data_dir = os.path.join(self._work_dir.name, "chrome-user-data") + os.makedirs(data_dir, exist_ok=True) if cookie_db is not None: cookie_dir = os.path.join(data_dir, "Default") cookie_location = os.path.join(cookie_dir, "Cookies") From cf82d266bd3b08bdeb53033c9b977242a9afbeee Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 11:17:30 -0700 Subject: [PATCH 17/33] disable SDCH? --- brozzler/browser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 8da494c..6d822c8 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -620,7 +620,8 @@ class Chrome: "--homepage=about:blank", "--disable-direct-npapi-requests", "--disable-web-security", "--disable-notifications", "--disable-extensions", - "--disable-save-password-bubble"] + "--disable-save-password-bubble", + "--enable-sdch=0"] if self.ignore_cert_errors: chrome_args.append("--ignore-certificate-errors") if self.proxy: From 15db75e825a1746517c0d85995848b37c0014512 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 10:55:13 -0700 Subject: [PATCH 18/33] try saving only Brotli conifg --- brozzler/browser.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 6d822c8..3ae1568 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -154,17 +154,7 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) - local_state_origin = '/tmp/chromium/Local State' - with open(local_state_origin) as j: - local_state = json.load(j) - if 'enabled_labs_experiments' in local_state['browser']: - if 'enable-brotli@2' not in local_state['browser']['enabled_labs_experiments']: - local_state['browser']['enabled_labs_experiments'].append('enable-brotli@2') - else: - local_state['browser']['enabled_labs_experiments'] = ['enable-brotli@2'] - - print(local_state) - + local_state = {'browser': ['enable-brotli@2']} local_state_location = os.path.join(data_dir, 'Local State') try: with open(local_state_location, 'w+') as f: From 5d0f11be09d9439064763ec5d2f156e8439ee5ae Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 11:20:54 -0700 Subject: [PATCH 19/33] correct local_state --- brozzler/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 3ae1568..e743d65 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -154,7 +154,7 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) - local_state = {'browser': ['enable-brotli@2']} + local_state = {'browser':{'enabled_labs_experiments':['enable-brotli@2']}} local_state_location = os.path.join(data_dir, 'Local State') try: with open(local_state_location, 'w+') as f: From d2b51224775a5c3c73de21600e64f166105832b2 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 12:34:05 -0700 Subject: [PATCH 20/33] test extra headers --- brozzler/browser.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index e743d65..ad5f852 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -154,6 +154,7 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) +''' local_state = {'browser':{'enabled_labs_experiments':['enable-brotli@2']}} local_state_location = os.path.join(data_dir, 'Local State') try: @@ -163,6 +164,7 @@ class Browser: self.logger.error( "exception writing local state file at %s", local_state_location, exc_info=True) +''' self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, @@ -470,8 +472,11 @@ __brzl_compileOutlinks(window).join(' '); self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Runtime.enable") - if self.extra_headers: - self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers}) + headers = self.extra_headers or {} + headers['Accept-Encoding'] = 'gzip, deflate' + self.send_to_chrome( + method="Network.setExtraHTTPHeaders", + params={"headers":self.extra_headers}) if self.user_agent: self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent}) @@ -610,8 +615,7 @@ class Chrome: "--homepage=about:blank", "--disable-direct-npapi-requests", "--disable-web-security", "--disable-notifications", "--disable-extensions", - "--disable-save-password-bubble", - "--enable-sdch=0"] + "--disable-save-password-bubble"] if self.ignore_cert_errors: chrome_args.append("--ignore-certificate-errors") if self.proxy: From fd76c819abce18b507ae6ca5ca2581306a569c7b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 12:36:10 -0700 Subject: [PATCH 21/33] test identity encoding --- brozzler/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index ad5f852..1fa1f4a 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -473,7 +473,7 @@ __brzl_compileOutlinks(window).join(' '); self.send_to_chrome(method="Runtime.enable") headers = self.extra_headers or {} - headers['Accept-Encoding'] = 'gzip, deflate' + headers['Accept-Encoding'] = 'identity' self.send_to_chrome( method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers}) From 025394516162058b2245617f9bccf652eba2e847 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 14:08:00 -0700 Subject: [PATCH 22/33] just test gzip,deflate after all --- brozzler/browser.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 1fa1f4a..3ac90e3 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -154,18 +154,6 @@ class Browser: "exception writing cookie file at %s", cookie_location, exc_info=True) -''' - local_state = {'browser':{'enabled_labs_experiments':['enable-brotli@2']}} - local_state_location = os.path.join(data_dir, 'Local State') - try: - with open(local_state_location, 'w+') as f: - json.dump(local_state, f) - except OSError: - self.logger.error( - "exception writing local state file at %s", - local_state_location, exc_info=True) -''' - self._chrome_instance = Chrome( port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, @@ -473,7 +461,7 @@ __brzl_compileOutlinks(window).join(' '); self.send_to_chrome(method="Runtime.enable") headers = self.extra_headers or {} - headers['Accept-Encoding'] = 'identity' + headers['Accept-Encoding'] = 'gzip, deflate' self.send_to_chrome( method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers}) From 8d0b5d4bfac8f48eea6cb5e4a59a5460102dedf5 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 30 Sep 2016 14:53:25 -0700 Subject: [PATCH 23/33] fixing mis-copy --- brozzler/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 3ac90e3..8a02753 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -464,7 +464,7 @@ __brzl_compileOutlinks(window).join(' '); headers['Accept-Encoding'] = 'gzip, deflate' self.send_to_chrome( method="Network.setExtraHTTPHeaders", - params={"headers":self.extra_headers}) + params={"headers":headers}) if self.user_agent: self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent}) From e884bee6c992c7e76e639dd03f39aaefaca0240b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 12 Oct 2016 14:10:28 -0700 Subject: [PATCH 24/33] rm old WIP item --- brozzler/behaviors.d/multiclicks.js.template | 105 ------------------- 1 file changed, 105 deletions(-) delete mode 100644 brozzler/behaviors.d/multiclicks.js.template diff --git a/brozzler/behaviors.d/multiclicks.js.template b/brozzler/behaviors.d/multiclicks.js.template deleted file mode 100644 index 7a39469..0000000 --- a/brozzler/behaviors.d/multiclicks.js.template +++ /dev/null @@ -1,105 +0,0 @@ -/* - * brozzler/behaviors.d/multiclicks.js.template - click on each of several elements - * - * Copyright (C) 2014-2016 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -var umbraAboveBelowOrOnScreen = function(e) { - var eTop = e.getBoundingClientRect().top; - if (eTop < window.scrollY) { - return -1; // above - } else if (eTop > window.scrollY + window.innerHeight) { - return 1; // below - } else { - return 0; // on screen - } -} - -var umbraState = {'idleSince':null}; -var umbraAlreadyClicked = {}; - -var umbraIntervalFunc = function() { - var clickedSomething = false; - var somethingLeftBelow = false; - var somethingLeftAbove = false; - var cssSelector = "${css_selector}"; - - var clickTargets = document.querySelectorAll(cssSelector); - - for (var i = 0; i < clickTargets.length; i++) { - targetID = clickTargets[i].id; - if (targetID === "") { - targetID = location.host + "-" + i; - } - if (!(targetID in umbraAlreadyClicked)) { - var where = umbraAboveBelowOrOnScreen(clickTargets[i]); - if (where === 0) { - var mouseOverEvent = document.createEvent('Events'); - mouseOverEvent.initEvent("mouseover", true, false); - clickTargets[i].dispatchEvent(mouseOverEvent); - clickTargets[i].click(); - clickedSomething = true; - umbraState.idleSince = null; - umbraAlreadyClicked[targetID] = true; - break; //break from clickTargets loop - - } else if (where > 0) { - somethingLeftBelow = true; - } else if (where < 0) { - somethingLeftAbove = true; - } - } - } - - if (!clickedSomething) { - if (somethingLeftAbove) { - // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); - window.scrollBy(0, -500); - umbraState.idleSince = null; - } else if (somethingLeftBelow) { - // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - umbraState.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - window.scrollBy(0, 200); - umbraState.idleSince = null; - } else if (umbraState.idleSince == null) { - umbraState.idleSince = Date.now(); - } - } - - if (umbraState.idleSince == null) { - umbraState.idleSince = Date.now(); - } -} - -// If we haven't had anything to do (scrolled, clicked, etc) in this amount of -// time, then we consider ourselves finished with the page. -var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 12; - -// Called from outside of this script. -var umbraBehaviorFinished = function() { - if (umbraState.idleSince != null) { - var idleTimeMs = Date.now() - umbraState.idleSince; - if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { - clearInterval(umbraIntervalId); - return true; - } - } - return false; -} - -var umbraIntervalId = setInterval(umbraIntervalFunc, 5000); From 393f40d5ff76ab79bd238a6086c0c578583dfb48 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 12 Oct 2016 17:42:00 -0700 Subject: [PATCH 25/33] custom behavior for catalogue.noguchi.org --- brozzler/behaviors.d/noguchi.js.template | 119 +++++++++++++++++++++++ brozzler/behaviors.yaml | 14 +-- 2 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 brozzler/behaviors.d/noguchi.js.template diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template new file mode 100644 index 0000000..258df80 --- /dev/null +++ b/brozzler/behaviors.d/noguchi.js.template @@ -0,0 +1,119 @@ +/* + * brozzler/behaviors.d/noguchi.js - from ARTWORKS or EXHIBITIONS main pages, + * click through end + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraAboveBelowOrOnScreen = function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } +} + +var UMBRA_N_USER_NAME = "${parameter_username}"; +var UMBRA_N_PASSWORD = "${parameter_password}"; + +var umbraState = {'idleSince':null}; +var umbraAlreadyClicked = {}; +var re = /(?:‹ Previous){0,1}   (page [\d]+/[\d]+)   (?:Next ›){0,1}/ + +var umbraIntervalFunc = function() { + var clickedSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + + var target = document.querySelectorAll("div.nav > a")[1]; + if (target) { + var where = umbraAboveBelowOrOnScreen(target); + if (where === 0) { + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover", true, false); + target.dispatchEvent(mouseOverEvent); + target.click(); + clickedSomething = true; + umbraState.idleSince = null; + target_page = re.exec(document.querySelector("div.nav").textContent) + console.log('clicked ' + target_page); + umbraAlreadyClicked[target_page] = true; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + + if (!clickedSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); + window.scrollBy(0, -500); + umbraState.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + window.scrollBy(0, 200); + umbraState.idleSince = null; + } else if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + } + + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } +} + +// If we haven't had anything to do (scrolled, clicked, etc) in this amount of +// time, then we consider ourselves finished with the page. +var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 5; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + if (umbraState.idleSince != null) { + var idleTimeMs = Date.now() - umbraState.idleSince; + if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + clearInterval(umbraIntervalId); + return true; + } + } + return false; +} + +var umbraNLogin = function() { + login_inputs = document.querySelectorAll("#loginForm > form > div > input"); + var emailInput = login_inputs[0]; + var passwordInput = login_inputs[1]; + var loginButton = document.querySelector("div.form-submit"); + emailInput.value=UMBRA_N_USER_NAME; + passwordInput.value=UMBRA_N_PASSWORD; + loginButton.click(); +} + +if (document.getElementById("loginForm") == null || UMBRA_N_USER_NAME.indexOf("parameter")>0 || UMBRA_N_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters + console.log("missing #loginForm or login credentials; maybe already logged in for " + location.href); + var umbraIntervalId = setInterval(umbraIntervalFunc, 200); +} +else {//login + console.log("#loginForm and credentials found for " + location.href); + umbraNLogin(); +} diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 764012c..90322e6 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -45,12 +45,14 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 -# - # ARI-4930 test -# url_regex: '^https?://(?:www\.)?kansascityfed\.org/publications/research/er/archive/.*$' -# behavior_js_template: clickGetPDFs.js.template -# default_parameters: -# css_selector: li.years>a -# request_idle_timeout_sec: 10 + - + url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$' + behavior_js_template: noguchi.js.template + request_idle_timeout_sec: 10 + - + url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$' + behavior_js_template: noguchi.js.template + request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' behavior_js_template: huffpostslides.js From 9062bee99d8722677f6164d8e85696d460356b96 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 12 Oct 2016 18:16:12 -0700 Subject: [PATCH 26/33] better seed selection --- brozzler/behaviors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 90322e6..83d784d 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -46,7 +46,7 @@ behaviors: behavior_js: instagram.js request_idle_timeout_sec: 10 - - url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$' + url_regex: '^https?://catalogue\.noguchi\.org/index.php$' behavior_js_template: noguchi.js.template request_idle_timeout_sec: 10 - From f55c12ffe27408a804fc3720dba9f1ac6b138237 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 16:40:43 -0700 Subject: [PATCH 27/33] more better yaml, target check --- brozzler/behaviors.d/noguchi.js.template | 13 ++++++++++++- brozzler/behaviors.yaml | 4 ++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index 258df80..2ece5c0 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -39,8 +39,19 @@ var umbraIntervalFunc = function() { var clickedSomething = false; var somethingLeftBelow = false; var somethingLeftAbove = false; + var target = null; + + var navlinks = document.querySelectorAll("div.nav > a"); + if (navlinks) { + if (navlinks.length > 1) { + target = navlinks[1]; + } else { + if navlinks[0].textContent.startsWith("Next") { + target = navlinks[0]; + } + } + } - var target = document.querySelectorAll("div.nav > a")[1]; if (target) { var where = umbraAboveBelowOrOnScreen(target); if (where === 0) { diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 83d784d..ba47af9 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -49,6 +49,10 @@ behaviors: url_regex: '^https?://catalogue\.noguchi\.org/index.php$' behavior_js_template: noguchi.js.template request_idle_timeout_sec: 10 + - + url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$' + behavior_js_template: noguchi.js.template + request_idle_timeout_sec: 10 - url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$' behavior_js_template: noguchi.js.template From eafd14f380991d756c6f8b1a499e2dc867f8fa98 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 17:09:10 -0700 Subject: [PATCH 28/33] correct re --- brozzler/behaviors.d/noguchi.js.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index 2ece5c0..a49c427 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -33,7 +33,7 @@ var UMBRA_N_PASSWORD = "${parameter_password}"; var umbraState = {'idleSince':null}; var umbraAlreadyClicked = {}; -var re = /(?:‹ Previous){0,1}   (page [\d]+/[\d]+)   (?:Next ›){0,1}/ +var re = /(?:Previous){0,1}   (page \d+\/\d+)   (?:Next){0,1}/; var umbraIntervalFunc = function() { var clickedSomething = false; From 21383e0964811e14b29e55d36315d28331121d9e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 17:29:49 -0700 Subject: [PATCH 29/33] correct jslint errors --- brozzler/behaviors.d/noguchi.js.template | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index a49c427..acf5dc9 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -26,14 +26,14 @@ var umbraAboveBelowOrOnScreen = function(e) { } else { return 0; // on screen } -} +}; var UMBRA_N_USER_NAME = "${parameter_username}"; var UMBRA_N_PASSWORD = "${parameter_password}"; var umbraState = {'idleSince':null}; var umbraAlreadyClicked = {}; -var re = /(?:Previous){0,1}   (page \d+\/\d+)   (?:Next){0,1}/; +var re = /(?:Previous){0,1}\s+(page \d+\/\d+)\s+(?:Next){0,1}/; var umbraIntervalFunc = function() { var clickedSomething = false; @@ -46,7 +46,7 @@ var umbraIntervalFunc = function() { if (navlinks.length > 1) { target = navlinks[1]; } else { - if navlinks[0].textContent.startsWith("Next") { + if (navlinks[0].textContent.startsWith("Next")) { target = navlinks[0]; } } @@ -61,7 +61,7 @@ var umbraIntervalFunc = function() { target.click(); clickedSomething = true; umbraState.idleSince = null; - target_page = re.exec(document.querySelector("div.nav").textContent) + var target_page = re.exec(document.querySelector("div.nav").textContent); console.log('clicked ' + target_page); umbraAlreadyClicked[target_page] = true; } else if (where > 0) { @@ -111,7 +111,7 @@ var umbraBehaviorFinished = function() { } var umbraNLogin = function() { - login_inputs = document.querySelectorAll("#loginForm > form > div > input"); + var login_inputs = document.querySelectorAll("#loginForm > form > div > input"); var emailInput = login_inputs[0]; var passwordInput = login_inputs[1]; var loginButton = document.querySelector("div.form-submit"); From 4e4159717bffafc78bce24b1fb34026ba56b2663 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 17:49:37 -0700 Subject: [PATCH 30/33] better targeting --- brozzler/behaviors.d/noguchi.js.template | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index acf5dc9..f420177 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -42,13 +42,9 @@ var umbraIntervalFunc = function() { var target = null; var navlinks = document.querySelectorAll("div.nav > a"); - if (navlinks) { - if (navlinks.length > 1) { - target = navlinks[1]; - } else { - if (navlinks[0].textContent.startsWith("Next")) { - target = navlinks[0]; - } + for (link in navlinks) { + if (link.textContent.indexOf("Next") > 0) { + target = link; } } @@ -62,7 +58,7 @@ var umbraIntervalFunc = function() { clickedSomething = true; umbraState.idleSince = null; var target_page = re.exec(document.querySelector("div.nav").textContent); - console.log('clicked ' + target_page); + console.log('clicked ' + target_page[1]); umbraAlreadyClicked[target_page] = true; } else if (where > 0) { somethingLeftBelow = true; From 6196e32f42e7d682e44ad73207478da829b25502 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 18:08:58 -0700 Subject: [PATCH 31/33] still better targetting --- brozzler/behaviors.d/noguchi.js.template | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index f420177..b297084 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -40,11 +40,14 @@ var umbraIntervalFunc = function() { var somethingLeftBelow = false; var somethingLeftAbove = false; var target = null; + var linktext = ""; var navlinks = document.querySelectorAll("div.nav > a"); - for (link in navlinks) { - if (link.textContent.indexOf("Next") > 0) { - target = link; + + for (i = 0; i < navlinks.length; i++) { + linktext = navlinks[i].textContent; + if (linktext.indexOf("Next") > -1) { + target = navlinks[i]; } } From 2df1177752adbcfcee18b58e93268c316efefa77 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Oct 2016 18:26:28 -0700 Subject: [PATCH 32/33] better interval --- brozzler/behaviors.d/noguchi.js.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index b297084..f79597c 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -121,7 +121,7 @@ var umbraNLogin = function() { if (document.getElementById("loginForm") == null || UMBRA_N_USER_NAME.indexOf("parameter")>0 || UMBRA_N_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters console.log("missing #loginForm or login credentials; maybe already logged in for " + location.href); - var umbraIntervalId = setInterval(umbraIntervalFunc, 200); + var umbraIntervalId = setInterval(umbraIntervalFunc, 2000); } else {//login console.log("#loginForm and credentials found for " + location.href); From a6d6af207dbf2b2814eded18ae3641daad346993 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 18 Oct 2016 21:34:39 -0700 Subject: [PATCH 33/33] alternate login code, better crawl? --- brozzler/behaviors.d/noguchi.js.template | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/behaviors.d/noguchi.js.template index f79597c..9196b38 100644 --- a/brozzler/behaviors.d/noguchi.js.template +++ b/brozzler/behaviors.d/noguchi.js.template @@ -109,6 +109,16 @@ var umbraBehaviorFinished = function() { return false; } +var umbraNLogin = function() { + var login_inputs = document.querySelectorAll("#loginForm > form > div > input"); + var emailInput = login_inputs[0]; + var passwordInput = login_inputs[1]; + emailInput.value=UMBRA_N_USER_NAME; + passwordInput.value=UMBRA_N_PASSWORD; + document.forms.login.submit(); +} + +/* var umbraNLogin = function() { var login_inputs = document.querySelectorAll("#loginForm > form > div > input"); var emailInput = login_inputs[0]; @@ -118,6 +128,7 @@ var umbraNLogin = function() { passwordInput.value=UMBRA_N_PASSWORD; loginButton.click(); } +*/ if (document.getElementById("loginForm") == null || UMBRA_N_USER_NAME.indexOf("parameter")>0 || UMBRA_N_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters console.log("missing #loginForm or login credentials; maybe already logged in for " + location.href);