diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index e0cd199..46dccc1 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -1,7 +1,7 @@ # # brozzler/behaviors.yaml - behavior configuration # -# Copyright (C) 2014-2018 Internet Archive +# Copyright (C) 2014-2019 Internet Archive # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -88,11 +88,10 @@ closeSelector: .pmf-artist-modal__close-btn - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: img.img-responsive - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: img.img-responsive - # https://webarchive.jira.com/browse/ARI-5517 url_regex: '^https?://(?:www\.)?thejewishnews.com/.*$' behavior_js_template: umbraBehavior.js.j2 @@ -101,17 +100,10 @@ - selector: a#get_more - # acalog https://webarchive.jira.com/browse/ARI-3775 url_regex: '^https?://.*[?&]catoid=[^?]*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: a[onclick] - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5366 - url_regex: '^https?://(?:(ici|beta)\.)radio-canada\.ca/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: .bigBtnPlay - click_until_hard_timeout: False + actions: + - selector: a[onclick] - # https://webarchive.jira.com/browse/ARI-5294 url_regex: '^https?://citymedfordwi\.civicweb\.net/.*$' behavior_js_template: umbraBehavior.js.j2 @@ -120,46 +112,46 @@ - selector: div.meeting-document-type-buttons button.button-small - # https://webarchive.jira.com/browse/ARI-5409 url_regex: '^https?://(?:www\.)?tuebingen.de/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: a.kl - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: a.kl - # https://webarchive.jira.com/browse/ARI-3956 url_regex: '^https?://(?:www\.)?usask.ca/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: a[id='feature-next'] - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: a[id='feature-next'] - # https://webarchive.jira.com/browse/AITFIVE-451 url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: button.sc-button-play, .playButton, div.compactTrackListItem - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: button.sc-button-play, .playButton, div.compactTrackListItem - # https://webarchive.jira.com/browse/AITFIVE-463 url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: button.playButton.medium - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: button.playButton.medium - # https://webarchive.jira.com/browse/ARI-4690 url_regex: '^https?://(?:www\.)?youtube.com/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: span.load-more-text - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: span.load-more-text +- # https://webarchive.jira.com/browse/ARI-5453 + url_regex: '^https?://.*\.wixsite.com/.*$' + behavior_js_template: umbraBehavior.js.j2 + default_parameters: + actions: + - selector: .ddm1repeaterButtonlabel - # https://webarchive.jira.com/browse/ARI-4725 url_regex: '^https?://(?:www\.)?moma.org/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 + actions: + - selector: button[data-more-results-bottom-button] - # https://webarchive.jira.com/browse/ARI-4692 url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' behavior_js_template: fec_gov.js @@ -172,67 +164,35 @@ do: mouseover - # https://webarchive.jira.com/browse/ARI-5259 url_regex: '^https?://blog\.sina\.com\.cn/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: li.SG_pgnext a - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5334 - url_regex: '^https?://(?:www\.)?google\.com/search.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: a#pnnext - click_until_hard_timeout: True - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5259 - url_regex: '^https?://blog\.sina\.com\.cn/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: li.SG_pgnext a - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: li.SG_pgnext a - # https://webarchive.jira.com/browse/ARI-5313 url_regex: '^https?://.*\.ky\.gov/.*$' - behavior_js_template: mouseovers.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - mouseover_css_selector: .zz1_AgencyListingMenu_1 - mouseover_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5433 - url_regex: '^https?://(?:www\.)?vermontcountrystore\.com/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: .navonetitle a - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: .zz1_AgencyListingMenu_1 + do: mouseover - # https://webarchive.jira.com/browse/ARI-5384 url_regex: '^https?://issuu\.com/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: button#ird3-button-next - click_until_hard_timeout: True - request_idle_timeout_sec: 10 + actions: + - selector: button#ird3-button-next - # https://webarchive.jira.com/browse/ARI-5241 url_regex: '^https?://(?:www\.)?colonialart\.org/.*$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: img.link-overlay - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: img.link-overlay - # https://webarchive.jira.com/browse/ARI-4960 url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$' - behavior_js_template: simpleclicks.js.j2 + behavior_js_template: umbraBehavior.js.j2 default_parameters: - click_css_selector: img#navForward1 - click_until_hard_timeout: True - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5210 - url_regex: '^https?://(?:www\.)?ssab.gov/Our-Work$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: input[type=button][value=Next] - click_until_hard_timeout: False - request_idle_timeout_sec: 10 + actions: + - selector: img#navForward1 - # default fallback behavior url_regex: '^.*$' behavior_js_template: umbraBehavior.js.j2 diff --git a/brozzler/js-templates/mouseovers.js.j2 b/brozzler/js-templates/mouseovers.js.j2 deleted file mode 100644 index 8521387..0000000 --- a/brozzler/js-templates/mouseovers.js.j2 +++ /dev/null @@ -1,133 +0,0 @@ -/* - * brozzler/behaviors.d/mouseovers.js.in - mouseovers behavior template, - * mouseovers on elements matching templatized css selector - * - * Copyright (C) 2014-2016 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -var umbraBehavior = { - IDLE_TIMEOUT_SEC : 10, - idleSince : null, - alreadyMouseovered : {}, - - intervalFunc : function() { - var mouseoveredSomething = false; - var somethingLeftBelow = false; - var somethingLeftAbove = false; - var cssSelector = {{mouseover_css_selector|json}}; - var mouseoverUntilTimeout = {{mouseover_until_hard_timeout|json}}; - - var iframes = document.querySelectorAll("iframe"); - var documents = Array(iframes.length + 1); - documents[0] = document; - - for (var i = 0; i < iframes.length; i++) { - documents[i+1] = iframes[i].contentWindow.document; - } - - for (var j = 0; j < documents.length; j++) { - - var mouseoverTargets = documents[j].querySelectorAll(cssSelector); - - for ( var i = 0; i < mouseoverTargets.length; i++) { - if (mouseoverTargets[i].umbraMouseovered && !mouseoverUntilTimeout) { - continue; - } - - var where = this.aboveBelowOrOnScreen(mouseoverTargets[i]); - - if (where == 0) { - console.log("mouseovering on " + mouseoverTargets[i].outerHTML); - // do mouse over event on mouseover target - // since some urls are requsted only on - // this event - see - // https://webarchive.jira.com/browse/AITFIVE-451 - var mouseOverEvent = document.createEvent('Events'); - mouseOverEvent.initEvent("mouseover",true, false); - mouseoverTargets[i].dispatchEvent(mouseOverEvent); - mouseoveredSomething = true; - this.idleSince = null; - mouseoverTargets[i].umbraMouseovered = true; - - break; //break from mouseoverTargets loop, but not from iframe loop - } else if (where > 0) { - somethingLeftBelow = true; - } else if (where < 0) { - somethingLeftAbove = true; - } - } - } - - if (!mouseoveredSomething) { - if (somethingLeftAbove) { - // console.log("scrolling UP because everything on this screen has been mouseovered but we missed something above"); - window.scrollBy(0, -500); - this.idleSince = null; - } else if (somethingLeftBelow) { - // console.log("scrolling because everything on this screen has been mouseovered but there's more below document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - // console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (this.idleSince == null) { - this.idleSince = Date.now(); - } - } - - if (!this.idleSince) { - this.idleSince = Date.now(); - } - }, - - start : function() { - var that = this; - this.intervalId = setInterval(function() { - that.intervalFunc() - }, 250); - }, - - isFinished : function() { - if (this.idleSince != null) { - var idleTimeMs = Date.now() - this.idleSince; - if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { - clearInterval(this.intervalId); - return true; - } - } - return false; - }, - - aboveBelowOrOnScreen : function(e) { - var eTop = e.getBoundingClientRect().top; - if (eTop < window.scrollY) { - return -1; // above - } else if (eTop > window.scrollY + window.innerHeight) { - return 1; // below - } else { - return 0; // on screen - } - }, -}; - -// Called from outside of this script. -var umbraBehaviorFinished = function() { - return umbraBehavior.isFinished() -}; - -umbraBehavior.start(); diff --git a/brozzler/js-templates/simpleclicks.js.j2 b/brozzler/js-templates/simpleclicks.js.j2 deleted file mode 100644 index 14652b8..0000000 --- a/brozzler/js-templates/simpleclicks.js.j2 +++ /dev/null @@ -1,142 +0,0 @@ -/* - * brozzler/behaviors.d/simpleclicks.js.in - simpleclicks behavior template, - * clicks on elements matching templatized css selector - * - * Copyright (C) 2014-2016 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -var umbraBehavior = { - IDLE_TIMEOUT_SEC : 10, - idleSince : null, - alreadyClicked : {}, - - // https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js - // n.b. returns true for elements with visibility:hidden, which occupy - // screen real estate but are not visible, or clickable with the ui - isVisible : function(elem) { - return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length); - }, - - intervalFunc : function() { - var clickedSomething = false; - var somethingLeftBelow = false; - var somethingLeftAbove = false; - var cssSelector = {{click_css_selector|json}}; - var clickUntilTimeout = {{click_until_hard_timeout|json}}; - - var iframes = document.querySelectorAll("iframe"); - var documents = Array(iframes.length + 1); - documents[0] = document; - - for (var i = 0; i < iframes.length; i++) { - documents[i+1] = iframes[i].contentWindow.document; - } - - for (var j = 0; j < documents.length; j++) { - var clickTargets = documents[j].querySelectorAll(cssSelector); - for ( var i = 0; i < clickTargets.length; i++) { - if (!this.isVisible(clickTargets[i])) { - continue; - } - if (clickTargets[i].umbraClicked && !clickUntilTimeout) { - continue; - } - - var where = this.aboveBelowOrOnScreen(clickTargets[i]); - - if (where == 0) { - console.log("clicking on " + clickTargets[i].outerHTML); - // do mouse over event on click target - // since some urls are requsted only on - // this event - see - // https://webarchive.jira.com/browse/AITFIVE-451 - var mouseOverEvent = document.createEvent('Events'); - mouseOverEvent.initEvent("mouseover",true, false); - clickTargets[i].dispatchEvent(mouseOverEvent); - clickTargets[i].click(); - clickedSomething = true; - this.idleSince = null; - clickTargets[i].umbraClicked = true; - - break; //break from clickTargets loop, but not from iframe loop - } else if (where > 0) { - somethingLeftBelow = true; - } else if (where < 0) { - somethingLeftAbove = true; - } - } - } - - if (!clickedSomething) { - if (somethingLeftAbove) { - // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); - window.scrollBy(0, -500); - this.idleSince = null; - } else if (somethingLeftBelow) { - // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - // console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (this.idleSince == null) { - this.idleSince = Date.now(); - } - } - - if (!this.idleSince) { - this.idleSince = Date.now(); - } - }, - - start : function() { - var that = this; - this.intervalId = setInterval(function() { - that.intervalFunc() - }, 250); - }, - - isFinished : function() { - if (this.idleSince != null) { - var idleTimeMs = Date.now() - this.idleSince; - if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { - clearInterval(this.intervalId); - return true; - } - } - return false; - }, - - aboveBelowOrOnScreen : function(e) { - var eTop = e.getBoundingClientRect().top; - if (eTop < window.scrollY) { - return -1; // above - } else if (eTop > window.scrollY + window.innerHeight) { - return 1; // below - } else { - return 0; // on screen - } - }, -}; - -// Called from outside of this script. -var umbraBehaviorFinished = function() { - return umbraBehavior.isFinished() -}; - -umbraBehavior.start(); diff --git a/brozzler/model.py b/brozzler/model.py index e10a712..9832a40 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -242,8 +242,19 @@ class Site(doublethink.Document, ElapsedMixIn): self.scope["accepts"].append({"ssurt": ssurt}) def note_seed_redirect(self, url): + canon_seed_redirect = brozzler.site_surt_canon(url) + canon_seed = brozzler.site_surt_canon(self.seed) + + # if http://foo.com/ redirects to https://foo.com/a/b/c let's also + # put all of https://foo.com/ in scope + if (canon_seed_redirect.authority == canon_seed.authority + and canon_seed_redirect.scheme != canon_seed.scheme): + canon_seed.scheme = canon_seed_redirect.scheme + self._accept_ssurt_if_not_redundant( + canon_seed.ssurt().decode('ascii')) + self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(url).ssurt().decode('ascii')) + canon_seed_redirect.ssurt().decode('ascii')) def extra_headers(self): hdrs = {} diff --git a/tests/test_units.py b/tests/test_units.py index 4a91e0c..1d62bc6 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -420,3 +420,22 @@ def test_needs_browsing(): assert not brozzler.worker.BrozzlerWorker._needs_browsing( None, page, spy.fetches) +def test_seed_redirect(): + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,foo,//https:/',}]} + + site = brozzler.Site(None, {'seed': 'https://foo.com/'}) + site.note_seed_redirect('http://foo.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//https:/',}, + {'ssurt': 'com,foo,//http:/',}]} + + site = brozzler.Site(None, {'seed': 'http://foo.com/'}) + site.note_seed_redirect('https://bar.com/a/b/c') + assert site.scope == {'accepts': [ + {'ssurt': 'com,foo,//http:/',}, + {'ssurt': 'com,bar,//https:/a/b/c',}]} +