From b577fe3c365145f7faa2934f09aeed80a10703e9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 13 Dec 2018 15:45:35 -0800 Subject: [PATCH 1/4] log browser uncaught exceptions at debug level didn't realize these weren't showing up as console messages --- brozzler/browser.py | 2 ++ setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 972a764..e4372e0 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -256,6 +256,8 @@ class WebsockReceiverThread(threading.Thread): self.logger.debug( 'console.%s %s', message['params']['message']['level'], message['params']['message']['text']) + elif message['method'] == 'Runtime.exceptionThrown': + self.logger.debug('uncaught exception: %s', message) elif message['method'] == 'Page.javascriptDialogOpening': self._javascript_dialog_opening(message) elif (message['method'] == 'Network.loadingFailed' diff --git a/setup.py b/setup.py index bc4b868..40aba7d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev315', + version='1.5.dev316', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 15870e60100f4c49a9f0402b84f3b2542164d9a7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 13 Dec 2018 15:49:38 -0800 Subject: [PATCH 2/4] avoid IndexError in some cases we receive this event from the browser: {"method":"ServiceWorker.workerVersionUpdated","params":{"versions":[]}} --- brozzler/worker.py | 12 +++++++----- setup.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 3cfa9fc..fba83aa 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -278,11 +278,13 @@ class BrozzlerWorker: def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 self.logger.trace('%r', chrome_msg) - url = chrome_msg.get('params', {}).get('versions', [{}])[0].get('scriptURL') - if url not in sw_fetched: - self.logger.info('fetching service worker script %s', url) - self._fetch_url(site, url) - sw_fetched.add(url) + if chrome_msg.get('params', {}).get('versions'): + url = chrome_msg.get('params', {}).get('versions')[0]\ + .get('scriptURL') + if url and url not in sw_fetched: + self.logger.info('fetching service worker script %s', url) + self._fetch_url(site, url) + sw_fetched.add(url) if not browser.is_running(): browser.start( diff --git a/setup.py b/setup.py index 40aba7d..1fb7ff8 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev316', + version='1.5.dev317', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 6c21a9f77319b9dbe00e230f3aa7ee0b702bd019 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 9 Dec 2018 14:25:59 -0800 Subject: [PATCH 3/4] iframe option and other instagram updates --- brozzler/behaviors.yaml | 8 +- brozzler/js-templates/umbraBehavior18q4.js.j2 | 177 ++++++++++++++++++ 2 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 brozzler/js-templates/umbraBehavior18q4.js.j2 diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 28cceba..6c46c7a 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -23,11 +23,13 @@ request_idle_timeout_sec: 30 - url_regex: '^https?://(?:www\.)?instagram\.com/.*$' - behavior_js_template: umbraBehavior.js.j2 + behavior_js_template: umbraBehavior18q4.js.j2 default_parameters: + interval: 500 + iframes: false actions: - - selector: a.coreSpriteDismissLarge - - selector: a>div[role='button'] + - selector: button.coreSpriteDismissLarge + - selector: 'a>.eLAPa>.KL4Bh' firstMatchOnly: true - selector: a.coreSpriteRightPaginationArrow repeatSameElement: true diff --git a/brozzler/js-templates/umbraBehavior18q4.js.j2 b/brozzler/js-templates/umbraBehavior18q4.js.j2 new file mode 100644 index 0000000..100b5b7 --- /dev/null +++ b/brozzler/js-templates/umbraBehavior18q4.js.j2 @@ -0,0 +1,177 @@ +/* + * brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class + * + * Copyright (C) 2017-2018 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +class UmbraBehavior { + + constructor(actions) { + this.IDLE_TIMEOUT_SEC = 10; + this.actions = actions; + this.alreadyDone = []; + this.idleSince = null; + this.intervalId = null; + this.intervalTimeMs = {{interval or 300}}; + this.doIframes = {{iframes or true}}; + this.index = 0; + } + + simpleIntervalFunc() { + // should match older default and simpleclicks behavior, and more + var k = this.index; + var selector = this.actions[k].selector; + var repeatSameElement = this.actions[k].repeatSameElement ? this.actions[k].repeatSameElement : false; + var firstMatchOnly = this.actions[k].firstMatchOnly ? this.actions[k].firstMatchOnly : false; + var action = this.actions[k].do ? this.actions[k].do : 'click'; + var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null; + var didSomething = false; + var somethingLeftAbove = false; + var somethingLeftBelow = false; + + var documents = []; + documents[0] = document; + if (this.doIframes) { + var iframes = document.querySelectorAll("iframe"); + var iframesLength = iframes.length; + for (var i = 0; i < iframesLength; i++) { + documents.push(iframes[i].contentWindow.document); + } + } + var documentsLength = documents.length; + for (var j = 0; j < documentsLength; j++) { + if (closeSelector) { + var closeTargets = documents[j].querySelectorAll(closeSelector); + if ((closeTargets.length > 0) && + (this.alreadyDone.indexOf(closeTargets[0]) === -1) && + (this.isVisible(closeTargets[0]))) { + doTarget(closeTargets[0], 'click'); + } + } + if (firstMatchOnly) { + var doTargets = [ documents[j].querySelector(selector) ]; + } else { + var doTargets = documents[j].querySelectorAll(selector); + } + var doTargetsLength = doTargets.length; + if (!(doTargetsLength > 0)) { + continue; + } + for ( var i = 0; i < doTargetsLength; i++) { + if (!repeatSameElement && this.alreadyDone.indexOf(doTargets[i]) > -1) { + continue; + } + if (!this.isVisible(doTargets[i])) { + continue; + } + var where = this.aboveBelowOrOnScreen(doTargets[i]); + if (where == 0) { + this.doTarget(doTargets[i], action); + didSomething = true; + break; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + + if (!didSomething) { + if (somethingLeftAbove) { + window.scrollBy(0, -500); + this.idleSince = null; + } else if (somethingLeftBelow || ( (window.scrollY + window.innerHeight) < document.documentElement.scrollHeight)) { + window.scrollBy(0, 200); + this.idleSince = null; + } else if (this.idleSince == null) { + this.idleSince = Date.now(); + } + } + + if (!this.idleSince) { + this.idleSince = Date.now(); + } else { + var idleTimeMs = Date.now() - this.idleSince; + if ((idleTimeMs / 1000) > (this.IDLE_TIMEOUT_SEC - 1) && (this.index < (this.actions.length - 1))) { + console.log("ready for next action"); + this.index += 1; + this.idleSince = null; + window.scroll(0,0); + } + } + } + + aboveBelowOrOnScreen(elem) { + var eTop = elem.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } + } + + isVisible(elem) { + return elem && !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length); + } + + doTarget(target, action) { + // console.log("doing " + action + target.outerHTML); + // do mouse over event on target + // since some urls are requsted only on + // this event - see + // https://webarchive.jira.com/browse/AITFIVE-451 + var mouseOverEvent = document.createEvent("Events"); + mouseOverEvent.initEvent("mouseover", true, false); + target.dispatchEvent(mouseOverEvent); + + if (action == "click") { + target.click(); + } // add new do's here! + + this.alreadyDone.push(target); + this.idleSince = null; + } + + start() { + var that = this; + this.intervalId = setInterval(function() { + that.simpleIntervalFunc() + }, this.intervalTimeMs); + } + + isFinished() { + if (this.idleSince != null) { + var idleTimeMs = Date.now() - this.idleSince; + if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { + clearInterval(this.intervalId); + return true; + } + } + return false; + } +} + +var umbraBehavior = new UmbraBehavior( {{actions|json}} ); + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + return umbraBehavior.isFinished(); +}; + +umbraBehavior.start(); From 425d44bf4ac8eec097a0ce490e63dea94971d7de Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 13 Dec 2018 15:13:03 -0800 Subject: [PATCH 4/4] updates for jina2 --- brozzler/behaviors.yaml | 2 +- brozzler/js-templates/umbraBehavior18q4.js.j2 | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 6c46c7a..e29fab6 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -26,7 +26,7 @@ behavior_js_template: umbraBehavior18q4.js.j2 default_parameters: interval: 500 - iframes: false + skip_iframes: true actions: - selector: button.coreSpriteDismissLarge - selector: 'a>.eLAPa>.KL4Bh' diff --git a/brozzler/js-templates/umbraBehavior18q4.js.j2 b/brozzler/js-templates/umbraBehavior18q4.js.j2 index 100b5b7..7c24a13 100644 --- a/brozzler/js-templates/umbraBehavior18q4.js.j2 +++ b/brozzler/js-templates/umbraBehavior18q4.js.j2 @@ -26,7 +26,11 @@ class UmbraBehavior { this.idleSince = null; this.intervalId = null; this.intervalTimeMs = {{interval or 300}}; - this.doIframes = {{iframes or true}}; + {% if skip_iframes %} + this.skipIframes = true; + {% else %} + this.skipIframes = false; + {% endif %} this.index = 0; } @@ -44,7 +48,7 @@ class UmbraBehavior { var documents = []; documents[0] = document; - if (this.doIframes) { + if (!(this.skipIframes)) { var iframes = document.querySelectorAll("iframe"); var iframesLength = iframes.length; for (var i = 0; i < iframesLength; i++) {