From 6bd4fd6532cd6cddee6c8f0e2d9141187dc7e108 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 26 Jun 2019 21:19:35 +0000 Subject: [PATCH 1/3] Block AMP analytics JS script AMP analytics is part of Google analytics. We need to block it for similar reasons. AMP analytics reference: https://developers.google.com/analytics/devguides/collection/amp-analytics/ --- brozzler/browser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index e4372e0..da8ab95 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -354,11 +354,12 @@ class Browser: self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') - # disable google analytics + # disable google analytics and amp analytics self.send_to_chrome( method='Network.setBlockedURLs', params={'urls': ['*google-analytics.com/analytics.js', - '*google-analytics.com/ga.js']}) + '*google-analytics.com/ga.js', + '*cdn.ampproject.org/*/amp-analytics*.js']}) def stop(self): ''' From 94cd6cacb6bed7a440a7a1c27fdd42eaf9f1e9a3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 18 Jul 2019 11:07:27 -0700 Subject: [PATCH 2/3] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1181bea..27eb4b6 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.6', + version='1.5.7', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 14e3d56cd28343c24049d9833c547e74d8000b5c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 20 Aug 2019 13:34:02 -0700 Subject: [PATCH 3/3] add popup urls as outlinks --- brozzler/js-templates/extract-outlinks.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js index e3a04ca..4d38962 100644 --- a/brozzler/js-templates/extract-outlinks.js +++ b/brozzler/js-templates/extract-outlinks.js @@ -8,6 +8,14 @@ var __brzl_compileOutlinks = function(frame) { if (frame && frame.document) { outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href], area[href]')); + popups = Array.prototype.slice.call( + frame.document.querySelectorAll('a[onclick]')); + if (popups && popups.length > 0) { + for (var p=0; p < popups.length; p++) { + popups[p] = popups[p].onclick.toString().split("'")[1]; + } + outlinks = outlinks.concat(popups); + } for (var i = 0; i < frame.frames.length; i++) { if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { outlinks = outlinks.concat(