From 1073431f76f7d427c1af7b4acaa3b214d28e7f82 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 29 Oct 2018 17:42:25 -0700 Subject: [PATCH] handle exceptions extracting links like this one: Uncaught DOMException: Blocked a frame with origin "https://www.youtube.com" from accessing a cross-origin frame. at __brzl_compileOutlinks (:4:24) at __brzl_compileOutlinks (:10:29) at :16:1 __brzl_compileOutlinks @ VM194:4 __brzl_compileOutlinks @ VM194:10 not sure exactly why this happens but we just have to handle it --- brozzler/js-templates/extract-outlinks.js | 18 ++++++++++++------ setup.py | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js index e9e8a47..e3a04ca 100644 --- a/brozzler/js-templates/extract-outlinks.js +++ b/brozzler/js-templates/extract-outlinks.js @@ -3,16 +3,22 @@ var __brzl_framesDone = new Set(); var __brzl_compileOutlinks = function(frame) { __brzl_framesDone.add(frame); - if (frame && frame.document) { - var outlinks = Array.prototype.slice.call( + var outlinks = []; + try { + if (frame && frame.document) { + outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href], area[href]')); - for (var i = 0; i < frame.frames.length; i++) { - if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { - outlinks = outlinks.concat( - __brzl_compileOutlinks(frame.frames[i])); + for (var i = 0; i < frame.frames.length; i++) { + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + outlinks = outlinks.concat( + __brzl_compileOutlinks(frame.frames[i])); + } } } + } catch (e) { + console.log("exception looking at frame" + frame + ": " + e); } + return outlinks; } __brzl_compileOutlinks(window).join('\n'); diff --git a/setup.py b/setup.py index d0f8cff..948bb4f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev311', + version='1.5.dev312', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',