diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js index 65c4098..e9e8a47 100644 --- a/brozzler/js-templates/extract-outlinks.js +++ b/brozzler/js-templates/extract-outlinks.js @@ -5,7 +5,7 @@ var __brzl_compileOutlinks = function(frame) { __brzl_framesDone.add(frame); if (frame && frame.document) { var outlinks = Array.prototype.slice.call( - frame.document.querySelectorAll('a[href]')); + frame.document.querySelectorAll('a[href], area[href]')); for (var i = 0; i < frame.frames.length; i++) { if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { outlinks = outlinks.concat( diff --git a/setup.py b/setup.py index 66d9b61..2182db2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b11.dev225', + version='1.1b11.dev226', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/site8/index.html b/tests/htdocs/site8/index.html new file mode 100644 index 0000000..0bf7d6e --- /dev/null +++ b/tests/htdocs/site8/index.html @@ -0,0 +1,14 @@ + + + outlinks + + + baz/quux/../zuh + fdjisapofdjisap#yessss + + + + + + + diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index b033cfd..c43c2f1 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -2,7 +2,7 @@ ''' test_brozzling.py - XXX explain -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -147,7 +147,6 @@ def test_page_videos(httpd): # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) - chrome_exe = brozzler.suggest_default_chrome_exe() site = brozzler.Site(None, {}) page = brozzler.Page(None, { 'url':'http://localhost:%s/site6/' % httpd.server_port}) @@ -172,3 +171,18 @@ def test_page_videos(httpd): 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, } +def test_extract_outlinks(httpd): + chrome_exe = brozzler.suggest_default_chrome_exe() + worker = brozzler.BrozzlerWorker(None) + site = brozzler.Site(None, {}) + page = brozzler.Page(None, { + 'url':'http://localhost:%s/site8/' % httpd.server_port}) + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + outlinks = worker.brozzle_page(browser, site, page) + assert outlinks == { + 'http://example.com/offsite', + 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, + 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, + 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port + } +