mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
extract area/@href links, and add test for outlink extraction
This commit is contained in:
parent
d4d3ef4fd3
commit
5bcd10c228
@ -5,7 +5,7 @@ var __brzl_compileOutlinks = function(frame) {
|
|||||||
__brzl_framesDone.add(frame);
|
__brzl_framesDone.add(frame);
|
||||||
if (frame && frame.document) {
|
if (frame && frame.document) {
|
||||||
var outlinks = Array.prototype.slice.call(
|
var outlinks = Array.prototype.slice.call(
|
||||||
frame.document.querySelectorAll('a[href]'));
|
frame.document.querySelectorAll('a[href], area[href]'));
|
||||||
for (var i = 0; i < frame.frames.length; i++) {
|
for (var i = 0; i < frame.frames.length; i++) {
|
||||||
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
||||||
outlinks = outlinks.concat(
|
outlinks = outlinks.concat(
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b11.dev225',
|
version='1.1b11.dev226',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
14
tests/htdocs/site8/index.html
Normal file
14
tests/htdocs/site8/index.html
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>outlinks</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<a href='baz/quux/../zuh'>baz/quux/../zuh</a>
|
||||||
|
<a href='fdjisapofdjisap#1'>fdjisapofdjisap#yessss</a>
|
||||||
|
<map name="fakemap">
|
||||||
|
<area href="fdjisapofdjisap#2">
|
||||||
|
<area href="./fdjisapofdjisap#1">
|
||||||
|
<area href="http://example.com/offsite">
|
||||||
|
</map>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
test_brozzling.py - XXX explain
|
test_brozzling.py - XXX explain
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -147,7 +147,6 @@ def test_page_videos(httpd):
|
|||||||
# to be adjusted on youtube-dl or chromium updates
|
# to be adjusted on youtube-dl or chromium updates
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
worker = brozzler.BrozzlerWorker(None)
|
worker = brozzler.BrozzlerWorker(None)
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
|
||||||
site = brozzler.Site(None, {})
|
site = brozzler.Site(None, {})
|
||||||
page = brozzler.Page(None, {
|
page = brozzler.Page(None, {
|
||||||
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
'url':'http://localhost:%s/site6/' % httpd.server_port})
|
||||||
@ -172,3 +171,18 @@ def test_page_videos(httpd):
|
|||||||
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def test_extract_outlinks(httpd):
|
||||||
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
worker = brozzler.BrozzlerWorker(None)
|
||||||
|
site = brozzler.Site(None, {})
|
||||||
|
page = brozzler.Page(None, {
|
||||||
|
'url':'http://localhost:%s/site8/' % httpd.server_port})
|
||||||
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
|
outlinks = worker.brozzle_page(browser, site, page)
|
||||||
|
assert outlinks == {
|
||||||
|
'http://example.com/offsite',
|
||||||
|
'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
|
||||||
|
'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
|
||||||
|
'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
|
||||||
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user