diff --git a/README.rst b/README.rst index 0fc5c4a..b2fe7f2 100644 --- a/README.rst +++ b/README.rst @@ -14,7 +14,7 @@ or Chromium) to fetch pages and embedded URLs and to extract links. It employs capabilities and `rethinkdb `_ to manage crawl state. -Brozzler is designed to work in conjuction with warcprox for web archiving. +Brozzler is designed to work in conjunction with warcprox for web archiving. Requirements ------------ diff --git a/brozzler/browser.py b/brozzler/browser.py index 1960b9e..8692e1d 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -28,6 +28,7 @@ import brozzler from requests.structures import CaseInsensitiveDict import datetime import base64 +from ipaddress import AddressValueError from brozzler.chrome import Chrome import socket import urlcanon @@ -619,8 +620,13 @@ class Browser: if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: - return frozenset([str(urlcanon.whatwg(link)) for link in - message['result']['result']['value'].split('\n')]) + out = [] + for link in message['result']['result']['value'].split('\n'): + try: + out.append(str(urlcanon.whatwg(link))) + except AddressValueError: + self.logger.warning('skip invalid outlink: %s', link) + return frozenset(out) else: # no links found return frozenset() diff --git a/brozzler/js-templates/fec_gov.js b/brozzler/js-templates/fec_gov.js index 25cfd8b..bdd03d9 100644 --- a/brozzler/js-templates/fec_gov.js +++ b/brozzler/js-templates/fec_gov.js @@ -65,7 +65,7 @@ var umbraBehavior = { if (where == 0) { console.log("clicking on " + clickRadioButtonTargets[k]); // do mouse over event on click target - // since some urls are requsted only on + // since some urls are requested only on // this event - see // https://webarchive.jira.com/browse/AITFIVE-451 var mouseOverEvent = document.createEvent('Events'); diff --git a/brozzler/js-templates/psu24.js b/brozzler/js-templates/psu24.js index 15388fc..fdcd865 100644 --- a/brozzler/js-templates/psu24.js +++ b/brozzler/js-templates/psu24.js @@ -46,7 +46,7 @@ var umbraBehavior = { if (where == 0) { console.log("clicking on " + clickTargets[i].outerHTML); // do mouse over event on click target - // since some urls are requsted only on + // since some urls are requested only on // this event - see // https://webarchive.jira.com/browse/AITFIVE-451 var mouseOverEvent = document.createEvent('Events'); diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index 7ed736a..6ee8fcc 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -167,7 +167,7 @@ class UmbraBehavior { doTarget(target, action) { // console.log("doing " + action + target.outerHTML); // do mouse over event on target - // since some urls are requsted only on + // since some urls are requested only on // this event - see // https://webarchive.jira.com/browse/AITFIVE-451 var mouseOverEvent = document.createEvent("Events"); diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4a8d71a..9c25981 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site): if ie_result.get('_type') == 'playlist': self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) - if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}: + if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}: # At this point ie_result['entries'] is an iterator that # will fetch more metadata from youtube to list all the # videos. We unroll that iterator here partly because diff --git a/setup.py b/setup.py index c2eb300..e105941 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2020 Internet Archive +Copyright (C) 2014-2021 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.24', + version='1.5.25', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 507379e..bd4a032 100755 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -75,7 +75,7 @@ def httpd(request): def do_POST(self): if self.path == '/login-action': self.send_response(200) - payload = b'login successfull\n' + payload = b'login successful\n' self.send_header('Content-Type', 'text/plain;charset=utf-8') self.send_header('Content-Length', len(payload)) self.end_headers()