Merge branch 'master' of github.com:internetarchive/brozzler

2025-04-20 23:56:34 -04:00 · 2022-01-03 16:37:40 -08:00 · 2022-01-03 16:37:40 -08:00 · 040a942ef2
commit 040a942ef2
parent 6290692ac4 427908e821
8 changed files with 16 additions and 10 deletions
--- a/README.rst
+++ b/README.rst
@ -14,7 +14,7 @@ or Chromium) to fetch pages and embedded URLs and to extract links. It employs
 capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
 manage crawl state.

-Brozzler is designed to work in conjuction with warcprox for web archiving.
+Brozzler is designed to work in conjunction with warcprox for web archiving.

 Requirements
 ------------
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -28,6 +28,7 @@ import brozzler
 from requests.structures import CaseInsensitiveDict
 import datetime
 import base64
+from ipaddress import AddressValueError
 from brozzler.chrome import Chrome
 import socket
 import urlcanon
@ -619,8 +620,13 @@ class Browser:
        if ('result' in message and 'result' in message['result']
                and 'value' in message['result']['result']):
            if message['result']['result']['value']:
-                return frozenset([str(urlcanon.whatwg(link)) for link in
-                                  message['result']['result']['value'].split('\n')])
+                out = []
+                for link in message['result']['result']['value'].split('\n'):
+                    try:
+                        out.append(str(urlcanon.whatwg(link)))
+                    except AddressValueError:
+                        self.logger.warning('skip invalid outlink: %s', link)
+                return frozenset(out)
            else:
                # no links found
                return frozenset()
--- a/brozzler/js-templates/fec_gov.js
+++ b/brozzler/js-templates/fec_gov.js
@ -65,7 +65,7 @@ var umbraBehavior = {
 						if (where == 0) {
 							console.log("clicking on " + clickRadioButtonTargets[k]);
 							// do mouse over event on click target
-							// since some urls are requsted only on
+							// since some urls are requested only on
 							// this event - see
 							// https://webarchive.jira.com/browse/AITFIVE-451
 							var mouseOverEvent = document.createEvent('Events');
--- a/brozzler/js-templates/psu24.js
+++ b/brozzler/js-templates/psu24.js
@ -46,7 +46,7 @@ var umbraBehavior = {
 				if (where == 0) {
 					console.log("clicking on " + clickTargets[i].outerHTML);
 					// do mouse over event on click target
-					// since some urls are requsted only on
+					// since some urls are requested only on
 					// this event - see
 					// https://webarchive.jira.com/browse/AITFIVE-451
 					var mouseOverEvent = document.createEvent('Events');
--- a/brozzler/js-templates/umbraBehavior.js.j2
+++ b/brozzler/js-templates/umbraBehavior.js.j2
@ -167,7 +167,7 @@ class UmbraBehavior {
    doTarget(target, action) {
        // console.log("doing " + action + target.outerHTML);
        // do mouse over event on target
-        // since some urls are requsted only on
+        // since some urls are requested only on
        // this event - see
        // https://webarchive.jira.com/browse/AITFIVE-451
        var mouseOverEvent = document.createEvent("Events");
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
-                if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
+                if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
                    # At this point ie_result['entries'] is an iterator that
                    # will fetch more metadata from youtube to list all the
                    # videos. We unroll that iterator here partly because
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@
 '''
 setup.py - brozzler setup script

-Copyright (C) 2014-2020 Internet Archive
+Copyright (C) 2014-2021 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.5.24',
+        version='1.5.25',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -75,7 +75,7 @@ def httpd(request):
        def do_POST(self):
            if self.path == '/login-action':
                self.send_response(200)
-                payload = b'login successfull\n'
+                payload = b'login successful\n'
                self.send_header('Content-Type', 'text/plain;charset=utf-8')
                self.send_header('Content-Length', len(payload))
                self.end_headers()