Merge branch 'master' of github.com:internetarchive/brozzler

This commit is contained in:
Barbara Miller 2022-01-03 16:37:40 -08:00
commit 040a942ef2
8 changed files with 16 additions and 10 deletions

View File

@ -14,7 +14,7 @@ or Chromium) to fetch pages and embedded URLs and to extract links. It employs
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
manage crawl state.
Brozzler is designed to work in conjuction with warcprox for web archiving.
Brozzler is designed to work in conjunction with warcprox for web archiving.
Requirements
------------

View File

@ -28,6 +28,7 @@ import brozzler
from requests.structures import CaseInsensitiveDict
import datetime
import base64
from ipaddress import AddressValueError
from brozzler.chrome import Chrome
import socket
import urlcanon
@ -619,8 +620,13 @@ class Browser:
if ('result' in message and 'result' in message['result']
and 'value' in message['result']['result']):
if message['result']['result']['value']:
return frozenset([str(urlcanon.whatwg(link)) for link in
message['result']['result']['value'].split('\n')])
out = []
for link in message['result']['result']['value'].split('\n'):
try:
out.append(str(urlcanon.whatwg(link)))
except AddressValueError:
self.logger.warning('skip invalid outlink: %s', link)
return frozenset(out)
else:
# no links found
return frozenset()

View File

@ -65,7 +65,7 @@ var umbraBehavior = {
if (where == 0) {
console.log("clicking on " + clickRadioButtonTargets[k]);
// do mouse over event on click target
// since some urls are requsted only on
// since some urls are requested only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');

View File

@ -46,7 +46,7 @@ var umbraBehavior = {
if (where == 0) {
console.log("clicking on " + clickTargets[i].outerHTML);
// do mouse over event on click target
// since some urls are requsted only on
// since some urls are requested only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');

View File

@ -167,7 +167,7 @@ class UmbraBehavior {
doTarget(target, action) {
// console.log("doing " + action + target.outerHTML);
// do mouse over event on target
// since some urls are requsted only on
// since some urls are requested only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent("Events");

View File

@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because

View File

@ -2,7 +2,7 @@
'''
setup.py - brozzler setup script
Copyright (C) 2014-2020 Internet Archive
Copyright (C) 2014-2021 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.24',
version='1.5.25',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -75,7 +75,7 @@ def httpd(request):
def do_POST(self):
if self.path == '/login-action':
self.send_response(200)
payload = b'login successfull\n'
payload = b'login successful\n'
self.send_header('Content-Type', 'text/plain;charset=utf-8')
self.send_header('Content-Length', len(payload))
self.end_headers()