mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' of github.com:internetarchive/brozzler
This commit is contained in:
commit
040a942ef2
@ -14,7 +14,7 @@ or Chromium) to fetch pages and embedded URLs and to extract links. It employs
|
||||
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
|
||||
manage crawl state.
|
||||
|
||||
Brozzler is designed to work in conjuction with warcprox for web archiving.
|
||||
Brozzler is designed to work in conjunction with warcprox for web archiving.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
@ -28,6 +28,7 @@ import brozzler
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import datetime
|
||||
import base64
|
||||
from ipaddress import AddressValueError
|
||||
from brozzler.chrome import Chrome
|
||||
import socket
|
||||
import urlcanon
|
||||
@ -619,8 +620,13 @@ class Browser:
|
||||
if ('result' in message and 'result' in message['result']
|
||||
and 'value' in message['result']['result']):
|
||||
if message['result']['result']['value']:
|
||||
return frozenset([str(urlcanon.whatwg(link)) for link in
|
||||
message['result']['result']['value'].split('\n')])
|
||||
out = []
|
||||
for link in message['result']['result']['value'].split('\n'):
|
||||
try:
|
||||
out.append(str(urlcanon.whatwg(link)))
|
||||
except AddressValueError:
|
||||
self.logger.warning('skip invalid outlink: %s', link)
|
||||
return frozenset(out)
|
||||
else:
|
||||
# no links found
|
||||
return frozenset()
|
||||
|
@ -65,7 +65,7 @@ var umbraBehavior = {
|
||||
if (where == 0) {
|
||||
console.log("clicking on " + clickRadioButtonTargets[k]);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// since some urls are requested only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
|
@ -46,7 +46,7 @@ var umbraBehavior = {
|
||||
if (where == 0) {
|
||||
console.log("clicking on " + clickTargets[i].outerHTML);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// since some urls are requested only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
|
@ -167,7 +167,7 @@ class UmbraBehavior {
|
||||
doTarget(target, action) {
|
||||
// console.log("doing " + action + target.outerHTML);
|
||||
// do mouse over event on target
|
||||
// since some urls are requsted only on
|
||||
// since some urls are requested only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent("Events");
|
||||
|
@ -154,7 +154,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
if ie_result.get('_type') == 'playlist':
|
||||
self.logger.info(
|
||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||
if ie.IE_NAME in {'youtube:playlist', 'soundcloud:user', 'instagram:user'}:
|
||||
if ie.IE_NAME in {'youtube:playlist', 'youtube:tab', 'soundcloud:user', 'instagram:user'}:
|
||||
# At this point ie_result['entries'] is an iterator that
|
||||
# will fetch more metadata from youtube to list all the
|
||||
# videos. We unroll that iterator here partly because
|
||||
|
4
setup.py
4
setup.py
@ -2,7 +2,7 @@
|
||||
'''
|
||||
setup.py - brozzler setup script
|
||||
|
||||
Copyright (C) 2014-2020 Internet Archive
|
||||
Copyright (C) 2014-2021 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.5.24',
|
||||
version='1.5.25',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -75,7 +75,7 @@ def httpd(request):
|
||||
def do_POST(self):
|
||||
if self.path == '/login-action':
|
||||
self.send_response(200)
|
||||
payload = b'login successfull\n'
|
||||
payload = b'login successful\n'
|
||||
self.send_header('Content-Type', 'text/plain;charset=utf-8')
|
||||
self.send_header('Content-Length', len(payload))
|
||||
self.end_headers()
|
||||
|
Loading…
x
Reference in New Issue
Block a user