2016-11-16 11:41:34 -08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
'''
|
|
|
|
test_units.py - some unit tests for parts of brozzler amenable to that
|
|
|
|
|
2017-02-15 16:46:45 -08:00
|
|
|
Copyright (C) 2016-2017 Internet Archive
|
2016-11-16 11:41:34 -08:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
'''
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
import http.server
|
|
|
|
import threading
|
|
|
|
import os
|
|
|
|
import brozzler
|
2016-12-06 17:12:20 -08:00
|
|
|
import brozzler.chrome
|
2018-08-16 11:40:54 -07:00
|
|
|
import brozzler.ydl
|
2016-12-06 17:12:20 -08:00
|
|
|
import logging
|
2017-02-15 16:46:45 -08:00
|
|
|
import yaml
|
2017-03-02 16:53:24 -08:00
|
|
|
import datetime
|
2017-04-17 16:47:05 -07:00
|
|
|
import requests
|
2017-04-17 18:15:22 -07:00
|
|
|
import tempfile
|
2017-04-18 12:00:23 -07:00
|
|
|
import uuid
|
|
|
|
import socket
|
2017-04-20 17:08:16 -07:00
|
|
|
import time
|
|
|
|
import sys
|
2018-06-22 14:50:57 -05:00
|
|
|
import threading
|
2019-12-04 12:38:22 -08:00
|
|
|
from unittest import mock
|
2017-04-20 17:08:16 -07:00
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
stream=sys.stderr, level=logging.INFO, format=(
|
|
|
|
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
|
|
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
2016-11-16 11:41:34 -08:00
|
|
|
|
|
|
|
@pytest.fixture(scope='module')
|
|
|
|
def httpd(request):
|
|
|
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
|
|
|
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
|
|
|
|
|
|
|
httpd = http.server.HTTPServer(
|
|
|
|
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
|
|
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
|
|
|
httpd_thread.start()
|
|
|
|
|
|
|
|
def fin():
|
|
|
|
httpd.shutdown()
|
|
|
|
httpd.server_close()
|
|
|
|
httpd_thread.join()
|
|
|
|
request.addfinalizer(fin)
|
|
|
|
|
|
|
|
return httpd
|
|
|
|
|
|
|
|
def test_robots(httpd):
|
|
|
|
'''
|
|
|
|
Basic test of robots.txt user-agent substring matching.
|
|
|
|
'''
|
|
|
|
url = 'http://localhost:%s/' % httpd.server_port
|
2017-03-02 12:48:45 -08:00
|
|
|
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
2016-11-16 11:41:34 -08:00
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
|
2017-03-02 12:48:45 -08:00
|
|
|
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
2016-11-16 11:41:34 -08:00
|
|
|
assert not brozzler.is_permitted_by_robots(site, url)
|
|
|
|
|
2018-06-22 14:50:57 -05:00
|
|
|
def test_robots_http_statuses():
|
|
|
|
for status in (
|
|
|
|
200, 204, 400, 401, 402, 403, 404, 405,
|
|
|
|
500, 501, 502, 503, 504, 505):
|
|
|
|
class Handler(http.server.BaseHTTPRequestHandler):
|
|
|
|
def do_GET(self):
|
2018-06-22 16:10:23 -05:00
|
|
|
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
|
|
|
+ 'Content-length: 0\r\n'
|
|
|
|
+ '\r\n') % status).encode('utf-8')
|
2018-06-22 14:50:57 -05:00
|
|
|
self.connection.sendall(response)
|
|
|
|
# self.send_response(status)
|
|
|
|
# self.end_headers()
|
|
|
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
|
|
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
|
|
|
httpd_thread.start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
url = 'http://localhost:%s/' % httpd.server_port
|
|
|
|
site = brozzler.Site(None, {'seed': url})
|
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
finally:
|
|
|
|
httpd.shutdown()
|
|
|
|
httpd.server_close()
|
|
|
|
httpd_thread.join()
|
|
|
|
|
2018-06-22 16:10:23 -05:00
|
|
|
def test_robots_empty_response():
|
2018-06-22 14:50:57 -05:00
|
|
|
class Handler(http.server.BaseHTTPRequestHandler):
|
|
|
|
def do_GET(self):
|
|
|
|
self.connection.shutdown(socket.SHUT_RDWR)
|
|
|
|
self.connection.close()
|
|
|
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
|
|
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
|
|
|
httpd_thread.start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
url = 'http://localhost:%s/' % httpd.server_port
|
|
|
|
site = brozzler.Site(None, {'seed': url})
|
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
finally:
|
|
|
|
httpd.shutdown()
|
|
|
|
httpd.server_close()
|
|
|
|
httpd_thread.join()
|
|
|
|
|
|
|
|
def test_robots_socket_timeout():
|
|
|
|
stop_hanging = threading.Event()
|
|
|
|
class Handler(http.server.BaseHTTPRequestHandler):
|
|
|
|
def do_GET(self):
|
|
|
|
stop_hanging.wait(60)
|
|
|
|
self.connection.sendall(
|
|
|
|
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
|
|
|
|
|
|
|
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
|
|
|
|
|
|
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
|
|
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
|
|
|
httpd_thread.start()
|
|
|
|
|
|
|
|
try:
|
|
|
|
url = 'http://localhost:%s/' % httpd.server_port
|
|
|
|
site = brozzler.Site(None, {'seed': url})
|
|
|
|
brozzler.robots._SessionRaiseOn420.timeout = 2
|
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
finally:
|
|
|
|
brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
|
|
|
|
stop_hanging.set()
|
|
|
|
httpd.shutdown()
|
|
|
|
httpd.server_close()
|
|
|
|
httpd_thread.join()
|
|
|
|
|
|
|
|
def test_robots_dns_failure():
|
|
|
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
|
|
|
url = 'http://whatever.invalid./'
|
|
|
|
site = brozzler.Site(None, {'seed': url})
|
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
|
2018-06-22 16:10:23 -05:00
|
|
|
def test_robots_connection_failure():
|
|
|
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
|
|
|
url = 'http://localhost:4/' # nobody listens on port 4
|
|
|
|
site = brozzler.Site(None, {'seed': url})
|
|
|
|
assert brozzler.is_permitted_by_robots(site, url)
|
|
|
|
|
2017-02-15 16:46:45 -08:00
|
|
|
def test_scoping():
|
2019-03-18 15:49:44 -07:00
|
|
|
test_scope = yaml.safe_load('''
|
2017-02-15 16:46:45 -08:00
|
|
|
max_hops: 100
|
|
|
|
accepts:
|
|
|
|
- url_match: REGEX_MATCH
|
|
|
|
value: ^.*/audio_file/.*\.mp3$
|
|
|
|
- url_match: SURT_MATCH
|
|
|
|
value: http://(com,vimeocdn,
|
|
|
|
- url_match: STRING_MATCH
|
|
|
|
value: ec-media.soundcloud.com
|
|
|
|
- regex: ^https?://twitter\.com.*$
|
|
|
|
- substring: facebook.com
|
|
|
|
- regex: ^https?://(www.)?youtube.com/watch?.*$
|
|
|
|
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
|
|
|
blocks:
|
|
|
|
- domain: twitter.com
|
|
|
|
url_match: REGEX_MATCH
|
|
|
|
value: ^.*lang=(?!en).*$
|
|
|
|
''')
|
|
|
|
|
2017-03-02 12:48:45 -08:00
|
|
|
site = brozzler.Site(None, {
|
|
|
|
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
|
|
|
|
'scope': test_scope})
|
|
|
|
page = brozzler.Page(None, {
|
|
|
|
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
|
|
|
'site_id': site.id})
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
|
|
|
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
|
|
|
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
|
|
|
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
|
|
|
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
|
|
|
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither(
|
|
|
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
2017-03-02 12:48:45 -08:00
|
|
|
yt_user_page = brozzler.Page(None, {
|
|
|
|
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
|
|
|
'site_id': site.id, 'hops_from_seed': 10})
|
2018-03-23 10:43:08 -07:00
|
|
|
assert site.accept_reject_or_neither(
|
|
|
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
2017-02-15 16:46:45 -08:00
|
|
|
|
2017-04-17 18:15:22 -07:00
|
|
|
def test_proxy_down():
|
2017-04-17 16:47:05 -07:00
|
|
|
'''
|
2017-04-17 18:15:22 -07:00
|
|
|
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
2017-04-17 16:47:05 -07:00
|
|
|
|
2017-04-17 18:15:22 -07:00
|
|
|
This test needs to cover every possible fetch through the proxy other than
|
|
|
|
fetches from the browser. For that, see test_brozzling.py.
|
|
|
|
|
2017-04-18 12:00:23 -07:00
|
|
|
Tests two different kinds of connection error:
|
|
|
|
- nothing listening the port (nobody listens on on port 4 :))
|
|
|
|
- port bound but not accepting connections
|
|
|
|
'''
|
|
|
|
sock = socket.socket()
|
|
|
|
sock.bind(('127.0.0.1', 0))
|
|
|
|
for not_listening_proxy in (
|
|
|
|
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
|
|
|
worker = brozzler.BrozzlerWorker(
|
|
|
|
frontier=None, proxy=not_listening_proxy)
|
|
|
|
site = brozzler.Site(None, {
|
|
|
|
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
|
|
|
|
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
|
|
|
|
|
|
|
# robots.txt fetch
|
|
|
|
with pytest.raises(brozzler.ProxyError):
|
|
|
|
brozzler.is_permitted_by_robots(
|
|
|
|
site, 'http://example.com/', proxy=not_listening_proxy)
|
2017-04-17 18:15:22 -07:00
|
|
|
|
2017-04-18 12:00:23 -07:00
|
|
|
# youtube-dl fetch
|
|
|
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
|
|
|
with pytest.raises(brozzler.ProxyError):
|
2018-08-16 11:40:54 -07:00
|
|
|
brozzler.ydl.do_youtube_dl(worker, site, page)
|
2017-04-17 18:15:22 -07:00
|
|
|
|
2017-04-18 12:00:23 -07:00
|
|
|
# raw fetch
|
2017-04-17 18:15:22 -07:00
|
|
|
with pytest.raises(brozzler.ProxyError):
|
2021-08-31 19:44:55 +00:00
|
|
|
worker._fetch_url(site, page=page)
|
2017-04-17 16:47:05 -07:00
|
|
|
|
2017-04-18 16:58:51 -07:00
|
|
|
# WARCPROX_WRITE_RECORD
|
|
|
|
with pytest.raises(brozzler.ProxyError):
|
|
|
|
worker._warcprox_write_record(
|
|
|
|
warcprox_address=not_listening_proxy,
|
|
|
|
url='test://proxy_down/warcprox_write_record',
|
|
|
|
warc_type='metadata',
|
|
|
|
content_type='text/plain',
|
|
|
|
payload=b'''payload doesn't matter here''')
|
|
|
|
|
2017-03-02 16:53:24 -08:00
|
|
|
def test_start_stop_backwards_compat():
|
|
|
|
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
|
|
|
assert len(site.starts_and_stops) == 1
|
|
|
|
assert site.starts_and_stops[0]['start']
|
|
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
assert not 'start_time' in site
|
|
|
|
|
|
|
|
site = brozzler.Site(None, {
|
|
|
|
'seed': 'http://example.com/',
|
|
|
|
'start_time': datetime.datetime(2017,1,1)})
|
|
|
|
assert len(site.starts_and_stops) == 1
|
|
|
|
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
|
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
assert not 'start_time' in site
|
|
|
|
|
|
|
|
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
|
|
|
|
assert job.starts_and_stops[0]['start']
|
|
|
|
assert job.starts_and_stops[0]['stop'] is None
|
|
|
|
assert not 'started' in job
|
|
|
|
assert not 'finished' in job
|
|
|
|
|
|
|
|
job = brozzler.Job(None, {
|
|
|
|
'seeds': [{'url':'https://example.com/'}],
|
|
|
|
'started': datetime.datetime(2017, 1, 1),
|
|
|
|
'finished': datetime.datetime(2017, 1, 2)})
|
|
|
|
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
|
|
|
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
|
|
|
|
assert not 'started' in job
|
|
|
|
assert not 'finished' in job
|
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
class Exception1(Exception):
|
|
|
|
pass
|
|
|
|
class Exception2(Exception):
|
|
|
|
pass
|
2017-05-15 16:20:20 -07:00
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
def test_thread_raise_not_accept():
|
2017-05-15 16:20:20 -07:00
|
|
|
def never_accept():
|
2017-04-20 17:08:16 -07:00
|
|
|
try:
|
2017-05-15 16:20:20 -07:00
|
|
|
brozzler.sleep(2)
|
2017-04-20 17:08:16 -07:00
|
|
|
except Exception as e:
|
|
|
|
nonlocal thread_caught_exception
|
|
|
|
thread_caught_exception = e
|
|
|
|
|
2017-05-15 16:20:20 -07:00
|
|
|
# test that thread_raise does not raise exception in a thread that has no
|
|
|
|
# `with thread_exception_gate()` block
|
2017-04-20 17:08:16 -07:00
|
|
|
thread_caught_exception = None
|
2017-05-15 16:20:20 -07:00
|
|
|
th = threading.Thread(target=never_accept)
|
2017-04-20 17:08:16 -07:00
|
|
|
th.start()
|
2017-05-16 14:00:10 -07:00
|
|
|
brozzler.thread_raise(th, Exception1)
|
2017-04-20 17:08:16 -07:00
|
|
|
th.join()
|
|
|
|
assert thread_caught_exception is None
|
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
def test_thread_raise_immediate():
|
|
|
|
def accept_immediately():
|
|
|
|
try:
|
|
|
|
with brozzler.thread_accept_exceptions():
|
|
|
|
brozzler.sleep(2)
|
|
|
|
except Exception as e:
|
|
|
|
nonlocal thread_caught_exception
|
|
|
|
thread_caught_exception = e
|
|
|
|
|
2017-05-15 16:20:20 -07:00
|
|
|
# test immediate exception raise
|
|
|
|
thread_caught_exception = None
|
|
|
|
th = threading.Thread(target=accept_immediately)
|
|
|
|
th.start()
|
2017-05-16 14:00:10 -07:00
|
|
|
brozzler.thread_raise(th, Exception1)
|
2017-05-15 16:20:20 -07:00
|
|
|
start = time.time()
|
|
|
|
th.join()
|
|
|
|
assert thread_caught_exception
|
2017-05-16 14:00:10 -07:00
|
|
|
assert isinstance(thread_caught_exception, Exception1)
|
2017-05-15 16:20:20 -07:00
|
|
|
assert time.time() - start < 1.0
|
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
def test_thread_raise_safe_exit():
|
|
|
|
def delay_context_exit():
|
|
|
|
gate = brozzler.thread_accept_exceptions()
|
|
|
|
orig_exit = type(gate).__exit__
|
|
|
|
try:
|
|
|
|
type(gate).__exit__ = lambda self, et, ev, t: (
|
|
|
|
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
|
|
|
|
with brozzler.thread_accept_exceptions() as gate:
|
|
|
|
brozzler.sleep(2)
|
|
|
|
except Exception as e:
|
|
|
|
nonlocal thread_caught_exception
|
|
|
|
thread_caught_exception = e
|
|
|
|
finally:
|
|
|
|
type(gate).__exit__ = orig_exit
|
|
|
|
|
2017-05-15 16:20:20 -07:00
|
|
|
# test that a second thread_raise() doesn't result in an exception in
|
|
|
|
# ThreadExceptionGate.__exit__
|
|
|
|
thread_caught_exception = None
|
|
|
|
th = threading.Thread(target=delay_context_exit)
|
|
|
|
th.start()
|
|
|
|
time.sleep(0.2)
|
|
|
|
brozzler.thread_raise(th, Exception1)
|
|
|
|
time.sleep(0.2)
|
|
|
|
brozzler.thread_raise(th, Exception2)
|
|
|
|
th.join()
|
|
|
|
assert thread_caught_exception
|
|
|
|
assert isinstance(thread_caught_exception, Exception1)
|
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
def test_thread_raise_pending_exception():
|
|
|
|
def accept_eventually():
|
|
|
|
try:
|
|
|
|
brozzler.sleep(2)
|
|
|
|
with brozzler.thread_accept_exceptions():
|
|
|
|
pass
|
|
|
|
except Exception as e:
|
|
|
|
nonlocal thread_caught_exception
|
|
|
|
thread_caught_exception = e
|
|
|
|
|
2017-05-15 16:20:20 -07:00
|
|
|
# test exception that has to wait for `with thread_exception_gate()` block
|
2017-04-20 17:08:16 -07:00
|
|
|
thread_caught_exception = None
|
2017-05-15 16:20:20 -07:00
|
|
|
th = threading.Thread(target=accept_eventually)
|
2017-04-20 17:08:16 -07:00
|
|
|
th.start()
|
2017-05-16 14:00:10 -07:00
|
|
|
brozzler.thread_raise(th, Exception1)
|
2017-05-15 16:20:20 -07:00
|
|
|
start = time.time()
|
2017-04-20 17:08:16 -07:00
|
|
|
th.join()
|
2017-05-16 14:00:10 -07:00
|
|
|
assert isinstance(thread_caught_exception, Exception1)
|
2017-05-15 16:20:20 -07:00
|
|
|
assert time.time() - start > 1.0
|
2017-04-20 17:08:16 -07:00
|
|
|
|
2017-05-16 14:00:10 -07:00
|
|
|
def test_thread_raise_second_with_block():
|
|
|
|
def two_with_blocks():
|
|
|
|
try:
|
|
|
|
with brozzler.thread_accept_exceptions():
|
|
|
|
time.sleep(2)
|
|
|
|
return # test fails
|
|
|
|
except Exception1 as e:
|
|
|
|
pass
|
|
|
|
except:
|
|
|
|
return # fail test
|
|
|
|
|
|
|
|
try:
|
|
|
|
with brozzler.thread_accept_exceptions():
|
|
|
|
brozzler.sleep(2)
|
|
|
|
except Exception as e:
|
|
|
|
nonlocal thread_caught_exception
|
|
|
|
thread_caught_exception = e
|
|
|
|
|
|
|
|
# test that second `with` block gets second exception raised during first
|
|
|
|
# `with` block
|
|
|
|
thread_caught_exception = None
|
|
|
|
th = threading.Thread(target=two_with_blocks)
|
|
|
|
th.start()
|
|
|
|
brozzler.thread_raise(th, Exception1)
|
|
|
|
brozzler.thread_raise(th, Exception2)
|
|
|
|
th.join()
|
|
|
|
assert isinstance(thread_caught_exception, Exception2)
|
|
|
|
|
2018-01-26 10:59:18 -08:00
|
|
|
def test_needs_browsing():
|
|
|
|
# only one test case here right now, which exposed a bug
|
|
|
|
|
|
|
|
class ConvenientHeaders(http.client.HTTPMessage):
|
|
|
|
def __init__(self, headers):
|
|
|
|
http.client.HTTPMessage.__init__(self)
|
|
|
|
for (k, v) in headers.items():
|
|
|
|
self.add_header(k, v)
|
|
|
|
|
|
|
|
page = brozzler.Page(None, {
|
|
|
|
'url':'http://example.com/a'})
|
|
|
|
|
2018-08-16 11:40:54 -07:00
|
|
|
spy = brozzler.ydl.YoutubeDLSpy()
|
|
|
|
spy.fetches.append({
|
2018-01-26 10:59:18 -08:00
|
|
|
'url': 'http://example.com/a',
|
|
|
|
'method': 'HEAD',
|
2018-08-16 11:40:54 -07:00
|
|
|
'response_code': 301,
|
2018-01-26 10:59:18 -08:00
|
|
|
'response_headers': ConvenientHeaders({'Location': '/b'})})
|
2018-08-16 11:40:54 -07:00
|
|
|
spy.fetches.append({
|
2018-01-26 10:59:18 -08:00
|
|
|
'url': 'http://example.com/b',
|
|
|
|
'method': 'GET',
|
2018-08-16 11:40:54 -07:00
|
|
|
'response_code': 200,
|
2018-01-26 10:59:18 -08:00
|
|
|
'response_headers': ConvenientHeaders({
|
|
|
|
'Content-Type': 'application/pdf'})})
|
|
|
|
|
2018-08-16 11:40:54 -07:00
|
|
|
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
|
|
|
None, page, spy.fetches)
|
2018-01-26 10:59:18 -08:00
|
|
|
|
2018-12-21 15:17:31 -08:00
|
|
|
def test_seed_redirect():
|
|
|
|
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
|
|
|
site.note_seed_redirect('https://foo.com/a/b/c')
|
|
|
|
assert site.scope == {'accepts': [
|
|
|
|
{'ssurt': 'com,foo,//http:/',},
|
|
|
|
{'ssurt': 'com,foo,//https:/',}]}
|
|
|
|
|
|
|
|
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
|
|
|
|
site.note_seed_redirect('http://foo.com/a/b/c')
|
|
|
|
assert site.scope == {'accepts': [
|
|
|
|
{'ssurt': 'com,foo,//https:/',},
|
|
|
|
{'ssurt': 'com,foo,//http:/',}]}
|
|
|
|
|
|
|
|
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
|
|
|
site.note_seed_redirect('https://bar.com/a/b/c')
|
|
|
|
assert site.scope == {'accepts': [
|
|
|
|
{'ssurt': 'com,foo,//http:/',},
|
|
|
|
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
|
|
|
|
2019-12-04 12:38:22 -08:00
|
|
|
def test_limit_failures():
|
|
|
|
page = mock.Mock()
|
|
|
|
page.failed_attempts = None
|
|
|
|
page.brozzle_count = 0
|
|
|
|
|
|
|
|
site = mock.Mock()
|
|
|
|
site.status = 'ACTIVE'
|
|
|
|
site.active_brozzling_time = 0
|
|
|
|
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
|
|
|
|
|
|
|
rr = mock.Mock()
|
|
|
|
rr.servers = [mock.Mock()]
|
|
|
|
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
|
|
|
|
rr.db_list = mock.Mock(return_value=rethink_query)
|
|
|
|
rr.table_list = mock.Mock(return_value=rethink_query)
|
|
|
|
rr.table = mock.Mock(
|
|
|
|
return_value=mock.Mock(
|
|
|
|
between=mock.Mock(
|
|
|
|
return_value=mock.Mock(
|
|
|
|
limit=mock.Mock(
|
|
|
|
return_value=rethink_query)))))
|
|
|
|
assert rr.table().between().limit().run() == []
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
frontier.enforce_time_limit = mock.Mock()
|
|
|
|
frontier.honor_stop_request = mock.Mock()
|
|
|
|
frontier.claim_page = mock.Mock(return_value=page)
|
|
|
|
frontier._maybe_finish_job = mock.Mock()
|
|
|
|
|
|
|
|
browser = mock.Mock()
|
|
|
|
|
|
|
|
worker = brozzler.BrozzlerWorker(frontier)
|
|
|
|
worker.brozzle_page = mock.Mock(side_effect=Exception)
|
|
|
|
|
|
|
|
assert page.failed_attempts is None
|
|
|
|
assert page.brozzle_count == 0
|
|
|
|
assert site.status == 'ACTIVE'
|
|
|
|
|
|
|
|
worker.brozzle_site(browser, site)
|
|
|
|
assert page.failed_attempts == 1
|
|
|
|
assert page.brozzle_count == 0
|
|
|
|
assert site.status == 'ACTIVE'
|
|
|
|
|
|
|
|
worker.brozzle_site(browser, site)
|
|
|
|
assert page.failed_attempts == 2
|
|
|
|
assert page.brozzle_count == 0
|
|
|
|
assert site.status == 'ACTIVE'
|
|
|
|
|
|
|
|
worker.brozzle_site(browser, site)
|
|
|
|
assert page.failed_attempts == 3
|
|
|
|
assert page.brozzle_count == 1
|
|
|
|
assert site.status == 'FINISHED'
|
|
|
|
|