mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-24 07:20:53 -04:00
Use black, enforce with GitHub Actions
This commit is contained in:
parent
c4620c3018
commit
8b23430a87
23 changed files with 4048 additions and 2797 deletions
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
'''
|
||||
"""
|
||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||
|
||||
Copyright (C) 2016-2017 Internet Archive
|
||||
|
@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
|
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import http.server
|
||||
|
@ -37,99 +37,131 @@ import threading
|
|||
from unittest import mock
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
stream=sys.stderr,
|
||||
level=logging.INFO,
|
||||
format=(
|
||||
"%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||
"%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s"
|
||||
),
|
||||
)
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def httpd(request):
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
||||
|
||||
httpd = http.server.HTTPServer(
|
||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
("localhost", 0), http.server.SimpleHTTPRequestHandler
|
||||
)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
|
||||
def test_robots(httpd):
|
||||
'''
|
||||
"""
|
||||
Basic test of robots.txt user-agent substring matching.
|
||||
'''
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
|
||||
"""
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a/GoOdbot/yep"})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
||||
site = brozzler.Site(None, {"seed": url, "user_agent": "im/a bAdBOt/uh huh"})
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_robots_http_statuses():
|
||||
for status in (
|
||||
200, 204, 400, 401, 402, 403, 404, 405,
|
||||
500, 501, 502, 503, 504, 505):
|
||||
200,
|
||||
204,
|
||||
400,
|
||||
401,
|
||||
402,
|
||||
403,
|
||||
404,
|
||||
405,
|
||||
500,
|
||||
501,
|
||||
502,
|
||||
503,
|
||||
504,
|
||||
505,
|
||||
):
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
||||
+ 'Content-length: 0\r\n'
|
||||
+ '\r\n') % status).encode('utf-8')
|
||||
response = (
|
||||
(
|
||||
"HTTP/1.1 %s Meaningless message\r\n"
|
||||
+ "Content-length: 0\r\n"
|
||||
+ "\r\n"
|
||||
)
|
||||
% status
|
||||
).encode("utf-8")
|
||||
self.connection.sendall(response)
|
||||
# self.send_response(status)
|
||||
# self.end_headers()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_empty_response():
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.connection.shutdown(socket.SHUT_RDWR)
|
||||
self.connection.close()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_socket_timeout():
|
||||
stop_hanging = threading.Event()
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
stop_hanging.wait(60)
|
||||
self.connection.sendall(
|
||||
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
||||
self.connection.sendall(b"HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n")
|
||||
|
||||
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
||||
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd = http.server.HTTPServer(("localhost", 0), Handler)
|
||||
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:%s/" % httpd.server_port
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
brozzler.robots._SessionRaiseOn420.timeout = 2
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
|
@ -139,20 +171,24 @@ def test_robots_socket_timeout():
|
|||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
|
||||
def test_robots_dns_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://whatever.invalid./'
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://whatever.invalid./"
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_robots_connection_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://localhost:4/' # nobody listens on port 4
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
url = "http://localhost:4/" # nobody listens on port 4
|
||||
site = brozzler.Site(None, {"seed": url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
|
||||
def test_scoping():
|
||||
test_scope = yaml.safe_load('''
|
||||
test_scope = yaml.safe_load(
|
||||
"""
|
||||
max_hops: 100
|
||||
accepts:
|
||||
- url_match: REGEX_MATCH
|
||||
|
@ -169,40 +205,73 @@ blocks:
|
|||
- domain: twitter.com
|
||||
url_match: REGEX_MATCH
|
||||
value: ^.*lang=(?!en).*$
|
||||
''')
|
||||
"""
|
||||
)
|
||||
|
||||
site = brozzler.Site(None, {
|
||||
'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||
'scope': test_scope})
|
||||
page = brozzler.Page(None, {
|
||||
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||
'site_id': site.id})
|
||||
site = brozzler.Site(
|
||||
None,
|
||||
{
|
||||
"id": 1,
|
||||
"seed": "http://example.com/foo/bar?baz=quux#monkey",
|
||||
"scope": test_scope,
|
||||
},
|
||||
)
|
||||
page = brozzler.Page(
|
||||
None, {"url": "http://example.com/foo/bar?baz=quux#monkey", "site_id": site.id}
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
||||
assert site.accept_reject_or_neither("http://example.com/foo/bar", page) is True
|
||||
assert site.accept_reject_or_neither("http://example.com/foo/baz", page) is None
|
||||
|
||||
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
||||
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
||||
assert site.accept_reject_or_neither("http://foo.com/some.mp3", page) is None
|
||||
assert (
|
||||
site.accept_reject_or_neither("http://foo.com/blah/audio_file/some.mp3", page)
|
||||
is True
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
||||
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
||||
assert (
|
||||
site.accept_reject_or_neither("http://a.b.vimeocdn.com/blahblah", page) is True
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://a.b.vimeocdn.com/blahblah", page) is None
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
||||
assert site.accept_reject_or_neither("https://twitter.com/twit", page) is True
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://twitter.com/twit?lang=en", page) is True
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://twitter.com/twit?lang=es", page) is False
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
||||
assert (
|
||||
site.accept_reject_or_neither("https://www.facebook.com/whatevz", page) is True
|
||||
)
|
||||
|
||||
assert (
|
||||
site.accept_reject_or_neither(
|
||||
"https://www.youtube.com/watch?v=dUIn5OAPS5s", page
|
||||
)
|
||||
is None
|
||||
)
|
||||
yt_user_page = brozzler.Page(
|
||||
None,
|
||||
{
|
||||
"url": "https://www.youtube.com/user/SonoraSantaneraVEVO",
|
||||
"site_id": site.id,
|
||||
"hops_from_seed": 10,
|
||||
},
|
||||
)
|
||||
assert (
|
||||
site.accept_reject_or_neither(
|
||||
"https://www.youtube.com/watch?v=dUIn5OAPS5s", yt_user_page
|
||||
)
|
||||
is True
|
||||
)
|
||||
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
||||
yt_user_page = brozzler.Page(None, {
|
||||
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
||||
'site_id': site.id, 'hops_from_seed': 10})
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
"""
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
|
||||
This test needs to cover every possible fetch through the proxy other than
|
||||
|
@ -211,24 +280,24 @@ def test_proxy_down():
|
|||
Tests two different kinds of connection error:
|
||||
- nothing listening the port (nobody listens on on port 4 :))
|
||||
- port bound but not accepting connections
|
||||
'''
|
||||
"""
|
||||
sock = socket.socket()
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
for not_listening_proxy in (
|
||||
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(
|
||||
frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(None, {
|
||||
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
|
||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(
|
||||
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
|
||||
)
|
||||
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||
|
||||
# robots.txt fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.is_permitted_by_robots(
|
||||
site, 'http://example.com/', proxy=not_listening_proxy)
|
||||
site, "http://example.com/", proxy=not_listening_proxy
|
||||
)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
|
@ -239,47 +308,58 @@ def test_proxy_down():
|
|||
# WARCPROX_WRITE_RECORD
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._warcprox_write_record(
|
||||
warcprox_address=not_listening_proxy,
|
||||
url='test://proxy_down/warcprox_write_record',
|
||||
warc_type='metadata',
|
||||
content_type='text/plain',
|
||||
payload=b'''payload doesn't matter here''')
|
||||
warcprox_address=not_listening_proxy,
|
||||
url="test://proxy_down/warcprox_write_record",
|
||||
warc_type="metadata",
|
||||
content_type="text/plain",
|
||||
payload=b"""payload doesn't matter here""",
|
||||
)
|
||||
|
||||
|
||||
def test_start_stop_backwards_compat():
|
||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||
site = brozzler.Site(None, {"seed": "http://example.com/"})
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
assert not 'start_time' in site
|
||||
assert site.starts_and_stops[0]["start"]
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
|
||||
site = brozzler.Site(None, {
|
||||
'seed': 'http://example.com/',
|
||||
'start_time': datetime.datetime(2017,1,1)})
|
||||
site = brozzler.Site(
|
||||
None,
|
||||
{"seed": "http://example.com/", "start_time": datetime.datetime(2017, 1, 1)},
|
||||
)
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
assert not 'start_time' in site
|
||||
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
|
||||
job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]})
|
||||
assert job.starts_and_stops[0]['start']
|
||||
assert job.starts_and_stops[0]['stop'] is None
|
||||
assert not 'started' in job
|
||||
assert not 'finished' in job
|
||||
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
||||
assert job.starts_and_stops[0]["start"]
|
||||
assert job.starts_and_stops[0]["stop"] is None
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
|
||||
job = brozzler.Job(
|
||||
None,
|
||||
{
|
||||
"seeds": [{"url": "https://example.com/"}],
|
||||
"started": datetime.datetime(2017, 1, 1),
|
||||
"finished": datetime.datetime(2017, 1, 2),
|
||||
},
|
||||
)
|
||||
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
|
||||
job = brozzler.Job(None, {
|
||||
'seeds': [{'url':'https://example.com/'}],
|
||||
'started': datetime.datetime(2017, 1, 1),
|
||||
'finished': datetime.datetime(2017, 1, 2)})
|
||||
assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1)
|
||||
assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2)
|
||||
assert not 'started' in job
|
||||
assert not 'finished' in job
|
||||
|
||||
class Exception1(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Exception2(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def test_thread_raise_not_accept():
|
||||
def never_accept():
|
||||
try:
|
||||
|
@ -297,6 +377,7 @@ def test_thread_raise_not_accept():
|
|||
th.join()
|
||||
assert thread_caught_exception is None
|
||||
|
||||
|
||||
def test_thread_raise_immediate():
|
||||
def accept_immediately():
|
||||
try:
|
||||
|
@ -317,13 +398,17 @@ def test_thread_raise_immediate():
|
|||
assert isinstance(thread_caught_exception, Exception1)
|
||||
assert time.time() - start < 1.0
|
||||
|
||||
|
||||
def test_thread_raise_safe_exit():
|
||||
def delay_context_exit():
|
||||
gate = brozzler.thread_accept_exceptions()
|
||||
orig_exit = type(gate).__exit__
|
||||
try:
|
||||
type(gate).__exit__ = lambda self, et, ev, t: (
|
||||
brozzler.sleep(2), orig_exit(self, et, ev, t), False)[-1]
|
||||
brozzler.sleep(2),
|
||||
orig_exit(self, et, ev, t),
|
||||
False,
|
||||
)[-1]
|
||||
with brozzler.thread_accept_exceptions() as gate:
|
||||
brozzler.sleep(2)
|
||||
except Exception as e:
|
||||
|
@ -345,6 +430,7 @@ def test_thread_raise_safe_exit():
|
|||
assert thread_caught_exception
|
||||
assert isinstance(thread_caught_exception, Exception1)
|
||||
|
||||
|
||||
def test_thread_raise_pending_exception():
|
||||
def accept_eventually():
|
||||
try:
|
||||
|
@ -365,16 +451,17 @@ def test_thread_raise_pending_exception():
|
|||
assert isinstance(thread_caught_exception, Exception1)
|
||||
assert time.time() - start > 1.0
|
||||
|
||||
|
||||
def test_thread_raise_second_with_block():
|
||||
def two_with_blocks():
|
||||
try:
|
||||
with brozzler.thread_accept_exceptions():
|
||||
time.sleep(2)
|
||||
return # test fails
|
||||
return # test fails
|
||||
except Exception1 as e:
|
||||
pass
|
||||
except:
|
||||
return # fail test
|
||||
return # fail test
|
||||
|
||||
try:
|
||||
with brozzler.thread_accept_exceptions():
|
||||
|
@ -393,52 +480,79 @@ def test_thread_raise_second_with_block():
|
|||
th.join()
|
||||
assert isinstance(thread_caught_exception, Exception2)
|
||||
|
||||
|
||||
def test_needs_browsing():
|
||||
# only one test case here right now, which exposed a bug
|
||||
|
||||
class ConvenientHeaders(http.client.HTTPMessage):
|
||||
def __init__(self, headers):
|
||||
http.client.HTTPMessage.__init__(self)
|
||||
for (k, v) in headers.items():
|
||||
for k, v in headers.items():
|
||||
self.add_header(k, v)
|
||||
|
||||
page = brozzler.Page(None, {
|
||||
'url':'http://example.com/a'})
|
||||
page = brozzler.Page(None, {"url": "http://example.com/a"})
|
||||
|
||||
spy = brozzler.ydl.YoutubeDLSpy()
|
||||
spy.fetches.append({
|
||||
'url': 'http://example.com/a',
|
||||
'method': 'HEAD',
|
||||
'response_code': 301,
|
||||
'response_headers': ConvenientHeaders({'Location': '/b'})})
|
||||
spy.fetches.append({
|
||||
'url': 'http://example.com/b',
|
||||
'method': 'GET',
|
||||
'response_code': 200,
|
||||
'response_headers': ConvenientHeaders({
|
||||
'Content-Type': 'application/pdf'})})
|
||||
spy.fetches.append(
|
||||
{
|
||||
"url": "http://example.com/a",
|
||||
"method": "HEAD",
|
||||
"response_code": 301,
|
||||
"response_headers": ConvenientHeaders({"Location": "/b"}),
|
||||
}
|
||||
)
|
||||
spy.fetches.append(
|
||||
{
|
||||
"url": "http://example.com/b",
|
||||
"method": "GET",
|
||||
"response_code": 200,
|
||||
"response_headers": ConvenientHeaders({"Content-Type": "application/pdf"}),
|
||||
}
|
||||
)
|
||||
|
||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(None, page, spy.fetches)
|
||||
|
||||
assert not brozzler.worker.BrozzlerWorker._needs_browsing(
|
||||
None, page, spy.fetches)
|
||||
|
||||
def test_seed_redirect():
|
||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||
site.note_seed_redirect('https://foo.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,foo,//https:/',}]}
|
||||
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||
site.note_seed_redirect("https://foo.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,foo,//https:/",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {'seed': 'https://foo.com/'})
|
||||
site.note_seed_redirect('http://foo.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//https:/',},
|
||||
{'ssurt': 'com,foo,//http:/',}]}
|
||||
site = brozzler.Site(None, {"seed": "https://foo.com/"})
|
||||
site.note_seed_redirect("http://foo.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//https:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {"seed": "http://foo.com/"})
|
||||
site.note_seed_redirect("https://bar.com/a/b/c")
|
||||
assert site.scope == {
|
||||
"accepts": [
|
||||
{
|
||||
"ssurt": "com,foo,//http:/",
|
||||
},
|
||||
{
|
||||
"ssurt": "com,bar,//https:/a/b/c",
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
site = brozzler.Site(None, {'seed': 'http://foo.com/'})
|
||||
site.note_seed_redirect('https://bar.com/a/b/c')
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||
|
||||
def test_limit_failures():
|
||||
page = mock.Mock()
|
||||
|
@ -446,9 +560,9 @@ def test_limit_failures():
|
|||
page.brozzle_count = 0
|
||||
|
||||
site = mock.Mock()
|
||||
site.status = 'ACTIVE'
|
||||
site.status = "ACTIVE"
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
||||
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
|
@ -456,11 +570,12 @@ def test_limit_failures():
|
|||
rr.db_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table = mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
limit=mock.Mock(
|
||||
return_value=rethink_query)))))
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(limit=mock.Mock(return_value=rethink_query))
|
||||
)
|
||||
)
|
||||
)
|
||||
assert rr.table().between().limit().run() == []
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.enforce_time_limit = mock.Mock()
|
||||
|
@ -475,20 +590,19 @@ def test_limit_failures():
|
|||
|
||||
assert page.failed_attempts is None
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 1
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 2
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
assert site.status == "ACTIVE"
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 3
|
||||
assert page.brozzle_count == 1
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
assert site.status == "FINISHED"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue