brozzler/tests/test_brozzling.py

342 lines
12 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
2024-02-08 11:55:23 -08:00
"""
test_brozzling.py - XXX explain
2018-10-12 11:03:54 -07:00
Copyright (C) 2016-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
2024-02-08 11:55:23 -08:00
"""
import pytest
import brozzler
import logging
import os
import http.server
import threading
import argparse
2016-12-21 19:19:34 -08:00
import urllib
import json
import threading
import socket
args = argparse.Namespace()
args.log_level = logging.INFO
2017-01-20 23:59:31 -08:00
brozzler.cli.configure_logging(args)
WARCPROX_META_420 = {
2024-02-08 11:55:23 -08:00
"stats": {
"test_limits_bucket": {
"total": {"urls": 0, "wire_bytes": 0},
"new": {"urls": 0, "wire_bytes": 0},
"revisit": {"urls": 0, "wire_bytes": 0},
"bucket": "test_limits_bucket",
}
},
2024-02-08 11:55:23 -08:00
"reached-limit": {"test_limits_bucket/total/urls": 0},
}
2024-02-08 11:55:23 -08:00
@pytest.fixture(scope="module")
def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler):
2017-11-08 17:01:27 -08:00
def __init__(self, *args, **kwargs):
2024-02-08 11:55:23 -08:00
self.extensions_map[".mpd"] = "video/vnd.mpeg.dash.mpd"
2017-11-08 17:01:27 -08:00
http.server.SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
def do_GET(self):
2024-02-08 11:55:23 -08:00
if self.path == "/420":
self.send_response(420, "Reached limit")
self.send_header("Connection", "close")
self.send_header("Warcprox-Meta", json.dumps(WARCPROX_META_420))
payload = b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n"
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Content-Length", len(payload))
self.end_headers()
self.wfile.write(payload)
2024-02-08 11:55:23 -08:00
elif self.path == "/401":
self.send_response(401)
2024-02-08 11:55:23 -08:00
self.send_header("WWW-Authenticate", 'Basic realm="Test"')
self.send_header("Content-type", "text/html")
self.end_headers()
2024-02-08 11:55:23 -08:00
self.wfile.write(self.headers.get("Authorization", b""))
self.wfile.write(b"not authenticated")
else:
super().do_GET()
def do_POST(self):
2024-02-08 11:55:23 -08:00
if self.path == "/login-action":
self.send_response(200)
2024-02-08 11:55:23 -08:00
payload = b"login successful\n"
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Content-Length", len(payload))
self.end_headers()
self.wfile.write(payload)
2020-04-14 09:39:48 +00:00
else:
super().do_POST()
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
2024-02-08 11:55:23 -08:00
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
2024-02-08 11:55:23 -08:00
httpd = http.server.HTTPServer(("localhost", 0), RequestHandler)
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
2024-02-08 11:55:23 -08:00
request.addfinalizer(fin)
return httpd
2024-02-08 11:55:23 -08:00
def test_httpd(httpd):
2024-02-08 11:55:23 -08:00
"""
Tests that our http server is working as expected, and that two fetches
of the same url return the same payload, proving it can be used to test
deduplication.
2024-02-08 11:55:23 -08:00
"""
payload1 = content2 = None
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload1 = response.read()
assert payload1
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload2 = response.read()
assert payload2
assert payload1 == payload2
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/420" % httpd.server_port
with pytest.raises(urllib.error.HTTPError) as excinfo:
urllib.request.urlopen(url)
assert excinfo.value.getcode() == 420
2024-02-08 11:55:23 -08:00
def test_aw_snap_hes_dead_jim():
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.BrowsingException):
2024-02-08 11:55:23 -08:00
browser.browse_page("chrome://crash")
2020-03-11 20:37:30 -07:00
# chromium's 401 handling changed???
@pytest.mark.xfail
def test_page_interstitial_exception(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/401" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.PageInterstitialShown):
browser.browse_page(url)
2024-02-08 11:55:23 -08:00
def test_on_response(httpd):
response_urls = []
2024-02-08 11:55:23 -08:00
def on_response(msg):
2024-02-08 11:55:23 -08:00
response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe()
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/site3/page.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(url, on_response=on_response)
2024-02-08 11:55:23 -08:00
assert response_urls[0] == "http://localhost:%s/site3/page.html" % httpd.server_port
assert (
response_urls[1] == "http://localhost:%s/site3/brozzler.svg" % httpd.server_port
)
assert response_urls[2] == "http://localhost:%s/favicon.ico" % httpd.server_port
def test_420(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/420" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ReachedLimit) as excinfo:
browser.browse_page(url)
assert excinfo.value.warcprox_meta == WARCPROX_META_420
2024-02-08 11:55:23 -08:00
def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
2024-02-08 11:55:23 -08:00
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
2024-02-08 11:55:23 -08:00
browser.browse_page("http://localhost:%s/site4/alert.html" % httpd.server_port)
browser.browse_page(
2024-02-08 11:55:23 -08:00
"http://localhost:%s/site4/confirm.html" % httpd.server_port
)
browser.browse_page("http://localhost:%s/site4/prompt.html" % httpd.server_port)
# XXX print dialog unresolved
# browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port)
2024-02-08 11:55:23 -08:00
def test_page_videos(httpd):
2017-03-20 12:14:11 -07:00
# test depends on behavior of youtube-dl and chromium, could fail and need
# to be adjusted on youtube-dl or chromium updates
chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {})
2024-02-08 11:55:23 -08:00
page = brozzler.Page(
None, {"url": "http://localhost:%s/site6/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
worker.brozzle_page(browser, site, page)
assert page.videos
2017-11-08 17:01:27 -08:00
assert len(page.videos) == 4
assert page.videos[0] == {
2024-02-08 11:55:23 -08:00
"blame": "youtube-dl",
"response_code": 200,
"content-length": 383631,
"content-type": "video/mp4",
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
}
assert page.videos[1] == {
2024-02-08 11:55:23 -08:00
"blame": "youtube-dl",
"content-length": 92728,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
% httpd.server_port,
2017-11-08 17:01:27 -08:00
}
assert page.videos[2] == {
2024-02-08 11:55:23 -08:00
"blame": "youtube-dl",
"content-length": 101114,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
2017-11-08 17:01:27 -08:00
}
assert page.videos[3] == {
2024-02-08 11:55:23 -08:00
"blame": "browser",
# 'response_code': 206,
# 'content-range': 'bytes 0-229454/229455',
2024-02-08 11:55:23 -08:00
"response_code": 200,
"content-length": 229455,
"content-type": "video/webm",
"url": "http://localhost:%s/site6/small.webm" % httpd.server_port,
}
2024-02-08 11:55:23 -08:00
def test_extract_outlinks(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
worker = brozzler.BrozzlerWorker(None)
site = brozzler.Site(None, {})
2024-02-08 11:55:23 -08:00
page = brozzler.Page(
None, {"url": "http://localhost:%s/site8/" % httpd.server_port}
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
outlinks = worker.brozzle_page(browser, site, page)
assert outlinks == {
2024-02-08 11:55:23 -08:00
"http://example.com/offsite",
"http://localhost:%s/site8/baz/zuh" % httpd.server_port,
"http://localhost:%s/site8/fdjisapofdjisap#1" % httpd.server_port,
"http://localhost:%s/site8/fdjisapofdjisap#2" % httpd.server_port,
}
2024-02-08 11:55:23 -08:00
def test_proxy_down():
2024-02-08 11:55:23 -08:00
"""
Test that browsing raises `brozzler.ProxyError` when proxy is down.
See also `test_proxy_down` in test_units.py.
Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections
2024-02-08 11:55:23 -08:00
"""
sock = socket.socket()
2024-02-08 11:55:23 -08:00
sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
site = brozzler.Site(None, {"seed": "http://example.com/"})
page = brozzler.Page(None, {"url": "http://example.com/"})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.ProxyError):
worker.brozzle_page(browser, site, page)
2024-02-08 11:55:23 -08:00
def test_try_login(httpd):
2024-02-08 11:55:23 -08:00
"""Test try_login behavior."""
response_urls = []
2024-02-08 11:55:23 -08:00
def on_response(msg):
2024-02-08 11:55:23 -08:00
response_urls.append(msg["params"]["response"]["url"])
chrome_exe = brozzler.suggest_default_chrome_exe()
2024-02-08 11:55:23 -08:00
form_url = "http://localhost:%s/site11/form1.html" % httpd.server_port
form_url_other = "http://localhost:%s/site11/form2.html" % httpd.server_port
favicon_url = "http://localhost:%s/favicon.ico" % httpd.server_port
login_url = "http://localhost:%s/login-action" % httpd.server_port
# When username and password are defined and initial page has login form,
# detect login form, submit login, and then return to the initial page.
2024-02-08 11:55:23 -08:00
username = "user1"
password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
2024-02-08 11:55:23 -08:00
browser.browse_page(
form_url, username=username, password=password, on_response=on_response
)
assert len(response_urls) == 4
assert response_urls[0] == form_url
assert response_urls[1] == favicon_url
assert response_urls[2] == login_url
assert response_urls[3] == form_url
# We are now supporting a different type of form, we'll test that here.
response_urls = []
2024-02-08 11:55:23 -08:00
username = "user1"
password = "pass1"
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
2024-02-08 11:55:23 -08:00
browser.browse_page(
form_url_other,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 4
assert response_urls[0] == form_url_other
assert response_urls[1] == favicon_url
assert response_urls[2] == login_url
assert response_urls[3] == form_url_other
# When username and password are not defined, just load the initial page.
response_urls = []
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(form_url, on_response=on_response)
assert len(response_urls) == 2
assert response_urls[0] == form_url
assert response_urls[1] == favicon_url
# when the page doesn't have a form with username/password, don't submit it
response_urls = []
2024-02-08 11:55:23 -08:00
form_without_login_url = (
"http://localhost:%s/site11/form-no-login.html" % httpd.server_port
)
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
2024-02-08 11:55:23 -08:00
browser.browse_page(
form_without_login_url,
username=username,
password=password,
on_response=on_response,
)
assert len(response_urls) == 2
assert response_urls[0] == form_without_login_url
assert response_urls[1] == favicon_url