2016-09-14 17:06:49 -07:00
|
|
|
#!/usr/bin/env python
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
2016-10-13 17:12:35 -07:00
|
|
|
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
|
|
|
|
warcprox, pywb, rethinkdb and other dependencies to be running already
|
2016-09-14 17:06:49 -07:00
|
|
|
|
2018-08-15 17:42:53 -07:00
|
|
|
Copyright (C) 2016-2018 Internet Archive
|
2016-09-14 17:06:49 -07:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
2016-09-14 17:06:49 -07:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
import http.server
|
|
|
|
import threading
|
|
|
|
import urllib.request
|
|
|
|
import os
|
|
|
|
import socket
|
2017-03-02 12:48:45 -08:00
|
|
|
import doublethink
|
2016-10-13 17:12:35 -07:00
|
|
|
import time
|
|
|
|
import brozzler
|
|
|
|
import datetime
|
|
|
|
import requests
|
2016-11-16 12:23:59 -08:00
|
|
|
import subprocess
|
2017-03-24 13:55:23 -07:00
|
|
|
import http.server
|
|
|
|
import logging
|
2019-10-17 13:47:33 -07:00
|
|
|
import sys
|
2017-04-18 17:54:12 -07:00
|
|
|
import warcprox
|
2016-11-16 12:23:59 -08:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
|
|
|
|
def _local_address():
|
|
|
|
import socket
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
|
2019-05-15 18:49:18 -07:00
|
|
|
return s.getsockname()[0]
|
|
|
|
except:
|
2024-02-08 11:55:23 -08:00
|
|
|
return "127.0.0.1"
|
2019-05-15 18:49:18 -07:00
|
|
|
finally:
|
|
|
|
s.close()
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
local_address = _local_address()
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2016-11-16 12:23:59 -08:00
|
|
|
def start_service(service):
|
2024-02-08 11:55:23 -08:00
|
|
|
subprocess.check_call(["sudo", "svc", "-u", "/etc/service/" + service])
|
|
|
|
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
def stop_service(service):
|
2024-02-08 11:55:23 -08:00
|
|
|
subprocess.check_call(["sudo", "svc", "-d", "/etc/service/" + service])
|
2019-05-15 18:49:18 -07:00
|
|
|
while True:
|
2024-02-08 11:55:23 -08:00
|
|
|
status = subprocess.check_output(["sudo", "svstat", "/etc/service/" + service])
|
|
|
|
if b" down " in status:
|
2019-05-15 18:49:18 -07:00
|
|
|
break
|
|
|
|
time.sleep(0.5)
|
2016-09-14 17:06:49 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
2016-09-14 17:06:49 -07:00
|
|
|
def httpd(request):
|
2017-03-06 15:13:40 -08:00
|
|
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
2019-05-15 18:49:18 -07:00
|
|
|
def do_POST(self):
|
2024-02-08 11:55:23 -08:00
|
|
|
logging.info("\n%s\n%s", self.requestline, self.headers)
|
2019-05-15 18:49:18 -07:00
|
|
|
self.do_GET()
|
|
|
|
|
2017-03-06 15:13:40 -08:00
|
|
|
def do_GET(self):
|
2024-02-08 11:55:23 -08:00
|
|
|
logging.info("\n%s\n%s", self.requestline, self.headers)
|
|
|
|
if self.path == "/site5/redirect/":
|
|
|
|
self.send_response(303, "See other")
|
|
|
|
self.send_header("Connection", "close")
|
|
|
|
self.send_header("Content-Length", 0)
|
|
|
|
self.send_header("Location", "/site5/destination/")
|
2017-03-06 15:13:40 -08:00
|
|
|
self.end_headers()
|
2024-02-08 11:55:23 -08:00
|
|
|
self.wfile.write(b"")
|
|
|
|
elif self.path == "/site9/redirect.html":
|
|
|
|
self.send_response(303, "See other")
|
|
|
|
self.send_header("Connection", "close")
|
|
|
|
self.send_header("Content-Length", 0)
|
|
|
|
self.send_header("Location", "/site9/destination.html")
|
2017-09-27 14:08:28 -07:00
|
|
|
self.end_headers()
|
2024-02-08 11:55:23 -08:00
|
|
|
self.wfile.write(b"")
|
|
|
|
elif self.path.startswith("/infinite/"):
|
|
|
|
payload = b"""
|
2017-04-14 18:06:15 -07:00
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<title>infinite site</title>
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<a href='a/'>a/</a> <a href='b/'>b/</a> <a href='c/'>c/</a>
|
|
|
|
<a href='d/'>d/</a> <a href='e/'>e/</a> <a href='f/'>f/</a>
|
|
|
|
<a href='g/'>g/</a> <a href='h/'>h/</a> <a href='i/'>i/</a>
|
|
|
|
</body>
|
|
|
|
</html>
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
|
|
|
self.send_response(200, "OK")
|
|
|
|
self.send_header("Connection", "close")
|
|
|
|
self.send_header("Content-Length", len(payload))
|
2017-04-14 18:06:15 -07:00
|
|
|
self.end_headers()
|
|
|
|
self.wfile.write(payload)
|
2017-03-06 15:13:40 -08:00
|
|
|
else:
|
|
|
|
super().do_GET()
|
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
2024-02-08 11:55:23 -08:00
|
|
|
os.chdir(os.path.join(os.path.dirname(__file__), "htdocs"))
|
2016-09-14 17:06:49 -07:00
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
httpd = http.server.HTTPServer((local_address, 0), RequestHandler)
|
2024-02-08 11:55:23 -08:00
|
|
|
httpd_thread = threading.Thread(name="httpd", target=httpd.serve_forever)
|
2016-09-14 17:06:49 -07:00
|
|
|
httpd_thread.start()
|
|
|
|
|
|
|
|
def fin():
|
|
|
|
httpd.shutdown()
|
|
|
|
httpd.server_close()
|
|
|
|
httpd_thread.join()
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
request.addfinalizer(fin)
|
|
|
|
|
|
|
|
return httpd
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
def make_url(httpd, rel_url):
|
2024-02-08 11:55:23 -08:00
|
|
|
return "http://%s:%s%s" % (local_address, httpd.server_port, rel_url)
|
|
|
|
|
2019-05-15 18:49:18 -07:00
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
def test_httpd(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
2016-09-14 17:06:49 -07:00
|
|
|
Tests that our http server is working as expected, and that two fetches
|
|
|
|
of the same url return the same payload, proving it can be used to test
|
|
|
|
deduplication.
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
2016-09-14 17:06:49 -07:00
|
|
|
payload1 = content2 = None
|
2024-02-08 11:55:23 -08:00
|
|
|
url = make_url(httpd, "/site1/file1.txt")
|
2016-12-19 17:30:09 -08:00
|
|
|
with urllib.request.urlopen(url) as response:
|
2016-09-14 17:06:49 -07:00
|
|
|
assert response.status == 200
|
|
|
|
payload1 = response.read()
|
|
|
|
assert payload1
|
|
|
|
|
2016-12-19 17:30:09 -08:00
|
|
|
with urllib.request.urlopen(url) as response:
|
2016-09-14 17:06:49 -07:00
|
|
|
assert response.status == 200
|
|
|
|
payload2 = response.read()
|
|
|
|
assert payload2
|
|
|
|
|
|
|
|
assert payload1 == payload2
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
def test_services_up():
|
2024-02-08 11:55:23 -08:00
|
|
|
"""Check that the expected services are up and running."""
|
2016-10-13 17:12:35 -07:00
|
|
|
# check that rethinkdb is listening and looks sane
|
2024-02-08 11:55:23 -08:00
|
|
|
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
2017-03-02 12:48:45 -08:00
|
|
|
tbls = rr.table_list().run()
|
2016-10-13 17:12:35 -07:00
|
|
|
assert len(tbls) > 10
|
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
# check that warcprox is listening
|
|
|
|
with socket.socket() as s:
|
|
|
|
# if the connect fails an exception is raised and the test fails
|
2024-02-08 11:55:23 -08:00
|
|
|
s.connect(("localhost", 8000))
|
2016-09-14 17:06:49 -07:00
|
|
|
|
2016-10-13 17:12:35 -07:00
|
|
|
# check that pywb is listening
|
|
|
|
with socket.socket() as s:
|
|
|
|
# if the connect fails an exception is raised and the test fails
|
2024-02-08 11:55:23 -08:00
|
|
|
s.connect(("localhost", 8880))
|
2016-09-14 17:06:49 -07:00
|
|
|
|
2016-11-04 17:46:23 -07:00
|
|
|
# check that brozzler dashboard is listening
|
2016-10-13 17:12:35 -07:00
|
|
|
with socket.socket() as s:
|
|
|
|
# if the connect fails an exception is raised and the test fails
|
2024-02-08 11:55:23 -08:00
|
|
|
s.connect(("localhost", 8881))
|
|
|
|
|
2016-09-14 17:06:49 -07:00
|
|
|
|
|
|
|
def test_brozzle_site(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site1/"),
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2016-10-13 17:12:35 -07:00
|
|
|
|
|
|
|
# the two pages we expect to be crawled
|
2024-02-08 11:55:23 -08:00
|
|
|
page1 = make_url(httpd, "/site1/")
|
|
|
|
page2 = make_url(httpd, "/site1/file1.txt")
|
|
|
|
robots = make_url(httpd, "/robots.txt")
|
2016-10-13 17:12:35 -07:00
|
|
|
|
2016-11-16 12:23:59 -08:00
|
|
|
# so we can examine rethinkdb before it does anything
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
assert site.id is None
|
2017-03-02 12:48:45 -08:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
2016-11-16 12:23:59 -08:00
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id is not None
|
|
|
|
assert len(list(frontier.site_pages(site.id))) == 1
|
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
2016-10-13 17:12:35 -07:00
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2016-10-13 17:12:35 -07:00
|
|
|
time.sleep(0.5)
|
2017-03-02 12:48:45 -08:00
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2016-10-13 17:12:35 -07:00
|
|
|
|
|
|
|
# check that we got the two pages we expected
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
2016-12-19 17:30:09 -08:00
|
|
|
assert len(pages) == 2
|
2016-10-13 17:12:35 -07:00
|
|
|
assert {page.url for page in pages} == {
|
2024-02-08 11:55:23 -08:00
|
|
|
make_url(httpd, "/site1/"),
|
|
|
|
make_url(httpd, "/site1/file1.txt"),
|
|
|
|
}
|
2016-10-18 17:39:33 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
2016-10-18 17:39:33 -07:00
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
captures = rr.table("captures").filter({"test_id": test_id}).run()
|
|
|
|
captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"}
|
2016-12-19 17:30:09 -08:00
|
|
|
assert robots in captures_by_url
|
2016-10-18 17:39:33 -07:00
|
|
|
assert page1 in captures_by_url
|
|
|
|
assert page2 in captures_by_url
|
2024-02-08 11:55:23 -08:00
|
|
|
assert "screenshot:%s" % page1 in captures_by_url
|
|
|
|
assert "thumbnail:%s" % page1 in captures_by_url
|
2016-10-18 17:39:33 -07:00
|
|
|
# no screenshots of plaintext
|
|
|
|
|
|
|
|
# check pywb
|
2024-02-08 11:55:23 -08:00
|
|
|
t14 = captures_by_url[page2]["timestamp"].strftime("%Y%m%d%H%M%S")
|
|
|
|
wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, page2)
|
|
|
|
expected_payload = open(
|
|
|
|
os.path.join(os.path.dirname(__file__), "htdocs", "site1", "file1.txt"), "rb"
|
|
|
|
).read()
|
2016-10-18 17:39:33 -07:00
|
|
|
assert requests.get(wb_url).content == expected_payload
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
url = "screenshot:%s" % page1
|
|
|
|
t14 = captures_by_url[url]["timestamp"].strftime("%Y%m%d%H%M%S")
|
|
|
|
wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, url)
|
2017-01-31 10:26:38 -08:00
|
|
|
response = requests.get(wb_url)
|
|
|
|
assert response.status_code == 200
|
2024-02-08 11:55:23 -08:00
|
|
|
assert response.headers["content-type"] == "image/jpeg"
|
2017-01-31 10:26:38 -08:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
url = "thumbnail:%s" % page1
|
|
|
|
t14 = captures_by_url[url]["timestamp"].strftime("%Y%m%d%H%M%S")
|
|
|
|
wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, url)
|
2017-01-31 10:26:38 -08:00
|
|
|
response = requests.get(wb_url)
|
|
|
|
assert response.status_code == 200
|
2024-02-08 11:55:23 -08:00
|
|
|
assert response.headers["content-type"] == "image/jpeg"
|
|
|
|
|
2017-01-31 10:26:38 -08:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
def test_proxy_warcprox(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
"""Test --proxy with proxy that happens to be warcprox"""
|
2017-03-24 13:55:23 -07:00
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
2017-03-24 13:55:23 -07:00
|
|
|
_test_proxy_setting(
|
2024-02-08 11:55:23 -08:00
|
|
|
httpd, proxy="localhost:8000", warcprox_auto=False, is_warcprox=True
|
|
|
|
)
|
2017-03-24 13:55:23 -07:00
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
|
|
|
|
def test_proxy_non_warcprox(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
"""Test --proxy with proxy that happens not to be warcprox"""
|
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
class DumbProxyRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
|
def do_HEAD(self):
|
2024-02-08 11:55:23 -08:00
|
|
|
if not hasattr(self.server, "requests"):
|
2017-03-24 13:55:23 -07:00
|
|
|
self.server.requests = []
|
2024-02-08 11:55:23 -08:00
|
|
|
logging.info("%s %s", self.command, self.path)
|
|
|
|
self.server.requests.append("%s %s" % (self.command, self.path))
|
2017-03-24 13:55:23 -07:00
|
|
|
response = urllib.request.urlopen(self.path)
|
2024-02-08 11:55:23 -08:00
|
|
|
self.wfile.write(
|
|
|
|
("HTTP/1.0 %s %s\r\n" % (response.code, response.reason)).encode(
|
|
|
|
"ascii"
|
|
|
|
)
|
|
|
|
)
|
2017-03-24 13:55:23 -07:00
|
|
|
for header in response.getheaders():
|
2024-02-08 11:55:23 -08:00
|
|
|
self.wfile.write(
|
|
|
|
("%s: %s\r\n" % (header[0], header[1])).encode("ascii")
|
|
|
|
)
|
|
|
|
self.wfile.write(b"\r\n")
|
2017-03-24 13:55:23 -07:00
|
|
|
return response
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
def do_GET(self):
|
|
|
|
response = self.do_HEAD()
|
|
|
|
self.copyfile(response, self.wfile)
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
def do_WARCPROX_WRITE_RECORD(self):
|
2024-02-08 11:55:23 -08:00
|
|
|
if not hasattr(self.server, "requests"):
|
2017-03-24 13:55:23 -07:00
|
|
|
self.server.requests = []
|
2024-02-08 11:55:23 -08:00
|
|
|
logging.info("%s %s", self.command, self.path)
|
2017-03-24 13:55:23 -07:00
|
|
|
self.send_error(400)
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
proxy = http.server.HTTPServer(("localhost", 0), DumbProxyRequestHandler)
|
|
|
|
th = threading.Thread(name="dumb-proxy", target=proxy.serve_forever)
|
2017-03-24 13:55:23 -07:00
|
|
|
th.start()
|
|
|
|
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
2017-03-24 13:55:23 -07:00
|
|
|
_test_proxy_setting(
|
2024-02-08 11:55:23 -08:00
|
|
|
httpd,
|
|
|
|
proxy="localhost:%s" % proxy.server_port,
|
|
|
|
warcprox_auto=False,
|
|
|
|
is_warcprox=False,
|
|
|
|
)
|
2017-03-24 13:55:23 -07:00
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
2017-03-24 13:55:23 -07:00
|
|
|
assert len(proxy.requests) <= 15
|
2024-02-08 11:55:23 -08:00
|
|
|
assert proxy.requests.count("GET /status") == 1
|
|
|
|
assert ("GET %s" % make_url(httpd, "/site1/")) in proxy.requests
|
|
|
|
assert ("GET %s" % make_url(httpd, "/site1/file1.txt")) in proxy.requests
|
|
|
|
assert [
|
|
|
|
req for req in proxy.requests if req.startswith("WARCPROX_WRITE_RECORD")
|
|
|
|
] == []
|
2016-10-18 17:39:33 -07:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
proxy.shutdown()
|
|
|
|
th.join()
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
def test_no_proxy(httpd):
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
|
|
|
_test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False)
|
2017-03-24 13:55:23 -07:00
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
2017-03-24 13:55:23 -07:00
|
|
|
# XXX how to check that no proxy was used?
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
def test_warcprox_auto(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
"""Test --warcprox-auto"""
|
2017-03-24 13:55:23 -07:00
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
|
|
|
_test_proxy_setting(httpd, proxy=None, warcprox_auto=True, is_warcprox=True)
|
2017-03-24 13:55:23 -07:00
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
|
|
|
|
def test_proxy_conflict():
|
|
|
|
with pytest.raises(AssertionError) as excinfo:
|
|
|
|
worker = brozzler.worker.BrozzlerWorker(
|
2024-02-08 11:55:23 -08:00
|
|
|
None, None, warcprox_auto=True, proxy="localhost:12345"
|
|
|
|
)
|
|
|
|
|
2017-03-24 13:55:23 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
|
|
|
|
test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
|
|
|
|
proxy,
|
|
|
|
warcprox_auto,
|
|
|
|
is_warcprox,
|
|
|
|
datetime.datetime.utcnow().isoformat(),
|
|
|
|
)
|
2016-10-18 17:39:33 -07:00
|
|
|
|
|
|
|
# the two pages we expect to be crawled
|
2024-02-08 11:55:23 -08:00
|
|
|
page1 = make_url(httpd, "/site1/")
|
|
|
|
page2 = make_url(httpd, "/site1/file1.txt")
|
|
|
|
robots = make_url(httpd, "/robots.txt")
|
2016-10-18 17:39:33 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
2017-03-24 13:55:23 -07:00
|
|
|
service_registry = doublethink.ServiceRegistry(rr)
|
2024-02-08 11:55:23 -08:00
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site1/"),
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2017-03-24 13:55:23 -07:00
|
|
|
assert site.id is None
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id is not None
|
|
|
|
assert len(list(frontier.site_pages(site.id))) == 1
|
|
|
|
|
|
|
|
worker = brozzler.worker.BrozzlerWorker(
|
2024-02-08 11:55:23 -08:00
|
|
|
frontier,
|
|
|
|
service_registry,
|
|
|
|
max_browsers=1,
|
|
|
|
chrome_exe=brozzler.suggest_default_chrome_exe(),
|
|
|
|
warcprox_auto=warcprox_auto,
|
|
|
|
proxy=proxy,
|
|
|
|
)
|
2017-03-24 13:55:23 -07:00
|
|
|
browser = worker._browser_pool.acquire()
|
|
|
|
worker.brozzle_site(browser, site)
|
|
|
|
worker._browser_pool.release(browser)
|
|
|
|
|
|
|
|
# check proxy is set
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2017-03-24 13:55:23 -07:00
|
|
|
if warcprox_auto:
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.proxy[-5:] == ":8000"
|
2017-03-24 13:55:23 -07:00
|
|
|
else:
|
|
|
|
assert not site.proxy
|
2024-02-08 11:55:23 -08:00
|
|
|
site.refresh() # check that these things were persisted
|
|
|
|
assert site.status == "FINISHED"
|
2017-03-24 13:55:23 -07:00
|
|
|
if warcprox_auto:
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.proxy[-5:] == ":8000"
|
2017-03-24 13:55:23 -07:00
|
|
|
else:
|
|
|
|
assert not site.proxy
|
2016-10-18 17:39:33 -07:00
|
|
|
|
|
|
|
# check that we got the two pages we expected
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
2016-12-19 17:30:09 -08:00
|
|
|
assert len(pages) == 2
|
2016-10-18 17:39:33 -07:00
|
|
|
assert {page.url for page in pages} == {
|
2024-02-08 11:55:23 -08:00
|
|
|
make_url(httpd, "/site1/"),
|
|
|
|
make_url(httpd, "/site1/file1.txt"),
|
|
|
|
}
|
2016-10-13 17:12:35 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
2016-10-13 17:12:35 -07:00
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
captures = rr.table("captures").filter({"test_id": test_id}).run()
|
|
|
|
captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"}
|
2017-03-24 13:55:23 -07:00
|
|
|
if is_warcprox:
|
|
|
|
assert robots in captures_by_url
|
|
|
|
assert page1 in captures_by_url
|
|
|
|
assert page2 in captures_by_url
|
2024-02-08 11:55:23 -08:00
|
|
|
assert "screenshot:%s" % page1 in captures_by_url
|
|
|
|
assert "thumbnail:%s" % page1 in captures_by_url
|
2017-03-24 13:55:23 -07:00
|
|
|
|
|
|
|
# check pywb
|
2024-02-08 11:55:23 -08:00
|
|
|
t14 = captures_by_url[page2]["timestamp"].strftime("%Y%m%d%H%M%S")
|
|
|
|
wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, page2)
|
|
|
|
expected_payload = open(
|
|
|
|
os.path.join(os.path.dirname(__file__), "htdocs", "site1", "file1.txt"),
|
|
|
|
"rb",
|
|
|
|
).read()
|
2017-03-24 13:55:23 -07:00
|
|
|
assert requests.get(wb_url).content == expected_payload
|
|
|
|
else:
|
|
|
|
assert captures_by_url == {}
|
2016-11-16 12:23:59 -08:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2016-11-16 12:23:59 -08:00
|
|
|
def test_obey_robots(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site1/"),
|
|
|
|
"user_agent": "im a badbot", # robots.txt blocks badbot
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
# so we can examine rethinkdb before it does anything
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("brozzler-worker")
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
assert site.id is None
|
2017-03-02 12:48:45 -08:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
2016-11-16 12:23:59 -08:00
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id is not None
|
|
|
|
site_pages = list(frontier.site_pages(site.id))
|
|
|
|
assert len(site_pages) == 1
|
|
|
|
assert site_pages[0].url == site.seed
|
|
|
|
assert site_pages[0].needs_robots_check
|
|
|
|
finally:
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("brozzler-worker")
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2016-11-16 12:23:59 -08:00
|
|
|
time.sleep(0.5)
|
2017-03-02 12:48:45 -08:00
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2016-11-16 12:23:59 -08:00
|
|
|
|
2016-12-19 17:30:09 -08:00
|
|
|
# check that only the one page is in rethinkdb
|
2016-11-16 12:23:59 -08:00
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
|
|
assert len(pages) == 1
|
2017-01-30 10:43:25 -08:00
|
|
|
page = pages[0]
|
2024-02-08 11:55:23 -08:00
|
|
|
assert page.url == make_url(httpd, "/site1/")
|
2017-01-30 10:43:25 -08:00
|
|
|
assert page.blocked_by_robots
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
|
|
|
robots_url = make_url(httpd, "/robots.txt")
|
|
|
|
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
|
2016-11-16 12:23:59 -08:00
|
|
|
assert len(captures) == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert captures[0]["url"] == robots_url
|
2016-11-16 12:23:59 -08:00
|
|
|
|
|
|
|
# check pywb
|
2024-02-08 11:55:23 -08:00
|
|
|
t14 = captures[0]["timestamp"].strftime("%Y%m%d%H%M%S")
|
|
|
|
wb_url = "http://localhost:8880/brozzler/%s/%s" % (t14, robots_url)
|
|
|
|
expected_payload = open(
|
|
|
|
os.path.join(os.path.dirname(__file__), "htdocs", "robots.txt"), "rb"
|
|
|
|
).read()
|
|
|
|
assert requests.get(wb_url, allow_redirects=False).content == expected_payload
|
|
|
|
|
2016-12-19 17:30:09 -08:00
|
|
|
|
|
|
|
def test_login(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site2/"),
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
"username": "test_username",
|
|
|
|
"password": "test_password",
|
|
|
|
},
|
|
|
|
)
|
2017-03-02 12:48:45 -08:00
|
|
|
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
2016-12-19 17:30:09 -08:00
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2016-12-19 17:30:09 -08:00
|
|
|
time.sleep(0.5)
|
2017-03-02 12:48:45 -08:00
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2016-12-19 17:30:09 -08:00
|
|
|
|
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
|
|
|
robots_url = make_url(httpd, "/robots.txt")
|
|
|
|
captures = list(
|
|
|
|
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
|
|
|
|
)
|
|
|
|
meth_url = ["%s %s" % (c["http_method"], c["url"]) for c in captures]
|
2016-12-19 17:30:09 -08:00
|
|
|
|
|
|
|
# there are several forms in in htdocs/site2/login.html but only one
|
|
|
|
# that brozzler's heuristic should match and try to submit, and it has
|
|
|
|
# action='00', so we can check for that here
|
2024-02-08 11:55:23 -08:00
|
|
|
assert ("POST %s" % make_url(httpd, "/site2/00")) in meth_url
|
2016-12-19 17:30:09 -08:00
|
|
|
|
|
|
|
# sanity check the rest of the crawl
|
2024-02-08 11:55:23 -08:00
|
|
|
assert ("GET %s" % make_url(httpd, "/robots.txt")) in meth_url
|
|
|
|
assert ("GET %s" % make_url(httpd, "/site2/")) in meth_url
|
|
|
|
assert (
|
|
|
|
"WARCPROX_WRITE_RECORD screenshot:%s" % make_url(httpd, "/site2/")
|
|
|
|
) in meth_url
|
|
|
|
assert (
|
|
|
|
"WARCPROX_WRITE_RECORD thumbnail:%s" % make_url(httpd, "/site2/")
|
|
|
|
) in meth_url
|
|
|
|
assert ("GET %s" % make_url(httpd, "/site2/login.html")) in meth_url
|
|
|
|
assert (
|
|
|
|
"WARCPROX_WRITE_RECORD screenshot:%s" % make_url(httpd, "/site2/login.html")
|
|
|
|
) in meth_url
|
|
|
|
assert (
|
|
|
|
"WARCPROX_WRITE_RECORD thumbnail:%s" % make_url(httpd, "/site2/login.html")
|
|
|
|
) in meth_url
|
|
|
|
|
2016-12-19 17:30:09 -08:00
|
|
|
|
2017-03-06 15:13:40 -08:00
|
|
|
def test_seed_redirect(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
seed_url = make_url(httpd, "/site5/redirect/")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site5/redirect/"),
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
|
|
|
assert site.scope == {
|
|
|
|
"accepts": [
|
|
|
|
{
|
|
|
|
"ssurt": "%s//%s:http:/site5/redirect/"
|
|
|
|
% (local_address, httpd.server_port)
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
2017-03-06 15:13:40 -08:00
|
|
|
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id
|
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2017-03-06 15:13:40 -08:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2017-03-06 15:13:40 -08:00
|
|
|
|
|
|
|
# take a look at the pages table
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
|
|
assert len(pages) == 2
|
|
|
|
pages.sort(key=lambda page: page.hops_from_seed)
|
|
|
|
assert pages[0].hops_from_seed == 0
|
|
|
|
assert pages[0].url == seed_url
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[0].redirect_url == make_url(httpd, "/site5/destination/")
|
2017-03-06 15:13:40 -08:00
|
|
|
assert pages[1].hops_from_seed == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[1].url == make_url(httpd, "/site5/destination/page2.html")
|
2017-03-06 15:13:40 -08:00
|
|
|
|
|
|
|
# check that scope has been updated properly
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.scope == {
|
|
|
|
"accepts": [
|
|
|
|
{
|
|
|
|
"ssurt": "%s//%s:http:/site5/redirect/"
|
|
|
|
% (local_address, httpd.server_port)
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"ssurt": "%s//%s:http:/site5/destination/"
|
|
|
|
% (local_address, httpd.server_port)
|
|
|
|
},
|
|
|
|
]
|
|
|
|
}
|
|
|
|
|
2017-03-27 12:15:49 -07:00
|
|
|
|
|
|
|
def test_hashtags(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
seed_url = make_url(httpd, "/site7/")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": seed_url,
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2017-03-27 12:15:49 -07:00
|
|
|
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id
|
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2017-03-27 12:15:49 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2017-03-27 12:15:49 -07:00
|
|
|
|
|
|
|
# check that we the page we expected
|
|
|
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
|
|
|
assert len(pages) == 2
|
|
|
|
assert pages[0].url == seed_url
|
|
|
|
assert pages[0].hops_from_seed == 0
|
|
|
|
assert pages[0].brozzle_count == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[0].outlinks["accepted"] == [make_url(httpd, "/site7/foo.html")]
|
2017-03-27 12:15:49 -07:00
|
|
|
assert not pages[0].hashtags
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[1].url == make_url(httpd, "/site7/foo.html")
|
2017-03-27 12:15:49 -07:00
|
|
|
assert pages[1].hops_from_seed == 1
|
|
|
|
assert pages[1].brozzle_count == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sorted(pages[1].hashtags) == [
|
|
|
|
"#boosh",
|
|
|
|
"#ignored",
|
|
|
|
"#whee",
|
|
|
|
]
|
2017-03-27 12:15:49 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
2017-03-27 12:15:49 -07:00
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
captures = rr.table("captures").filter({"test_id": test_id}).run()
|
|
|
|
captures_by_url = {c["url"]: c for c in captures if c["http_method"] != "HEAD"}
|
2017-03-27 12:15:49 -07:00
|
|
|
assert seed_url in captures_by_url
|
2024-02-08 11:55:23 -08:00
|
|
|
assert make_url(httpd, "/site7/foo.html") in captures_by_url
|
|
|
|
assert make_url(httpd, "/site7/whee.txt") in captures_by_url
|
|
|
|
assert make_url(httpd, "/site7/boosh.txt") in captures_by_url
|
|
|
|
assert "screenshot:%s" % seed_url in captures_by_url
|
|
|
|
assert "thumbnail:%s" % seed_url in captures_by_url
|
|
|
|
assert "screenshot:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
|
|
|
assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
|
|
|
|
2017-03-27 12:15:49 -07:00
|
|
|
|
2017-09-27 14:08:28 -07:00
|
|
|
def test_redirect_hashtags(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
|
|
|
seed_url = make_url(httpd, "/site9/")
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": seed_url,
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2017-09-27 14:08:28 -07:00
|
|
|
|
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.id
|
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2017-09-27 14:08:28 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2017-09-27 14:08:28 -07:00
|
|
|
|
|
|
|
# check that we the page we expected
|
|
|
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
|
|
|
assert len(pages) == 2
|
|
|
|
assert pages[0].url == seed_url
|
|
|
|
assert pages[0].hops_from_seed == 0
|
|
|
|
assert pages[0].brozzle_count == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[0].outlinks["accepted"] == [make_url(httpd, "/site9/redirect.html")]
|
2017-09-27 14:08:28 -07:00
|
|
|
assert not pages[0].hashtags
|
2024-02-08 11:55:23 -08:00
|
|
|
assert pages[1].url == make_url(httpd, "/site9/redirect.html")
|
2017-09-27 14:08:28 -07:00
|
|
|
assert pages[1].hops_from_seed == 1
|
|
|
|
assert pages[1].brozzle_count == 1
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sorted(pages[1].hashtags) == [
|
|
|
|
"#hash1",
|
|
|
|
"#hash2",
|
|
|
|
]
|
2017-09-27 14:08:28 -07:00
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
2017-09-27 14:08:28 -07:00
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
captures = rr.table("captures").filter({"test_id": test_id}).run()
|
|
|
|
redirect_captures = [
|
|
|
|
c
|
|
|
|
for c in captures
|
|
|
|
if c["url"] == make_url(httpd, "/site9/redirect.html")
|
|
|
|
and c["http_method"] == "GET"
|
|
|
|
]
|
|
|
|
assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
|
2017-09-27 14:08:28 -07:00
|
|
|
|
|
|
|
# === expected captures ===
|
|
|
|
# 1. GET http://localhost:41243/favicon.ico
|
|
|
|
# 2. GET http://localhost:41243/robots.txt
|
|
|
|
# 3. GET http://localhost:41243/site9/
|
|
|
|
# 4. GET http://localhost:41243/site9/
|
|
|
|
# 5. GET http://localhost:41243/site9/destination.html
|
|
|
|
# 6. GET http://localhost:41243/site9/destination.html
|
|
|
|
# 7. GET http://localhost:41243/site9/redirect.html
|
|
|
|
# 8. GET http://localhost:41243/site9/redirect.html
|
|
|
|
# 9. HEAD http://localhost:41243/site9/
|
|
|
|
# 10. HEAD http://localhost:41243/site9/redirect.html
|
|
|
|
# 11. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/
|
|
|
|
# 12. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/redirect.html
|
|
|
|
# 13. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/
|
|
|
|
# 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
|
2017-04-14 18:06:15 -07:00
|
|
|
def test_stop_crawl(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
2017-04-14 18:06:15 -07:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
|
|
|
|
# create a new job with three sites that could be crawled forever
|
2024-02-08 11:55:23 -08:00
|
|
|
job_conf = {
|
|
|
|
"seeds": [
|
|
|
|
{"url": make_url(httpd, "/infinite/foo/")},
|
|
|
|
{"url": make_url(httpd, "/infinite/bar/")},
|
|
|
|
{"url": make_url(httpd, "/infinite/baz/")},
|
|
|
|
]
|
|
|
|
}
|
2017-04-14 18:06:15 -07:00
|
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
|
|
assert job.id
|
|
|
|
|
|
|
|
sites = list(frontier.job_sites(job.id))
|
|
|
|
assert not sites[0].stop_requested
|
|
|
|
assert not sites[1].stop_requested
|
|
|
|
|
|
|
|
# request crawl stop for one site using the command line entrypoint
|
2024-02-08 11:55:23 -08:00
|
|
|
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=%s" % sites[0].id])
|
2017-04-14 18:06:15 -07:00
|
|
|
sites[0].refresh()
|
|
|
|
assert sites[0].stop_requested
|
|
|
|
|
|
|
|
# stop request should be honored quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while not sites[0].status.startswith("FINISHED") and time.time() - start < 120:
|
2017-04-14 18:06:15 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
sites[0].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[0].status == "FINISHED_STOP_REQUESTED"
|
2017-04-14 18:06:15 -07:00
|
|
|
|
|
|
|
# but the other sites and the job as a whole should still be crawling
|
|
|
|
sites[1].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[1].status == "ACTIVE"
|
2017-04-14 18:06:15 -07:00
|
|
|
sites[2].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[2].status == "ACTIVE"
|
2017-04-14 18:06:15 -07:00
|
|
|
job.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert job.status == "ACTIVE"
|
2017-04-14 18:06:15 -07:00
|
|
|
|
|
|
|
# request crawl stop for the job using the command line entrypoint
|
2024-02-08 11:55:23 -08:00
|
|
|
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--job=%s" % job.id])
|
2017-04-14 18:06:15 -07:00
|
|
|
job.refresh()
|
|
|
|
assert job.stop_requested
|
|
|
|
|
|
|
|
# stop request should be honored quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while not job.status.startswith("FINISHED") and time.time() - start < 120:
|
2017-04-14 18:06:15 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
job.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert job.status == "FINISHED"
|
2017-04-14 18:06:15 -07:00
|
|
|
|
|
|
|
# the other sites should also be FINISHED_STOP_REQUESTED
|
|
|
|
sites[0].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[0].status == "FINISHED_STOP_REQUESTED"
|
2017-04-14 18:06:15 -07:00
|
|
|
sites[1].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[1].status == "FINISHED_STOP_REQUESTED"
|
2017-04-14 18:06:15 -07:00
|
|
|
sites[2].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[2].status == "FINISHED_STOP_REQUESTED"
|
|
|
|
|
2017-04-14 18:06:15 -07:00
|
|
|
|
2017-04-18 17:54:12 -07:00
|
|
|
def test_warcprox_outage_resiliency(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
2017-04-18 17:54:12 -07:00
|
|
|
Tests resiliency to warcprox outage.
|
|
|
|
|
|
|
|
If no instances of warcprox are healthy when starting to crawl a site,
|
|
|
|
brozzler-worker should sit there and wait until a healthy instance appears.
|
|
|
|
|
|
|
|
If an instance goes down, sites assigned to that instance should bounce
|
|
|
|
over to a healthy instance.
|
|
|
|
|
|
|
|
If all instances of warcprox go down, brozzler-worker should sit and wait.
|
2024-02-08 11:55:23 -08:00
|
|
|
"""
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
2017-04-18 17:54:12 -07:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
svcreg = doublethink.ServiceRegistry(rr)
|
|
|
|
|
|
|
|
# run two instances of warcprox
|
|
|
|
opts = warcprox.Options()
|
2024-02-08 11:55:23 -08:00
|
|
|
opts.address = "0.0.0.0"
|
2017-04-18 17:54:12 -07:00
|
|
|
opts.port = 0
|
2024-02-08 11:55:23 -08:00
|
|
|
opts.rethinkdb_services_url = "rethinkdb://localhost/brozzler/services"
|
2017-04-18 17:54:12 -07:00
|
|
|
|
2018-02-02 15:11:26 -08:00
|
|
|
warcprox1 = warcprox.controller.WarcproxController(opts)
|
|
|
|
warcprox2 = warcprox.controller.WarcproxController(opts)
|
2017-04-18 17:54:12 -07:00
|
|
|
warcprox1_thread = threading.Thread(
|
2024-02-08 11:55:23 -08:00
|
|
|
target=warcprox1.run_until_shutdown, name="warcprox1"
|
|
|
|
)
|
2017-04-18 17:54:12 -07:00
|
|
|
warcprox2_thread = threading.Thread(
|
2024-02-08 11:55:23 -08:00
|
|
|
target=warcprox2.run_until_shutdown, name="warcprox2"
|
|
|
|
)
|
2017-04-18 17:54:12 -07:00
|
|
|
|
|
|
|
# put together a site to crawl
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_warcprox_death-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/infinite/"),
|
|
|
|
"warcprox_meta": {"captures-table-extra-fields": {"test_id": test_id}},
|
|
|
|
},
|
|
|
|
)
|
2017-04-18 17:54:12 -07:00
|
|
|
|
|
|
|
try:
|
|
|
|
# we manage warcprox instances ourselves, so stop the one running on
|
|
|
|
# the system, if any
|
|
|
|
try:
|
2024-02-08 11:55:23 -08:00
|
|
|
stop_service("warcprox")
|
2017-04-18 17:54:12 -07:00
|
|
|
except Exception as e:
|
2024-02-08 11:55:23 -08:00
|
|
|
logging.warning("problem stopping warcprox service: %s", e)
|
2017-04-18 17:54:12 -07:00
|
|
|
|
|
|
|
# queue the site for brozzling
|
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
|
|
|
|
# check that nothing happens
|
|
|
|
# XXX tail brozzler-worker.log or something?
|
|
|
|
time.sleep(30)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "ACTIVE"
|
2017-04-18 17:54:12 -07:00
|
|
|
assert not site.proxy
|
|
|
|
assert len(list(frontier.site_pages(site.id))) == 1
|
|
|
|
|
|
|
|
# start one instance of warcprox
|
|
|
|
warcprox1_thread.start()
|
|
|
|
|
|
|
|
# check that it started using that instance
|
|
|
|
start = time.time()
|
|
|
|
while not site.proxy and time.time() - start < 30:
|
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.proxy.endswith(":%s" % warcprox1.proxy.server_port)
|
2017-04-18 17:54:12 -07:00
|
|
|
|
|
|
|
# check that the site accumulates pages in the frontier, confirming
|
|
|
|
# that crawling is really happening
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60:
|
2017-04-18 17:54:12 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
|
|
|
assert len(list(frontier.site_pages(site.id))) > 1
|
|
|
|
|
|
|
|
# stop warcprox #1, start warcprox #2
|
|
|
|
warcprox2_thread.start()
|
|
|
|
warcprox1.stop.set()
|
|
|
|
warcprox1_thread.join()
|
|
|
|
|
|
|
|
# check that it switched over to warcprox #2
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while (
|
|
|
|
not site.proxy
|
|
|
|
or not site.proxy.endswith(":%s" % warcprox2.proxy.server_port)
|
|
|
|
) and time.time() - start < 30:
|
2017-04-18 17:54:12 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.proxy.endswith(":%s" % warcprox2.proxy.server_port)
|
2017-04-18 17:54:12 -07:00
|
|
|
|
|
|
|
# stop warcprox #2
|
|
|
|
warcprox2.stop.set()
|
|
|
|
warcprox2_thread.join()
|
|
|
|
|
|
|
|
page_count = len(list(frontier.site_pages(site.id)))
|
|
|
|
assert page_count > 1
|
|
|
|
|
|
|
|
# check that it is waiting for a warcprox to appear
|
|
|
|
time.sleep(30)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "ACTIVE"
|
2017-04-18 17:54:12 -07:00
|
|
|
assert not site.proxy
|
|
|
|
assert len(list(frontier.site_pages(site.id))) == page_count
|
|
|
|
|
|
|
|
# stop crawling the site, else it can pollute subsequent test runs
|
2024-02-08 11:55:23 -08:00
|
|
|
brozzler.cli.brozzler_stop_crawl(["brozzler-stop-crawl", "--site=%s" % site.id])
|
2017-04-18 17:54:12 -07:00
|
|
|
site.refresh()
|
|
|
|
assert site.stop_requested
|
|
|
|
|
|
|
|
# stop request should be honored quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while not site.status.startswith("FINISHED") and time.time() - start < 120:
|
2017-04-18 17:54:12 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED_STOP_REQUESTED"
|
2017-04-18 17:54:12 -07:00
|
|
|
finally:
|
|
|
|
warcprox1.stop.set()
|
|
|
|
warcprox2.stop.set()
|
|
|
|
warcprox1_thread.join()
|
|
|
|
warcprox2_thread.join()
|
2024-02-08 11:55:23 -08:00
|
|
|
start_service("warcprox")
|
|
|
|
|
2017-04-18 17:54:12 -07:00
|
|
|
|
2018-03-01 11:28:30 -08:00
|
|
|
def test_time_limit(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
2018-03-01 11:28:30 -08:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
|
2018-06-25 10:35:24 -05:00
|
|
|
# create a new job with one seed that could be crawled forever
|
2024-02-08 11:55:23 -08:00
|
|
|
job_conf = {"seeds": [{"url": make_url(httpd, "/infinite/foo/"), "time_limit": 20}]}
|
2018-03-01 11:28:30 -08:00
|
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
|
|
assert job.id
|
|
|
|
|
|
|
|
sites = list(frontier.job_sites(job.id))
|
|
|
|
assert len(sites) == 1
|
|
|
|
site = sites[0]
|
|
|
|
|
|
|
|
# time limit should be enforced pretty soon
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while not sites[0].status.startswith("FINISHED") and time.time() - start < 120:
|
2018-03-01 11:28:30 -08:00
|
|
|
time.sleep(0.5)
|
|
|
|
sites[0].refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert sites[0].status == "FINISHED_TIME_LIMIT"
|
2018-03-01 11:28:30 -08:00
|
|
|
|
|
|
|
# all sites finished so job should be finished too
|
2018-06-25 10:35:24 -05:00
|
|
|
start = time.time()
|
2018-03-01 11:28:30 -08:00
|
|
|
job.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
while not job.status == "FINISHED" and time.time() - start < 10:
|
2018-06-25 10:35:24 -05:00
|
|
|
time.sleep(0.5)
|
|
|
|
job.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert job.status == "FINISHED"
|
|
|
|
|
2018-03-01 11:28:30 -08:00
|
|
|
|
2018-08-15 17:42:53 -07:00
|
|
|
def test_ydl_stitching(httpd):
|
2024-02-08 11:55:23 -08:00
|
|
|
test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
|
|
|
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
2018-08-15 17:42:53 -07:00
|
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
2024-02-08 11:55:23 -08:00
|
|
|
site = brozzler.Site(
|
|
|
|
rr,
|
|
|
|
{
|
|
|
|
"seed": make_url(httpd, "/site10/"),
|
|
|
|
"warcprox_meta": {
|
|
|
|
"warc-prefix": "test_ydl_stitching",
|
|
|
|
"captures-table-extra-fields": {"test_id": test_id},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
)
|
2018-08-15 17:42:53 -07:00
|
|
|
brozzler.new_site(frontier, site)
|
|
|
|
|
|
|
|
# the site should be brozzled fairly quickly
|
|
|
|
start = time.time()
|
2024-02-08 11:55:23 -08:00
|
|
|
while site.status != "FINISHED" and time.time() - start < 300:
|
2018-08-15 17:42:53 -07:00
|
|
|
time.sleep(0.5)
|
|
|
|
site.refresh()
|
2024-02-08 11:55:23 -08:00
|
|
|
assert site.status == "FINISHED"
|
2018-08-15 17:42:53 -07:00
|
|
|
|
|
|
|
# check page.videos
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
|
|
assert len(pages) == 1
|
|
|
|
page = pages[0]
|
|
|
|
assert len(page.videos) == 6
|
2024-02-08 11:55:23 -08:00
|
|
|
stitched_url = "youtube-dl:00001:%s" % make_url(httpd, "/site10/")
|
2018-08-15 17:42:53 -07:00
|
|
|
assert {
|
2024-02-08 11:55:23 -08:00
|
|
|
"blame": "youtube-dl",
|
|
|
|
"content-length": 267900,
|
|
|
|
"content-type": "video/mp4",
|
|
|
|
"response_code": 204,
|
|
|
|
"url": stitched_url,
|
2018-08-15 17:42:53 -07:00
|
|
|
} in page.videos
|
|
|
|
|
2024-02-08 11:55:23 -08:00
|
|
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
2018-09-18 01:05:18 -07:00
|
|
|
# take a look at the captures table
|
2024-02-08 11:55:23 -08:00
|
|
|
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
|
|
|
|
l = [c for c in captures if c["url"] == stitched_url]
|
2018-09-18 01:05:18 -07:00
|
|
|
assert len(l) == 1
|
|
|
|
c = l[0]
|
2024-02-08 11:55:23 -08:00
|
|
|
assert c["filename"].startswith("test_ydl_stitching")
|
|
|
|
assert c["content_type"] == "video/mp4"
|
|
|
|
assert c["http_method"] == "WARCPROX_WRITE_RECORD"
|