mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-26 08:15:43 -04:00
chore: disable cluster tests, add frontier load test
This commit is contained in:
parent
addf73f865
commit
cdb81496f6
2 changed files with 93 additions and 24 deletions
|
@ -68,6 +68,13 @@ def stop_service(service):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def rethinker(request):
|
||||||
|
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||||
|
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||||
|
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
@ -162,10 +169,11 @@ def test_httpd(httpd):
|
||||||
assert payload1 == payload2
|
assert payload1 == payload2
|
||||||
|
|
||||||
|
|
||||||
def test_services_up():
|
@pytest.mark.skip()
|
||||||
|
def test_services_up(rethinker):
|
||||||
"""Check that the expected services are up and running."""
|
"""Check that the expected services are up and running."""
|
||||||
# check that rethinkdb is listening and looks sane
|
# check that rethinkdb is listening and looks sane
|
||||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
rr = rethinker
|
||||||
tbls = rr.table_list().run()
|
tbls = rr.table_list().run()
|
||||||
assert len(tbls) > 10
|
assert len(tbls) > 10
|
||||||
|
|
||||||
|
@ -185,9 +193,11 @@ def test_services_up():
|
||||||
s.connect(("localhost", 8881))
|
s.connect(("localhost", 8881))
|
||||||
|
|
||||||
|
|
||||||
def test_brozzle_site(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_brozzle_site(httpd, rethinker):
|
||||||
test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
{
|
{
|
||||||
|
@ -262,6 +272,7 @@ def test_brozzle_site(httpd):
|
||||||
assert response.headers["content-type"] == "image/jpeg"
|
assert response.headers["content-type"] == "image/jpeg"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||||
def test_proxy_warcprox(httpd):
|
def test_proxy_warcprox(httpd):
|
||||||
"""Test --proxy with proxy that happens to be warcprox"""
|
"""Test --proxy with proxy that happens to be warcprox"""
|
||||||
try:
|
try:
|
||||||
|
@ -273,6 +284,7 @@ def test_proxy_warcprox(httpd):
|
||||||
start_service("brozzler-worker")
|
start_service("brozzler-worker")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||||
def test_proxy_non_warcprox(httpd):
|
def test_proxy_non_warcprox(httpd):
|
||||||
"""Test --proxy with proxy that happens not to be warcprox"""
|
"""Test --proxy with proxy that happens not to be warcprox"""
|
||||||
|
|
||||||
|
@ -331,6 +343,7 @@ def test_proxy_non_warcprox(httpd):
|
||||||
th.join()
|
th.join()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip()
|
||||||
def test_no_proxy(httpd):
|
def test_no_proxy(httpd):
|
||||||
try:
|
try:
|
||||||
stop_service("brozzler-worker")
|
stop_service("brozzler-worker")
|
||||||
|
@ -340,6 +353,7 @@ def test_no_proxy(httpd):
|
||||||
# XXX how to check that no proxy was used?
|
# XXX how to check that no proxy was used?
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip()
|
||||||
def test_warcprox_auto(httpd):
|
def test_warcprox_auto(httpd):
|
||||||
"""Test --warcprox-auto"""
|
"""Test --warcprox-auto"""
|
||||||
try:
|
try:
|
||||||
|
@ -349,6 +363,7 @@ def test_warcprox_auto(httpd):
|
||||||
start_service("brozzler-worker")
|
start_service("brozzler-worker")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip()
|
||||||
def test_proxy_conflict():
|
def test_proxy_conflict():
|
||||||
with pytest.raises(AssertionError):
|
with pytest.raises(AssertionError):
|
||||||
brozzler.worker.BrozzlerWorker(
|
brozzler.worker.BrozzlerWorker(
|
||||||
|
@ -356,7 +371,11 @@ def test_proxy_conflict():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
|
@pytest.mark.skip()
|
||||||
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
def _test_proxy_setting(
|
||||||
|
httpd, rethinker, proxy=None, warcprox_auto=False, is_warcprox=False
|
||||||
|
):
|
||||||
test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
|
test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
|
||||||
proxy,
|
proxy,
|
||||||
warcprox_auto,
|
warcprox_auto,
|
||||||
|
@ -369,7 +388,7 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||||
page2 = make_url(httpd, "/site1/file1.txt")
|
page2 = make_url(httpd, "/site1/file1.txt")
|
||||||
robots = make_url(httpd, "/robots.txt")
|
robots = make_url(httpd, "/robots.txt")
|
||||||
|
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
|
@ -440,9 +459,11 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||||
assert captures_by_url == {}
|
assert captures_by_url == {}
|
||||||
|
|
||||||
|
|
||||||
def test_obey_robots(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_obey_robots(httpd, rethinker):
|
||||||
test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
{
|
{
|
||||||
|
@ -497,9 +518,11 @@ def test_obey_robots(httpd):
|
||||||
assert requests.get(wb_url, allow_redirects=False).content == expected_payload
|
assert requests.get(wb_url, allow_redirects=False).content == expected_payload
|
||||||
|
|
||||||
|
|
||||||
def test_login(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_login(httpd, rethinker):
|
||||||
test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
{
|
{
|
||||||
|
@ -550,9 +573,11 @@ def test_login(httpd):
|
||||||
) in meth_url
|
) in meth_url
|
||||||
|
|
||||||
|
|
||||||
def test_seed_redirect(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_seed_redirect(httpd, rethinker):
|
||||||
test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
seed_url = make_url(httpd, "/site5/redirect/")
|
seed_url = make_url(httpd, "/site5/redirect/")
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
|
@ -606,9 +631,11 @@ def test_seed_redirect(httpd):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_hashtags(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_hashtags(httpd, rethinker):
|
||||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
seed_url = make_url(httpd, "/site7/")
|
seed_url = make_url(httpd, "/site7/")
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
|
@ -660,9 +687,11 @@ def test_hashtags(httpd):
|
||||||
assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
||||||
|
|
||||||
|
|
||||||
def test_redirect_hashtags(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_redirect_hashtags(httpd, rethinker):
|
||||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
seed_url = make_url(httpd, "/site9/")
|
seed_url = make_url(httpd, "/site9/")
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
|
@ -727,8 +756,10 @@ def test_redirect_hashtags(httpd):
|
||||||
# 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
|
# 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
|
||||||
|
|
||||||
|
|
||||||
def test_stop_crawl(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_stop_crawl(httpd, rethinker):
|
||||||
|
rr = rethinker
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# create a new job with three sites that could be crawled forever
|
# create a new job with three sites that could be crawled forever
|
||||||
|
@ -787,7 +818,9 @@ def test_stop_crawl(httpd):
|
||||||
assert sites[2].status == "FINISHED_STOP_REQUESTED"
|
assert sites[2].status == "FINISHED_STOP_REQUESTED"
|
||||||
|
|
||||||
|
|
||||||
def test_warcprox_outage_resiliency(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_warcprox_outage_resiliency(httpd, rethinker):
|
||||||
"""
|
"""
|
||||||
Tests resiliency to warcprox outage.
|
Tests resiliency to warcprox outage.
|
||||||
|
|
||||||
|
@ -799,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
|
|
||||||
If all instances of warcprox go down, brozzler-worker should sit and wait.
|
If all instances of warcprox go down, brozzler-worker should sit and wait.
|
||||||
"""
|
"""
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# run two instances of warcprox
|
# run two instances of warcprox
|
||||||
|
@ -912,8 +945,10 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
start_service("warcprox")
|
start_service("warcprox")
|
||||||
|
|
||||||
|
|
||||||
def test_time_limit(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_time_limit(httpd, rethinker):
|
||||||
|
rr = rethinker
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# create a new job with one seed that could be crawled forever
|
# create a new job with one seed that could be crawled forever
|
||||||
|
@ -940,9 +975,11 @@ def test_time_limit(httpd):
|
||||||
assert job.status == "FINISHED"
|
assert job.status == "FINISHED"
|
||||||
|
|
||||||
|
|
||||||
def test_ydl_stitching(httpd):
|
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||||
|
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||||
|
def test_ydl_stitching(httpd, rethinker):
|
||||||
test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
|
test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = rethinker
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
rr,
|
rr,
|
||||||
|
|
|
@ -1052,6 +1052,38 @@ def test_max_claimed_sites_cross_job(rethinker):
|
||||||
rr.table("sites").delete().run()
|
rr.table("sites").delete().run()
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_claimed_sites_load_perf(rethinker):
|
||||||
|
rr = rethinker
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
# clean slate
|
||||||
|
rr.table("jobs").delete().run()
|
||||||
|
rr.table("sites").delete().run()
|
||||||
|
|
||||||
|
job_conf = {
|
||||||
|
"id": 1,
|
||||||
|
"seeds": [],
|
||||||
|
"max_claimed_sites": 25,
|
||||||
|
}
|
||||||
|
for i in range(1, 20):
|
||||||
|
job_conf["seeds"].clear()
|
||||||
|
for j in range(0, 1000):
|
||||||
|
job_conf["id"] = i
|
||||||
|
job_conf["seeds"].append({"url": "http://example.com/{}".format(j)})
|
||||||
|
|
||||||
|
assert (len(job_conf["seeds"])) == 1000
|
||||||
|
brozzler.new_job(frontier, job_conf)
|
||||||
|
assert len(list(frontier.job_sites(i))) == 1000
|
||||||
|
|
||||||
|
claim_start_time = time.perf_counter()
|
||||||
|
claimed_sites = frontier.claim_sites(50)
|
||||||
|
claim_end_time = time.perf_counter()
|
||||||
|
assert claim_end_time - claim_start_time < 2
|
||||||
|
assert len(claimed_sites) == 50
|
||||||
|
rr.table("jobs").delete().run()
|
||||||
|
rr.table("sites").delete().run()
|
||||||
|
|
||||||
|
|
||||||
def test_choose_warcprox(rethinker):
|
def test_choose_warcprox(rethinker):
|
||||||
rr = rethinker
|
rr = rethinker
|
||||||
svcreg = doublethink.ServiceRegistry(rr)
|
svcreg = doublethink.ServiceRegistry(rr)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue