mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
chore: disable cluster tests, add frontier load test
This commit is contained in:
parent
addf73f865
commit
cdb81496f6
@ -68,6 +68,13 @@ def stop_service(service):
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def httpd(request):
|
||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
@ -162,10 +169,11 @@ def test_httpd(httpd):
|
||||
assert payload1 == payload2
|
||||
|
||||
|
||||
def test_services_up():
|
||||
@pytest.mark.skip()
|
||||
def test_services_up(rethinker):
|
||||
"""Check that the expected services are up and running."""
|
||||
# check that rethinkdb is listening and looks sane
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
rr = rethinker
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
@ -185,9 +193,11 @@ def test_services_up():
|
||||
s.connect(("localhost", 8881))
|
||||
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_brozzle_site(httpd, rethinker):
|
||||
test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -262,6 +272,7 @@ def test_brozzle_site(httpd):
|
||||
assert response.headers["content-type"] == "image/jpeg"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||
def test_proxy_warcprox(httpd):
|
||||
"""Test --proxy with proxy that happens to be warcprox"""
|
||||
try:
|
||||
@ -273,6 +284,7 @@ def test_proxy_warcprox(httpd):
|
||||
start_service("brozzler-worker")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||
def test_proxy_non_warcprox(httpd):
|
||||
"""Test --proxy with proxy that happens not to be warcprox"""
|
||||
|
||||
@ -331,6 +343,7 @@ def test_proxy_non_warcprox(httpd):
|
||||
th.join()
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_no_proxy(httpd):
|
||||
try:
|
||||
stop_service("brozzler-worker")
|
||||
@ -340,6 +353,7 @@ def test_no_proxy(httpd):
|
||||
# XXX how to check that no proxy was used?
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_warcprox_auto(httpd):
|
||||
"""Test --warcprox-auto"""
|
||||
try:
|
||||
@ -349,6 +363,7 @@ def test_warcprox_auto(httpd):
|
||||
start_service("brozzler-worker")
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_proxy_conflict():
|
||||
with pytest.raises(AssertionError):
|
||||
brozzler.worker.BrozzlerWorker(
|
||||
@ -356,7 +371,11 @@ def test_proxy_conflict():
|
||||
)
|
||||
|
||||
|
||||
def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
|
||||
@pytest.mark.skip()
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
def _test_proxy_setting(
|
||||
httpd, rethinker, proxy=None, warcprox_auto=False, is_warcprox=False
|
||||
):
|
||||
test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
|
||||
proxy,
|
||||
warcprox_auto,
|
||||
@ -369,7 +388,7 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||
page2 = make_url(httpd, "/site1/file1.txt")
|
||||
robots = make_url(httpd, "/robots.txt")
|
||||
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -440,9 +459,11 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||
assert captures_by_url == {}
|
||||
|
||||
|
||||
def test_obey_robots(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_obey_robots(httpd, rethinker):
|
||||
test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -497,9 +518,11 @@ def test_obey_robots(httpd):
|
||||
assert requests.get(wb_url, allow_redirects=False).content == expected_payload
|
||||
|
||||
|
||||
def test_login(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_login(httpd, rethinker):
|
||||
test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -550,9 +573,11 @@ def test_login(httpd):
|
||||
) in meth_url
|
||||
|
||||
|
||||
def test_seed_redirect(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_seed_redirect(httpd, rethinker):
|
||||
test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site5/redirect/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -606,9 +631,11 @@ def test_seed_redirect(httpd):
|
||||
}
|
||||
|
||||
|
||||
def test_hashtags(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_hashtags(httpd, rethinker):
|
||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site7/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -660,9 +687,11 @@ def test_hashtags(httpd):
|
||||
assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
||||
|
||||
|
||||
def test_redirect_hashtags(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_redirect_hashtags(httpd, rethinker):
|
||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site9/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -727,8 +756,10 @@ def test_redirect_hashtags(httpd):
|
||||
# 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
|
||||
|
||||
|
||||
def test_stop_crawl(httpd):
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_stop_crawl(httpd, rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# create a new job with three sites that could be crawled forever
|
||||
@ -787,7 +818,9 @@ def test_stop_crawl(httpd):
|
||||
assert sites[2].status == "FINISHED_STOP_REQUESTED"
|
||||
|
||||
|
||||
def test_warcprox_outage_resiliency(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_warcprox_outage_resiliency(httpd, rethinker):
|
||||
"""
|
||||
Tests resiliency to warcprox outage.
|
||||
|
||||
@ -799,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
|
||||
If all instances of warcprox go down, brozzler-worker should sit and wait.
|
||||
"""
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# run two instances of warcprox
|
||||
@ -912,8 +945,10 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
start_service("warcprox")
|
||||
|
||||
|
||||
def test_time_limit(httpd):
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_time_limit(httpd, rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# create a new job with one seed that could be crawled forever
|
||||
@ -940,9 +975,11 @@ def test_time_limit(httpd):
|
||||
assert job.status == "FINISHED"
|
||||
|
||||
|
||||
def test_ydl_stitching(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_ydl_stitching(httpd, rethinker):
|
||||
test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
|
@ -1052,6 +1052,38 @@ def test_max_claimed_sites_cross_job(rethinker):
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_max_claimed_sites_load_perf(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf = {
|
||||
"id": 1,
|
||||
"seeds": [],
|
||||
"max_claimed_sites": 25,
|
||||
}
|
||||
for i in range(1, 20):
|
||||
job_conf["seeds"].clear()
|
||||
for j in range(0, 1000):
|
||||
job_conf["id"] = i
|
||||
job_conf["seeds"].append({"url": "http://example.com/{}".format(j)})
|
||||
|
||||
assert (len(job_conf["seeds"])) == 1000
|
||||
brozzler.new_job(frontier, job_conf)
|
||||
assert len(list(frontier.job_sites(i))) == 1000
|
||||
|
||||
claim_start_time = time.perf_counter()
|
||||
claimed_sites = frontier.claim_sites(50)
|
||||
claim_end_time = time.perf_counter()
|
||||
assert claim_end_time - claim_start_time < 2
|
||||
assert len(claimed_sites) == 50
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_choose_warcprox(rethinker):
|
||||
rr = rethinker
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
|
Loading…
x
Reference in New Issue
Block a user