diff --git a/Makefile b/Makefile index 409d8d9..cbe9c8f 100644 --- a/Makefile +++ b/Makefile @@ -59,3 +59,8 @@ check-format: format: $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --fix . $(VIRTUAL_ENV_DIR)/bin/ruff format . + +.PHONY: test +test: + uv sync --all-extras + uv run py.test tests diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 2c872d8..b199c1c 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -65,9 +65,6 @@ def filter_claimable_site_ids( if is_claimable: claimable_sites.append(site) - if len(claimable_sites) >= max_sites_to_claim: - break - site_ids_to_claim = [] # gather sites that are under the max without going over for site in claimable_sites: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index da6f5bf..c0ba2b3 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -1052,6 +1052,46 @@ def test_max_claimed_sites_cross_job(rethinker): rr.table("sites").delete().run() +def test_many_active_claimed_sites_cross_job(rethinker): + rr = rethinker + frontier = brozzler.RethinkDbFrontier(rr) + + # clean slate + rr.table("jobs").delete().run() + rr.table("sites").delete().run() + + job_conf_1 = { + "id": 1, + "seeds": [{"url": f"http://example.com/{i}"} for i in range(0, 2000)], + "max_claimed_sites": 3, + } + job_conf_2 = { + "id": 2, + "seeds": [ + {"url": "http://example.com/1"}, + {"url": "http://example.com/2"}, + {"url": "http://example.com/3"}, + {"url": "http://example.com/4"}, + {"url": "http://example.com/5"}, + ], + "max_claimed_sites": 5, + } + + seeds_seen = [] + job_1 = brozzler.new_job(frontier, job_conf_1) + + # Claim all possible sites from job 1. We should only get 3 due to max_claimed_sites + claimed_sites_1 = frontier.claim_sites(4) + assert len(claimed_sites_1) == 3 + + # Add 5 more seeds + job_2 = brozzler.new_job(frontier, job_conf_2) + + # We shouldn't have trouble getting seeds from job 2 + claimed_sites_1 = frontier.claim_sites(5) + assert len(claimed_sites_1) == 5 + + # Works locally, but reliably fails in CI. @pytest.mark.xfail def test_max_claimed_sites_load_perf(rethinker):