fix: We were applying the max_sites_to_claim filter too early. Many sites in a single crawl prevent claimable sites from getting through.

2025-12-01 20:54:58 -05:00 · 2025-11-13 11:14:11 -08:00 · 2025-11-13 11:14:11 -08:00 · 96942d40f9
commit 96942d40f9
parent 4d1fb31bc6
3 changed files with 45 additions and 3 deletions
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -1052,6 +1052,46 @@ def test_max_claimed_sites_cross_job(rethinker):
    rr.table("sites").delete().run()


+def test_many_active_claimed_sites_cross_job(rethinker):
+    rr = rethinker
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # clean slate
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
+    job_conf_1 = {
+        "id": 1,
+        "seeds": [{"url": f"http://example.com/{i}"} for i in range(0, 2000)],
+        "max_claimed_sites": 3,
+    }
+    job_conf_2 = {
+        "id": 2,
+        "seeds": [
+            {"url": "http://example.com/1"},
+            {"url": "http://example.com/2"},
+            {"url": "http://example.com/3"},
+            {"url": "http://example.com/4"},
+            {"url": "http://example.com/5"},
+        ],
+        "max_claimed_sites": 5,
+    }
+
+    seeds_seen = []
+    job_1 = brozzler.new_job(frontier, job_conf_1)
+
+    # Claim all possible sites from job 1. We should only get 3 due to max_claimed_sites
+    claimed_sites_1 = frontier.claim_sites(4)
+    assert len(claimed_sites_1) == 3
+
+    # Add 5 more seeds
+    job_2 = brozzler.new_job(frontier, job_conf_2)
+
+    # We shouldn't have trouble getting seeds from job 2
+    claimed_sites_1 = frontier.claim_sites(5)
+    assert len(claimed_sites_1) == 5
+
+
 # Works locally, but reliably fails in CI.
@pytest.mark.xfail
 def test_max_claimed_sites_load_perf(rethinker):