mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-12-01 20:54:58 -05:00
fix: We were applying the max_sites_to_claim filter too early. Many sites in a single crawl prevent claimable sites from getting through.
This commit is contained in:
parent
4d1fb31bc6
commit
96942d40f9
3 changed files with 45 additions and 3 deletions
|
|
@ -1052,6 +1052,46 @@ def test_max_claimed_sites_cross_job(rethinker):
|
|||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_many_active_claimed_sites_cross_job(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf_1 = {
|
||||
"id": 1,
|
||||
"seeds": [{"url": f"http://example.com/{i}"} for i in range(0, 2000)],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
job_conf_2 = {
|
||||
"id": 2,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/1"},
|
||||
{"url": "http://example.com/2"},
|
||||
{"url": "http://example.com/3"},
|
||||
{"url": "http://example.com/4"},
|
||||
{"url": "http://example.com/5"},
|
||||
],
|
||||
"max_claimed_sites": 5,
|
||||
}
|
||||
|
||||
seeds_seen = []
|
||||
job_1 = brozzler.new_job(frontier, job_conf_1)
|
||||
|
||||
# Claim all possible sites from job 1. We should only get 3 due to max_claimed_sites
|
||||
claimed_sites_1 = frontier.claim_sites(4)
|
||||
assert len(claimed_sites_1) == 3
|
||||
|
||||
# Add 5 more seeds
|
||||
job_2 = brozzler.new_job(frontier, job_conf_2)
|
||||
|
||||
# We shouldn't have trouble getting seeds from job 2
|
||||
claimed_sites_1 = frontier.claim_sites(5)
|
||||
assert len(claimed_sites_1) == 5
|
||||
|
||||
|
||||
# Works locally, but reliably fails in CI.
|
||||
@pytest.mark.xfail
|
||||
def test_max_claimed_sites_load_perf(rethinker):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue