chore: Additional frontier testing and reformat

2025-04-20 15:55:49 -04:00 · 2025-03-31 16:03:44 -07:00 · 2025-03-31 16:03:44 -07:00 · addf73f865
commit addf73f865
parent e7e4225bf2
5 changed files with 99 additions and 14 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -17,7 +17,7 @@ limitations under the License.
 """

 import datetime
-from typing import List, Dict
+from typing import Dict, List

 import doublethink
 import rethinkdb as rdb
@ -34,7 +34,9 @@ class UnexpectedDbResult(Exception):


 def filter_claimable_site_ids(
-    active_sites: List[Dict], max_sites_to_claim=1
+    active_sites: List[Dict],
+    reclaim_cooldown: int,
+    max_sites_to_claim=1,
 ) -> List[str]:
    job_counts = {}
    claimable_sites = []
@ -45,7 +47,7 @@ def filter_claimable_site_ids(

        # If site not claimed and not disclaimed within last 20 seconds
        if not site["claimed"] and site.get("last_disclaimed", 0) <= (
-            now - datetime.timedelta(seconds=20)
+            now - datetime.timedelta(seconds=reclaim_cooldown)
        ):
            is_claimable = True

@ -176,11 +178,13 @@ class RethinkDbFrontier:
        )
        return active_sites

-    def claim_sites(self, n=1) -> List[Dict]:
+    def claim_sites(self, n=1, reclaim_cooldown=20) -> List[Dict]:
        self.logger.debug("claiming up to %s sites to brozzle", n)

        active_sites = self.get_active_sites()
-        site_ids_to_claim = filter_claimable_site_ids(active_sites, n)
+        site_ids_to_claim = filter_claimable_site_ids(
+            active_sites, reclaim_cooldown, max_sites_to_claim=n
+        )
        result = (
            self.rr.table("sites", read_mode="majority")
            .get_all(r.args(site_ids_to_claim))
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -271,7 +271,7 @@ def test_proxy_down():
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-            browser.stop() # We're manually instantiating the browser without arguments,
+            browser.stop()  # We're manually instantiating the browser without arguments,
            # so it is running without a proxy. Stop it first.
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -76,8 +76,11 @@ def test_run_command(capsys, cmd):
    )
    out, err = proc.communicate()
    # Remove lines from syntax warning in imported library
-    filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
-                    "re.compile" not in line]
+    filtered_lines = [
+        line
+        for line in err.decode("utf-8").splitlines()
+        if "reppy" not in line and "re.compile" not in line
+    ]
    assert filtered_lines == []
    assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")

--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -20,6 +20,7 @@ limitations under the License.

 import argparse
 import datetime
+import itertools
 import logging
 import os
 import time
@ -933,6 +934,7 @@ def test_max_claimed_sites(rethinker):
    rr.table("sites").delete().run()

    job_conf = {
+        "id": 1,
        "seeds": [
            {"url": "http://example.com/1"},
            {"url": "http://example.com/2"},
@ -942,7 +944,7 @@ def test_max_claimed_sites(rethinker):
        ],
        "max_claimed_sites": 3,
    }
-
+    seeds_seen = []
    job = brozzler.new_job(frontier, job_conf)

    assert job.id
@ -962,13 +964,88 @@ def test_max_claimed_sites(rethinker):
    rr.table("jobs").delete().run()
    rr.table("sites").delete().run()

-    job = brozzler.new_job(frontier, job_conf)
-    claimed_sites = frontier.claim_sites(2)
-    assert len(claimed_sites) == 2
-    claimed_sites = frontier.claim_sites(1)
-    assert len(claimed_sites) == 1
+
+def test_max_claimed_sites_cross_job(rethinker):
+    rr = rethinker
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # clean slate
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
+    job_conf_1 = {
+        "id": 1,
+        "seeds": [
+            {"url": "http://example.com/1"},
+            {"url": "http://example.com/2"},
+            {"url": "http://example.com/3"},
+            {"url": "http://example.com/4"},
+            {"url": "http://example.com/5"},
+        ],
+        "max_claimed_sites": 3,
+    }
+    job_conf_2 = {
+        "id": 2,
+        "seeds": [
+            {"url": "http://example.com/6"},
+            {"url": "http://example.com/7"},
+            {"url": "http://example.com/8"},
+            {"url": "http://example.com/9"},
+            {"url": "http://example.com/10"},
+        ],
+        "max_claimed_sites": 3,
+    }
+
+    seeds_seen = []
+    job_1 = brozzler.new_job(frontier, job_conf_1)
+    job_2 = brozzler.new_job(frontier, job_conf_2)
+
+    assert len(list(frontier.job_sites(job_1.id))) == 5
+    assert len(list(frontier.job_sites(job_2.id))) == 5
+
+    claimed_sites_1 = frontier.claim_sites(4)
+    assert len(claimed_sites_1) == 4
+
+    sites_per_job = {}
+    for site in claimed_sites_1:
+        sites_per_job[site["job_id"]] = sites_per_job.get(site["job_id"], 0) + 1
+
+    # 2 jobs, max of 3 each.
+    assert len(sites_per_job.keys()) == 2
+    assert sites_per_job[1] + sites_per_job[2] == 4
+    assert sites_per_job[1] <= 3 and sites_per_job[2] <= 3
+
+    # 6 sites left in queue, but only 2 are still claimable due to max
+    claimed_sites_2 = frontier.claim_sites(6)
+    assert len(claimed_sites_2) == 2
+
+    # disclaim sites
+    for site in itertools.chain(claimed_sites_1, claimed_sites_2):
+        frontier.disclaim_site(site)
+        seeds_seen.append(site["seed"])
+
+    # Only 4 sites left in queue, that aren't recently claimed
+    claimed_sites_3 = frontier.claim_sites(6)
+    assert len(claimed_sites_3) == 4
+
    with pytest.raises(brozzler.NothingToClaim):
        claimed_sites = frontier.claim_sites(1)
+        assert len(claimed_sites) == 1
+
+    for site in claimed_sites_3:
+        seeds_seen.append(site["seed"])
+
+    # ensure all sites have been claimed at this point
+    for seed in itertools.chain(job_conf_1["seeds"], job_conf_2["seeds"]):
+        assert seed["url"] in seeds_seen
+
+    # All unclaimed sites have been recently disclaimed and are not claimable
+    with pytest.raises(brozzler.NothingToClaim):
+        frontier.claim_sites(3)
+
+    # Disable reclaim cooldown. With 4 claimed, we should have 2 available
+    claimed_sites_4 = frontier.claim_sites(4, reclaim_cooldown=0)
+    assert len(claimed_sites_4) == 2

    # clean slate for the next one
    rr.table("jobs").delete().run()
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -277,6 +277,7 @@ def test_ydl_proxy_down():
            with pytest.raises(brozzler.ProxyError):
                brozzler.ydl.do_youtube_dl(worker, site, page)

+
 def test_proxy_down():
    """
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.