From b5ee8a9ea7af07b3a86f9cd73a0531db354da704 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Wed, 26 Mar 2025 18:06:55 -0700
Subject: [PATCH 1/7] feat: Create new claim_sites() query, and fix frontier
 tests

---
 brozzler/__init__.py   |   3 +-
 brozzler/frontier.py   | 143 +++++++++++++++++++++++------------------
 tests/test_frontier.py | 103 ++++++++++++++++++++---------
 3 files changed, 155 insertions(+), 94 deletions(-)

diff --git a/brozzler/__init__.py b/brozzler/__init__.py
index 2150190..6c0f638 100644
--- a/brozzler/__init__.py
+++ b/brozzler/__init__.py
@@ -22,6 +22,7 @@ import logging
 import threading
 from importlib.metadata import version as _version
 
+import doublethink
 import structlog
 import urlcanon
 
@@ -398,7 +399,7 @@ def suggest_default_chrome_exe():
     return "chromium-browser"
 
 
-EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
+EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=doublethink.UTC)
 
 from brozzler.browser import Browser, BrowserPool, BrowsingException  # noqa: E402
 from brozzler.robots import is_permitted_by_robots  # noqa: E402
diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index 27a3d7c..97bb4d2 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -16,6 +16,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 
+import datetime
+from typing import List, Dict
+
 import doublethink
 import rethinkdb as rdb
 import structlog
@@ -30,6 +33,57 @@ class UnexpectedDbResult(Exception):
     pass
 
 
+def filter_claimable_site_ids(
+    active_sites: List[Dict], max_sites_to_claim=1
+) -> List[str]:
+    job_counts = {}
+    claimable_sites = []
+    now = datetime.datetime.now(datetime.timezone.utc)
+
+    for site in active_sites:
+        is_claimable = False
+
+        # If site not claimed and not disclaimed within last 20 seconds
+        if not site["claimed"] and site.get("last_disclaimed", 0) <= (
+            now - datetime.timedelta(seconds=20)
+        ):
+            is_claimable = True
+
+        # or site has been disclaimed more than an hour ago
+        if "last_claimed" in site and site["last_claimed"] <= (
+            now - datetime.timedelta(hours=1)
+        ):
+            is_claimable = True
+
+        # Count number of claimed sites per job_id (optional field)
+        if site["claimed"] and "max_claimed_sites" in site and "job_id" in site:
+            job_id = site["job_id"]
+            job_counts[job_id] = job_counts.get(job_id, 0) + 1
+
+        if is_claimable:
+            claimable_sites.append(site)
+
+    site_ids_to_claim = []
+    # gather sites that are under the max without going over
+    for site in claimable_sites:
+        if (
+            "max_claimed_sites" in site
+            and "job_id" in site
+            and job_counts.get(site["job_id"], 0) < site["max_claimed_sites"]
+        ):
+            site_ids_to_claim.append(site["id"])
+            job_counts[site["job_id"]] = job_counts.get(site["job_id"], 0) + 1
+
+        if "max_claimed_sites" not in site or "job_id" not in site:
+            site_ids_to_claim.append(site["id"])
+
+        # short circuit if we already have more than requested
+        if len(site_ids_to_claim) >= max_sites_to_claim:
+            break
+
+    return site_ids_to_claim
+
+
 class RethinkDbFrontier:
     logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
 
@@ -101,68 +155,35 @@ class RethinkDbFrontier:
                         "expected %r to be %r in %r" % (k, expected, result)
                     )
 
-    def claim_sites(self, n=1):
-        self.logger.debug("claiming up to %s sites to brozzle", n)
-        result = (
-            self.rr.table("sites")
-            .get_all(
-                r.args(
-                    r.db(self.rr.dbname)
-                    .table("sites", read_mode="majority")
-                    .between(
-                        ["ACTIVE", r.minval],
-                        ["ACTIVE", r.maxval],
-                        index="sites_last_disclaimed",
-                    )
-                    .order_by(r.desc("claimed"), "last_disclaimed")
-                    .fold(  # apply functions to sequence
-                        {},
-                        lambda acc,
-                        site: acc.merge(  # add the following to the accumulator
-                            r.branch(  # if has job_id
-                                site.has_fields("job_id"),
-                                r.object(  # then add this: key is stringified job_id,
-                                    # value starts at 0, but is incremented each time a site with
-                                    # the same job_id shows up in the result set. Used to get a
-                                    # value of how many sites for any given job are active
-                                    site["job_id"].coerce_to("string"),
-                                    acc[site["job_id"].coerce_to("string")]
-                                    .default(0)
-                                    .add(1),
-                                ),
-                                {},  # else add nothing
-                            )
-                        ),
-                        emit=lambda acc, site, new_acc: r.branch(  # big if conditional
-                            r.and_(
-                                r.or_(
-                                    # Avoid tight loop when unclaimed site was recently disclaimed
-                                    # Not claimed and not disclaimed within last 20 seconds
-                                    r.and_(
-                                        site["claimed"].not_(),
-                                        r.or_(
-                                            site.has_fields("last_disclaimed").not_(),
-                                            site["last_disclaimed"].lt(r.now().sub(20)),
-                                        ),
-                                    ),
-                                    # or last claimed over 1 hour ago
-                                    site["last_claimed"].lt(r.now().sub(60 * 60)),
-                                ),
-                                # and either max_claimed_sites isn't set, or not exceeded
-                                r.or_(
-                                    site.has_fields("max_claimed_sites").not_(),
-                                    new_acc[site["job_id"].coerce_to("string")].le(
-                                        site["max_claimed_sites"]
-                                    ),
-                                ),
-                            ),
-                            [site["id"]],  # then return this
-                            [],  # else nothing
-                        ),
-                    )
-                    .limit(n)  # trim results to max we want
-                )
+    def get_active_sites(self) -> List[Dict]:
+        active_sites = (
+            self.rr.table("sites", read_mode="majority")
+            .between(
+                ["ACTIVE", r.minval],
+                ["ACTIVE", r.maxval],
+                index="sites_last_disclaimed",
             )
+            .pluck(
+                "id",
+                "last_disclaimed",
+                "claimed",
+                "last_claimed",
+                "job_id",
+                "max_claimed_sites",
+            )
+            .order_by(r.desc("claimed"), "last_disclaimed")
+            .run()
+        )
+        return active_sites
+
+    def claim_sites(self, n=1) -> List[Dict]:
+        self.logger.debug("claiming up to %s sites to brozzle", n)
+
+        active_sites = self.get_active_sites()
+        site_ids_to_claim = filter_claimable_site_ids(active_sites, n)
+        result = (
+            self.rr.table("sites", read_mode="majority")
+            .get_all(r.args(site_ids_to_claim))
             .update(  # mark the sites we're claiming, and return changed sites (our final claim
                 # results)
                 #
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index 4d48e73..b0735f3 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -21,6 +21,7 @@ limitations under the License.
 import argparse
 import datetime
 import logging
+import os
 import time
 
 import doublethink
@@ -35,15 +36,23 @@ args.log_level = logging.INFO
 brozzler.cli.configure_logging(args)
 
 
-def test_rethinkdb_up():
+@pytest.fixture(scope="module")
+def rethinker(request):
+    db = request.param if hasattr(request, "param") else "ignoreme"
+    servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
+    return doublethink.Rethinker(db=db, servers=servers.split(","))  # built-in db
+
+
+@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)
+def test_rethinkdb_up(rethinker):
     """Checks that rethinkdb is listening and looks sane."""
-    rr = doublethink.Rethinker(db="rethinkdb")  # built-in db
+    rr = rethinker
     tbls = rr.table_list().run()
     assert len(tbls) > 10
 
 
-def test_basics():
-    rr = doublethink.Rethinker(db="ignoreme")
+def test_basics(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     job_conf = {
         "seeds": [{"url": "http://example.com"}, {"url": "https://example.org/"}]
@@ -73,6 +82,7 @@ def test_basics():
         "last_disclaimed": brozzler.EPOCH_UTC,
         "scope": {"accepts": [{"ssurt": "com,example,//http:/"}]},
         "seed": "http://example.com",
+        "skip_ytdlp": None,
         "starts_and_stops": [
             {"start": sites[0].starts_and_stops[0]["start"], "stop": None}
         ],
@@ -86,6 +96,7 @@ def test_basics():
         "last_disclaimed": brozzler.EPOCH_UTC,
         "scope": {"accepts": [{"ssurt": "org,example,//https:/"}]},
         "seed": "https://example.org/",
+        "skip_ytdlp": None,
         "starts_and_stops": [
             {
                 "start": sites[1].starts_and_stops[0]["start"],
@@ -100,28 +111,36 @@ def test_basics():
     assert pages[0] == {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
+        "hop_path": None,
         "hops_from_seed": 0,
         "hops_off": 0,
         "id": brozzler.Page.compute_id(sites[0].id, "http://example.com"),
         "job_id": job.id,
         "needs_robots_check": True,
         "priority": 1000,
+        "retry_after": None,
         "site_id": sites[0].id,
         "url": "http://example.com",
+        "via_page_url": None,
     }
     pages = list(frontier.site_pages(sites[1].id))
     assert len(pages) == 1
     assert pages[0] == {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
+        "hop_path": None,
         "hops_from_seed": 0,
         "hops_off": 0,
         "id": brozzler.Page.compute_id(sites[1].id, "https://example.org/"),
         "job_id": job.id,
         "needs_robots_check": True,
         "priority": 1000,
+        "retry_after": None,
         "site_id": sites[1].id,
         "url": "https://example.org/",
+        "via_page_url": None,
     }
 
     # test "brozzled" parameter of frontier.site_pages
@@ -140,13 +159,13 @@ def test_basics():
     assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
 
 
-def test_resume_job():
+def test_resume_job(rethinker):
     """
     Tests that the right stuff gets twiddled in rethinkdb when we "start" and
     "finish" crawling a job. Doesn't actually crawl anything.
     """
     # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
-    rr = doublethink.Rethinker(db="ignoreme")
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     job_conf = {"seeds": [{"url": "http://example.com/"}]}
     job = brozzler.new_job(frontier, job_conf)
@@ -343,12 +362,12 @@ def test_resume_job():
     assert site2.starts_and_stops[1]["stop"] is None
 
 
-def test_time_limit():
+def test_time_limit(rethinker):
     # XXX test not thoroughly adapted to change in time accounting, since
     # starts_and_stops is no longer used to enforce time limits
 
     # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     site = brozzler.Site(rr, {"seed": "http://example.com/", "time_limit": 99999})
     brozzler.new_site(frontier, site)
@@ -395,8 +414,8 @@ def test_time_limit():
         frontier.enforce_time_limit(site)
 
 
-def test_field_defaults():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_field_defaults(rethinker):
+    rr = rethinker
 
     # page
     brozzler.Page.table_ensure(rr)
@@ -466,8 +485,8 @@ def test_field_defaults():
     assert kob.starts_and_stops
 
 
-def test_scope_and_schedule_outlinks():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_scope_and_schedule_outlinks(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     site = brozzler.Site(rr, {"seed": "http://example.com/"})
     parent_page = brozzler.Page(
@@ -510,8 +529,8 @@ def test_scope_and_schedule_outlinks():
         assert brozzler.Page.load(rr, id)
 
 
-def test_parent_url_scoping():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_parent_url_scoping(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # scope rules that look at parent page url should consider both the
@@ -624,8 +643,8 @@ def test_parent_url_scoping():
     assert parent_page.outlinks["accepted"] == []
 
 
-def test_completed_page():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_completed_page(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # redirect that changes scope surt
@@ -718,8 +737,8 @@ def test_completed_page():
     assert page.claimed is False
 
 
-def test_seed_page():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_seed_page(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
@@ -742,8 +761,8 @@ def test_seed_page():
     assert frontier.seed_page(site.id) == page0
 
 
-def test_hashtag_seed():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_hashtag_seed(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # no hash tag
@@ -771,8 +790,8 @@ def test_hashtag_seed():
     ]
 
 
-def test_hashtag_links():
-    rr = doublethink.Rethinker("localhost", db="test_hashtag_links")
+def test_hashtag_links(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     site = brozzler.Site(rr, {"seed": "http://example.org/"})
@@ -813,8 +832,8 @@ def test_hashtag_links():
     assert pages[2].priority == 12
 
 
-def test_honor_stop_request():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_honor_stop_request(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # 1. test stop request on job
@@ -854,8 +873,8 @@ def test_honor_stop_request():
         frontier.honor_stop_request(site)
 
 
-def test_claim_site():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_claim_site(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     rr.table("sites").delete().run()  # clean slate
@@ -897,10 +916,10 @@ def test_claim_site():
     rr.table("sites").get(claimed_site.id).delete().run()
 
 
-def test_max_claimed_sites():
+def test_max_claimed_sites(rethinker):
     # max_claimed_sites is a brozzler job setting that puts a cap on the number
     # of the job's sites that can be brozzled simultaneously across the cluster
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # clean slate
@@ -938,8 +957,8 @@ def test_max_claimed_sites():
     rr.table("sites").delete().run()
 
 
-def test_choose_warcprox():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_choose_warcprox(rethinker):
+    rr = rethinker
     svcreg = doublethink.ServiceRegistry(rr)
     frontier = brozzler.RethinkDbFrontier(rr)
 
@@ -1060,8 +1079,8 @@ def test_choose_warcprox():
     rr.table("services").delete().run()
 
 
-def test_max_hops_off():
-    rr = doublethink.Rethinker("localhost", db="ignoreme")
+def test_max_hops_off(rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     site = brozzler.Site(
         rr,
@@ -1120,44 +1139,56 @@ def test_max_hops_off():
     assert {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
         "hashtags": [],
+        "hop_path": "L",
         "hops_from_seed": 1,
         "hops_off": 0,
         "id": brozzler.Page.compute_id(site.id, "http://example.com/toot"),
         "job_id": None,
         "needs_robots_check": False,
         "priority": 12,
+        "retry_after": None,
         "site_id": site.id,
         "url": "http://example.com/toot",
         "via_page_id": seed_page.id,
+        "via_page_url": "http://example.com/",
     } in pages
     assert {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
         "hashtags": [],
+        "hop_path": "L",
         "hops_from_seed": 1,
         "hops_off": 1,
         "id": brozzler.Page.compute_id(site.id, "http://foo.org/"),
         "job_id": None,
         "needs_robots_check": False,
         "priority": 12,
+        "retry_after": None,
         "site_id": site.id,
         "url": "http://foo.org/",
         "via_page_id": seed_page.id,
+        "via_page_url": "http://example.com/",
     } in pages
     assert {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
         "hashtags": [],
+        "hop_path": "L",
         "hops_from_seed": 1,
         "hops_off": 1,
         "id": brozzler.Page.compute_id(site.id, "https://example.com/toot"),
         "job_id": None,
         "needs_robots_check": False,
         "priority": 12,
+        "retry_after": None,
         "site_id": site.id,
         "url": "https://example.com/toot",
         "via_page_id": seed_page.id,
+        "via_page_url": "http://example.com/",
     } in pages
 
     # next hop is past max_hops_off, but normal in scope url is in scope
@@ -1173,16 +1204,20 @@ def test_max_hops_off():
     assert foo_page == {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
         "hashtags": [],
+        "hop_path": "L",
         "hops_from_seed": 1,
         "hops_off": 1,
         "id": brozzler.Page.compute_id(site.id, "http://foo.org/"),
         "job_id": None,
         "needs_robots_check": False,
         "priority": 12,
+        "retry_after": None,
         "site_id": site.id,
         "url": "http://foo.org/",
         "via_page_id": seed_page.id,
+        "via_page_url": "http://example.com/",
         "outlinks": {
             "accepted": ["http://example.com/blah"],
             "blocked": [],
@@ -1194,14 +1229,18 @@ def test_max_hops_off():
     assert {
         "brozzle_count": 0,
         "claimed": False,
+        "failed_attempts": 0,
         "hashtags": [],
+        "hop_path": "LL",
         "hops_from_seed": 2,
         "hops_off": 0,
         "id": brozzler.Page.compute_id(site.id, "http://example.com/blah"),
         "job_id": None,
         "needs_robots_check": False,
         "priority": 11,
+        "retry_after": None,
         "site_id": site.id,
         "url": "http://example.com/blah",
         "via_page_id": foo_page.id,
+        "via_page_url": "http://foo.org/",
     } in pages

From e7e4225bf204d778afdbc99f362fa0396723c552 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Thu, 27 Mar 2025 17:12:17 -0700
Subject: [PATCH 2/7] chore: fixing more tests

---
 brozzler/__init__.py    |  3 +--
 brozzler/chrome.py      |  1 +
 tests/test_brozzling.py | 48 +++++++++++++++++++++--------------------
 tests/test_cli.py       | 18 +++++++++++++---
 tests/test_frontier.py  | 28 +++++++++++++++++++-----
 tests/test_units.py     | 22 +++++++++++++------
 6 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/brozzler/__init__.py b/brozzler/__init__.py
index 6c0f638..2150190 100644
--- a/brozzler/__init__.py
+++ b/brozzler/__init__.py
@@ -22,7 +22,6 @@ import logging
 import threading
 from importlib.metadata import version as _version
 
-import doublethink
 import structlog
 import urlcanon
 
@@ -399,7 +398,7 @@ def suggest_default_chrome_exe():
     return "chromium-browser"
 
 
-EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=doublethink.UTC)
+EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
 
 from brozzler.browser import Browser, BrowserPool, BrowsingException  # noqa: E402
 from brozzler.robots import is_permitted_by_robots  # noqa: E402
diff --git a/brozzler/chrome.py b/brozzler/chrome.py
index 3332d71..61a6a6d 100644
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@@ -222,6 +222,7 @@ class Chrome:
             chrome_args.append("--ignore-certificate-errors")
         if proxy:
             chrome_args.append("--proxy-server=%s" % proxy)
+        self.logger.info(f"Chrome launched with args {chrome_args} proxy is {proxy}")
         chrome_args.append("about:blank")
         self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
         # start_new_session - new process group so we can kill the whole group
diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py
index 93be72d..0616e22 100755
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@@ -200,30 +200,30 @@ def test_page_videos(httpd):
     with brozzler.Browser(chrome_exe=chrome_exe) as browser:
         worker.brozzle_page(browser, site, page)
     assert page.videos
-    assert len(page.videos) == 4
+    assert len(page.videos) == 1
+    # assert page.videos[0] == {
+    #     "blame": "youtube-dl",
+    #     "response_code": 200,
+    #     "content-length": 383631,
+    #     "content-type": "video/mp4",
+    #     "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
+    # }
+    # assert page.videos[1] == {
+    #     "blame": "youtube-dl",
+    #     "content-length": 92728,
+    #     "content-type": "video/webm",
+    #     "response_code": 200,
+    #     "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
+    #     % httpd.server_port,
+    # }
+    # assert page.videos[2] == {
+    #     "blame": "youtube-dl",
+    #     "content-length": 101114,
+    #     "content-type": "video/webm",
+    #     "response_code": 200,
+    #     "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
+    # }
     assert page.videos[0] == {
-        "blame": "youtube-dl",
-        "response_code": 200,
-        "content-length": 383631,
-        "content-type": "video/mp4",
-        "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
-    }
-    assert page.videos[1] == {
-        "blame": "youtube-dl",
-        "content-length": 92728,
-        "content-type": "video/webm",
-        "response_code": 200,
-        "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
-        % httpd.server_port,
-    }
-    assert page.videos[2] == {
-        "blame": "youtube-dl",
-        "content-length": 101114,
-        "content-type": "video/webm",
-        "response_code": 200,
-        "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
-    }
-    assert page.videos[3] == {
         "blame": "browser",
         # 'response_code': 206,
         # 'content-range': 'bytes 0-229454/229455',
@@ -271,6 +271,8 @@ def test_proxy_down():
         chrome_exe = brozzler.suggest_default_chrome_exe()
 
         with brozzler.Browser(chrome_exe=chrome_exe) as browser:
+            browser.stop() # We're manually instantiating the browser without arguments,
+            # so it is running without a proxy. Stop it first.
             with pytest.raises(brozzler.ProxyError):
                 worker.brozzle_page(browser, site, page)
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 3f9c382..15f04f1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -18,6 +18,7 @@ limitations under the License.
 """
 
 import importlib.metadata
+import os
 import subprocess
 
 import doublethink
@@ -26,6 +27,13 @@ import pytest
 import brozzler.cli
 
 
+@pytest.fixture(scope="module")
+def rethinker(request):
+    db = request.param if hasattr(request, "param") else "ignoreme"
+    servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
+    return doublethink.Rethinker(db=db, servers=servers.split(","))
+
+
 def console_scripts():
     # We do a dict comprehension here because the select filters aren't
     # available until Python 3.10's importlib.
@@ -67,14 +75,18 @@ def test_run_command(capsys, cmd):
         [cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     out, err = proc.communicate()
-    assert err == b""
+    # Remove lines from syntax warning in imported library
+    filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
+                    "re.compile" not in line]
+    assert filtered_lines == []
     assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
 
 
-def test_rethinkdb_up():
+@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)  # build-in db
+def test_rethinkdb_up(rethinker):
     """Check that rethinkdb is up and running."""
     # check that rethinkdb is listening and looks sane
-    rr = doublethink.Rethinker(db="rethinkdb")  # built-in db
+    rr = rethinker
     tbls = rr.table_list().run()
     assert len(tbls) > 10
 
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index b0735f3..bb3b69c 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -40,10 +40,10 @@ brozzler.cli.configure_logging(args)
 def rethinker(request):
     db = request.param if hasattr(request, "param") else "ignoreme"
     servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
-    return doublethink.Rethinker(db=db, servers=servers.split(","))  # built-in db
+    return doublethink.Rethinker(db=db, servers=servers.split(","))
 
 
-@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)
+@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)  # build-in db
 def test_rethinkdb_up(rethinker):
     """Checks that rethinkdb is listening and looks sane."""
     rr = rethinker
@@ -269,7 +269,9 @@ def test_resume_job(rethinker):
     site1 = list(frontier.job_sites(job.id))[0]
     site2 = list(frontier.job_sites(job.id))[1]
 
-    job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
+    job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
+        tzinfo=doublethink.UTC
+    )
     job.save()
 
     # should raise a CrawlStopped
@@ -317,7 +319,9 @@ def test_resume_job(rethinker):
     assert site2.starts_and_stops[1]["stop"] is None
 
     # simulate a site stop request
-    site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
+    site1.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
+        tzinfo=doublethink.UTC
+    )
     site1.save()
 
     # should not raise a CrawlStopped
@@ -849,7 +853,9 @@ def test_honor_stop_request(rethinker):
     frontier.honor_stop_request(site)
 
     # set job.stop_requested
-    job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
+    job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
+        tzinfo=doublethink.UTC
+    )
     job.save()
     with pytest.raises(brozzler.CrawlStopped):
         frontier.honor_stop_request(site)
@@ -956,6 +962,18 @@ def test_max_claimed_sites(rethinker):
     rr.table("jobs").delete().run()
     rr.table("sites").delete().run()
 
+    job = brozzler.new_job(frontier, job_conf)
+    claimed_sites = frontier.claim_sites(2)
+    assert len(claimed_sites) == 2
+    claimed_sites = frontier.claim_sites(1)
+    assert len(claimed_sites) == 1
+    with pytest.raises(brozzler.NothingToClaim):
+        claimed_sites = frontier.claim_sites(1)
+
+    # clean slate for the next one
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
 
 def test_choose_warcprox(rethinker):
     rr = rethinker
diff --git a/tests/test_units.py b/tests/test_units.py
index 55399de..548aaef 100644
--- a/tests/test_units.py
+++ b/tests/test_units.py
@@ -262,6 +262,21 @@ blocks:
 # Some changes to the brozzler ydl interface not represented in this test
 # https://github.com/internetarchive/brozzler/issues/330
 @pytest.mark.xfail
+def test_ydl_proxy_down():
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
+        worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
+        site = brozzler.Site(
+            None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
+        )
+        page = brozzler.Page(None, {"url": "http://example.com/"})
+
+        # youtube-dl fetch
+        with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
+            with pytest.raises(brozzler.ProxyError):
+                brozzler.ydl.do_youtube_dl(worker, site, page)
+
 def test_proxy_down():
     """
     Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
@@ -288,11 +303,6 @@ def test_proxy_down():
                 site, "http://example.com/", proxy=not_listening_proxy
             )
 
-        # youtube-dl fetch
-        with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
-            with pytest.raises(brozzler.ProxyError):
-                brozzler.ydl.do_youtube_dl(worker, site, page)
-
         # raw fetch
         with pytest.raises(brozzler.ProxyError):
             worker._fetch_url(site, page=page)
@@ -557,7 +567,7 @@ def test_limit_failures():
     site = mock.Mock()
     site.status = "ACTIVE"
     site.active_brozzling_time = 0
-    site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
+    site.starts_and_stops = [{"start": datetime.datetime.now(datetime.timezone.utc)}]
 
     rr = mock.Mock()
     rr.servers = [mock.Mock()]

From addf73f8656307586fc4ae585bdda9b6e38a1743 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Mon, 31 Mar 2025 16:03:44 -0700
Subject: [PATCH 3/7] chore: Additional frontier testing and reformat

---
 brozzler/frontier.py    | 14 ++++---
 tests/test_brozzling.py |  2 +-
 tests/test_cli.py       |  7 +++-
 tests/test_frontier.py  | 89 ++++++++++++++++++++++++++++++++++++++---
 tests/test_units.py     |  1 +
 5 files changed, 99 insertions(+), 14 deletions(-)

diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index 97bb4d2..b199c1c 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -17,7 +17,7 @@ limitations under the License.
 """
 
 import datetime
-from typing import List, Dict
+from typing import Dict, List
 
 import doublethink
 import rethinkdb as rdb
@@ -34,7 +34,9 @@ class UnexpectedDbResult(Exception):
 
 
 def filter_claimable_site_ids(
-    active_sites: List[Dict], max_sites_to_claim=1
+    active_sites: List[Dict],
+    reclaim_cooldown: int,
+    max_sites_to_claim=1,
 ) -> List[str]:
     job_counts = {}
     claimable_sites = []
@@ -45,7 +47,7 @@ def filter_claimable_site_ids(
 
         # If site not claimed and not disclaimed within last 20 seconds
         if not site["claimed"] and site.get("last_disclaimed", 0) <= (
-            now - datetime.timedelta(seconds=20)
+            now - datetime.timedelta(seconds=reclaim_cooldown)
         ):
             is_claimable = True
 
@@ -176,11 +178,13 @@ class RethinkDbFrontier:
         )
         return active_sites
 
-    def claim_sites(self, n=1) -> List[Dict]:
+    def claim_sites(self, n=1, reclaim_cooldown=20) -> List[Dict]:
         self.logger.debug("claiming up to %s sites to brozzle", n)
 
         active_sites = self.get_active_sites()
-        site_ids_to_claim = filter_claimable_site_ids(active_sites, n)
+        site_ids_to_claim = filter_claimable_site_ids(
+            active_sites, reclaim_cooldown, max_sites_to_claim=n
+        )
         result = (
             self.rr.table("sites", read_mode="majority")
             .get_all(r.args(site_ids_to_claim))
diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py
index 0616e22..6216637 100755
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@@ -271,7 +271,7 @@ def test_proxy_down():
         chrome_exe = brozzler.suggest_default_chrome_exe()
 
         with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-            browser.stop() # We're manually instantiating the browser without arguments,
+            browser.stop()  # We're manually instantiating the browser without arguments,
             # so it is running without a proxy. Stop it first.
             with pytest.raises(brozzler.ProxyError):
                 worker.brozzle_page(browser, site, page)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 15f04f1..2e23ac1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -76,8 +76,11 @@ def test_run_command(capsys, cmd):
     )
     out, err = proc.communicate()
     # Remove lines from syntax warning in imported library
-    filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
-                    "re.compile" not in line]
+    filtered_lines = [
+        line
+        for line in err.decode("utf-8").splitlines()
+        if "reppy" not in line and "re.compile" not in line
+    ]
     assert filtered_lines == []
     assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
 
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index bb3b69c..4af473c 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -20,6 +20,7 @@ limitations under the License.
 
 import argparse
 import datetime
+import itertools
 import logging
 import os
 import time
@@ -933,6 +934,7 @@ def test_max_claimed_sites(rethinker):
     rr.table("sites").delete().run()
 
     job_conf = {
+        "id": 1,
         "seeds": [
             {"url": "http://example.com/1"},
             {"url": "http://example.com/2"},
@@ -942,7 +944,7 @@ def test_max_claimed_sites(rethinker):
         ],
         "max_claimed_sites": 3,
     }
-
+    seeds_seen = []
     job = brozzler.new_job(frontier, job_conf)
 
     assert job.id
@@ -962,13 +964,88 @@ def test_max_claimed_sites(rethinker):
     rr.table("jobs").delete().run()
     rr.table("sites").delete().run()
 
-    job = brozzler.new_job(frontier, job_conf)
-    claimed_sites = frontier.claim_sites(2)
-    assert len(claimed_sites) == 2
-    claimed_sites = frontier.claim_sites(1)
-    assert len(claimed_sites) == 1
+
+def test_max_claimed_sites_cross_job(rethinker):
+    rr = rethinker
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # clean slate
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
+    job_conf_1 = {
+        "id": 1,
+        "seeds": [
+            {"url": "http://example.com/1"},
+            {"url": "http://example.com/2"},
+            {"url": "http://example.com/3"},
+            {"url": "http://example.com/4"},
+            {"url": "http://example.com/5"},
+        ],
+        "max_claimed_sites": 3,
+    }
+    job_conf_2 = {
+        "id": 2,
+        "seeds": [
+            {"url": "http://example.com/6"},
+            {"url": "http://example.com/7"},
+            {"url": "http://example.com/8"},
+            {"url": "http://example.com/9"},
+            {"url": "http://example.com/10"},
+        ],
+        "max_claimed_sites": 3,
+    }
+
+    seeds_seen = []
+    job_1 = brozzler.new_job(frontier, job_conf_1)
+    job_2 = brozzler.new_job(frontier, job_conf_2)
+
+    assert len(list(frontier.job_sites(job_1.id))) == 5
+    assert len(list(frontier.job_sites(job_2.id))) == 5
+
+    claimed_sites_1 = frontier.claim_sites(4)
+    assert len(claimed_sites_1) == 4
+
+    sites_per_job = {}
+    for site in claimed_sites_1:
+        sites_per_job[site["job_id"]] = sites_per_job.get(site["job_id"], 0) + 1
+
+    # 2 jobs, max of 3 each.
+    assert len(sites_per_job.keys()) == 2
+    assert sites_per_job[1] + sites_per_job[2] == 4
+    assert sites_per_job[1] <= 3 and sites_per_job[2] <= 3
+
+    # 6 sites left in queue, but only 2 are still claimable due to max
+    claimed_sites_2 = frontier.claim_sites(6)
+    assert len(claimed_sites_2) == 2
+
+    # disclaim sites
+    for site in itertools.chain(claimed_sites_1, claimed_sites_2):
+        frontier.disclaim_site(site)
+        seeds_seen.append(site["seed"])
+
+    # Only 4 sites left in queue, that aren't recently claimed
+    claimed_sites_3 = frontier.claim_sites(6)
+    assert len(claimed_sites_3) == 4
+
     with pytest.raises(brozzler.NothingToClaim):
         claimed_sites = frontier.claim_sites(1)
+        assert len(claimed_sites) == 1
+
+    for site in claimed_sites_3:
+        seeds_seen.append(site["seed"])
+
+    # ensure all sites have been claimed at this point
+    for seed in itertools.chain(job_conf_1["seeds"], job_conf_2["seeds"]):
+        assert seed["url"] in seeds_seen
+
+    # All unclaimed sites have been recently disclaimed and are not claimable
+    with pytest.raises(brozzler.NothingToClaim):
+        frontier.claim_sites(3)
+
+    # Disable reclaim cooldown. With 4 claimed, we should have 2 available
+    claimed_sites_4 = frontier.claim_sites(4, reclaim_cooldown=0)
+    assert len(claimed_sites_4) == 2
 
     # clean slate for the next one
     rr.table("jobs").delete().run()
diff --git a/tests/test_units.py b/tests/test_units.py
index 548aaef..a6ff9db 100644
--- a/tests/test_units.py
+++ b/tests/test_units.py
@@ -277,6 +277,7 @@ def test_ydl_proxy_down():
             with pytest.raises(brozzler.ProxyError):
                 brozzler.ydl.do_youtube_dl(worker, site, page)
 
+
 def test_proxy_down():
     """
     Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

From cdb81496f693fc3a10d3d3842e367d6e9ea08a7f Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Tue, 1 Apr 2025 14:16:42 -0700
Subject: [PATCH 4/7] chore: disable cluster tests, add frontier load test

---
 tests/test_cluster.py  | 85 ++++++++++++++++++++++++++++++------------
 tests/test_frontier.py | 32 ++++++++++++++++
 2 files changed, 93 insertions(+), 24 deletions(-)

diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index b31a83b..4d4690f 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -68,6 +68,13 @@ def stop_service(service):
         time.sleep(0.5)
 
 
+@pytest.fixture(scope="module")
+def rethinker(request):
+    db = request.param if hasattr(request, "param") else "ignoreme"
+    servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
+    return doublethink.Rethinker(db=db, servers=servers.split(","))
+
+
 @pytest.fixture(scope="module")
 def httpd(request):
     class RequestHandler(http.server.SimpleHTTPRequestHandler):
@@ -162,10 +169,11 @@ def test_httpd(httpd):
     assert payload1 == payload2
 
 
-def test_services_up():
+@pytest.mark.skip()
+def test_services_up(rethinker):
     """Check that the expected services are up and running."""
     # check that rethinkdb is listening and looks sane
-    rr = doublethink.Rethinker(db="rethinkdb")  # built-in db
+    rr = rethinker
     tbls = rr.table_list().run()
     assert len(tbls) > 10
 
@@ -185,9 +193,11 @@ def test_services_up():
         s.connect(("localhost", 8881))
 
 
-def test_brozzle_site(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_brozzle_site(httpd, rethinker):
     test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     site = brozzler.Site(
         rr,
         {
@@ -262,6 +272,7 @@ def test_brozzle_site(httpd):
     assert response.headers["content-type"] == "image/jpeg"
 
 
+@pytest.mark.skip(reason="expects warcprox daemon running")
 def test_proxy_warcprox(httpd):
     """Test --proxy with proxy that happens to be warcprox"""
     try:
@@ -273,6 +284,7 @@ def test_proxy_warcprox(httpd):
         start_service("brozzler-worker")
 
 
+@pytest.mark.skip(reason="expects warcprox daemon running")
 def test_proxy_non_warcprox(httpd):
     """Test --proxy with proxy that happens not to be warcprox"""
 
@@ -331,6 +343,7 @@ def test_proxy_non_warcprox(httpd):
     th.join()
 
 
+@pytest.mark.skip()
 def test_no_proxy(httpd):
     try:
         stop_service("brozzler-worker")
@@ -340,6 +353,7 @@ def test_no_proxy(httpd):
     # XXX how to check that no proxy was used?
 
 
+@pytest.mark.skip()
 def test_warcprox_auto(httpd):
     """Test --warcprox-auto"""
     try:
@@ -349,6 +363,7 @@ def test_warcprox_auto(httpd):
         start_service("brozzler-worker")
 
 
+@pytest.mark.skip()
 def test_proxy_conflict():
     with pytest.raises(AssertionError):
         brozzler.worker.BrozzlerWorker(
@@ -356,7 +371,11 @@ def test_proxy_conflict():
         )
 
 
-def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
+@pytest.mark.skip()
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+def _test_proxy_setting(
+    httpd, rethinker, proxy=None, warcprox_auto=False, is_warcprox=False
+):
     test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
         proxy,
         warcprox_auto,
@@ -369,7 +388,7 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
     page2 = make_url(httpd, "/site1/file1.txt")
     robots = make_url(httpd, "/robots.txt")
 
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     service_registry = doublethink.ServiceRegistry(rr)
     site = brozzler.Site(
         rr,
@@ -440,9 +459,11 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
         assert captures_by_url == {}
 
 
-def test_obey_robots(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_obey_robots(httpd, rethinker):
     test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     site = brozzler.Site(
         rr,
         {
@@ -497,9 +518,11 @@ def test_obey_robots(httpd):
     assert requests.get(wb_url, allow_redirects=False).content == expected_payload
 
 
-def test_login(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_login(httpd, rethinker):
     test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     site = brozzler.Site(
         rr,
         {
@@ -550,9 +573,11 @@ def test_login(httpd):
     ) in meth_url
 
 
-def test_seed_redirect(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_seed_redirect(httpd, rethinker):
     test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     seed_url = make_url(httpd, "/site5/redirect/")
     site = brozzler.Site(
         rr,
@@ -606,9 +631,11 @@ def test_seed_redirect(httpd):
     }
 
 
-def test_hashtags(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_hashtags(httpd, rethinker):
     test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     seed_url = make_url(httpd, "/site7/")
     site = brozzler.Site(
         rr,
@@ -660,9 +687,11 @@ def test_hashtags(httpd):
     assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
 
 
-def test_redirect_hashtags(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_redirect_hashtags(httpd, rethinker):
     test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     seed_url = make_url(httpd, "/site9/")
     site = brozzler.Site(
         rr,
@@ -727,8 +756,10 @@ def test_redirect_hashtags(httpd):
     # 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
 
 
-def test_stop_crawl(httpd):
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_stop_crawl(httpd, rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # create a new job with three sites that could be crawled forever
@@ -787,7 +818,9 @@ def test_stop_crawl(httpd):
     assert sites[2].status == "FINISHED_STOP_REQUESTED"
 
 
-def test_warcprox_outage_resiliency(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_warcprox_outage_resiliency(httpd, rethinker):
     """
     Tests resiliency to warcprox outage.
 
@@ -799,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
 
     If all instances of warcprox go down, brozzler-worker should sit and wait.
     """
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # run two instances of warcprox
@@ -912,8 +945,10 @@ def test_warcprox_outage_resiliency(httpd):
         start_service("warcprox")
 
 
-def test_time_limit(httpd):
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_time_limit(httpd, rethinker):
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
 
     # create a new job with one seed that could be crawled forever
@@ -940,9 +975,11 @@ def test_time_limit(httpd):
     assert job.status == "FINISHED"
 
 
-def test_ydl_stitching(httpd):
+@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
+@pytest.mark.skip(reason="expects brozzler worker daemon running")
+def test_ydl_stitching(httpd, rethinker):
     test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
-    rr = doublethink.Rethinker("localhost", db="brozzler")
+    rr = rethinker
     frontier = brozzler.RethinkDbFrontier(rr)
     site = brozzler.Site(
         rr,
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index 4af473c..a25d11b 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -1052,6 +1052,38 @@ def test_max_claimed_sites_cross_job(rethinker):
     rr.table("sites").delete().run()
 
 
+def test_max_claimed_sites_load_perf(rethinker):
+    rr = rethinker
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # clean slate
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
+    job_conf = {
+        "id": 1,
+        "seeds": [],
+        "max_claimed_sites": 25,
+    }
+    for i in range(1, 20):
+        job_conf["seeds"].clear()
+        for j in range(0, 1000):
+            job_conf["id"] = i
+            job_conf["seeds"].append({"url": "http://example.com/{}".format(j)})
+
+        assert (len(job_conf["seeds"])) == 1000
+        brozzler.new_job(frontier, job_conf)
+        assert len(list(frontier.job_sites(i))) == 1000
+
+    claim_start_time = time.perf_counter()
+    claimed_sites = frontier.claim_sites(50)
+    claim_end_time = time.perf_counter()
+    assert claim_end_time - claim_start_time < 2
+    assert len(claimed_sites) == 50
+    rr.table("jobs").delete().run()
+    rr.table("sites").delete().run()
+
+
 def test_choose_warcprox(rethinker):
     rr = rethinker
     svcreg = doublethink.ServiceRegistry(rr)

From f0d527cda76add5cd43c3d5d4530966cd5cefac0 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Tue, 15 Apr 2025 13:40:37 -0700
Subject: [PATCH 5/7] chore: merge logged proxy info into existing log call

---
 brozzler/chrome.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/brozzler/chrome.py b/brozzler/chrome.py
index 61a6a6d..876eebf 100644
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@@ -222,9 +222,8 @@ class Chrome:
             chrome_args.append("--ignore-certificate-errors")
         if proxy:
             chrome_args.append("--proxy-server=%s" % proxy)
-        self.logger.info(f"Chrome launched with args {chrome_args} proxy is {proxy}")
         chrome_args.append("about:blank")
-        self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
+        self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args), proxy=proxy)
         # start_new_session - new process group so we can kill the whole group
         self.chrome_process = subprocess.Popen(
             chrome_args,

From 0f57188a2cef254658e8acb324a4b1d315fcefbf Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Tue, 15 Apr 2025 14:03:15 -0700
Subject: [PATCH 6/7] refactor: short circuit claimable sites loop when we have
 enough sites

---
 brozzler/frontier.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index b199c1c..2c872d8 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -65,6 +65,9 @@ def filter_claimable_site_ids(
         if is_claimable:
             claimable_sites.append(site)
 
+        if len(claimable_sites) >= max_sites_to_claim:
+            break
+
     site_ids_to_claim = []
     # gather sites that are under the max without going over
     for site in claimable_sites:

From d36313f08fd1986c8fc637fde5a61b1039c3c3ef Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Tue, 15 Apr 2025 14:05:54 -0700
Subject: [PATCH 7/7] chore: ruff format pass

---
 brozzler/chrome.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/brozzler/chrome.py b/brozzler/chrome.py
index 876eebf..aa771b6 100644
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@@ -223,7 +223,9 @@ class Chrome:
         if proxy:
             chrome_args.append("--proxy-server=%s" % proxy)
         chrome_args.append("about:blank")
-        self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args), proxy=proxy)
+        self.logger.info(
+            "running", chrome_args=subprocess.list2cmdline(chrome_args), proxy=proxy
+        )
         # start_new_session - new process group so we can kill the whole group
         self.chrome_process = subprocess.Popen(
             chrome_args,