partition_id, not account_id

2025-12-01 04:37:09 -05:00 · 2025-07-26 15:47:16 -07:00 · 2025-07-26 15:47:16 -07:00 · d2ec6de49e
commit d2ec6de49e
parent a93bf13252
4 changed files with 24 additions and 23 deletions
--- a/brozzler/job_schema.yaml
+++ b/brozzler/job_schema.yaml
@ -98,7 +98,7 @@ seeds:
      video_capture:
        type: string

-      account_id:
+      partition_id:
        type:
        - string
        - integer
@ -111,7 +111,7 @@ max_claimed_sites:
 pdfs_only:
  type: boolean

-account_id:
+partition_id:
  type:
  - string
  - integer
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -104,8 +104,8 @@ def new_job(frontier, job_conf):
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    if "pdfs_only" in job_conf:
        job.pdfs_only = job_conf["pdfs_only"]
-    if "account_id" in job_conf:
-        job.account_id = job_conf["account_id"]
+    if "partition_id" in job_conf:
+        job.partition_id = job_conf["partition_id"]
    job.save()

    sites = []
@ -117,7 +117,7 @@ def new_job(frontier, job_conf):
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        site.id = str(uuid.uuid4())
-        site.account_id = job.account_id
+        site.partition_id = job.partition_id
        sites.append(site)
        pages.append(new_seed_page(frontier, site))

@ -205,8 +205,8 @@ class Job(doublethink.Document, ElapsedMixIn):
    def populate_defaults(self):
        if "status" not in self:
            self.status = "ACTIVE"
-        if "account_id" not in self:
-            self.account_id = None
+        if "partition_id" not in self:
+            self.partition_id = None
        if "pdfs_only" not in self:
            self.pdfs_only = False
        if "starts_and_stops" not in self:
@ -268,8 +268,8 @@ class Site(doublethink.Document, ElapsedMixIn):
            self.scope = {}
        if "video_capture" not in self:
            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
-        if "account_id" not in self:
-            self.account_id = None
+        if "partition_id" not in self:
+            self.partition_id = None

        # backward compatibility
        if "surt" in self.scope:
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -108,16 +108,16 @@ class VideoDataClient:
        return None

    def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
-        account_id = site["account_id"] if site["account_id"] else None
+        partition_id = site["partition_id"] if site["partition_id"] else None
        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )

-        if account_id and seed_id and containing_page_url:
+        if partition_id and seed_id and containing_page_url:
            # check for postgres query for most recent record
            pg_query = (
-                "SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
-                (account_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
+                "SELECT * from video where partition_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
+                (partition_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
            )
            try:
                results = self._execute_pg_query(pg_query, fetchall=True)
@ -125,13 +125,13 @@ class VideoDataClient:
                logger.warn("postgres query failed: %s", e)
                results = []
        else:
-            logger.warn("missing account_id, seed_id, or containing_page_url")
+            logger.warn("missing partition_id, seed_id, or containing_page_url")
            results = []

        return results

    def get_video_captures(self, site=None, source=None) -> List[str]:
-        account_id = site["account_id"] if site["account_id"] else None
+        partition_id = site["partition_id"] if site["partition_id"] else None
        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )
@ -143,11 +143,11 @@ class VideoDataClient:
            containing_page_url_pattern = "http://youtube.com/watch%"  # yes, video data canonicalization uses "http"
        # support other media sources here

-        if account_id and seed_id and source:
+        if partition_id and seed_id and source:
            pg_query = (
-                "SELECT containing_page_url from video where account_id = %s and seed_id = %s and containing_page_url like %s",
+                "SELECT containing_page_url from video where partition_id = %s and seed_id = %s and containing_page_url like %s",
                (
-                    account_id,
+                    partition_id,
                    seed_id,
                    containing_page_url_pattern,
                ),
@ -160,7 +160,7 @@ class VideoDataClient:
                logger.warn("postgres query failed: %s", e)
                results = []
        else:
-            logger.warn("missing account_id, seed_id, or source")
+            logger.warn("missing partition_id, seed_id, or source")
            results = []

        return results
@ -619,7 +619,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
            if worker._video_data:
                logger.info(
                    "checking for previously captured youtube watch pages for account %s, seed_id %s",
-                    site["account_id"],
+                    site["partition_id"],
                    site["metadata"]["ait_seed_id"],
                )
                try:
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -70,7 +70,7 @@ def test_basics(rethinker):
        "status": "ACTIVE",
        "pdfs_only": False,
        "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
-        "account_id": None,
+        "partition_id": None,
    }

    sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
@ -89,8 +89,8 @@ def test_basics(rethinker):
            {"start": sites[0].starts_and_stops[0]["start"], "stop": None}
        ],
        "status": "ACTIVE",
-        "account_id": None,
        "video_capture": "ENABLE_VIDEO_CAPTURE",
+        "partition_id": None,
    }
    assert sites[1] == {
        "claimed": False,
@ -107,8 +107,9 @@ def test_basics(rethinker):
            },
        ],
        "status": "ACTIVE",
-        "account_id": None,
+        "partition_id": None,
        "video_capture": "ENABLE_VIDEO_CAPTURE",
+        "partition_id": None,
    }

    pages = list(frontier.site_pages(sites[0].id))