simplify job conf

2025-08-13 16:55:34 -04:00 · 2025-07-27 08:05:48 -07:00 · 2025-07-27 08:05:48 -07:00 · 4a73b1e8d8
commit 4a73b1e8d8
parent 353a74546a
4 changed files with 20 additions and 28 deletions
--- a/brozzler/job_schema.yaml
+++ b/brozzler/job_schema.yaml
@ -98,11 +98,6 @@ seeds:
      video_capture:
        type: string

-      partition_id:
-        type:
-        - string
-        - integer
-
      <<: *multi_level_options

 max_claimed_sites:
@ -110,8 +105,3 @@ max_claimed_sites:

 pdfs_only:
  type: boolean
-
-partition_id:
-  type:
-  - string
-  - integer
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -104,8 +104,6 @@ def new_job(frontier, job_conf):
        job.max_claimed_sites = job_conf["max_claimed_sites"]
    if "pdfs_only" in job_conf:
        job.pdfs_only = job_conf["pdfs_only"]
-    if "partition_id" in job_conf:
-        job.partition_id = job_conf["partition_id"]
    job.save()

    sites = []
@ -117,7 +115,6 @@ def new_job(frontier, job_conf):
        merged_conf["seed"] = merged_conf.pop("url")
        site = brozzler.Site(frontier.rr, merged_conf)
        site.id = str(uuid.uuid4())
-        site.partition_id = job.partition_id
        sites.append(site)
        pages.append(new_seed_page(frontier, site))

@ -205,8 +202,6 @@ class Job(doublethink.Document, ElapsedMixIn):
    def populate_defaults(self):
        if "status" not in self:
            self.status = "ACTIVE"
-        if "partition_id" not in self:
-            self.partition_id = None
        if "pdfs_only" not in self:
            self.pdfs_only = False
        if "starts_and_stops" not in self:
@ -268,8 +263,6 @@ class Site(doublethink.Document, ElapsedMixIn):
            self.scope = {}
        if "video_capture" not in self:
            self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
-        if "partition_id" not in self:
-            self.partition_id = None

        # backward compatibility
        if "surt" in self.scope:
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -108,11 +108,19 @@ class VideoDataClient:
        return None

    def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
-        partition_id = site["partition_id"] if site["partition_id"] else None
+        # using ait_account_id as postgres partition id
+        partition_id = (
+            site["metadata"]["ait_account_id"]
+            if site["metadata"]["ait_account_id"]
+            else None
+        )
        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )

+        # TODO: generalize, make variable?
+        # containing_page_timestamp_pattern = "2025%"  # for future pre-dup additions
+
        if partition_id and seed_id and containing_page_url:
            # check for postgres query for most recent record
            pg_query = (
@ -125,20 +133,24 @@ class VideoDataClient:
                logger.warn("postgres query failed: %s", e)
                results = []
        else:
-            logger.warn("missing partition_id, seed_id, or containing_page_url")
+            logger.warn(
+                "missing partition_id/account_id, seed_id, or containing_page_url"
+            )
            results = []

        return results

    def get_video_captures(self, site=None, source=None) -> List[str]:
-        partition_id = site["partition_id"] if site["partition_id"] else None
+        # using ait_account_id as postgres partition id
+        partition_id = (
+            site["metadata"]["ait_account_id"]
+            if site["metadata"]["ait_account_id"]
+            else None
+        )
        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )

-        # TODO: generalize, maybe make variable?
-        # containing_page_timestamp_pattern = "2025%"  # for future pre-dup additions
-
        if source == "youtube":
            containing_page_url_pattern = "http://youtube.com/watch%"  # yes, video data canonicalization uses "http"
        # support other media sources here
@ -160,7 +172,7 @@ class VideoDataClient:
                logger.warn("postgres query failed: %s", e)
                results = []
        else:
-            logger.warn("missing partition_id, seed_id, or source")
+            logger.warn("missing partition_id/account_id, seed_id, or source")
            results = []

        return results
@ -619,7 +631,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
            if worker._video_data:
                logger.info(
                    "checking for previously captured youtube watch pages for account %s, seed_id %s",
-                    site["partition_id"],
+                    site["metadata"]["ait_account_id"],
                    site["metadata"]["ait_seed_id"],
                )
                try:
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -70,7 +70,6 @@ def test_basics(rethinker):
        "status": "ACTIVE",
        "pdfs_only": False,
        "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
-        "partition_id": None,
    }

    sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
@ -90,7 +89,6 @@ def test_basics(rethinker):
        ],
        "status": "ACTIVE",
        "video_capture": "ENABLE_VIDEO_CAPTURE",
-        "partition_id": None,
    }
    assert sites[1] == {
        "claimed": False,
@ -108,7 +106,6 @@ def test_basics(rethinker):
        ],
        "status": "ACTIVE",
        "video_capture": "ENABLE_VIDEO_CAPTURE",
-        "partition_id": None,
    }

    pages = list(frontier.site_pages(sites[0].id))