From 4a73b1e8d8542274006cd700964e77bee2c035c6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 27 Jul 2025 08:05:48 -0700 Subject: [PATCH] simplify job conf --- brozzler/job_schema.yaml | 10 ---------- brozzler/model.py | 7 ------- brozzler/ydl.py | 28 ++++++++++++++++++++-------- tests/test_frontier.py | 3 --- 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 76e9cc7..59b831f 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -98,11 +98,6 @@ seeds: video_capture: type: string - partition_id: - type: - - string - - integer - <<: *multi_level_options max_claimed_sites: @@ -110,8 +105,3 @@ max_claimed_sites: pdfs_only: type: boolean - -partition_id: - type: - - string - - integer diff --git a/brozzler/model.py b/brozzler/model.py index b696a5f..28979bb 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -104,8 +104,6 @@ def new_job(frontier, job_conf): job.max_claimed_sites = job_conf["max_claimed_sites"] if "pdfs_only" in job_conf: job.pdfs_only = job_conf["pdfs_only"] - if "partition_id" in job_conf: - job.partition_id = job_conf["partition_id"] job.save() sites = [] @@ -117,7 +115,6 @@ def new_job(frontier, job_conf): merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) site.id = str(uuid.uuid4()) - site.partition_id = job.partition_id sites.append(site) pages.append(new_seed_page(frontier, site)) @@ -205,8 +202,6 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if "status" not in self: self.status = "ACTIVE" - if "partition_id" not in self: - self.partition_id = None if "pdfs_only" not in self: self.pdfs_only = False if "starts_and_stops" not in self: @@ -268,8 +263,6 @@ class Site(doublethink.Document, ElapsedMixIn): self.scope = {} if "video_capture" not in self: self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value - if "partition_id" not in self: - self.partition_id = None # backward compatibility if "surt" in self.scope: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ccf5f7a..0220c4c 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -108,11 +108,19 @@ class VideoDataClient: return None def get_recent_video_capture(self, site=None, containing_page_url=None) -> List: - partition_id = site["partition_id"] if site["partition_id"] else None + # using ait_account_id as postgres partition id + partition_id = ( + site["metadata"]["ait_account_id"] + if site["metadata"]["ait_account_id"] + else None + ) seed_id = ( site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None ) + # TODO: generalize, make variable? + # containing_page_timestamp_pattern = "2025%" # for future pre-dup additions + if partition_id and seed_id and containing_page_url: # check for postgres query for most recent record pg_query = ( @@ -125,20 +133,24 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) results = [] else: - logger.warn("missing partition_id, seed_id, or containing_page_url") + logger.warn( + "missing partition_id/account_id, seed_id, or containing_page_url" + ) results = [] return results def get_video_captures(self, site=None, source=None) -> List[str]: - partition_id = site["partition_id"] if site["partition_id"] else None + # using ait_account_id as postgres partition id + partition_id = ( + site["metadata"]["ait_account_id"] + if site["metadata"]["ait_account_id"] + else None + ) seed_id = ( site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None ) - # TODO: generalize, maybe make variable? - # containing_page_timestamp_pattern = "2025%" # for future pre-dup additions - if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other media sources here @@ -160,7 +172,7 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) results = [] else: - logger.warn("missing partition_id, seed_id, or source") + logger.warn("missing partition_id/account_id, seed_id, or source") results = [] return results @@ -619,7 +631,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): if worker._video_data: logger.info( "checking for previously captured youtube watch pages for account %s, seed_id %s", - site["partition_id"], + site["metadata"]["ait_account_id"], site["metadata"]["ait_seed_id"], ) try: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 2d9525b..da6f5bf 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -70,7 +70,6 @@ def test_basics(rethinker): "status": "ACTIVE", "pdfs_only": False, "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}], - "partition_id": None, } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) @@ -90,7 +89,6 @@ def test_basics(rethinker): ], "status": "ACTIVE", "video_capture": "ENABLE_VIDEO_CAPTURE", - "partition_id": None, } assert sites[1] == { "claimed": False, @@ -108,7 +106,6 @@ def test_basics(rethinker): ], "status": "ACTIVE", "video_capture": "ENABLE_VIDEO_CAPTURE", - "partition_id": None, } pages = list(frontier.site_pages(sites[0].id))