diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 6cc4b04..76e9cc7 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -98,7 +98,7 @@ seeds: video_capture: type: string - account_id: + partition_id: type: - string - integer @@ -111,7 +111,7 @@ max_claimed_sites: pdfs_only: type: boolean -account_id: +partition_id: type: - string - integer diff --git a/brozzler/model.py b/brozzler/model.py index 624c7df..b696a5f 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -104,8 +104,8 @@ def new_job(frontier, job_conf): job.max_claimed_sites = job_conf["max_claimed_sites"] if "pdfs_only" in job_conf: job.pdfs_only = job_conf["pdfs_only"] - if "account_id" in job_conf: - job.account_id = job_conf["account_id"] + if "partition_id" in job_conf: + job.partition_id = job_conf["partition_id"] job.save() sites = [] @@ -117,7 +117,7 @@ def new_job(frontier, job_conf): merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) site.id = str(uuid.uuid4()) - site.account_id = job.account_id + site.partition_id = job.partition_id sites.append(site) pages.append(new_seed_page(frontier, site)) @@ -205,8 +205,8 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if "status" not in self: self.status = "ACTIVE" - if "account_id" not in self: - self.account_id = None + if "partition_id" not in self: + self.partition_id = None if "pdfs_only" not in self: self.pdfs_only = False if "starts_and_stops" not in self: @@ -268,8 +268,8 @@ class Site(doublethink.Document, ElapsedMixIn): self.scope = {} if "video_capture" not in self: self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value - if "account_id" not in self: - self.account_id = None + if "partition_id" not in self: + self.partition_id = None # backward compatibility if "surt" in self.scope: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7ec1310..972fb52 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -108,16 +108,16 @@ class VideoDataClient: return None def get_recent_video_capture(self, site=None, containing_page_url=None) -> List: - account_id = site["account_id"] if site["account_id"] else None + partition_id = site["partition_id"] if site["partition_id"] else None seed_id = ( site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None ) - if account_id and seed_id and containing_page_url: + if partition_id and seed_id and containing_page_url: # check for postgres query for most recent record pg_query = ( - "SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1", - (account_id, seed_id, str(urlcanon.aggressive(containing_page_url))), + "SELECT * from video where partition_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1", + (partition_id, seed_id, str(urlcanon.aggressive(containing_page_url))), ) try: results = self._execute_pg_query(pg_query, fetchall=True) @@ -125,13 +125,13 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) results = [] else: - logger.warn("missing account_id, seed_id, or containing_page_url") + logger.warn("missing partition_id, seed_id, or containing_page_url") results = [] return results def get_video_captures(self, site=None, source=None) -> List[str]: - account_id = site["account_id"] if site["account_id"] else None + partition_id = site["partition_id"] if site["partition_id"] else None seed_id = ( site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None ) @@ -143,11 +143,11 @@ class VideoDataClient: containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other media sources here - if account_id and seed_id and source: + if partition_id and seed_id and source: pg_query = ( - "SELECT containing_page_url from video where account_id = %s and seed_id = %s and containing_page_url like %s", + "SELECT containing_page_url from video where partition_id = %s and seed_id = %s and containing_page_url like %s", ( - account_id, + partition_id, seed_id, containing_page_url_pattern, ), @@ -160,7 +160,7 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) results = [] else: - logger.warn("missing account_id, seed_id, or source") + logger.warn("missing partition_id, seed_id, or source") results = [] return results @@ -619,7 +619,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): if worker._video_data: logger.info( "checking for previously captured youtube watch pages for account %s, seed_id %s", - site["account_id"], + site["partition_id"], site["metadata"]["ait_seed_id"], ) try: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 3836e37..e14e823 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -70,7 +70,7 @@ def test_basics(rethinker): "status": "ACTIVE", "pdfs_only": False, "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}], - "account_id": None, + "partition_id": None, } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) @@ -89,8 +89,8 @@ def test_basics(rethinker): {"start": sites[0].starts_and_stops[0]["start"], "stop": None} ], "status": "ACTIVE", - "account_id": None, "video_capture": "ENABLE_VIDEO_CAPTURE", + "partition_id": None, } assert sites[1] == { "claimed": False, @@ -107,8 +107,9 @@ def test_basics(rethinker): }, ], "status": "ACTIVE", - "account_id": None, + "partition_id": None, "video_capture": "ENABLE_VIDEO_CAPTURE", + "partition_id": None, } pages = list(frontier.site_pages(sites[0].id))