partition_id, not account_id

This commit is contained in:
Barbara Miller 2025-07-26 15:47:16 -07:00
parent a93bf13252
commit d2ec6de49e
4 changed files with 24 additions and 23 deletions

View file

@ -98,7 +98,7 @@ seeds:
video_capture: video_capture:
type: string type: string
account_id: partition_id:
type: type:
- string - string
- integer - integer
@ -111,7 +111,7 @@ max_claimed_sites:
pdfs_only: pdfs_only:
type: boolean type: boolean
account_id: partition_id:
type: type:
- string - string
- integer - integer

View file

@ -104,8 +104,8 @@ def new_job(frontier, job_conf):
job.max_claimed_sites = job_conf["max_claimed_sites"] job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf: if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"] job.pdfs_only = job_conf["pdfs_only"]
if "account_id" in job_conf: if "partition_id" in job_conf:
job.account_id = job_conf["account_id"] job.partition_id = job_conf["partition_id"]
job.save() job.save()
sites = [] sites = []
@ -117,7 +117,7 @@ def new_job(frontier, job_conf):
merged_conf["seed"] = merged_conf.pop("url") merged_conf["seed"] = merged_conf.pop("url")
site = brozzler.Site(frontier.rr, merged_conf) site = brozzler.Site(frontier.rr, merged_conf)
site.id = str(uuid.uuid4()) site.id = str(uuid.uuid4())
site.account_id = job.account_id site.partition_id = job.partition_id
sites.append(site) sites.append(site)
pages.append(new_seed_page(frontier, site)) pages.append(new_seed_page(frontier, site))
@ -205,8 +205,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self): def populate_defaults(self):
if "status" not in self: if "status" not in self:
self.status = "ACTIVE" self.status = "ACTIVE"
if "account_id" not in self: if "partition_id" not in self:
self.account_id = None self.partition_id = None
if "pdfs_only" not in self: if "pdfs_only" not in self:
self.pdfs_only = False self.pdfs_only = False
if "starts_and_stops" not in self: if "starts_and_stops" not in self:
@ -268,8 +268,8 @@ class Site(doublethink.Document, ElapsedMixIn):
self.scope = {} self.scope = {}
if "video_capture" not in self: if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
if "account_id" not in self: if "partition_id" not in self:
self.account_id = None self.partition_id = None
# backward compatibility # backward compatibility
if "surt" in self.scope: if "surt" in self.scope:

View file

@ -108,16 +108,16 @@ class VideoDataClient:
return None return None
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List: def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
account_id = site["account_id"] if site["account_id"] else None partition_id = site["partition_id"] if site["partition_id"] else None
seed_id = ( seed_id = (
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
) )
if account_id and seed_id and containing_page_url: if partition_id and seed_id and containing_page_url:
# check for postgres query for most recent record # check for postgres query for most recent record
pg_query = ( pg_query = (
"SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1", "SELECT * from video where partition_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
(account_id, seed_id, str(urlcanon.aggressive(containing_page_url))), (partition_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
) )
try: try:
results = self._execute_pg_query(pg_query, fetchall=True) results = self._execute_pg_query(pg_query, fetchall=True)
@ -125,13 +125,13 @@ class VideoDataClient:
logger.warn("postgres query failed: %s", e) logger.warn("postgres query failed: %s", e)
results = [] results = []
else: else:
logger.warn("missing account_id, seed_id, or containing_page_url") logger.warn("missing partition_id, seed_id, or containing_page_url")
results = [] results = []
return results return results
def get_video_captures(self, site=None, source=None) -> List[str]: def get_video_captures(self, site=None, source=None) -> List[str]:
account_id = site["account_id"] if site["account_id"] else None partition_id = site["partition_id"] if site["partition_id"] else None
seed_id = ( seed_id = (
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
) )
@ -143,11 +143,11 @@ class VideoDataClient:
containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http"
# support other media sources here # support other media sources here
if account_id and seed_id and source: if partition_id and seed_id and source:
pg_query = ( pg_query = (
"SELECT containing_page_url from video where account_id = %s and seed_id = %s and containing_page_url like %s", "SELECT containing_page_url from video where partition_id = %s and seed_id = %s and containing_page_url like %s",
( (
account_id, partition_id,
seed_id, seed_id,
containing_page_url_pattern, containing_page_url_pattern,
), ),
@ -160,7 +160,7 @@ class VideoDataClient:
logger.warn("postgres query failed: %s", e) logger.warn("postgres query failed: %s", e)
results = [] results = []
else: else:
logger.warn("missing account_id, seed_id, or source") logger.warn("missing partition_id, seed_id, or source")
results = [] results = []
return results return results
@ -619,7 +619,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
if worker._video_data: if worker._video_data:
logger.info( logger.info(
"checking for previously captured youtube watch pages for account %s, seed_id %s", "checking for previously captured youtube watch pages for account %s, seed_id %s",
site["account_id"], site["partition_id"],
site["metadata"]["ait_seed_id"], site["metadata"]["ait_seed_id"],
) )
try: try:

View file

@ -70,7 +70,7 @@ def test_basics(rethinker):
"status": "ACTIVE", "status": "ACTIVE",
"pdfs_only": False, "pdfs_only": False,
"starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}], "starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
"account_id": None, "partition_id": None,
} }
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
@ -89,8 +89,8 @@ def test_basics(rethinker):
{"start": sites[0].starts_and_stops[0]["start"], "stop": None} {"start": sites[0].starts_and_stops[0]["start"], "stop": None}
], ],
"status": "ACTIVE", "status": "ACTIVE",
"account_id": None,
"video_capture": "ENABLE_VIDEO_CAPTURE", "video_capture": "ENABLE_VIDEO_CAPTURE",
"partition_id": None,
} }
assert sites[1] == { assert sites[1] == {
"claimed": False, "claimed": False,
@ -107,8 +107,9 @@ def test_basics(rethinker):
}, },
], ],
"status": "ACTIVE", "status": "ACTIVE",
"account_id": None, "partition_id": None,
"video_capture": "ENABLE_VIDEO_CAPTURE", "video_capture": "ENABLE_VIDEO_CAPTURE",
"partition_id": None,
} }
pages = list(frontier.site_pages(sites[0].id)) pages = list(frontier.site_pages(sites[0].id))