mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 05:52:27 -04:00
partition_id, not account_id
This commit is contained in:
parent
a93bf13252
commit
d2ec6de49e
4 changed files with 24 additions and 23 deletions
|
@ -98,7 +98,7 @@ seeds:
|
||||||
video_capture:
|
video_capture:
|
||||||
type: string
|
type: string
|
||||||
|
|
||||||
account_id:
|
partition_id:
|
||||||
type:
|
type:
|
||||||
- string
|
- string
|
||||||
- integer
|
- integer
|
||||||
|
@ -111,7 +111,7 @@ max_claimed_sites:
|
||||||
pdfs_only:
|
pdfs_only:
|
||||||
type: boolean
|
type: boolean
|
||||||
|
|
||||||
account_id:
|
partition_id:
|
||||||
type:
|
type:
|
||||||
- string
|
- string
|
||||||
- integer
|
- integer
|
||||||
|
|
|
@ -104,8 +104,8 @@ def new_job(frontier, job_conf):
|
||||||
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
||||||
if "pdfs_only" in job_conf:
|
if "pdfs_only" in job_conf:
|
||||||
job.pdfs_only = job_conf["pdfs_only"]
|
job.pdfs_only = job_conf["pdfs_only"]
|
||||||
if "account_id" in job_conf:
|
if "partition_id" in job_conf:
|
||||||
job.account_id = job_conf["account_id"]
|
job.partition_id = job_conf["partition_id"]
|
||||||
job.save()
|
job.save()
|
||||||
|
|
||||||
sites = []
|
sites = []
|
||||||
|
@ -117,7 +117,7 @@ def new_job(frontier, job_conf):
|
||||||
merged_conf["seed"] = merged_conf.pop("url")
|
merged_conf["seed"] = merged_conf.pop("url")
|
||||||
site = brozzler.Site(frontier.rr, merged_conf)
|
site = brozzler.Site(frontier.rr, merged_conf)
|
||||||
site.id = str(uuid.uuid4())
|
site.id = str(uuid.uuid4())
|
||||||
site.account_id = job.account_id
|
site.partition_id = job.partition_id
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
pages.append(new_seed_page(frontier, site))
|
pages.append(new_seed_page(frontier, site))
|
||||||
|
|
||||||
|
@ -205,8 +205,8 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if "status" not in self:
|
if "status" not in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if "account_id" not in self:
|
if "partition_id" not in self:
|
||||||
self.account_id = None
|
self.partition_id = None
|
||||||
if "pdfs_only" not in self:
|
if "pdfs_only" not in self:
|
||||||
self.pdfs_only = False
|
self.pdfs_only = False
|
||||||
if "starts_and_stops" not in self:
|
if "starts_and_stops" not in self:
|
||||||
|
@ -268,8 +268,8 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if "video_capture" not in self:
|
if "video_capture" not in self:
|
||||||
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
||||||
if "account_id" not in self:
|
if "partition_id" not in self:
|
||||||
self.account_id = None
|
self.partition_id = None
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if "surt" in self.scope:
|
if "surt" in self.scope:
|
||||||
|
|
|
@ -108,16 +108,16 @@ class VideoDataClient:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
|
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
|
||||||
account_id = site["account_id"] if site["account_id"] else None
|
partition_id = site["partition_id"] if site["partition_id"] else None
|
||||||
seed_id = (
|
seed_id = (
|
||||||
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||||
)
|
)
|
||||||
|
|
||||||
if account_id and seed_id and containing_page_url:
|
if partition_id and seed_id and containing_page_url:
|
||||||
# check for postgres query for most recent record
|
# check for postgres query for most recent record
|
||||||
pg_query = (
|
pg_query = (
|
||||||
"SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
|
"SELECT * from video where partition_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
|
||||||
(account_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
|
(partition_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
results = self._execute_pg_query(pg_query, fetchall=True)
|
results = self._execute_pg_query(pg_query, fetchall=True)
|
||||||
|
@ -125,13 +125,13 @@ class VideoDataClient:
|
||||||
logger.warn("postgres query failed: %s", e)
|
logger.warn("postgres query failed: %s", e)
|
||||||
results = []
|
results = []
|
||||||
else:
|
else:
|
||||||
logger.warn("missing account_id, seed_id, or containing_page_url")
|
logger.warn("missing partition_id, seed_id, or containing_page_url")
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_video_captures(self, site=None, source=None) -> List[str]:
|
def get_video_captures(self, site=None, source=None) -> List[str]:
|
||||||
account_id = site["account_id"] if site["account_id"] else None
|
partition_id = site["partition_id"] if site["partition_id"] else None
|
||||||
seed_id = (
|
seed_id = (
|
||||||
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||||
)
|
)
|
||||||
|
@ -143,11 +143,11 @@ class VideoDataClient:
|
||||||
containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http"
|
containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http"
|
||||||
# support other media sources here
|
# support other media sources here
|
||||||
|
|
||||||
if account_id and seed_id and source:
|
if partition_id and seed_id and source:
|
||||||
pg_query = (
|
pg_query = (
|
||||||
"SELECT containing_page_url from video where account_id = %s and seed_id = %s and containing_page_url like %s",
|
"SELECT containing_page_url from video where partition_id = %s and seed_id = %s and containing_page_url like %s",
|
||||||
(
|
(
|
||||||
account_id,
|
partition_id,
|
||||||
seed_id,
|
seed_id,
|
||||||
containing_page_url_pattern,
|
containing_page_url_pattern,
|
||||||
),
|
),
|
||||||
|
@ -160,7 +160,7 @@ class VideoDataClient:
|
||||||
logger.warn("postgres query failed: %s", e)
|
logger.warn("postgres query failed: %s", e)
|
||||||
results = []
|
results = []
|
||||||
else:
|
else:
|
||||||
logger.warn("missing account_id, seed_id, or source")
|
logger.warn("missing partition_id, seed_id, or source")
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
@ -619,7 +619,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
|
||||||
if worker._video_data:
|
if worker._video_data:
|
||||||
logger.info(
|
logger.info(
|
||||||
"checking for previously captured youtube watch pages for account %s, seed_id %s",
|
"checking for previously captured youtube watch pages for account %s, seed_id %s",
|
||||||
site["account_id"],
|
site["partition_id"],
|
||||||
site["metadata"]["ait_seed_id"],
|
site["metadata"]["ait_seed_id"],
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -70,7 +70,7 @@ def test_basics(rethinker):
|
||||||
"status": "ACTIVE",
|
"status": "ACTIVE",
|
||||||
"pdfs_only": False,
|
"pdfs_only": False,
|
||||||
"starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
|
"starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
|
||||||
"account_id": None,
|
"partition_id": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
||||||
|
@ -89,8 +89,8 @@ def test_basics(rethinker):
|
||||||
{"start": sites[0].starts_and_stops[0]["start"], "stop": None}
|
{"start": sites[0].starts_and_stops[0]["start"], "stop": None}
|
||||||
],
|
],
|
||||||
"status": "ACTIVE",
|
"status": "ACTIVE",
|
||||||
"account_id": None,
|
|
||||||
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
||||||
|
"partition_id": None,
|
||||||
}
|
}
|
||||||
assert sites[1] == {
|
assert sites[1] == {
|
||||||
"claimed": False,
|
"claimed": False,
|
||||||
|
@ -107,8 +107,9 @@ def test_basics(rethinker):
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"status": "ACTIVE",
|
"status": "ACTIVE",
|
||||||
"account_id": None,
|
"partition_id": None,
|
||||||
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
||||||
|
"partition_id": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
pages = list(frontier.site_pages(sites[0].id))
|
pages = list(frontier.site_pages(sites[0].id))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue