mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-13 16:55:34 -04:00
simplify job conf
This commit is contained in:
parent
353a74546a
commit
4a73b1e8d8
4 changed files with 20 additions and 28 deletions
|
@ -98,11 +98,6 @@ seeds:
|
|||
video_capture:
|
||||
type: string
|
||||
|
||||
partition_id:
|
||||
type:
|
||||
- string
|
||||
- integer
|
||||
|
||||
<<: *multi_level_options
|
||||
|
||||
max_claimed_sites:
|
||||
|
@ -110,8 +105,3 @@ max_claimed_sites:
|
|||
|
||||
pdfs_only:
|
||||
type: boolean
|
||||
|
||||
partition_id:
|
||||
type:
|
||||
- string
|
||||
- integer
|
||||
|
|
|
@ -104,8 +104,6 @@ def new_job(frontier, job_conf):
|
|||
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
||||
if "pdfs_only" in job_conf:
|
||||
job.pdfs_only = job_conf["pdfs_only"]
|
||||
if "partition_id" in job_conf:
|
||||
job.partition_id = job_conf["partition_id"]
|
||||
job.save()
|
||||
|
||||
sites = []
|
||||
|
@ -117,7 +115,6 @@ def new_job(frontier, job_conf):
|
|||
merged_conf["seed"] = merged_conf.pop("url")
|
||||
site = brozzler.Site(frontier.rr, merged_conf)
|
||||
site.id = str(uuid.uuid4())
|
||||
site.partition_id = job.partition_id
|
||||
sites.append(site)
|
||||
pages.append(new_seed_page(frontier, site))
|
||||
|
||||
|
@ -205,8 +202,6 @@ class Job(doublethink.Document, ElapsedMixIn):
|
|||
def populate_defaults(self):
|
||||
if "status" not in self:
|
||||
self.status = "ACTIVE"
|
||||
if "partition_id" not in self:
|
||||
self.partition_id = None
|
||||
if "pdfs_only" not in self:
|
||||
self.pdfs_only = False
|
||||
if "starts_and_stops" not in self:
|
||||
|
@ -268,8 +263,6 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||
self.scope = {}
|
||||
if "video_capture" not in self:
|
||||
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
|
||||
if "partition_id" not in self:
|
||||
self.partition_id = None
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
|
|
|
@ -108,11 +108,19 @@ class VideoDataClient:
|
|||
return None
|
||||
|
||||
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
|
||||
partition_id = site["partition_id"] if site["partition_id"] else None
|
||||
# using ait_account_id as postgres partition id
|
||||
partition_id = (
|
||||
site["metadata"]["ait_account_id"]
|
||||
if site["metadata"]["ait_account_id"]
|
||||
else None
|
||||
)
|
||||
seed_id = (
|
||||
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||
)
|
||||
|
||||
# TODO: generalize, make variable?
|
||||
# containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
|
||||
|
||||
if partition_id and seed_id and containing_page_url:
|
||||
# check for postgres query for most recent record
|
||||
pg_query = (
|
||||
|
@ -125,20 +133,24 @@ class VideoDataClient:
|
|||
logger.warn("postgres query failed: %s", e)
|
||||
results = []
|
||||
else:
|
||||
logger.warn("missing partition_id, seed_id, or containing_page_url")
|
||||
logger.warn(
|
||||
"missing partition_id/account_id, seed_id, or containing_page_url"
|
||||
)
|
||||
results = []
|
||||
|
||||
return results
|
||||
|
||||
def get_video_captures(self, site=None, source=None) -> List[str]:
|
||||
partition_id = site["partition_id"] if site["partition_id"] else None
|
||||
# using ait_account_id as postgres partition id
|
||||
partition_id = (
|
||||
site["metadata"]["ait_account_id"]
|
||||
if site["metadata"]["ait_account_id"]
|
||||
else None
|
||||
)
|
||||
seed_id = (
|
||||
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||
)
|
||||
|
||||
# TODO: generalize, maybe make variable?
|
||||
# containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
|
||||
|
||||
if source == "youtube":
|
||||
containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http"
|
||||
# support other media sources here
|
||||
|
@ -160,7 +172,7 @@ class VideoDataClient:
|
|||
logger.warn("postgres query failed: %s", e)
|
||||
results = []
|
||||
else:
|
||||
logger.warn("missing partition_id, seed_id, or source")
|
||||
logger.warn("missing partition_id/account_id, seed_id, or source")
|
||||
results = []
|
||||
|
||||
return results
|
||||
|
@ -619,7 +631,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
|
|||
if worker._video_data:
|
||||
logger.info(
|
||||
"checking for previously captured youtube watch pages for account %s, seed_id %s",
|
||||
site["partition_id"],
|
||||
site["metadata"]["ait_account_id"],
|
||||
site["metadata"]["ait_seed_id"],
|
||||
)
|
||||
try:
|
||||
|
|
|
@ -70,7 +70,6 @@ def test_basics(rethinker):
|
|||
"status": "ACTIVE",
|
||||
"pdfs_only": False,
|
||||
"starts_and_stops": [{"start": job.starts_and_stops[0]["start"], "stop": None}],
|
||||
"partition_id": None,
|
||||
}
|
||||
|
||||
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
||||
|
@ -90,7 +89,6 @@ def test_basics(rethinker):
|
|||
],
|
||||
"status": "ACTIVE",
|
||||
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
||||
"partition_id": None,
|
||||
}
|
||||
assert sites[1] == {
|
||||
"claimed": False,
|
||||
|
@ -108,7 +106,6 @@ def test_basics(rethinker):
|
|||
],
|
||||
"status": "ACTIVE",
|
||||
"video_capture": "ENABLE_VIDEO_CAPTURE",
|
||||
"partition_id": None,
|
||||
}
|
||||
|
||||
pages = list(frontier.site_pages(sites[0].id))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue