more updates post-QA-deploy

This commit is contained in:
Barbara Miller 2025-07-16 16:30:05 -07:00
parent 3d377e7943
commit 3adae49be7

View file

@ -114,11 +114,11 @@ class VideoDataClient:
if account_id and seed_id and containing_page_url: if account_id and seed_id and containing_page_url:
# check for postgres query for most recent record # check for postgres query for most recent record
pg_query = ( pg_query = (
"SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s LIMIT 1", "SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
(account_id, seed_id, str(urlcanon.aggressive(containing_page_url))), (account_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
) )
try: try:
results = self._execute_query(pg_query, fetchall=True) results = self._execute_pg_query(pg_query, fetchall=True)
except Exception as e: except Exception as e:
logger.warn("postgres query failed: %s", e) logger.warn("postgres query failed: %s", e)
results = [] results = []
@ -149,7 +149,7 @@ class VideoDataClient:
), ),
) )
try: try:
results = self._execute_query( results = self._execute_pg_query(
pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True
) )
except Exception as e: except Exception as e:
@ -162,16 +162,15 @@ class VideoDataClient:
return results return results
def create_video_capture_record(self, video_capture_record): def create_video_capture_record(self, video_capture_record):
# note: brozzler postcrawl step 72 includes info from crawl-log — watch for differences! # NOTE: we want to do this in brozzler postcrawl for now
# WIP
# TODO: needs added fields added to postgres table, refinement # TODO: needs added fields added to postgres table, refinement
pg_query = ( pg_query = (
f"INSERT INTO video ({VideoCaptureRecord - items}) VALUES (%s, %s, ...)", f"INSERT INTO video ({VideoCaptureRecord - items}) VALUES (%s, %s, ...)",
VideoCaptureRecord - values, VideoCaptureRecord - values,
) )
try: try:
results = self._execute_query( results = self._execute_pg_query(pg_query)
pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True
)
except Exception as e: except Exception as e:
logger.warn("postgres query failed: %s", e) logger.warn("postgres query failed: %s", e)
results = [] results = []
@ -444,9 +443,10 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
return ydl return ydl
def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None): # new maybe? def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None):
def _remember_videos(page, pushed_videos=None):
""" """
Saves info about videos captured by yt-dlp in `page.videos` and postgres. Saves info about videos captured by yt-dlp in `page.videos`
""" """
if "videos" not in page: if "videos" not in page:
page.videos = [] page.videos = []
@ -480,13 +480,11 @@ def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None):
urlcanon.aggressive(ydl.url) urlcanon.aggressive(ydl.url)
) # probably? ) # probably?
video_record.video_url = pushed_video["url"] video_record.video_url = pushed_video["url"]
# note: ie_result may not be correct when multiple videos present # note: NEW! ie_result may not be correct when multiple videos present
video_record.video_title = ie_result.get("title") video_record.video_title = ie_result.get("title")
video_record.video_display_id = ie_result.get("display_id") video_record.video_display_id = ie_result.get("display_id")
video_record.video_resolution = ie_result.get("resolution") video_record.video_resolution = ie_result.get("resolution")
video_record.video_capture_status = None # "recrawl" maybe video_record.video_capture_status = None # "recrawl" maybe
worker._video_data.save_video_capture_record(video_record)
""" """
logger.debug("embedded video", video=video) logger.debug("embedded video", video=video)
page.videos.append(video) page.videos.append(video)
@ -563,7 +561,7 @@ def _try_youtube_dl(worker, ydl, site, page):
logger.info("ytdlp completed successfully") logger.info("ytdlp completed successfully")
info_json = json.dumps(ie_result, sort_keys=True, indent=4) info_json = json.dumps(ie_result, sort_keys=True, indent=4)
_remember_videos(page, info_json, ydl.pushed_videos) _remember_videos(page, ydl.pushed_videos)
if worker._using_warcprox(site): if worker._using_warcprox(site):
logger.info( logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json", "sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json",
@ -610,9 +608,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
if worker._video_data: if worker._video_data:
captured_youtube_watch_pages = set() captured_youtube_watch_pages = set()
captured_youtube_watch_pages.add( captured_youtube_watch_pages.add(
worker._video_data.get_video_captures_by_source( worker._video_data.get_video_captures(site, source="youtube")
site, source="youtube"
)
) )
uncaptured_youtube_watch_pages = [] uncaptured_youtube_watch_pages = []
for e in ie_result.get("entries_no_dl", []): for e in ie_result.get("entries_no_dl", []):