From f979bbf15412b90b442c443470e6e13b582b5e47 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 17 Jul 2025 19:09:58 -0700
Subject: [PATCH] even more updates post-QA-deploy

---
 brozzler/ydl.py | 65 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 4a0e88b..6f457c4 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -108,8 +108,10 @@ class VideoDataClient:
         return None
 
     def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
-        account_id = site.account_id if site.account_id else None
-        seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
+        account_id = site["account_id"] if site["account_id"] else None
+        seed_id = (
+            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
+        )
 
         if account_id and seed_id and containing_page_url:
             # check for postgres query for most recent record
@@ -129,8 +131,10 @@ class VideoDataClient:
         return results
 
     def get_video_captures(self, site=None, source=None) -> List[str]:
-        account_id = site.account_id if site.account_id else None
-        seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
+        account_id = site["account_id"] if site["account_id"] else None
+        seed_id = (
+            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
+        )
 
         # TODO: generalize, maybe make variable?
         # containing_page_timestamp_pattern = "2025%"  # for future pre-dup additions
@@ -465,7 +469,7 @@ def _remember_videos(page, pushed_videos=None):
         video_record = worker._video_data.VideoCaptureRecord()
         video_record.crawl_job_id = site.job_id
         video_record.is_test_crawl = True if warc_prefix_items[2] == "TEST" else False
-        video_record.seed_id = site.ait_seed_id
+        video_record.seed_id = site["metadata"]["ait_seed_id"]
         video_record.collection_id = int(warc_prefix_items[1])
         video_record.containing_page_timestamp = None
         video_record.containing_page_digest = None
@@ -606,23 +610,42 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
             or ie_result.get("extractor") == "youtube:tab"
         ):
             if worker._video_data:
-                captured_youtube_watch_pages = set()
-                captured_youtube_watch_pages.add(
-                    worker._video_data.get_video_captures(site, source="youtube")
+                logger.info(
+                    "checking for previously captured youtube watch pages for account %s, seed_id %s",
+                    site["account_id"],
+                    site["metadata"]["ait_seed_id"],
                 )
-                uncaptured_youtube_watch_pages = []
-                for e in ie_result.get("entries_no_dl", []):
-                    # note: http needed for match
-                    youtube_watch_url = str(
-                        urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}")
+                try:
+                    captured_youtube_watch_pages = set()
+                    captured_youtube_watch_pages.update(
+                        worker._video_data.get_video_captures(site, source="youtube")
                     )
-                    if youtube_watch_url in captured_youtube_watch_pages:
-                        logger.info("skipping adding %s to outlinks", youtube_watch_url)
-                        continue
-                    uncaptured_youtube_watch_pages.append(
-                        f"https://www.youtube.com/watch?v={e['id']}"
-                    )
-                if uncaptured_youtube_watch_pages:
-                    outlinks.add(uncaptured_youtube_watch_pages)
+                    uncaptured_youtube_watch_pages = []
+                    for e in ie_result.get("entries_no_dl", []):
+                        # note: http needed for match
+                        youtube_watch_url = str(
+                            urlcanon.aggressive(
+                                f"http://www.youtube.com/watch?v={e['id']}"
+                            )
+                        )
+                        if youtube_watch_url in captured_youtube_watch_pages:
+                            logger.info(
+                                "skipping adding %s to yt-dlp outlinks",
+                                youtube_watch_url,
+                            )
+                            continue
+                        uncaptured_youtube_watch_pages.append(
+                            f"https://www.youtube.com/watch?v={e['id']}"
+                        )
+                except Exception as e:
+                    logger.warning("hit exception processing worker._video_data: %s", e)
+                    if uncaptured_youtube_watch_pages:
+                        outlinks.update(uncaptured_youtube_watch_pages)
+            else:
+                outlinks = {
+                    "https://www.youtube.com/watch?v=%s" % e["id"]
+                    for e in ie_result.get("entries_no_dl", [])
+                }
+
         # todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere)
         return outlinks