even more updates post-QA-deploy

2025-09-21 13:14:49 -04:00 · 2025-07-17 19:09:58 -07:00 · 2025-07-17 19:09:58 -07:00 · f979bbf154
commit f979bbf154
parent 3adae49be7
1 changed files with 44 additions and 21 deletions
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -108,8 +108,10 @@ class VideoDataClient:
        return None
    def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
-        account_id = site.account_id if site.account_id else None
+        account_id = site["account_id"] if site["account_id"] else None
-        seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
+        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )
        if account_id and seed_id and containing_page_url:
            # check for postgres query for most recent record
@ -129,8 +131,10 @@ class VideoDataClient:
        return results
    def get_video_captures(self, site=None, source=None) -> List[str]:
-        account_id = site.account_id if site.account_id else None
+        account_id = site["account_id"] if site["account_id"] else None
-        seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
+        seed_id = (
            site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
        )
        # TODO: generalize, maybe make variable?
        # containing_page_timestamp_pattern = "2025%"  # for future pre-dup additions
@ -465,7 +469,7 @@ def _remember_videos(page, pushed_videos=None):
        video_record = worker._video_data.VideoCaptureRecord()
        video_record.crawl_job_id = site.job_id
        video_record.is_test_crawl = True if warc_prefix_items[2] == "TEST" else False
-        video_record.seed_id = site.ait_seed_id
+        video_record.seed_id = site["metadata"]["ait_seed_id"]
        video_record.collection_id = int(warc_prefix_items[1])
        video_record.containing_page_timestamp = None
        video_record.containing_page_digest = None
@ -606,23 +610,42 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
            or ie_result.get("extractor") == "youtube:tab"
        ):
            if worker._video_data:
-                captured_youtube_watch_pages = set()
+                logger.info(
-                captured_youtube_watch_pages.add(
+                    "checking for previously captured youtube watch pages for account %s, seed_id %s",
-                    worker._video_data.get_video_captures(site, source="youtube")
+                    site["account_id"],
                    site["metadata"]["ait_seed_id"],
                )
-                uncaptured_youtube_watch_pages = []
+                try:
-                for e in ie_result.get("entries_no_dl", []):
+                    captured_youtube_watch_pages = set()
-                    # note: http needed for match
+                    captured_youtube_watch_pages.update(
-                    youtube_watch_url = str(
+                        worker._video_data.get_video_captures(site, source="youtube")
                        urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}")
                    )
-                    if youtube_watch_url in captured_youtube_watch_pages:
+                    uncaptured_youtube_watch_pages = []
-                        logger.info("skipping adding %s to outlinks", youtube_watch_url)
+                    for e in ie_result.get("entries_no_dl", []):
-                        continue
+                        # note: http needed for match
-                    uncaptured_youtube_watch_pages.append(
+                        youtube_watch_url = str(
-                        f"https://www.youtube.com/watch?v={e['id']}"
+                            urlcanon.aggressive(
-                    )
+                                f"http://www.youtube.com/watch?v={e['id']}"
-                if uncaptured_youtube_watch_pages:
+                            )
-                    outlinks.add(uncaptured_youtube_watch_pages)
+                        )
                        if youtube_watch_url in captured_youtube_watch_pages:
                            logger.info(
                                "skipping adding %s to yt-dlp outlinks",
                                youtube_watch_url,
                            )
                            continue
                        uncaptured_youtube_watch_pages.append(
                            f"https://www.youtube.com/watch?v={e['id']}"
                        )
                except Exception as e:
                    logger.warning("hit exception processing worker._video_data: %s", e)
                    if uncaptured_youtube_watch_pages:
                        outlinks.update(uncaptured_youtube_watch_pages)
            else:
                outlinks = {
                    "https://www.youtube.com/watch?v=%s" % e["id"]
                    for e in ie_result.get("entries_no_dl", [])
                }
        # todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere)
        return outlinks