From 72cb2a6e4ab8d08ed5a90e36ec000a3f48824359 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Wed, 3 Apr 2024 16:42:44 -0700
Subject: [PATCH 1/2] predup youtube watch pages

---
 brozzler/worker.py |  2 +-
 brozzler/ydl.py    | 51 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index 86977cf..13b4fb7 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -244,7 +244,7 @@ class BrozzlerWorker:
         self.logger.info("brozzling {}".format(page))
         ydl_fetches = None
         outlinks = set()
-        if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
+        if enable_youtube_dl and ydl.should_ytdlp(page, site):
             try:
                 ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
             except brozzler.ReachedLimit as e:
diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 4281d4a..e1069c3 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -27,11 +27,62 @@ import os
 import json
 import doublethink
 import datetime
+from cassandra import ReadTimeout
+from cassandra.cluster import Cluster
+
 import threading
 
 thread_local = threading.local()
 
 
+def _timestamp4datetime(timestamp):
+    """split `timestamp` into a tuple of 6 integers.
+
+    :param timestamp: full-length timestamp.
+    :type timestamp: bytes
+    """
+    timestamp = timestamp[:14]
+    return (
+        int(timestamp[:-10]),
+        int(timestamp[-10:-8]),
+        int(timestamp[-8:-6]),
+        int(timestamp[-6:-4]),
+        int(timestamp[-4:-2]),
+        int(timestamp[-2:])
+        )
+
+def should_ytdlp(page, site):
+    ytdlp_url = page.redirect_url if page.redirect_url else page.url
+    logging.debug("ytdlp_url: %s", ytdlp_url)
+    ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "")
+    logging.debug("ytdlp_seed: %s", ytdlp_seed)
+
+    if ".pdf" in ytdlp_url.lower():
+        return False
+
+    if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
+        # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
+        cluster = Cluster(["207.241.235.189"], protocol_version=5)
+        session = cluster.connect("video")
+        containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
+        future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
+        logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}")
+        try:
+            record = future.result()
+            logging.debug("record: %s", record)
+        except ReadTimeout:
+            log.exception("Query timed out:")
+        if record and record.video_timestamp:
+            logging.debug(f"video_timestamp: {record.video_timestamp}")
+            ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp))
+            logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp)
+            time_diff = datetime.now() - ytdlp_timestamp
+            # TODO: make veriable for timedelta
+            if time_diff > timedelta(days = 90):
+                return False
+
+    return True
+
 class ExtraHeaderAdder(urllib.request.BaseHandler):
     def __init__(self, extra_headers):
         self.extra_headers = extra_headers

From 2e79e31ae95532cca3cf7181f534bcb680fef98d Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 4 Apr 2024 14:22:22 -0700
Subject: [PATCH 2/2] tidying

---
 brozzler/ydl.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index e1069c3..75d63c6 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -1,7 +1,7 @@
 """
 brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
 
-Copyright (C) 2023 Internet Archive
+Copyright (C) 2024 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -38,8 +38,7 @@ thread_local = threading.local()
 def _timestamp4datetime(timestamp):
     """split `timestamp` into a tuple of 6 integers.
 
-    :param timestamp: full-length timestamp.
-    :type timestamp: bytes
+    :param timestamp: full-length timestamp
     """
     timestamp = timestamp[:14]
     return (
@@ -53,12 +52,8 @@ def _timestamp4datetime(timestamp):
 
 def should_ytdlp(page, site):
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
-    logging.debug("ytdlp_url: %s", ytdlp_url)
     ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "")
-    logging.debug("ytdlp_seed: %s", ytdlp_seed)
-
-    if ".pdf" in ytdlp_url.lower():
-        return False
+    logging.info("ytdlp_seed: %s", ytdlp_seed)
 
     if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
         # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
@@ -66,19 +61,19 @@ def should_ytdlp(page, site):
         session = cluster.connect("video")
         containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
         future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
-        logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}")
+        record = None
         try:
             record = future.result()
-            logging.debug("record: %s", record)
+            logging.info("record: %s", record)
         except ReadTimeout:
             log.exception("Query timed out:")
         if record and record.video_timestamp:
-            logging.debug(f"video_timestamp: {record.video_timestamp}")
+            logging.info(f"video_timestamp: {record.video_timestamp}")
             ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp))
-            logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp)
+            logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
             time_diff = datetime.now() - ytdlp_timestamp
-            # TODO: make veriable for timedelta
-            if time_diff > timedelta(days = 90):
+            # TODO: make variable for timedelta
+            if time_diff < timedelta(days = 90):
                 return False
 
     return True