predup youtube watch pages

2025-07-14 10:39:27 -04:00 · 2024-04-03 16:42:44 -07:00 · 2024-04-03 16:42:44 -07:00 · 72cb2a6e4a
commit 72cb2a6e4a
parent 376f3139ce
2 changed files with 52 additions and 1 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -244,7 +244,7 @@ class BrozzlerWorker:
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
        outlinks = set()
-        if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
+        if enable_youtube_dl and ydl.should_ytdlp(page, site):
            try:
                ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
            except brozzler.ReachedLimit as e:
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -27,11 +27,62 @@ import os
 import json
 import doublethink
 import datetime
 from cassandra import ReadTimeout
 from cassandra.cluster import Cluster
 import threading
 thread_local = threading.local()
 def _timestamp4datetime(timestamp):
    """split `timestamp` into a tuple of 6 integers.
    :param timestamp: full-length timestamp.
    :type timestamp: bytes
    """
    timestamp = timestamp[:14]
    return (
        int(timestamp[:-10]),
        int(timestamp[-10:-8]),
        int(timestamp[-8:-6]),
        int(timestamp[-6:-4]),
        int(timestamp[-4:-2]),
        int(timestamp[-2:])
        )
 def should_ytdlp(page, site):
    ytdlp_url = page.redirect_url if page.redirect_url else page.url
    logging.debug("ytdlp_url: %s", ytdlp_url)
    ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "")
    logging.debug("ytdlp_seed: %s", ytdlp_seed)
    if ".pdf" in ytdlp_url.lower():
        return False
    if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
        # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
        cluster = Cluster(["207.241.235.189"], protocol_version=5)
        session = cluster.connect("video")
        containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
        future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
        logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}")
        try:
            record = future.result()
            logging.debug("record: %s", record)
        except ReadTimeout:
            log.exception("Query timed out:")
        if record and record.video_timestamp:
            logging.debug(f"video_timestamp: {record.video_timestamp}")
            ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp))
            logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp)
            time_diff = datetime.now() - ytdlp_timestamp
            # TODO: make veriable for timedelta
            if time_diff > timedelta(days = 90):
                return False
    return True
 class ExtraHeaderAdder(urllib.request.BaseHandler):
    def __init__(self, extra_headers):
        self.extra_headers = extra_headers