From 72cb2a6e4ab8d08ed5a90e36ec000a3f48824359 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Apr 2024 16:42:44 -0700 Subject: [PATCH 1/2] predup youtube watch pages --- brozzler/worker.py | 2 +- brozzler/ydl.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 86977cf..13b4fb7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,7 +244,7 @@ class BrozzlerWorker: self.logger.info("brozzling {}".format(page)) ydl_fetches = None outlinks = set() - if enable_youtube_dl and not page.url.lower().endswith(".pdf"): + if enable_youtube_dl and ydl.should_ytdlp(page, site): try: ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4281d4a..e1069c3 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -27,11 +27,62 @@ import os import json import doublethink import datetime +from cassandra import ReadTimeout +from cassandra.cluster import Cluster + import threading thread_local = threading.local() +def _timestamp4datetime(timestamp): + """split `timestamp` into a tuple of 6 integers. + + :param timestamp: full-length timestamp. + :type timestamp: bytes + """ + timestamp = timestamp[:14] + return ( + int(timestamp[:-10]), + int(timestamp[-10:-8]), + int(timestamp[-8:-6]), + int(timestamp[-6:-4]), + int(timestamp[-4:-2]), + int(timestamp[-2:]) + ) + +def should_ytdlp(page, site): + ytdlp_url = page.redirect_url if page.redirect_url else page.url + logging.debug("ytdlp_url: %s", ytdlp_url) + ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "") + logging.debug("ytdlp_seed: %s", ytdlp_seed) + + if ".pdf" in ytdlp_url.lower(): + return False + + if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: + # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml + cluster = Cluster(["207.241.235.189"], protocol_version=5) + session = cluster.connect("video") + containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1" + future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url]) + logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}") + try: + record = future.result() + logging.debug("record: %s", record) + except ReadTimeout: + log.exception("Query timed out:") + if record and record.video_timestamp: + logging.debug(f"video_timestamp: {record.video_timestamp}") + ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp)) + logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp) + time_diff = datetime.now() - ytdlp_timestamp + # TODO: make veriable for timedelta + if time_diff > timedelta(days = 90): + return False + + return True + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers From 2e79e31ae95532cca3cf7181f534bcb680fef98d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 4 Apr 2024 14:22:22 -0700 Subject: [PATCH 2/2] tidying --- brozzler/ydl.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e1069c3..75d63c6 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ """ brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2023 Internet Archive +Copyright (C) 2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -38,8 +38,7 @@ thread_local = threading.local() def _timestamp4datetime(timestamp): """split `timestamp` into a tuple of 6 integers. - :param timestamp: full-length timestamp. - :type timestamp: bytes + :param timestamp: full-length timestamp """ timestamp = timestamp[:14] return ( @@ -53,12 +52,8 @@ def _timestamp4datetime(timestamp): def should_ytdlp(page, site): ytdlp_url = page.redirect_url if page.redirect_url else page.url - logging.debug("ytdlp_url: %s", ytdlp_url) ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "") - logging.debug("ytdlp_seed: %s", ytdlp_seed) - - if ".pdf" in ytdlp_url.lower(): - return False + logging.info("ytdlp_seed: %s", ytdlp_seed) if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: # connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml @@ -66,19 +61,19 @@ def should_ytdlp(page, site): session = cluster.connect("video") containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1" future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url]) - logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}") + record = None try: record = future.result() - logging.debug("record: %s", record) + logging.info("record: %s", record) except ReadTimeout: log.exception("Query timed out:") if record and record.video_timestamp: - logging.debug(f"video_timestamp: {record.video_timestamp}") + logging.info(f"video_timestamp: {record.video_timestamp}") ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp)) - logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp) + logging.info("ytdlp_timestamp: %s", ytdlp_timestamp) time_diff = datetime.now() - ytdlp_timestamp - # TODO: make veriable for timedelta - if time_diff > timedelta(days = 90): + # TODO: make variable for timedelta + if time_diff < timedelta(days = 90): return False return True