From fe5ad0c31d7a1b06d78ce74517b29c7cdd5875a4 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 16:46:37 -0700 Subject: [PATCH] VIDEO_DATA_SOURCE --- brozzler/ydl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 92972b5..b52ebf7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -42,7 +42,7 @@ PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 -VIDEO_DATA = "" +VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) @@ -421,23 +421,22 @@ def _try_youtube_dl(worker, ydl, site, page): def get_video_captures(site, source="youtube"): - if not VIDEO_DATA: + if not VIDEO_DATA_SOURCE: return None - if VIDEO_DATA and VIDEO_DATA.startswith("postgresql"): + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): import psycopg - pg_url = VIDEO_DATA account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None if source == "youtube": - containing_page_url_pattern = "http://youtube.com/watch" + containing_page_url_pattern = "http://youtube.com/watch" # yes, video data canonicalization uses "http" # support other sources here else: containing_page_url_pattern = None if account_id and seed and source: pg_query = ( - "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like %s", + "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", ( account_id, seed, @@ -451,10 +450,11 @@ def get_video_captures(site, source="youtube"): ) else: return None - with psycopg.connect(pg_url) as conn: + with psycopg.connect(VIDEO_DATA_SOURCE) as conn: with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: cur.execute(pg_query) return cur.fetchall() + return None @metrics.brozzler_ytdlp_duration_seconds.time()