From 72e549694c154c6666b12cbecc1cf084a01c5315 Mon Sep 17 00:00:00 2001 From: Alex Dempsey Date: Tue, 4 Mar 2025 13:58:52 -0800 Subject: [PATCH] Only import yt-dlp if we're using it --- brozzler/worker.py | 39 ++++++++++++++++++++++++++++++++++++--- brozzler/ydl.py | 33 --------------------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index b8befef..5500e4a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,7 +39,6 @@ import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as rdb from . import metrics -from . import ydl r = rdb.RethinkDB() @@ -260,6 +259,38 @@ class BrozzlerWorker: img.save(out, "jpeg", quality=95) return out.getbuffer() + def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds): + # called only after we've passed needs_browsing() check + + if page_status != 200: + logger.info("skipping ytdlp: non-200 page status", page_status=page_status) + return False + if site.skip_ytdlp: + logger.info("skipping ytdlp: site marked skip_ytdlp") + return False + + ytdlp_url = page.redirect_url if page.redirect_url else page.url + + if "chrome-error:" in ytdlp_url: + return False + + ytdlp_seed = ( + site["metadata"]["ait_seed_id"] + if "metadata" in site and "ait_seed_id" in site["metadata"] + else None + ) + + # TODO: develop UI and refactor + if ytdlp_seed: + if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: + logger.info("skipping ytdlp: site in skip_av_seeds") + site.skip_ytdlp = True + return False + else: + site.skip_ytdlp = False + + return True + @metrics.brozzler_page_processing_duration_seconds.time() @metrics.brozzler_in_progress_pages.track_inprogress() def brozzle_page( @@ -293,10 +324,12 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: page_logger.info("page interstitial shown (http auth)") - if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds + if enable_youtube_dl and self.should_ytdlp( + page_logger, site, page, status_code, self._skip_av_seeds ): try: + from . import ydl + ydl_outlinks = ydl.do_youtube_dl( self, site, page, self._ytdlp_proxy_endpoints ) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7fab1f7..ae756a0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5 logger = structlog.get_logger(logger_name=__name__) -def should_ytdlp(site, page, page_status, skip_av_seeds): - # called only after we've passed needs_browsing() check - - if page_status != 200: - logger.info("skipping ytdlp: non-200 page status", page_status=page_status) - return False - if site.skip_ytdlp: - logger.info("skipping ytdlp: site marked skip_ytdlp") - return False - - ytdlp_url = page.redirect_url if page.redirect_url else page.url - - if "chrome-error:" in ytdlp_url: - return False - - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logger.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - - return True - - def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]