diff --git a/brozzler/model.py b/brozzler/model.py index cebcf6c..6da2cb6 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -34,6 +34,7 @@ import urllib import uuid import yaml import zlib +from enum import Enum from typing import Optional @@ -191,6 +192,12 @@ class ElapsedMixIn(object): return dt +class YTDLPStatus(Enum): + UNKNOWN = 0 + SKIP = 1 + CAPTURE = 2 + + class Job(doublethink.Document, ElapsedMixIn): logger = logging.getLogger(__module__ + "." + __qualname__) table = "jobs" @@ -236,7 +243,7 @@ class Site(doublethink.Document, ElapsedMixIn): if not "scope" in self: self.scope = {} if not "skip_ytdlp" in self: - self.skip_ytdlp = False + self.skip_ytdlp = YTDLPStatus.UNKNOWN # backward compatibility if "surt" in self.scope: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 3d217c0..c787cf0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -53,10 +53,11 @@ def _timestamp4datetime(timestamp): def should_ytdlp(site, page, skip_av_seeds): # called only after we've passed needs_browsing() check + from .model import YTDLPStatus if page.status_code != 200: logging.info("skipping ytdlp: non-200 page status") return False - if site.skip_ytdlp: + if site.skip_ytdlp == "SKIP": logging.info("skipping ytdlp: site marked skip_ytdlp") return False @@ -72,11 +73,14 @@ def should_ytdlp(site, page, skip_av_seeds): ) # TODO: develop UI and refactor - if ytdlp_seed and ytdlp_seed in skip_av_seeds: - logging.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - + if ytdlp_seed + if site.skip_ytdlp == "UNKNOWN" and ytdlp_seed in skip_av_seeds: + logging.info("skipping ytdlp: site in skip_av_seeds") + site.skip_ytdlp = YTDLPStatus.SKIP + return False + else: + site.skip_ytdlp = YTDLPStatus.CAPTURE + logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed) if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: