From 02d26b0b9c15edc88f39d890ee4bca59aa674023 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 5 Apr 2024 17:09:22 -0700 Subject: [PATCH] pylint (partial) and other tidying --- brozzler/worker.py | 2 +- brozzler/ydl.py | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 63fe206..13b4fb7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,7 +244,7 @@ class BrozzlerWorker: self.logger.info("brozzling {}".format(page)) ydl_fetches = None outlinks = set() - if enable_youtube_dl and ydl.should_ytdlp(self, page, site): + if enable_youtube_dl and ydl.should_ytdlp(page, site): try: ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 77532f2..c8698f1 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -50,10 +50,10 @@ def _timestamp4datetime(timestamp): int(timestamp[-2:]) ) -def should_ytdlp(worker, page, site): +def should_ytdlp(page, site): ytdlp_url = page.redirect_url if page.redirect_url else page.url ytdlp_seed = site["metadata"]["ait_seed_id"] - logging.info("checking containing page %r, site %r", ytdlp_url, ytdlp_seed) + logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed) if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: logging.info("found youtube watch page %r", ytdlp_url) @@ -62,19 +62,23 @@ def should_ytdlp(worker, page, site): session = cluster.connect("video") containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1" future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url]) - record = None try: - record = future.result() - logging.info("record: %s", record) + rows = future.result() except ReadTimeout: - log.exception("Query timed out:") - if record and record.video_timestamp: - logging.info(f"video_timestamp: {record.video_timestamp}") - ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp)) + logging.exception("Query timed out:") + + if len(rows.current_rows) == 0: + logging.info("no results returned from videos query") + return True + + for row in rows: + logging.info("video query found %r", row) + ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp)) logging.info("ytdlp_timestamp: %s", ytdlp_timestamp) - time_diff = datetime.now() - ytdlp_timestamp + time_diff = datetime.datetime.now() - ytdlp_timestamp # TODO: make variable for timedelta - if time_diff < timedelta(days = 90): + if time_diff < datetime.timedelta(days = 90): + logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url) return False return True