From db17335ffb7fd3268a69da5ae84564d092f71e6d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 15:10:14 -0700 Subject: [PATCH] fix github ruff issues --- .github/workflows/setup/action.yml | 4 +++- brozzler/worker.py | 1 - brozzler/ydl.py | 32 +++++++++++++++++------------- pyproject.toml | 4 ++-- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index a0203f0..58b2d1f 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -25,5 +25,7 @@ runs: - name: Install pip dependencies run: | - uv sync --python ${{ inputs.python-version }} --extra rethinkdb --extra warcprox --extra yt-dlp + pip install .[rethinkdb,warcprox,yt-dlp,psycopg] + # setuptools required by rethinkdb==2.4.9 + pip install pytest setuptools shell: bash diff --git a/brozzler/worker.py b/brozzler/worker.py index 983f89d..7e2b254 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,7 +39,6 @@ from urllib3.exceptions import ProxyError, TimeoutError import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions -from brozzler.ydl import VideoDataClient from . import metrics diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a5541f0..685e1e5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -24,12 +24,14 @@ import tempfile import threading import time import urllib.request -from typing import List +from typing import Any, List, Optional import doublethink +import psycopg import structlog import urlcanon import yt_dlp +from psycopg_pool import ConnectionPool, PoolTimeout from yt_dlp.utils import ExtractorError, match_filter_func import brozzler @@ -38,7 +40,6 @@ from . import metrics thread_local = threading.local() - PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 @@ -52,26 +53,29 @@ logger = structlog.get_logger(logger_name=__name__) class VideoDataClient: def __init__(self): if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - import psycopg - from psycopg_pool import ConnectionPool - pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) pool.wait() logger.info("pg pool ready") - atexit.register(pool.close) + # atexit.register(pool.close) self.pool = pool def _execute_pg_query( self, query: str, row_factory=None, fetchone=False, fetchall=False ) -> Optional[Any]: - with self.pool.connection() as conn: - with conn.cursor(row_factory=row_factory) as cur: - cur.execute(query) - if fetchone: - return cur.fetchone() - if fetchall: - return cur.fetchall() + try: + with self.pool.connection() as conn: + with conn.cursor(row_factory=row_factory) as cur: + cur.execute(query) + if fetchone: + return cur.fetchone() + if fetchall: + return cur.fetchall() + except PoolTimeout as e: + logger.warn("hit PoolTimeout: %s", e) + self.pool.check() + except Exception as e: + logger.warn("postgres query failed: %s", e) return None def get_pg_video_captures(self, site=None, source=None) -> List[str]: @@ -79,7 +83,7 @@ class VideoDataClient: seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None # TODO: generalize, maybe make variable? - containing_page_timestamp_pattern = "2025%" # for future pre-dup additions + # containing_page_timestamp_pattern = "2025%" # for future pre-dup additions if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" diff --git a/pyproject.toml b/pyproject.toml index e8ac6af..4d7e0c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "brozzler" -version = "1.7.0" +version = "1.7.1" authors = [ { name="Noah Levitt", email="nlevitt@archive.org" }, ] @@ -40,7 +40,7 @@ license = "Apache-2.0" [project.optional-dependencies] yt-dlp = ["yt-dlp[default,curl-cffi]>=2024.7.25"] -psycopg = ["psycopg[binary]>=3.2.6"] +psycopg = ["psycopg[binary,pool]>=3.2.6"] dashboard = ["flask>=1.0", "gunicorn>=19.8.1"] warcprox = ["warcprox>=2.4.31"] rethinkdb = [