fix github ruff issues

This commit is contained in:
Barbara Miller 2025-06-30 15:10:14 -07:00
parent 7d58a9ae3b
commit db17335ffb
4 changed files with 23 additions and 18 deletions

View file

@ -25,5 +25,7 @@ runs:
- name: Install pip dependencies
run: |
uv sync --python ${{ inputs.python-version }} --extra rethinkdb --extra warcprox --extra yt-dlp
pip install .[rethinkdb,warcprox,yt-dlp,psycopg]
# setuptools required by rethinkdb==2.4.9
pip install pytest setuptools
shell: bash

View file

@ -39,7 +39,6 @@ from urllib3.exceptions import ProxyError, TimeoutError
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
from brozzler.ydl import VideoDataClient
from . import metrics

View file

@ -24,12 +24,14 @@ import tempfile
import threading
import time
import urllib.request
from typing import List
from typing import Any, List, Optional
import doublethink
import psycopg
import structlog
import urlcanon
import yt_dlp
from psycopg_pool import ConnectionPool, PoolTimeout
from yt_dlp.utils import ExtractorError, match_filter_func
import brozzler
@ -38,7 +40,6 @@ from . import metrics
thread_local = threading.local()
PROXY_ATTEMPTS = 4
YTDLP_WAIT = 10
YTDLP_MAX_REDIRECTS = 5
@ -52,26 +53,29 @@ logger = structlog.get_logger(logger_name=__name__)
class VideoDataClient:
def __init__(self):
if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"):
import psycopg
from psycopg_pool import ConnectionPool
pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9)
pool.wait()
logger.info("pg pool ready")
atexit.register(pool.close)
# atexit.register(pool.close)
self.pool = pool
def _execute_pg_query(
self, query: str, row_factory=None, fetchone=False, fetchall=False
) -> Optional[Any]:
with self.pool.connection() as conn:
with conn.cursor(row_factory=row_factory) as cur:
cur.execute(query)
if fetchone:
return cur.fetchone()
if fetchall:
return cur.fetchall()
try:
with self.pool.connection() as conn:
with conn.cursor(row_factory=row_factory) as cur:
cur.execute(query)
if fetchone:
return cur.fetchone()
if fetchall:
return cur.fetchall()
except PoolTimeout as e:
logger.warn("hit PoolTimeout: %s", e)
self.pool.check()
except Exception as e:
logger.warn("postgres query failed: %s", e)
return None
def get_pg_video_captures(self, site=None, source=None) -> List[str]:
@ -79,7 +83,7 @@ class VideoDataClient:
seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
# TODO: generalize, maybe make variable?
containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
# containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
if source == "youtube":
containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http"

View file

@ -1,6 +1,6 @@
[project]
name = "brozzler"
version = "1.7.0"
version = "1.7.1"
authors = [
{ name="Noah Levitt", email="nlevitt@archive.org" },
]
@ -40,7 +40,7 @@ license = "Apache-2.0"
[project.optional-dependencies]
yt-dlp = ["yt-dlp[default,curl-cffi]>=2024.7.25"]
psycopg = ["psycopg[binary]>=3.2.6"]
psycopg = ["psycopg[binary,pool]>=3.2.6"]
dashboard = ["flask>=1.0", "gunicorn>=19.8.1"]
warcprox = ["warcprox>=2.4.31"]
rethinkdb = [