ci: set up a yt-dlp test script

This runs every time we get a new yt-dlp version - we test to see if this script is able to download at least 3/5 out of a set of videos we've defined. If it succeeds, we go ahead and automatically merge the new yt-dlp version into the qa branch so that we can test further.
2025-07-21 14:09:00 -04:00 · 2025-05-22 11:23:04 -07:00 · 2025-05-22 11:23:04 -07:00 · 794f7dd98d
commit 794f7dd98d
parent b4d2726e54
2 changed files with 85 additions and 0 deletions
--- a/.github/workflows/dependabot.yml
+++ b/.github/workflows/dependabot.yml
@ -9,6 +9,22 @@ jobs:
    runs-on: ubuntu-latest
    if: github.event.pull_request.user.login == 'dependabot[bot]' && github.repository == 'internetarchive/brozzler'
    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+      - name: Test new yt-dlp
+        run: |
+          set -euo pipefail
+
+          uv sync --extra yt-dlp --extra rethinkdb --extra warcprox --python 3.12
+
+          # Warcprox has to be running to get video capture results
+          .venv/bin/warcprox &
+          warcprox_pid=$!
+
+          uv run scripts/ytdlp_test.py
+
+          kill $warcprox_pid
      - name: Dependabot metadata
        id: metadata
        uses: dependabot/fetch-metadata@d7267f607e9d3fb96fc2fbe83e0af444713e90b7
--- a/scripts/ytdlp_test.py
+++ b/scripts/ytdlp_test.py
@ -0,0 +1,69 @@
+import math
+import sys
+
+import brozzler
+
+CHROME_EXE = brozzler.suggest_default_chrome_exe()
+
+
+def brozzle_page(worker, page) -> bool:
+    site = brozzler.Site(None, {})
+
+    with brozzler.Browser(chrome_exe=CHROME_EXE) as browser:
+        worker.brozzle_page(browser, site, page)
+
+    # This gets assigned after a video is captured; if an
+    # exception was raised by yt-dlp, it never gets assigned.
+    if not "videos" in page:
+        return False
+
+    if len(page.videos) > 0:
+        response_code = page.videos[0]["response_code"]
+        if (
+            response_code >= 200
+            and response_code < 300
+            and page.videos[0]["content-length"] > 0
+        ):
+            return True
+
+    return False
+
+
+worker = brozzler.BrozzlerWorker(None, proxy="localhost:8000")
+
+videos = [
+    # Short YouTube video
+    "https://www.youtube.com/watch?v=AdtZtvlFi9o",
+    # Long YouTube video (former livestream we've had trouble capturing)
+    "https://www.youtube.com/watch?v=v4f6InE9X_c",
+    # YouTube Short
+    "https://www.youtube.com/shorts/ee_lH4qlfzc",
+    # Vimeo
+    "https://vimeo.com/175568834",
+    # Instagram
+    "https://www.instagram.com/reel/DFZMmHONL8K/",
+    # Audio in a webpage
+    "https://www.woxx.lu/am-bistro-mat-der-woxx-308-grenzenlose-fitness/",
+    # Video in a webpage
+    "https://play.rtl.lu/shows/lb/eurovision/episodes/r/3414779",
+    # TikTok
+    "https://www.tiktok.com/@cbcnews/video/7498842317630033157",
+    # Twitter
+    "https://x.com/NationalZoo/status/690915532539838464",
+    # Facebook
+    "https://www.facebook.com/100064323443815/videos/1421958299004555",
+]
+
+successes = 0
+min_successes = math.floor(len(videos) * 0.75) or 1
+
+for url in videos:
+    page = brozzler.Page(None, {"url": url})
+    if brozzle_page(worker, page):
+        successes += 1
+
+if successes >= min_successes:
+    print(f"Success! {successes}/{len(videos)} captures succeeded.")
+else:
+    print(f"Failure: {successes}/{len(videos)} captures succeeded.")
+    sys.exit(1)