From faa06b449d4f258d95944ef1665342798beeed2d Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Tue, 2 Apr 2024 16:37:07 -0700
Subject: [PATCH 01/13] run yt-dlp after browse_page

---
 brozzler/worker.py | 55 ++++++++++------------------------------
 brozzler/ydl.py    | 63 +++++++++++++++-------------------------------
 2 files changed, 33 insertions(+), 85 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index 86977cf..c03d21c 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
 it runs yt-dlp on them, browses them and runs behaviors if appropriate,
 scopes and adds outlinks to the frontier
 
-Copyright (C) 2014-2023 Internet Archive
+Copyright (C) 2014-2024 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -242,11 +242,19 @@ class BrozzlerWorker:
         enable_youtube_dl=True,
     ):
         self.logger.info("brozzling {}".format(page))
-        ydl_fetches = None
         outlinks = set()
-        if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
+
+        try:
+            browser_outlinks = self._browse_page(
+                browser, site, page, on_screenshot, on_request
+            )
+            outlinks.update(browser_outlinks)
+        except brozzler.PageInterstitialShown:
+            self.logger.info("page interstitial shown (http auth): %s", page)
+
+        if enable_youtube_dl and ydl.should_ytdlp(page):
             try:
-                ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
+                ydl_outlinks = ydl.do_youtube_dl(self, site, page)
             except brozzler.ReachedLimit as e:
                 raise
             except brozzler.ShutdownRequested:
@@ -271,22 +279,7 @@ class BrozzlerWorker:
                         "youtube_dl raised exception on %s", page, exc_info=True
                     )
 
-        if self._needs_browsing(page, ydl_fetches):
-            self.logger.info("needs browsing: %s", page)
-            try:
-                browser_outlinks = self._browse_page(
-                    browser, site, page, on_screenshot, on_request
-                )
-                outlinks.update(browser_outlinks)
-            except brozzler.PageInterstitialShown:
-                self.logger.info("page interstitial shown (http auth): %s", page)
-        else:
-            if not self._already_fetched(page, ydl_fetches):
-                self.logger.info("needs fetch: %s", page)
-                self._fetch_url(site, page=page)
-            else:
-                self.logger.info("already fetched: %s", page)
-
+        outlinks.update(ydl_outlinks)
         return outlinks
 
     def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
@@ -415,28 +408,6 @@ class BrozzlerWorker:
         except requests.exceptions.ProxyError as e:
             raise brozzler.ProxyError("proxy error fetching %s" % url) from e
 
-    def _needs_browsing(self, page, ydl_fetches):
-        if ydl_fetches:
-            final_bounces = ydl.final_bounces(ydl_fetches, page.url)
-            if not final_bounces:
-                return True
-            for txn in final_bounces:
-                if txn["response_headers"].get_content_type() in [
-                    "text/html",
-                    "application/xhtml+xml",
-                ]:
-                    return True
-            return False
-        else:
-            return True
-
-    def _already_fetched(self, page, ydl_fetches):
-        if ydl_fetches:
-            for fetch in ydl.final_bounces(ydl_fetches, page.url):
-                if fetch["method"] == "GET" and fetch["response_code"] == 200:
-                    return True
-        return False
-
     def brozzle_site(self, browser, site):
         try:
             site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 4281d4a..a2d0405 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -1,7 +1,7 @@
 """
 brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
 
-Copyright (C) 2023 Internet Archive
+Copyright (C) 2024 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -31,6 +31,20 @@ import threading
 
 thread_local = threading.local()
 
+def should_ytdlp(page):
+    skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
+    if page.redirect_url:
+        ytdlp_url = page.redirect_url
+    else:
+        ytdlp_url = page.url
+
+    for t in skip_url_types:
+        if t in ytdlp_url:
+            logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url)
+            return False
+
+    return True
+
 
 class ExtraHeaderAdder(urllib.request.BaseHandler):
     def __init__(self, extra_headers):
@@ -67,35 +81,6 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
         self.fetches = []
 
 
-def final_bounces(fetches, url):
-    """
-    Resolves redirect chains in `fetches` and returns a list of fetches
-    representing the final redirect destinations of the given url. There could
-    be more than one if for example youtube-dl hit the same url with HEAD and
-    then GET requests.
-    """
-    redirects = {}
-    for fetch in fetches:
-        # XXX check http status 301,302,303,307? check for "uri" header
-        # as well as "location"? see urllib.request.HTTPRedirectHandler
-        if "location" in fetch["response_headers"]:
-            redirects[fetch["url"]] = fetch
-
-    final_url = url
-    while final_url in redirects:
-        fetch = redirects.pop(final_url)
-        final_url = urllib.parse.urljoin(
-            fetch["url"], fetch["response_headers"]["location"]
-        )
-
-    final_bounces = []
-    for fetch in fetches:
-        if fetch["url"] == final_url:
-            final_bounces.append(fetch)
-
-    return final_bounces
-
-
 def _build_youtube_dl(worker, destdir, site, page):
     """
     Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
@@ -183,8 +168,8 @@ def _build_youtube_dl(worker, destdir, site, page):
             else:
                 url = info_dict.get("url", "")
 
-            # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
-            if url.endswith(".m3u8") or url == "":
+            # skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8
+            if url == "" or ".m3u8" in url:
                 return
 
             size = os.path.getsize(info_dict["filepath"])
@@ -408,15 +393,7 @@ def do_youtube_dl(worker, site, page):
         page (brozzler.Page): the page we are brozzling
 
     Returns:
-        tuple with two entries:
-            `list` of `dict`: with info about urls fetched:
-                [{
-                    'url': ...,
-                    'method': ...,
-                    'response_code': ...,
-                    'response_headers': ...,
-                }, ...]
-            `list` of `str`: outlink urls
+         `list` of `str`: outlink urls
     """
     with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
         ydl = _build_youtube_dl(worker, tempdir, site, page)
@@ -431,5 +408,5 @@ def do_youtube_dl(worker, site, page):
                 "https://www.youtube.com/watch?v=%s" % e["id"]
                 for e in ie_result.get("entries_no_dl", [])
             }
-        # any outlinks for other cases?
-        return ydl.fetch_spy.fetches, outlinks
+        # any outlinks for other cases? soundcloud, maybe?
+        return outlinks

From 1bc9a544ef46ec89fb999cb5747a92733dd4bf69 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 4 Apr 2024 12:24:45 -0700
Subject: [PATCH 02/13] is_html_maybe

---
 brozzler/ydl.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index a2d0405..557c107 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -21,6 +21,7 @@ import yt_dlp
 from yt_dlp.utils import match_filter_func
 import brozzler
 import urllib.request
+from urllib.parse import urlparse
 import tempfile
 import urlcanon
 import os
@@ -31,17 +32,24 @@ import threading
 
 thread_local = threading.local()
 
-def should_ytdlp(page):
-    skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
-    if page.redirect_url:
-        ytdlp_url = page.redirect_url
-    else:
-        ytdlp_url = page.url
+def is_html_maybe(url):
+    skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
 
-    for t in skip_url_types:
-        if t in ytdlp_url:
-            logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url)
+    parsed_url = urlparse(url)
+    base_url, ext = os.path.splitext(parsed_url.path)
+    ext = ext[1:]
+    for skip in skip_url_exts:
+        if ext.startswith(skip):
             return False
+    return True
+
+
+def should_ytdlp(page):
+    ytdlp_url = page.redirect_url if page.redirect_url else page.url
+
+    if not is_html_maybe(ytdlp_url):
+        logging.warning("skipping yt-dlp for %s due to unsupported extension", ytdlp_url)
+        return False
 
     return True
 

From 0b2650963788c8eafda6bdffa0c6132cde220b1a Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Wed, 10 Apr 2024 13:41:03 -0700
Subject: [PATCH 03/13] more ytdlp_url

---
 brozzler/ydl.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 557c107..1e8ae30 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -337,8 +337,9 @@ def _remember_videos(page, fetches, pushed_videos=None):
 
 
 def _try_youtube_dl(worker, ydl, site, page):
+    ytdlp_url = page.redirect_url if page.redirect_url else page.url
     try:
-        logging.info("trying yt-dlp on %s", page)
+        logging.info("trying yt-dlp on %s", ytdlp_url)
 
         with brozzler.thread_accept_exceptions():
             # we do whatwg canonicalization here to avoid "<urlopen error
@@ -346,7 +347,7 @@ def _try_youtube_dl(worker, ydl, site, page):
             # needs automated test
             # and yt-dlp needs sanitize_info for extract_info
             ie_result = ydl.sanitize_info(
-                ydl.extract_info(str(urlcanon.whatwg(page.url)))
+                ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
             )
         _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
         if worker._using_warcprox(site):
@@ -354,11 +355,11 @@ def _try_youtube_dl(worker, ydl, site, page):
             logging.info(
                 "sending WARCPROX_WRITE_RECORD request to warcprox "
                 "with yt-dlp json for %s",
-                page,
+                ytdlp_url,
             )
             worker._warcprox_write_record(
                 warcprox_address=worker._proxy_for(site),
-                url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
+                url="youtube-dl:%s" % str(urlcanon.semantic(ytdlp_url)),
                 warc_type="metadata",
                 content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                 payload=info_json.encode("utf-8"),
@@ -384,7 +385,7 @@ def _try_youtube_dl(worker, ydl, site, page):
         ):
             # connection problem when using a proxy == proxy error (XXX?)
             raise brozzler.ProxyError(
-                "yt-dlp hit apparent proxy error from " "%s" % page.url
+                "yt-dlp hit apparent proxy error from " "%s" % ytdlp_url
             ) from e
         else:
             raise

From 5fa25a747f96bcd046c0b4e9662f907bb75663b2 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Wed, 10 Apr 2024 13:42:10 -0700
Subject: [PATCH 04/13] mv append ytdlp_outlinks

---
 brozzler/worker.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index c03d21c..ef29434 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -255,6 +255,7 @@ class BrozzlerWorker:
         if enable_youtube_dl and ydl.should_ytdlp(page):
             try:
                 ydl_outlinks = ydl.do_youtube_dl(self, site, page)
+                outlinks.update(ydl_outlinks)
             except brozzler.ReachedLimit as e:
                 raise
             except brozzler.ShutdownRequested:
@@ -278,8 +279,6 @@ class BrozzlerWorker:
                     self.logger.error(
                         "youtube_dl raised exception on %s", page, exc_info=True
                     )
-
-        outlinks.update(ydl_outlinks)
         return outlinks
 
     def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):

From 737770e3ba056423d61ee81ca6b660336884b3a0 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Wed, 17 Apr 2024 10:48:03 -0700
Subject: [PATCH 05/13] locally black'd, to avoid github formatting error?

---
 brozzler/ydl.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 1e8ae30..627da6d 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -32,8 +32,9 @@ import threading
 
 thread_local = threading.local()
 
+
 def is_html_maybe(url):
-    skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
+    skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp4", "mpeg"]
 
     parsed_url = urlparse(url)
     base_url, ext = os.path.splitext(parsed_url.path)
@@ -48,7 +49,9 @@ def should_ytdlp(page):
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
 
     if not is_html_maybe(ytdlp_url):
-        logging.warning("skipping yt-dlp for %s due to unsupported extension", ytdlp_url)
+        logging.warning(
+            "skipping yt-dlp for %s due to unsupported extension", ytdlp_url
+        )
         return False
 
     return True

From f2c89d1c18cb0e060549dac07bc7be9e489f6621 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Wed, 17 Apr 2024 17:23:36 -0700
Subject: [PATCH 06/13] skip more exts, plus chrome-error

---
 brozzler/ydl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 627da6d..635839b 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -34,7 +34,10 @@ thread_local = threading.local()
 
 
 def is_html_maybe(url):
-    skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp4", "mpeg"]
+    if "chrome-error:" in url:
+        return False
+
+    skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"]
 
     parsed_url = urlparse(url)
     base_url, ext = os.path.splitext(parsed_url.path)

From 700c80ba903250164d128a2cc10f8a549b4b4a7a Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 18 Apr 2024 15:41:00 -0700
Subject: [PATCH 07/13] use page headers to browse or fetch

---
 brozzler/worker.py | 112 ++++++++++++++++++++++++++++++++-------------
 brozzler/ydl.py    |  22 +--------
 2 files changed, 80 insertions(+), 54 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index ef29434..259ccd6 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -244,43 +244,89 @@ class BrozzlerWorker:
         self.logger.info("brozzling {}".format(page))
         outlinks = set()
 
-        try:
-            browser_outlinks = self._browse_page(
-                browser, site, page, on_screenshot, on_request
-            )
-            outlinks.update(browser_outlinks)
-        except brozzler.PageInterstitialShown:
-            self.logger.info("page interstitial shown (http auth): %s", page)
+        self._get_page_headers(page)
 
-        if enable_youtube_dl and ydl.should_ytdlp(page):
+        if self._needs_browsing(page):
+            self.logger.info("needs browsing: %s", page)
             try:
-                ydl_outlinks = ydl.do_youtube_dl(self, site, page)
-                outlinks.update(ydl_outlinks)
-            except brozzler.ReachedLimit as e:
-                raise
-            except brozzler.ShutdownRequested:
-                raise
-            except brozzler.ProxyError:
-                raise
-            except Exception as e:
-                if (
-                    hasattr(e, "exc_info")
-                    and len(e.exc_info) >= 2
-                    and hasattr(e.exc_info[1], "code")
-                    and e.exc_info[1].code == 430
-                ):
-                    self.logger.info(
-                        "youtube-dl got %s %s processing %s",
-                        e.exc_info[1].code,
-                        e.exc_info[1].msg,
-                        page.url,
-                    )
-                else:
-                    self.logger.error(
-                        "youtube_dl raised exception on %s", page, exc_info=True
-                    )
+                browser_outlinks = self._browse_page(
+                    browser, site, page, on_screenshot, on_request
+                )
+                outlinks.update(browser_outlinks)
+            except brozzler.PageInterstitialShown:
+                self.logger.info("page interstitial shown (http auth): %s", page)
+
+            if enable_youtube_dl and ydl.should_ytdlp(page):
+                try:
+                    ydl_outlinks = ydl.do_youtube_dl(self, site, page)
+                    outlinks.update(ydl_outlinks)
+                except brozzler.ReachedLimit as e:
+                    raise
+                except brozzler.ShutdownRequested:
+                    raise
+                except brozzler.ProxyError:
+                    raise
+                except Exception as e:
+                    if (
+                        hasattr(e, "exc_info")
+                        and len(e.exc_info) >= 2
+                        and hasattr(e.exc_info[1], "code")
+                        and e.exc_info[1].code == 430
+                    ):
+                        self.logger.info(
+                            "youtube-dl got %s %s processing %s",
+                            e.exc_info[1].code,
+                            e.exc_info[1].msg,
+                            page.url,
+                        )
+                    else:
+                        self.logger.error(
+                            "youtube_dl raised exception on %s", page, exc_info=True
+                        )
+        else:
+            self.logger.info("needs fetch: %s", page)
+            self._fetch_url(site, page=page)
         return outlinks
 
+    def _get_page_headers(self, page):
+        with requests.get(page.url, stream=True) as r:
+            content_type_header = content_length_header = last_modified_header = None
+            if "Content-Type" in r.headers:
+                content_type_header = "Content-Type"
+            elif "content-length" in r.headers:
+                content_type_header = "content-length"
+            elif "CONTENT-LENGTH" in r.headers:
+                content_type_header = "CONTENT-LENGTH"
+            if content_type_header:
+                page.content_type = r.headers[content_type_header]
+                self.logger.info("url %s content_type is %s", page.url, page.content_type)
+
+            if "Content-Length" in r.headers:
+                content_length_header = "Content-Length"
+            elif "content-length" in r.headers:
+                content_length_header = "content-length"
+            elif "CONTENT-LENGTH" in r.headers:
+                content_length_header = "CONTENT-LENGTH"
+            if content_length_header:
+                page.content_length = int(r.headers[content_length_header])
+                self.logger.info("url %s content_length is %s", page.url, page.content_length)
+
+            if "Last-Modified" in r.headers:
+                last_modified_header = "Last-Modified"
+            elif "Last-Modified" in r.headers:
+                last_modified_header = "Last-Modified"
+            elif "LAST-MODIFIED" in r.headers:
+                last_modified_header = "LAST-MODIFIED"
+            if last_modified_header:
+                page.last_modified = r.headers[last_modified_header]
+                self.logger.info("url %s last_modified is %s", page.url, page.last_modified)
+
+    def _needs_browsing(self, page):
+        if page.content_type and "html" not in page.content_type:
+            return False
+        return True
+
+
     def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
         def _on_screenshot(screenshot_jpeg):
             if on_screenshot:
diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 635839b..af0c313 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -21,7 +21,6 @@ import yt_dlp
 from yt_dlp.utils import match_filter_func
 import brozzler
 import urllib.request
-from urllib.parse import urlparse
 import tempfile
 import urlcanon
 import os
@@ -32,29 +31,10 @@ import threading
 
 thread_local = threading.local()
 
-
-def is_html_maybe(url):
-    if "chrome-error:" in url:
-        return False
-
-    skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"]
-
-    parsed_url = urlparse(url)
-    base_url, ext = os.path.splitext(parsed_url.path)
-    ext = ext[1:]
-    for skip in skip_url_exts:
-        if ext.startswith(skip):
-            return False
-    return True
-
-
 def should_ytdlp(page):
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
 
-    if not is_html_maybe(ytdlp_url):
-        logging.warning(
-            "skipping yt-dlp for %s due to unsupported extension", ytdlp_url
-        )
+    if "chrome-error:" in ytdlp_url:
         return False
 
     return True

From 423f05b8417beeb2018bd48d955ac4fb4e770e96 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 18 Apr 2024 15:44:35 -0700
Subject: [PATCH 08/13] black'd

---
 brozzler/worker.py | 13 +++++++++----
 brozzler/ydl.py    |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index 259ccd6..e9015fd 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -299,7 +299,9 @@ class BrozzlerWorker:
                 content_type_header = "CONTENT-LENGTH"
             if content_type_header:
                 page.content_type = r.headers[content_type_header]
-                self.logger.info("url %s content_type is %s", page.url, page.content_type)
+                self.logger.info(
+                    "url %s content_type is %s", page.url, page.content_type
+                )
 
             if "Content-Length" in r.headers:
                 content_length_header = "Content-Length"
@@ -309,7 +311,9 @@ class BrozzlerWorker:
                 content_length_header = "CONTENT-LENGTH"
             if content_length_header:
                 page.content_length = int(r.headers[content_length_header])
-                self.logger.info("url %s content_length is %s", page.url, page.content_length)
+                self.logger.info(
+                    "url %s content_length is %s", page.url, page.content_length
+                )
 
             if "Last-Modified" in r.headers:
                 last_modified_header = "Last-Modified"
@@ -319,14 +323,15 @@ class BrozzlerWorker:
                 last_modified_header = "LAST-MODIFIED"
             if last_modified_header:
                 page.last_modified = r.headers[last_modified_header]
-                self.logger.info("url %s last_modified is %s", page.url, page.last_modified)
+                self.logger.info(
+                    "url %s last_modified is %s", page.url, page.last_modified
+                )
 
     def _needs_browsing(self, page):
         if page.content_type and "html" not in page.content_type:
             return False
         return True
 
-
     def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
         def _on_screenshot(screenshot_jpeg):
             if on_screenshot:
diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index af0c313..3ac4bb2 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -31,6 +31,7 @@ import threading
 
 thread_local = threading.local()
 
+
 def should_ytdlp(page):
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
 

From 12e49bf29e2ffc2b3e881f82ed446a0913d8545f Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Thu, 18 Apr 2024 16:19:13 -0700
Subject: [PATCH 09/13] add / check status code for yt-dlp

---
 brozzler/worker.py | 2 ++
 brozzler/ydl.py    | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index e9015fd..b2f5a1a 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -253,6 +253,8 @@ class BrozzlerWorker:
                     browser, site, page, on_screenshot, on_request
                 )
                 outlinks.update(browser_outlinks)
+                page.status_code = browser.websock_thread.page_status
+                self.logger.info("url %s status code %s", page.url, page.status_code)
             except brozzler.PageInterstitialShown:
                 self.logger.info("page interstitial shown (http auth): %s", page)
 
diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 3ac4bb2..a5a7e89 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -33,6 +33,9 @@ thread_local = threading.local()
 
 
 def should_ytdlp(page):
+    if page.status_code != 200:
+        return False
+
     ytdlp_url = page.redirect_url if page.redirect_url else page.url
 
     if "chrome-error:" in ytdlp_url:

From 5cc056cc7ba993b30da0209d70518250d62c4cdc Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Fri, 19 Apr 2024 12:44:45 -0700
Subject: [PATCH 10/13] use requests' CaseInsensitiveDict

---
 brozzler/worker.py | 40 +++++++++++-----------------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index b2f5a1a..2772db3 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -258,7 +258,7 @@ class BrozzlerWorker:
             except brozzler.PageInterstitialShown:
                 self.logger.info("page interstitial shown (http auth): %s", page)
 
-            if enable_youtube_dl and ydl.should_ytdlp(page):
+            if enable_youtube_dl and ydl.should_ytdlp(page, site):
                 try:
                     ydl_outlinks = ydl.do_youtube_dl(self, site, page)
                     outlinks.update(ydl_outlinks)
@@ -291,42 +291,24 @@ class BrozzlerWorker:
         return outlinks
 
     def _get_page_headers(self, page):
+        page.content_type = page.content_length = page.last_modified = None
         with requests.get(page.url, stream=True) as r:
-            content_type_header = content_length_header = last_modified_header = None
-            if "Content-Type" in r.headers:
-                content_type_header = "Content-Type"
-            elif "content-length" in r.headers:
-                content_type_header = "content-length"
-            elif "CONTENT-LENGTH" in r.headers:
-                content_type_header = "CONTENT-LENGTH"
-            if content_type_header:
-                page.content_type = r.headers[content_type_header]
+            if "content-type" in r.headers:
+                page.content_type = r.headers["content-type"]
                 self.logger.info(
-                    "url %s content_type is %s", page.url, page.content_type
+                    "content_type: %s for url %s", page.content_type, page.url
                 )
 
-            if "Content-Length" in r.headers:
-                content_length_header = "Content-Length"
-            elif "content-length" in r.headers:
-                content_length_header = "content-length"
-            elif "CONTENT-LENGTH" in r.headers:
-                content_length_header = "CONTENT-LENGTH"
-            if content_length_header:
-                page.content_length = int(r.headers[content_length_header])
+            if "content-length" in r.headers:
+                page.content_length = int(r.headers["content-length"])
                 self.logger.info(
-                    "url %s content_length is %s", page.url, page.content_length
+                    "content_length: %s for url %s", page.content_length, page.url
                 )
 
-            if "Last-Modified" in r.headers:
-                last_modified_header = "Last-Modified"
-            elif "Last-Modified" in r.headers:
-                last_modified_header = "Last-Modified"
-            elif "LAST-MODIFIED" in r.headers:
-                last_modified_header = "LAST-MODIFIED"
-            if last_modified_header:
-                page.last_modified = r.headers[last_modified_header]
+            if "last-modified" in r.headers:
+                page.last_modified = r.headers["last-modified"]
                 self.logger.info(
-                    "url %s last_modified is %s", page.url, page.last_modified
+                    "last_modified: %s for url %s", page.last_modified, page.url
                 )
 
     def _needs_browsing(self, page):

From 487a7009f08721f4d7c909e2d8821fd800d94417 Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Tue, 23 Apr 2024 12:41:38 -0700
Subject: [PATCH 11/13] rm remaining spy/fetch

---
 brozzler/ydl.py | 81 +++++++++----------------------------------------
 1 file changed, 15 insertions(+), 66 deletions(-)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index a5a7e89..4c4b7ad 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -32,7 +32,7 @@ import threading
 thread_local = threading.local()
 
 
-def should_ytdlp(page):
+def should_ytdlp(page, site):
     if page.status_code != 200:
         return False
 
@@ -57,35 +57,12 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
         return req
 
 
-class YoutubeDLSpy(urllib.request.BaseHandler):
-    logger = logging.getLogger(__module__ + "." + __qualname__)
-
-    def __init__(self):
-        self.reset()
-
-    def _http_response(self, request, response):
-        fetch = {
-            "url": request.full_url,
-            "method": request.get_method(),
-            "response_code": response.code,
-            "response_headers": response.headers,
-        }
-        self.fetches.append(fetch)
-        return response
-
-    http_response = https_response = _http_response
-
-    def reset(self):
-        self.fetches = []
-
-
 def _build_youtube_dl(worker, destdir, site, page):
     """
     Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
 
     The `YoutubeDL` instance does a few special brozzler-specific things:
 
-    - keeps track of urls fetched using a `YoutubeDLSpy`
     - periodically updates `site.last_claimed` in rethinkdb
     - pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
     - some logging
@@ -94,6 +71,7 @@ def _build_youtube_dl(worker, destdir, site, page):
         worker (brozzler.BrozzlerWorker): the calling brozzler worker
         destdir (str): where to save downloaded videos
         site (brozzler.Site): the site we are brozzling
+        page (brozzler.Page): the page we are brozzling
 
     Returns:
         a yt-dlp `yt_dlp.YoutubeDL` instance
@@ -260,7 +238,7 @@ def _build_youtube_dl(worker, destdir, site, page):
         "match_filter": match_filter_func("!is_live"),
         "extractor_args": {"youtube": {"skip": ["dash", "hls"]}},
         # --cache-dir local or..
-        # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal?
+        # this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal?
         "cache_dir": "/home/archiveit",
         "logger": logging.getLogger("yt_dlp"),
         "verbose": False,
@@ -274,56 +252,27 @@ def _build_youtube_dl(worker, destdir, site, page):
     ydl = _YoutubeDL(ydl_opts)
     if site.extra_headers():
         ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
-    ydl.fetch_spy = YoutubeDLSpy()
     ydl.pushed_videos = []
-    ydl._opener.add_handler(ydl.fetch_spy)
+
     return ydl
 
 
-def _remember_videos(page, fetches, pushed_videos=None):
+def _remember_videos(page, pushed_videos=None):
     """
     Saves info about videos captured by yt-dlp in `page.videos`.
     """
     if not "videos" in page:
         page.videos = []
-    for fetch in fetches or []:
-        content_type = fetch["response_headers"].get_content_type()
-        if (
-            content_type.startswith("video/")
-            # skip manifests of DASH segmented video -
-            # see https://github.com/internetarchive/brozzler/pull/70
-            and content_type != "video/vnd.mpeg.dash.mpd"
-            and fetch["method"] == "GET"
-            and fetch["response_code"] in (200, 206)
-        ):
-            video = {
-                "blame": "youtube-dl",
-                "url": fetch["url"],
-                "response_code": fetch["response_code"],
-                "content-type": content_type,
-            }
-            if "content-length" in fetch["response_headers"]:
-                video["content-length"] = int(
-                    fetch["response_headers"]["content-length"]
-                )
-            if "content-range" in fetch["response_headers"]:
-                # skip chunked youtube video
-                if "googlevideo.com/videoplayback" in fetch["url"]:
-                    continue
-                video["content-range"] = fetch["response_headers"]["content-range"]
-            logging.debug("embedded video %s", video)
-            page.videos.append(video)
     for pushed_video in pushed_videos or []:
-        if pushed_video["content-type"].startswith("video/"):
-            video = {
-                "blame": "youtube-dl",
-                "url": pushed_video["url"],
-                "response_code": pushed_video["response_code"],
-                "content-type": pushed_video["content-type"],
-                "content-length": pushed_video["content-length"],
-            }
-            logging.debug("embedded video %s", video)
-            page.videos.append(video)
+        video = {
+            "blame": "youtube-dl",
+            "url": pushed_video["url"],
+            "response_code": pushed_video["response_code"],
+            "content-type": pushed_video["content-type"],
+            "content-length": pushed_video["content-length"],
+        }
+        logging.debug("embedded video %s", video)
+        page.videos.append(video)
 
 
 def _try_youtube_dl(worker, ydl, site, page):
@@ -339,7 +288,7 @@ def _try_youtube_dl(worker, ydl, site, page):
             ie_result = ydl.sanitize_info(
                 ydl.extract_info(str(urlcanon.whatwg(ytdlp_url)))
             )
-        _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
+        _remember_videos(page, ydl.pushed_videos)
         if worker._using_warcprox(site):
             info_json = json.dumps(ie_result, sort_keys=True, indent=4)
             logging.info(

From 7764c3f6d71b3a517c641594dd88f70b8d9c649f Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Tue, 23 Apr 2024 16:00:18 -0700
Subject: [PATCH 12/13] add comment

---
 brozzler/ydl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/brozzler/ydl.py b/brozzler/ydl.py
index 4c4b7ad..9caf662 100644
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@@ -33,6 +33,7 @@ thread_local = threading.local()
 
 
 def should_ytdlp(page, site):
+    # called only after we've passed needs_browsing() check
     if page.status_code != 200:
         return False
 

From 2d183c7d0cb4c14ff2432b4849efc8aa067dc4dc Mon Sep 17 00:00:00 2001
From: Barbara Miller <barbara@archive.org>
Date: Tue, 23 Apr 2024 16:01:35 -0700
Subject: [PATCH 13/13] if not self._needs_browsing; _get_page_headers comment

---
 brozzler/worker.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/brozzler/worker.py b/brozzler/worker.py
index 2772db3..2bad677 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -246,7 +246,10 @@ class BrozzlerWorker:
 
         self._get_page_headers(page)
 
-        if self._needs_browsing(page):
+        if not self._needs_browsing(page):
+            self.logger.info("needs fetch: %s", page)
+            self._fetch_url(site, page=page)
+        else:
             self.logger.info("needs browsing: %s", page)
             try:
                 browser_outlinks = self._browse_page(
@@ -285,13 +288,12 @@ class BrozzlerWorker:
                         self.logger.error(
                             "youtube_dl raised exception on %s", page, exc_info=True
                         )
-        else:
-            self.logger.info("needs fetch: %s", page)
-            self._fetch_url(site, page=page)
         return outlinks
 
     def _get_page_headers(self, page):
         page.content_type = page.content_length = page.last_modified = None
+        # bypassing warcprox, requests' stream=True defers downloading the body of the response
+        # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
         with requests.get(page.url, stream=True) as r:
             if "content-type" in r.headers:
                 page.content_type = r.headers["content-type"]