From faa06b449d4f258d95944ef1665342798beeed2d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 2 Apr 2024 16:37:07 -0700 Subject: [PATCH 01/13] run yt-dlp after browse_page --- brozzler/worker.py | 55 ++++++++++------------------------------ brozzler/ydl.py | 63 +++++++++++++++------------------------------- 2 files changed, 33 insertions(+), 85 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 86977cf..c03d21c 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs yt-dlp on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2023 Internet Archive +Copyright (C) 2014-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -242,11 +242,19 @@ class BrozzlerWorker: enable_youtube_dl=True, ): self.logger.info("brozzling {}".format(page)) - ydl_fetches = None outlinks = set() - if enable_youtube_dl and not page.url.lower().endswith(".pdf"): + + try: + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request + ) + outlinks.update(browser_outlinks) + except brozzler.PageInterstitialShown: + self.logger.info("page interstitial shown (http auth): %s", page) + + if enable_youtube_dl and ydl.should_ytdlp(page): try: - ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) + ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -271,22 +279,7 @@ class BrozzlerWorker: "youtube_dl raised exception on %s", page, exc_info=True ) - if self._needs_browsing(page, ydl_fetches): - self.logger.info("needs browsing: %s", page) - try: - browser_outlinks = self._browse_page( - browser, site, page, on_screenshot, on_request - ) - outlinks.update(browser_outlinks) - except brozzler.PageInterstitialShown: - self.logger.info("page interstitial shown (http auth): %s", page) - else: - if not self._already_fetched(page, ydl_fetches): - self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) - else: - self.logger.info("already fetched: %s", page) - + outlinks.update(ydl_outlinks) return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): @@ -415,28 +408,6 @@ class BrozzlerWorker: except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e - def _needs_browsing(self, page, ydl_fetches): - if ydl_fetches: - final_bounces = ydl.final_bounces(ydl_fetches, page.url) - if not final_bounces: - return True - for txn in final_bounces: - if txn["response_headers"].get_content_type() in [ - "text/html", - "application/xhtml+xml", - ]: - return True - return False - else: - return True - - def _already_fetched(self, page, ydl_fetches): - if ydl_fetches: - for fetch in ydl.final_bounces(ydl_fetches, page.url): - if fetch["method"] == "GET" and fetch["response_code"] == 200: - return True - return False - def brozzle_site(self, browser, site): try: site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4281d4a..a2d0405 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ """ brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2023 Internet Archive +Copyright (C) 2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,6 +31,20 @@ import threading thread_local = threading.local() +def should_ytdlp(page): + skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg'] + if page.redirect_url: + ytdlp_url = page.redirect_url + else: + ytdlp_url = page.url + + for t in skip_url_types: + if t in ytdlp_url: + logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url) + return False + + return True + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): @@ -67,35 +81,6 @@ class YoutubeDLSpy(urllib.request.BaseHandler): self.fetches = [] -def final_bounces(fetches, url): - """ - Resolves redirect chains in `fetches` and returns a list of fetches - representing the final redirect destinations of the given url. There could - be more than one if for example youtube-dl hit the same url with HEAD and - then GET requests. - """ - redirects = {} - for fetch in fetches: - # XXX check http status 301,302,303,307? check for "uri" header - # as well as "location"? see urllib.request.HTTPRedirectHandler - if "location" in fetch["response_headers"]: - redirects[fetch["url"]] = fetch - - final_url = url - while final_url in redirects: - fetch = redirects.pop(final_url) - final_url = urllib.parse.urljoin( - fetch["url"], fetch["response_headers"]["location"] - ) - - final_bounces = [] - for fetch in fetches: - if fetch["url"] == final_url: - final_bounces.append(fetch) - - return final_bounces - - def _build_youtube_dl(worker, destdir, site, page): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. @@ -183,8 +168,8 @@ def _build_youtube_dl(worker, destdir, site, page): else: url = info_dict.get("url", "") - # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8 - if url.endswith(".m3u8") or url == "": + # skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8 + if url == "" or ".m3u8" in url: return size = os.path.getsize(info_dict["filepath"]) @@ -408,15 +393,7 @@ def do_youtube_dl(worker, site, page): page (brozzler.Page): the page we are brozzling Returns: - tuple with two entries: - `list` of `dict`: with info about urls fetched: - [{ - 'url': ..., - 'method': ..., - 'response_code': ..., - 'response_headers': ..., - }, ...] - `list` of `str`: outlink urls + `list` of `str`: outlink urls """ with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: ydl = _build_youtube_dl(worker, tempdir, site, page) @@ -431,5 +408,5 @@ def do_youtube_dl(worker, site, page): "https://www.youtube.com/watch?v=%s" % e["id"] for e in ie_result.get("entries_no_dl", []) } - # any outlinks for other cases? - return ydl.fetch_spy.fetches, outlinks + # any outlinks for other cases? soundcloud, maybe? + return outlinks From 1bc9a544ef46ec89fb999cb5747a92733dd4bf69 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 4 Apr 2024 12:24:45 -0700 Subject: [PATCH 02/13] is_html_maybe --- brozzler/ydl.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a2d0405..557c107 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -21,6 +21,7 @@ import yt_dlp from yt_dlp.utils import match_filter_func import brozzler import urllib.request +from urllib.parse import urlparse import tempfile import urlcanon import os @@ -31,17 +32,24 @@ import threading thread_local = threading.local() -def should_ytdlp(page): - skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg'] - if page.redirect_url: - ytdlp_url = page.redirect_url - else: - ytdlp_url = page.url +def is_html_maybe(url): + skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg'] - for t in skip_url_types: - if t in ytdlp_url: - logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url) + parsed_url = urlparse(url) + base_url, ext = os.path.splitext(parsed_url.path) + ext = ext[1:] + for skip in skip_url_exts: + if ext.startswith(skip): return False + return True + + +def should_ytdlp(page): + ytdlp_url = page.redirect_url if page.redirect_url else page.url + + if not is_html_maybe(ytdlp_url): + logging.warning("skipping yt-dlp for %s due to unsupported extension", ytdlp_url) + return False return True From 0b2650963788c8eafda6bdffa0c6132cde220b1a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 10 Apr 2024 13:41:03 -0700 Subject: [PATCH 03/13] more ytdlp_url --- brozzler/ydl.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 557c107..1e8ae30 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -337,8 +337,9 @@ def _remember_videos(page, fetches, pushed_videos=None): def _try_youtube_dl(worker, ydl, site, page): + ytdlp_url = page.redirect_url if page.redirect_url else page.url try: - logging.info("trying yt-dlp on %s", page) + logging.info("trying yt-dlp on %s", ytdlp_url) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid " Date: Wed, 10 Apr 2024 13:42:10 -0700 Subject: [PATCH 04/13] mv append ytdlp_outlinks --- brozzler/worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index c03d21c..ef29434 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -255,6 +255,7 @@ class BrozzlerWorker: if enable_youtube_dl and ydl.should_ytdlp(page): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) + outlinks.update(ydl_outlinks) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -278,8 +279,6 @@ class BrozzlerWorker: self.logger.error( "youtube_dl raised exception on %s", page, exc_info=True ) - - outlinks.update(ydl_outlinks) return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): From 737770e3ba056423d61ee81ca6b660336884b3a0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 17 Apr 2024 10:48:03 -0700 Subject: [PATCH 05/13] locally black'd, to avoid github formatting error? --- brozzler/ydl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 1e8ae30..627da6d 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -32,8 +32,9 @@ import threading thread_local = threading.local() + def is_html_maybe(url): - skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg'] + skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp4", "mpeg"] parsed_url = urlparse(url) base_url, ext = os.path.splitext(parsed_url.path) @@ -48,7 +49,9 @@ def should_ytdlp(page): ytdlp_url = page.redirect_url if page.redirect_url else page.url if not is_html_maybe(ytdlp_url): - logging.warning("skipping yt-dlp for %s due to unsupported extension", ytdlp_url) + logging.warning( + "skipping yt-dlp for %s due to unsupported extension", ytdlp_url + ) return False return True From f2c89d1c18cb0e060549dac07bc7be9e489f6621 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 17 Apr 2024 17:23:36 -0700 Subject: [PATCH 06/13] skip more exts, plus chrome-error --- brozzler/ydl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 627da6d..635839b 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -34,7 +34,10 @@ thread_local = threading.local() def is_html_maybe(url): - skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp4", "mpeg"] + if "chrome-error:" in url: + return False + + skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"] parsed_url = urlparse(url) base_url, ext = os.path.splitext(parsed_url.path) From 700c80ba903250164d128a2cc10f8a549b4b4a7a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 18 Apr 2024 15:41:00 -0700 Subject: [PATCH 07/13] use page headers to browse or fetch --- brozzler/worker.py | 112 ++++++++++++++++++++++++++++++++------------- brozzler/ydl.py | 22 +-------- 2 files changed, 80 insertions(+), 54 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index ef29434..259ccd6 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -244,43 +244,89 @@ class BrozzlerWorker: self.logger.info("brozzling {}".format(page)) outlinks = set() - try: - browser_outlinks = self._browse_page( - browser, site, page, on_screenshot, on_request - ) - outlinks.update(browser_outlinks) - except brozzler.PageInterstitialShown: - self.logger.info("page interstitial shown (http auth): %s", page) + self._get_page_headers(page) - if enable_youtube_dl and ydl.should_ytdlp(page): + if self._needs_browsing(page): + self.logger.info("needs browsing: %s", page) try: - ydl_outlinks = ydl.do_youtube_dl(self, site, page) - outlinks.update(ydl_outlinks) - except brozzler.ReachedLimit as e: - raise - except brozzler.ShutdownRequested: - raise - except brozzler.ProxyError: - raise - except Exception as e: - if ( - hasattr(e, "exc_info") - and len(e.exc_info) >= 2 - and hasattr(e.exc_info[1], "code") - and e.exc_info[1].code == 430 - ): - self.logger.info( - "youtube-dl got %s %s processing %s", - e.exc_info[1].code, - e.exc_info[1].msg, - page.url, - ) - else: - self.logger.error( - "youtube_dl raised exception on %s", page, exc_info=True - ) + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request + ) + outlinks.update(browser_outlinks) + except brozzler.PageInterstitialShown: + self.logger.info("page interstitial shown (http auth): %s", page) + + if enable_youtube_dl and ydl.should_ytdlp(page): + try: + ydl_outlinks = ydl.do_youtube_dl(self, site, page) + outlinks.update(ydl_outlinks) + except brozzler.ReachedLimit as e: + raise + except brozzler.ShutdownRequested: + raise + except brozzler.ProxyError: + raise + except Exception as e: + if ( + hasattr(e, "exc_info") + and len(e.exc_info) >= 2 + and hasattr(e.exc_info[1], "code") + and e.exc_info[1].code == 430 + ): + self.logger.info( + "youtube-dl got %s %s processing %s", + e.exc_info[1].code, + e.exc_info[1].msg, + page.url, + ) + else: + self.logger.error( + "youtube_dl raised exception on %s", page, exc_info=True + ) + else: + self.logger.info("needs fetch: %s", page) + self._fetch_url(site, page=page) return outlinks + def _get_page_headers(self, page): + with requests.get(page.url, stream=True) as r: + content_type_header = content_length_header = last_modified_header = None + if "Content-Type" in r.headers: + content_type_header = "Content-Type" + elif "content-length" in r.headers: + content_type_header = "content-length" + elif "CONTENT-LENGTH" in r.headers: + content_type_header = "CONTENT-LENGTH" + if content_type_header: + page.content_type = r.headers[content_type_header] + self.logger.info("url %s content_type is %s", page.url, page.content_type) + + if "Content-Length" in r.headers: + content_length_header = "Content-Length" + elif "content-length" in r.headers: + content_length_header = "content-length" + elif "CONTENT-LENGTH" in r.headers: + content_length_header = "CONTENT-LENGTH" + if content_length_header: + page.content_length = int(r.headers[content_length_header]) + self.logger.info("url %s content_length is %s", page.url, page.content_length) + + if "Last-Modified" in r.headers: + last_modified_header = "Last-Modified" + elif "Last-Modified" in r.headers: + last_modified_header = "Last-Modified" + elif "LAST-MODIFIED" in r.headers: + last_modified_header = "LAST-MODIFIED" + if last_modified_header: + page.last_modified = r.headers[last_modified_header] + self.logger.info("url %s last_modified is %s", page.url, page.last_modified) + + def _needs_browsing(self, page): + if page.content_type and "html" not in page.content_type: + return False + return True + + def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 635839b..af0c313 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -21,7 +21,6 @@ import yt_dlp from yt_dlp.utils import match_filter_func import brozzler import urllib.request -from urllib.parse import urlparse import tempfile import urlcanon import os @@ -32,29 +31,10 @@ import threading thread_local = threading.local() - -def is_html_maybe(url): - if "chrome-error:" in url: - return False - - skip_url_exts = ["pdf", "jpg", "jpeg", "png", "gif", "mp3", "mp4", "mpeg", "css", "js"] - - parsed_url = urlparse(url) - base_url, ext = os.path.splitext(parsed_url.path) - ext = ext[1:] - for skip in skip_url_exts: - if ext.startswith(skip): - return False - return True - - def should_ytdlp(page): ytdlp_url = page.redirect_url if page.redirect_url else page.url - if not is_html_maybe(ytdlp_url): - logging.warning( - "skipping yt-dlp for %s due to unsupported extension", ytdlp_url - ) + if "chrome-error:" in ytdlp_url: return False return True From 423f05b8417beeb2018bd48d955ac4fb4e770e96 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 18 Apr 2024 15:44:35 -0700 Subject: [PATCH 08/13] black'd --- brozzler/worker.py | 13 +++++++++---- brozzler/ydl.py | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 259ccd6..e9015fd 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -299,7 +299,9 @@ class BrozzlerWorker: content_type_header = "CONTENT-LENGTH" if content_type_header: page.content_type = r.headers[content_type_header] - self.logger.info("url %s content_type is %s", page.url, page.content_type) + self.logger.info( + "url %s content_type is %s", page.url, page.content_type + ) if "Content-Length" in r.headers: content_length_header = "Content-Length" @@ -309,7 +311,9 @@ class BrozzlerWorker: content_length_header = "CONTENT-LENGTH" if content_length_header: page.content_length = int(r.headers[content_length_header]) - self.logger.info("url %s content_length is %s", page.url, page.content_length) + self.logger.info( + "url %s content_length is %s", page.url, page.content_length + ) if "Last-Modified" in r.headers: last_modified_header = "Last-Modified" @@ -319,14 +323,15 @@ class BrozzlerWorker: last_modified_header = "LAST-MODIFIED" if last_modified_header: page.last_modified = r.headers[last_modified_header] - self.logger.info("url %s last_modified is %s", page.url, page.last_modified) + self.logger.info( + "url %s last_modified is %s", page.url, page.last_modified + ) def _needs_browsing(self, page): if page.content_type and "html" not in page.content_type: return False return True - def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index af0c313..3ac4bb2 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -31,6 +31,7 @@ import threading thread_local = threading.local() + def should_ytdlp(page): ytdlp_url = page.redirect_url if page.redirect_url else page.url From 12e49bf29e2ffc2b3e881f82ed446a0913d8545f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 18 Apr 2024 16:19:13 -0700 Subject: [PATCH 09/13] add / check status code for yt-dlp --- brozzler/worker.py | 2 ++ brozzler/ydl.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index e9015fd..b2f5a1a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -253,6 +253,8 @@ class BrozzlerWorker: browser, site, page, on_screenshot, on_request ) outlinks.update(browser_outlinks) + page.status_code = browser.websock_thread.page_status + self.logger.info("url %s status code %s", page.url, page.status_code) except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 3ac4bb2..a5a7e89 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,6 +33,9 @@ thread_local = threading.local() def should_ytdlp(page): + if page.status_code != 200: + return False + ytdlp_url = page.redirect_url if page.redirect_url else page.url if "chrome-error:" in ytdlp_url: From 5cc056cc7ba993b30da0209d70518250d62c4cdc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 19 Apr 2024 12:44:45 -0700 Subject: [PATCH 10/13] use requests' CaseInsensitiveDict --- brozzler/worker.py | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index b2f5a1a..2772db3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -258,7 +258,7 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp(page): + if enable_youtube_dl and ydl.should_ytdlp(page, site): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) outlinks.update(ydl_outlinks) @@ -291,42 +291,24 @@ class BrozzlerWorker: return outlinks def _get_page_headers(self, page): + page.content_type = page.content_length = page.last_modified = None with requests.get(page.url, stream=True) as r: - content_type_header = content_length_header = last_modified_header = None - if "Content-Type" in r.headers: - content_type_header = "Content-Type" - elif "content-length" in r.headers: - content_type_header = "content-length" - elif "CONTENT-LENGTH" in r.headers: - content_type_header = "CONTENT-LENGTH" - if content_type_header: - page.content_type = r.headers[content_type_header] + if "content-type" in r.headers: + page.content_type = r.headers["content-type"] self.logger.info( - "url %s content_type is %s", page.url, page.content_type + "content_type: %s for url %s", page.content_type, page.url ) - if "Content-Length" in r.headers: - content_length_header = "Content-Length" - elif "content-length" in r.headers: - content_length_header = "content-length" - elif "CONTENT-LENGTH" in r.headers: - content_length_header = "CONTENT-LENGTH" - if content_length_header: - page.content_length = int(r.headers[content_length_header]) + if "content-length" in r.headers: + page.content_length = int(r.headers["content-length"]) self.logger.info( - "url %s content_length is %s", page.url, page.content_length + "content_length: %s for url %s", page.content_length, page.url ) - if "Last-Modified" in r.headers: - last_modified_header = "Last-Modified" - elif "Last-Modified" in r.headers: - last_modified_header = "Last-Modified" - elif "LAST-MODIFIED" in r.headers: - last_modified_header = "LAST-MODIFIED" - if last_modified_header: - page.last_modified = r.headers[last_modified_header] + if "last-modified" in r.headers: + page.last_modified = r.headers["last-modified"] self.logger.info( - "url %s last_modified is %s", page.url, page.last_modified + "last_modified: %s for url %s", page.last_modified, page.url ) def _needs_browsing(self, page): From 487a7009f08721f4d7c909e2d8821fd800d94417 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 23 Apr 2024 12:41:38 -0700 Subject: [PATCH 11/13] rm remaining spy/fetch --- brozzler/ydl.py | 81 +++++++++---------------------------------------- 1 file changed, 15 insertions(+), 66 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a5a7e89..4c4b7ad 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -32,7 +32,7 @@ import threading thread_local = threading.local() -def should_ytdlp(page): +def should_ytdlp(page, site): if page.status_code != 200: return False @@ -57,35 +57,12 @@ class ExtraHeaderAdder(urllib.request.BaseHandler): return req -class YoutubeDLSpy(urllib.request.BaseHandler): - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self): - self.reset() - - def _http_response(self, request, response): - fetch = { - "url": request.full_url, - "method": request.get_method(), - "response_code": response.code, - "response_headers": response.headers, - } - self.fetches.append(fetch) - return response - - http_response = https_response = _http_response - - def reset(self): - self.fetches = [] - - def _build_youtube_dl(worker, destdir, site, page): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. The `YoutubeDL` instance does a few special brozzler-specific things: - - keeps track of urls fetched using a `YoutubeDLSpy` - periodically updates `site.last_claimed` in rethinkdb - pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request - some logging @@ -94,6 +71,7 @@ def _build_youtube_dl(worker, destdir, site, page): worker (brozzler.BrozzlerWorker): the calling brozzler worker destdir (str): where to save downloaded videos site (brozzler.Site): the site we are brozzling + page (brozzler.Page): the page we are brozzling Returns: a yt-dlp `yt_dlp.YoutubeDL` instance @@ -260,7 +238,7 @@ def _build_youtube_dl(worker, destdir, site, page): "match_filter": match_filter_func("!is_live"), "extractor_args": {"youtube": {"skip": ["dash", "hls"]}}, # --cache-dir local or.. - # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal? + # this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal? "cache_dir": "/home/archiveit", "logger": logging.getLogger("yt_dlp"), "verbose": False, @@ -274,56 +252,27 @@ def _build_youtube_dl(worker, destdir, site, page): ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) - ydl.fetch_spy = YoutubeDLSpy() ydl.pushed_videos = [] - ydl._opener.add_handler(ydl.fetch_spy) + return ydl -def _remember_videos(page, fetches, pushed_videos=None): +def _remember_videos(page, pushed_videos=None): """ Saves info about videos captured by yt-dlp in `page.videos`. """ if not "videos" in page: page.videos = [] - for fetch in fetches or []: - content_type = fetch["response_headers"].get_content_type() - if ( - content_type.startswith("video/") - # skip manifests of DASH segmented video - - # see https://github.com/internetarchive/brozzler/pull/70 - and content_type != "video/vnd.mpeg.dash.mpd" - and fetch["method"] == "GET" - and fetch["response_code"] in (200, 206) - ): - video = { - "blame": "youtube-dl", - "url": fetch["url"], - "response_code": fetch["response_code"], - "content-type": content_type, - } - if "content-length" in fetch["response_headers"]: - video["content-length"] = int( - fetch["response_headers"]["content-length"] - ) - if "content-range" in fetch["response_headers"]: - # skip chunked youtube video - if "googlevideo.com/videoplayback" in fetch["url"]: - continue - video["content-range"] = fetch["response_headers"]["content-range"] - logging.debug("embedded video %s", video) - page.videos.append(video) for pushed_video in pushed_videos or []: - if pushed_video["content-type"].startswith("video/"): - video = { - "blame": "youtube-dl", - "url": pushed_video["url"], - "response_code": pushed_video["response_code"], - "content-type": pushed_video["content-type"], - "content-length": pushed_video["content-length"], - } - logging.debug("embedded video %s", video) - page.videos.append(video) + video = { + "blame": "youtube-dl", + "url": pushed_video["url"], + "response_code": pushed_video["response_code"], + "content-type": pushed_video["content-type"], + "content-length": pushed_video["content-length"], + } + logging.debug("embedded video %s", video) + page.videos.append(video) def _try_youtube_dl(worker, ydl, site, page): @@ -339,7 +288,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos) + _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( From 7764c3f6d71b3a517c641594dd88f70b8d9c649f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 23 Apr 2024 16:00:18 -0700 Subject: [PATCH 12/13] add comment --- brozzler/ydl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4c4b7ad..9caf662 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,6 +33,7 @@ thread_local = threading.local() def should_ytdlp(page, site): + # called only after we've passed needs_browsing() check if page.status_code != 200: return False From 2d183c7d0cb4c14ff2432b4849efc8aa067dc4dc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 23 Apr 2024 16:01:35 -0700 Subject: [PATCH 13/13] if not self._needs_browsing; _get_page_headers comment --- brozzler/worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 2772db3..2bad677 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -246,7 +246,10 @@ class BrozzlerWorker: self._get_page_headers(page) - if self._needs_browsing(page): + if not self._needs_browsing(page): + self.logger.info("needs fetch: %s", page) + self._fetch_url(site, page=page) + else: self.logger.info("needs browsing: %s", page) try: browser_outlinks = self._browse_page( @@ -285,13 +288,12 @@ class BrozzlerWorker: self.logger.error( "youtube_dl raised exception on %s", page, exc_info=True ) - else: - self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) return outlinks def _get_page_headers(self, page): page.content_type = page.content_length = page.last_modified = None + # bypassing warcprox, requests' stream=True defers downloading the body of the response + # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow with requests.get(page.url, stream=True) as r: if "content-type" in r.headers: page.content_type = r.headers["content-type"]