From faa06b449d4f258d95944ef1665342798beeed2d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 2 Apr 2024 16:37:07 -0700 Subject: [PATCH] run yt-dlp after browse_page --- brozzler/worker.py | 55 ++++++++++------------------------------ brozzler/ydl.py | 63 +++++++++++++++------------------------------- 2 files changed, 33 insertions(+), 85 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 86977cf..c03d21c 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs yt-dlp on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2023 Internet Archive +Copyright (C) 2014-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -242,11 +242,19 @@ class BrozzlerWorker: enable_youtube_dl=True, ): self.logger.info("brozzling {}".format(page)) - ydl_fetches = None outlinks = set() - if enable_youtube_dl and not page.url.lower().endswith(".pdf"): + + try: + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request + ) + outlinks.update(browser_outlinks) + except brozzler.PageInterstitialShown: + self.logger.info("page interstitial shown (http auth): %s", page) + + if enable_youtube_dl and ydl.should_ytdlp(page): try: - ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) + ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -271,22 +279,7 @@ class BrozzlerWorker: "youtube_dl raised exception on %s", page, exc_info=True ) - if self._needs_browsing(page, ydl_fetches): - self.logger.info("needs browsing: %s", page) - try: - browser_outlinks = self._browse_page( - browser, site, page, on_screenshot, on_request - ) - outlinks.update(browser_outlinks) - except brozzler.PageInterstitialShown: - self.logger.info("page interstitial shown (http auth): %s", page) - else: - if not self._already_fetched(page, ydl_fetches): - self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) - else: - self.logger.info("already fetched: %s", page) - + outlinks.update(ydl_outlinks) return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): @@ -415,28 +408,6 @@ class BrozzlerWorker: except requests.exceptions.ProxyError as e: raise brozzler.ProxyError("proxy error fetching %s" % url) from e - def _needs_browsing(self, page, ydl_fetches): - if ydl_fetches: - final_bounces = ydl.final_bounces(ydl_fetches, page.url) - if not final_bounces: - return True - for txn in final_bounces: - if txn["response_headers"].get_content_type() in [ - "text/html", - "application/xhtml+xml", - ]: - return True - return False - else: - return True - - def _already_fetched(self, page, ydl_fetches): - if ydl_fetches: - for fetch in ydl.final_bounces(ydl_fetches, page.url): - if fetch["method"] == "GET" and fetch["response_code"] == 200: - return True - return False - def brozzle_site(self, browser, site): try: site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4281d4a..a2d0405 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,7 +1,7 @@ """ brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler -Copyright (C) 2023 Internet Archive +Copyright (C) 2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,6 +31,20 @@ import threading thread_local = threading.local() +def should_ytdlp(page): + skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg'] + if page.redirect_url: + ytdlp_url = page.redirect_url + else: + ytdlp_url = page.url + + for t in skip_url_types: + if t in ytdlp_url: + logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url) + return False + + return True + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): @@ -67,35 +81,6 @@ class YoutubeDLSpy(urllib.request.BaseHandler): self.fetches = [] -def final_bounces(fetches, url): - """ - Resolves redirect chains in `fetches` and returns a list of fetches - representing the final redirect destinations of the given url. There could - be more than one if for example youtube-dl hit the same url with HEAD and - then GET requests. - """ - redirects = {} - for fetch in fetches: - # XXX check http status 301,302,303,307? check for "uri" header - # as well as "location"? see urllib.request.HTTPRedirectHandler - if "location" in fetch["response_headers"]: - redirects[fetch["url"]] = fetch - - final_url = url - while final_url in redirects: - fetch = redirects.pop(final_url) - final_url = urllib.parse.urljoin( - fetch["url"], fetch["response_headers"]["location"] - ) - - final_bounces = [] - for fetch in fetches: - if fetch["url"] == final_url: - final_bounces.append(fetch) - - return final_bounces - - def _build_youtube_dl(worker, destdir, site, page): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. @@ -183,8 +168,8 @@ def _build_youtube_dl(worker, destdir, site, page): else: url = info_dict.get("url", "") - # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8 - if url.endswith(".m3u8") or url == "": + # skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8 + if url == "" or ".m3u8" in url: return size = os.path.getsize(info_dict["filepath"]) @@ -408,15 +393,7 @@ def do_youtube_dl(worker, site, page): page (brozzler.Page): the page we are brozzling Returns: - tuple with two entries: - `list` of `dict`: with info about urls fetched: - [{ - 'url': ..., - 'method': ..., - 'response_code': ..., - 'response_headers': ..., - }, ...] - `list` of `str`: outlink urls + `list` of `str`: outlink urls """ with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: ydl = _build_youtube_dl(worker, tempdir, site, page) @@ -431,5 +408,5 @@ def do_youtube_dl(worker, site, page): "https://www.youtube.com/watch?v=%s" % e["id"] for e in ie_result.get("entries_no_dl", []) } - # any outlinks for other cases? - return ydl.fetch_spy.fetches, outlinks + # any outlinks for other cases? soundcloud, maybe? + return outlinks