mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
run yt-dlp after browse_page
This commit is contained in:
parent
376f3139ce
commit
faa06b449d
@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
|
||||
scopes and adds outlinks to the frontier
|
||||
|
||||
Copyright (C) 2014-2023 Internet Archive
|
||||
Copyright (C) 2014-2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -242,11 +242,19 @@ class BrozzlerWorker:
|
||||
enable_youtube_dl=True,
|
||||
):
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl_fetches = None
|
||||
outlinks = set()
|
||||
if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
|
||||
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request
|
||||
)
|
||||
outlinks.update(browser_outlinks)
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(page):
|
||||
try:
|
||||
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
@ -271,22 +279,7 @@ class BrozzlerWorker:
|
||||
"youtube_dl raised exception on %s", page, exc_info=True
|
||||
)
|
||||
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info("needs browsing: %s", page)
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request
|
||||
)
|
||||
outlinks.update(browser_outlinks)
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info("already fetched: %s", page)
|
||||
|
||||
outlinks.update(ydl_outlinks)
|
||||
return outlinks
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
@ -415,28 +408,6 @@ class BrozzlerWorker:
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise brozzler.ProxyError("proxy error fetching %s" % url) from e
|
||||
|
||||
def _needs_browsing(self, page, ydl_fetches):
|
||||
if ydl_fetches:
|
||||
final_bounces = ydl.final_bounces(ydl_fetches, page.url)
|
||||
if not final_bounces:
|
||||
return True
|
||||
for txn in final_bounces:
|
||||
if txn["response_headers"].get_content_type() in [
|
||||
"text/html",
|
||||
"application/xhtml+xml",
|
||||
]:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _already_fetched(self, page, ydl_fetches):
|
||||
if ydl_fetches:
|
||||
for fetch in ydl.final_bounces(ydl_fetches, page.url):
|
||||
if fetch["method"] == "GET" and fetch["response_code"] == 200:
|
||||
return True
|
||||
return False
|
||||
|
||||
def brozzle_site(self, browser, site):
|
||||
try:
|
||||
site.last_claimed_by = "%s:%s" % (socket.gethostname(), browser.chrome.port)
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
brozzler/ydl.py - youtube-dl / yt-dlp support for brozzler
|
||||
|
||||
Copyright (C) 2023 Internet Archive
|
||||
Copyright (C) 2024 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -31,6 +31,20 @@ import threading
|
||||
|
||||
thread_local = threading.local()
|
||||
|
||||
def should_ytdlp(page):
|
||||
skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
|
||||
if page.redirect_url:
|
||||
ytdlp_url = page.redirect_url
|
||||
else:
|
||||
ytdlp_url = page.url
|
||||
|
||||
for t in skip_url_types:
|
||||
if t in ytdlp_url:
|
||||
logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
def __init__(self, extra_headers):
|
||||
@ -67,35 +81,6 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
self.fetches = []
|
||||
|
||||
|
||||
def final_bounces(fetches, url):
|
||||
"""
|
||||
Resolves redirect chains in `fetches` and returns a list of fetches
|
||||
representing the final redirect destinations of the given url. There could
|
||||
be more than one if for example youtube-dl hit the same url with HEAD and
|
||||
then GET requests.
|
||||
"""
|
||||
redirects = {}
|
||||
for fetch in fetches:
|
||||
# XXX check http status 301,302,303,307? check for "uri" header
|
||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||
if "location" in fetch["response_headers"]:
|
||||
redirects[fetch["url"]] = fetch
|
||||
|
||||
final_url = url
|
||||
while final_url in redirects:
|
||||
fetch = redirects.pop(final_url)
|
||||
final_url = urllib.parse.urljoin(
|
||||
fetch["url"], fetch["response_headers"]["location"]
|
||||
)
|
||||
|
||||
final_bounces = []
|
||||
for fetch in fetches:
|
||||
if fetch["url"] == final_url:
|
||||
final_bounces.append(fetch)
|
||||
|
||||
return final_bounces
|
||||
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site, page):
|
||||
"""
|
||||
Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`.
|
||||
@ -183,8 +168,8 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||
else:
|
||||
url = info_dict.get("url", "")
|
||||
|
||||
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
|
||||
if url.endswith(".m3u8") or url == "":
|
||||
# skip urls containing .m3u8, to avoid duplicates handled by FixupM3u8
|
||||
if url == "" or ".m3u8" in url:
|
||||
return
|
||||
|
||||
size = os.path.getsize(info_dict["filepath"])
|
||||
@ -408,15 +393,7 @@ def do_youtube_dl(worker, site, page):
|
||||
page (brozzler.Page): the page we are brozzling
|
||||
|
||||
Returns:
|
||||
tuple with two entries:
|
||||
`list` of `dict`: with info about urls fetched:
|
||||
[{
|
||||
'url': ...,
|
||||
'method': ...,
|
||||
'response_code': ...,
|
||||
'response_headers': ...,
|
||||
}, ...]
|
||||
`list` of `str`: outlink urls
|
||||
`list` of `str`: outlink urls
|
||||
"""
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||
@ -431,5 +408,5 @@ def do_youtube_dl(worker, site, page):
|
||||
"https://www.youtube.com/watch?v=%s" % e["id"]
|
||||
for e in ie_result.get("entries_no_dl", [])
|
||||
}
|
||||
# any outlinks for other cases?
|
||||
return ydl.fetch_spy.fetches, outlinks
|
||||
# any outlinks for other cases? soundcloud, maybe?
|
||||
return outlinks
|
||||
|
Loading…
x
Reference in New Issue
Block a user