mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-06 21:44:29 -04:00
Merge branch 'ytdlp_last' into qa
This commit is contained in:
commit
ff8823d3d4
1 changed files with 17 additions and 9 deletions
|
@ -21,6 +21,7 @@ import yt_dlp
|
||||||
from yt_dlp.utils import match_filter_func
|
from yt_dlp.utils import match_filter_func
|
||||||
import brozzler
|
import brozzler
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
from urllib.parse import urlparse
|
||||||
import tempfile
|
import tempfile
|
||||||
import urlcanon
|
import urlcanon
|
||||||
import os
|
import os
|
||||||
|
@ -31,17 +32,24 @@ import threading
|
||||||
|
|
||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
def should_ytdlp(page):
|
def is_html_maybe(url):
|
||||||
skip_url_types = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
|
skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
|
||||||
if page.redirect_url:
|
|
||||||
ytdlp_url = page.redirect_url
|
|
||||||
else:
|
|
||||||
ytdlp_url = page.url
|
|
||||||
|
|
||||||
for t in skip_url_types:
|
parsed_url = urlparse(url)
|
||||||
if t in ytdlp_url:
|
base_url, ext = os.path.splitext(parsed_url.path)
|
||||||
logging.warning("skipping yt-dlp for %s due to unsupported guessed content type", ytdlp_url)
|
ext = ext[1:]
|
||||||
|
for skip in skip_url_exts:
|
||||||
|
if ext.startswith(skip):
|
||||||
return False
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def should_ytdlp(page):
|
||||||
|
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||||
|
|
||||||
|
if not is_html_maybe(ytdlp_url):
|
||||||
|
logging.warning("skipping yt-dlp for %s due to unsupported extension", ytdlp_url)
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue