mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-22 14:30:53 -04:00
Merge remote-tracking branch 'upstream/adam/patch-yt-dlp-infinite-loop-bug' into qa
This commit is contained in:
commit
28009b5844
2 changed files with 32 additions and 2 deletions
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
from yt_dlp.utils import match_filter_func
|
from yt_dlp.utils import match_filter_func, ExtractorError
|
||||||
import brozzler
|
import brozzler
|
||||||
from brozzler.model import VideoCaptureOptions
|
from brozzler.model import VideoCaptureOptions
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
@ -116,6 +116,31 @@ def _build_youtube_dl(worker, destdir, site, page):
|
||||||
a yt-dlp `yt_dlp.YoutubeDL` instance
|
a yt-dlp `yt_dlp.YoutubeDL` instance
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Custom GenericIE to handle redirect loops with shared state
|
||||||
|
class CustomGenericIE(yt_dlp.extractor.generic.GenericIE):
|
||||||
|
"""Custom Generic Information Extractor to detect redirect loops."""
|
||||||
|
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
shared_visited_urls = set() # Shared state for all instances
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.logger.info("[CustomGenericIE] Initialized")
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
# Check for redirect loops in the shared state
|
||||||
|
if url in self.shared_visited_urls:
|
||||||
|
self.logger.error("Redirect loop detected for URL: {url}")
|
||||||
|
raise ExtractorError(
|
||||||
|
f"Redirect loop detected for URL: {url}",
|
||||||
|
expected=True, # Marks the error as non-fatal
|
||||||
|
)
|
||||||
|
self.shared_visited_urls.add(url)
|
||||||
|
self.logger.info(f"[CustomGenericIE] Extracting URL: {url}")
|
||||||
|
return super()._real_extract(url)
|
||||||
|
|
||||||
|
yt_dlp.extractor.generic.GenericIE = CustomGenericIE
|
||||||
|
|
||||||
class _YoutubeDL(yt_dlp.YoutubeDL):
|
class _YoutubeDL(yt_dlp.YoutubeDL):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -367,6 +392,11 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||||
and e.exc_info[1].code == 420
|
and e.exc_info[1].code == 420
|
||||||
):
|
):
|
||||||
raise brozzler.ReachedLimit(e.exc_info[1])
|
raise brozzler.ReachedLimit(e.exc_info[1])
|
||||||
|
elif (
|
||||||
|
isinstance(e, yt_dlp.utils.DownloadError)
|
||||||
|
and "Redirect loop detected" in e.msg
|
||||||
|
):
|
||||||
|
raise brozzler.VideoExtractorError(e.msg)
|
||||||
else:
|
else:
|
||||||
# todo: other errors to handle separately?
|
# todo: other errors to handle separately?
|
||||||
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
|
# OSError('Tunnel connection failed: 464 Host Not Allowed') (caused by ProxyError...)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -34,7 +34,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="brozzler",
|
name="brozzler",
|
||||||
version="1.6.5.a",
|
version="1.6.6.a",
|
||||||
description="Distributed web crawling with browsers",
|
description="Distributed web crawling with browsers",
|
||||||
url="https://github.com/internetarchive/brozzler",
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author="Noah Levitt",
|
author="Noah Levitt",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue