From 27bdfb65d25118144701bc0bd610e79f35654edc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 11 Jun 2018 11:50:22 -0700 Subject: [PATCH] monkey-patch youtube-dl to short-circuit video extraction using generic extractor in case of very large url (more than 20 mb) that youtube-dl interprets as html, to avoid spinning forever here: Traceback (most recent call first): File "/opt/brozzler-ve3/lib/python3.5/re.py", line 213, in findall return _compile(pattern, flags).findall(string) File "/opt/brozzler-ve3/lib/python3.5/site-packages/youtube_dl/extractor/generic.py", line 2878, in _real_extract 'uploader': video_uploader, File "/opt/brozzler-ve3/lib/python3.5/site-packages/youtube_dl/extractor/common.py", line 503, in extract ie_result = self._real_extract(url) File "/opt/brozzler-ve3/lib/python3.5/site-packages/youtube_dl/YoutubeDL.py", line 792, in extract_info ie_result = ie.extract(url) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 302, in _try_youtube_dl info = ydl.extract_info(str(urlcanon.whatwg(page.url))) File "/opt/brozzler-ve3/lib/python3.5/site-packages/brozzler/worker.py", line 361, in brozzle_page self._try_youtube_dl(ydl, site, page) --- brozzler/worker.py | 11 +++++++++++ setup.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 872a3f3..5dbb330 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,6 +39,17 @@ import rethinkdb as r import datetime import urllib.parse +_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content +def _webpage_read_content(self, *args, **kwargs): + content = _orig_webpage_read_content(self, *args, **kwargs) + if len(content) > 20000000: + logging.warn( + 'bypassing youtube-dl extraction because content is ' + 'too large (%s characters)', len(content)) + return '' + return content +youtube_dl.extractor.generic.GenericIE._webpage_read_content = _webpage_read_content + class ExtraHeaderAdder(urllib.request.BaseHandler): def __init__(self, extra_headers): self.extra_headers = extra_headers diff --git a/setup.py b/setup.py index d7ef0f7..d37da36 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b13.dev290', + version='1.1b13.dev291', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',