skip downloading videos from youtube playlists

because we expect to capture videos from individual watch pages, and often processing thousands of videos with youtube-dl before the page is ever opened in the browser is not desired behavior and is a crawling problem
2025-12-01 20:54:58 -05:00 · 2018-10-11 15:46:30 -07:00 · 2018-10-11 15:46:30 -07:00 · 82cf5c6dbb
commit 82cf5c6dbb
parent e406e42312
2 changed files with 20 additions and 1 deletions
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -127,12 +127,31 @@ def _build_youtube_dl(worker, destdir, site):
    class _YoutubeDL(youtube_dl.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)

+        def urlopen(self, req):
+            try:
+                url = req.full_url
+            except AttributeError:
+                url = req
+            self.logger.debug('fetching %r', url)
+            return super().urlopen(req)
+
+        # def _match_entry(self, info_dict, incomplete):
+        #     return super()._match_entry(info_dict, incomplete)
+
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
+                if ie.IE_NAME == 'youtube:playlist':
+                    self.logger.info(
+                            'setting skip_download because this is a youtube '
+                            'playlist and we expect to capture videos from '
+                            'individual watch pages')
+                    # XXX good enuf? still fetches metadata for each video
+                    # if we want to not do that, implement self._match_entry()
+                    self.params['skip_download'] = True
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.5.dev308',
+        version='1.5.dev309',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',