skip downloading videos from youtube playlists

because we expect to capture videos from individual watch pages, and often processing thousands of videos with youtube-dl before the page is ever opened in the browser is not desired behavior and is a crawling problem
2025-11-24 01:23:28 -05:00 · 2018-10-11 15:46:30 -07:00 · 2018-10-11 15:46:30 -07:00 · 82cf5c6dbb
commit 82cf5c6dbb
parent e406e42312
2 changed files with 20 additions and 1 deletions
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -127,12 +127,31 @@ def _build_youtube_dl(worker, destdir, site):
    class _YoutubeDL(youtube_dl.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)
        def urlopen(self, req):
            try:
                url = req.full_url
            except AttributeError:
                url = req
            self.logger.debug('fetching %r', url)
            return super().urlopen(req)
        # def _match_entry(self, info_dict, incomplete):
        #     return super()._match_entry(info_dict, incomplete)
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
                if ie.IE_NAME == 'youtube:playlist':
                    self.logger.info(
                            'setting skip_download because this is a youtube '
                            'playlist and we expect to capture videos from '
                            'individual watch pages')
                    # XXX good enuf? still fetches metadata for each video
                    # if we want to not do that, implement self._match_entry()
                    self.params['skip_download'] = True
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.5.dev308',
+        version='1.5.dev309',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',