skip downloading videos from youtube playlists

because we expect to capture videos from individual watch pages, and
often processing thousands of videos with youtube-dl before the page is
ever opened in the browser is not desired behavior and is a crawling
problem
This commit is contained in:
Noah Levitt 2018-10-11 15:46:30 -07:00
parent e406e42312
commit 82cf5c6dbb
2 changed files with 20 additions and 1 deletions

View File

@ -127,12 +127,31 @@ def _build_youtube_dl(worker, destdir, site):
class _YoutubeDL(youtube_dl.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__)
def urlopen(self, req):
try:
url = req.full_url
except AttributeError:
url = req
self.logger.debug('fetching %r', url)
return super().urlopen(req)
# def _match_entry(self, info_dict, incomplete):
# return super()._match_entry(info_dict, incomplete)
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist':
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist':
self.logger.info(
'setting skip_download because this is a youtube '
'playlist and we expect to capture videos from '
'individual watch pages')
# XXX good enuf? still fetches metadata for each video
# if we want to not do that, implement self._match_entry()
self.params['skip_download'] = True
else:
self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url)

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.dev308',
version='1.5.dev309',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',