mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
skip downloading videos from youtube playlists
because we expect to capture videos from individual watch pages, and often processing thousands of videos with youtube-dl before the page is ever opened in the browser is not desired behavior and is a crawling problem
This commit is contained in:
parent
e406e42312
commit
82cf5c6dbb
2 changed files with 20 additions and 1 deletions
|
@ -127,12 +127,31 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
class _YoutubeDL(youtube_dl.YoutubeDL):
|
class _YoutubeDL(youtube_dl.YoutubeDL):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def urlopen(self, req):
|
||||||
|
try:
|
||||||
|
url = req.full_url
|
||||||
|
except AttributeError:
|
||||||
|
url = req
|
||||||
|
self.logger.debug('fetching %r', url)
|
||||||
|
return super().urlopen(req)
|
||||||
|
|
||||||
|
# def _match_entry(self, info_dict, incomplete):
|
||||||
|
# return super()._match_entry(info_dict, incomplete)
|
||||||
|
|
||||||
def add_default_extra_info(self, ie_result, ie, url):
|
def add_default_extra_info(self, ie_result, ie, url):
|
||||||
# hook in some logging
|
# hook in some logging
|
||||||
super().add_default_extra_info(ie_result, ie, url)
|
super().add_default_extra_info(ie_result, ie, url)
|
||||||
if ie_result.get('_type') == 'playlist':
|
if ie_result.get('_type') == 'playlist':
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||||
|
if ie.IE_NAME == 'youtube:playlist':
|
||||||
|
self.logger.info(
|
||||||
|
'setting skip_download because this is a youtube '
|
||||||
|
'playlist and we expect to capture videos from '
|
||||||
|
'individual watch pages')
|
||||||
|
# XXX good enuf? still fetches metadata for each video
|
||||||
|
# if we want to not do that, implement self._match_entry()
|
||||||
|
self.params['skip_download'] = True
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found a video in %s', ie.IE_NAME, url)
|
'extractor %r found a video in %s', ie.IE_NAME, url)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.dev308',
|
version='1.5.dev309',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue