watch pages as outlinks from youtube-dl playlists

and bypass downloading metadata about individual videos as well as the
videos themselves (for youtube playlists), because even just the
metadata can take many minutes or hours in case of thousands of videos
This commit is contained in:
Noah Levitt 2018-10-12 00:41:16 -07:00
parent 9211fb45ec
commit 8f9077fbf3
2 changed files with 39 additions and 15 deletions

View File

@ -186,9 +186,10 @@ class BrozzlerWorker:
on_request=None, enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page))
ydl_fetches = None
ydl_outlinks = []
if enable_youtube_dl:
try:
ydl_fetches = ydl.do_youtube_dl(self, site, page)
ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
@ -207,18 +208,22 @@ class BrozzlerWorker:
'youtube_dl raised exception on %s', page,
exc_info=True)
browser_outlinks = []
if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page)
outlinks = self._browse_page(browser, site, page, on_screenshot,
on_request)
return outlinks
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request)
else:
if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page)
else:
self.logger.info('already fetched: %s', page)
return []
outlinks = set()
outlinks.update(ydl_outlinks)
outlinks.update(browser_outlinks)
return list(outlinks)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_png):

View File

@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site):
return super().urlopen(req)
# def _match_entry(self, info_dict, incomplete):
# return super()._match_entry(info_dict, incomplete)
# if self.dl_disabled:
# return 'Downloading disabled (probably youtube playlist)'
# def extract_info(self, *args, **kwargs):
# self.dl_disabled = False
# try:
# return super().extract_info(*args, **kwargs)
# finally:
# self.dl_disabled = False
def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging
@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site):
self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist':
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because
# otherwise `process_ie_result()` will clobber it, and we
# use it later to extract the watch pages as outlinks.
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info(
'setting skip_download because this is a youtube '
'playlist and we expect to capture videos from '
'individual watch pages')
# XXX good enuf? still fetches metadata for each video
# if we want to not do that, implement self._match_entry()
self.params['skip_download'] = True
'playlist (%s entries) and we expect to capture '
'videos from individual watch pages',
len(ie_result['entries_no_dl']))
# self.dl_disabled = True
else:
self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url)
@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page):
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
return ie_result
except brozzler.ShutdownRequested as e:
raise
except BaseException as e:
except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
return None
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code")
@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page):
'''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site)
_try_youtube_dl(worker, ydl, site, page)
return ydl.fetch_spy.fetches
ie_result = _try_youtube_dl(worker, ydl, site, page)
outlinks = []
if ie_result['extractor'] == 'youtube:playlist':
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
for e in ie_result.get('entries_no_dl', [])]
return ydl.fetch_spy.fetches, outlinks