watch pages as outlinks from youtube-dl playlists

and bypass downloading metadata about individual videos as well as the
videos themselves (for youtube playlists), because even just the
metadata can take many minutes or hours in case of thousands of videos
This commit is contained in:
Noah Levitt 2018-10-12 00:41:16 -07:00
parent 9211fb45ec
commit 8f9077fbf3
2 changed files with 39 additions and 15 deletions

View file

@ -186,9 +186,10 @@ class BrozzlerWorker:
on_request=None, enable_youtube_dl=True): on_request=None, enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
ydl_fetches = None ydl_fetches = None
ydl_outlinks = []
if enable_youtube_dl: if enable_youtube_dl:
try: try:
ydl_fetches = ydl.do_youtube_dl(self, site, page) ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
raise raise
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
@ -207,18 +208,22 @@ class BrozzlerWorker:
'youtube_dl raised exception on %s', page, 'youtube_dl raised exception on %s', page,
exc_info=True) exc_info=True)
browser_outlinks = []
if self._needs_browsing(page, ydl_fetches): if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page) self.logger.info('needs browsing: %s', page)
outlinks = self._browse_page(browser, site, page, on_screenshot, browser_outlinks = self._browse_page(
on_request) browser, site, page, on_screenshot, on_request)
return outlinks
else: else:
if not self._already_fetched(page, ydl_fetches): if not self._already_fetched(page, ydl_fetches):
self.logger.info('needs fetch: %s', page) self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page) self._fetch_url(site, page)
else: else:
self.logger.info('already fetched: %s', page) self.logger.info('already fetched: %s', page)
return []
outlinks = set()
outlinks.update(ydl_outlinks)
outlinks.update(browser_outlinks)
return list(outlinks)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_png): def _on_screenshot(screenshot_png):

View file

@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site):
return super().urlopen(req) return super().urlopen(req)
# def _match_entry(self, info_dict, incomplete): # def _match_entry(self, info_dict, incomplete):
# return super()._match_entry(info_dict, incomplete) # if self.dl_disabled:
# return 'Downloading disabled (probably youtube playlist)'
# def extract_info(self, *args, **kwargs):
# self.dl_disabled = False
# try:
# return super().extract_info(*args, **kwargs)
# finally:
# self.dl_disabled = False
def add_default_extra_info(self, ie_result, ie, url): def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging # hook in some logging
@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site):
self.logger.info( self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url) 'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist': if ie.IE_NAME == 'youtube:playlist':
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because
# otherwise `process_ie_result()` will clobber it, and we
# use it later to extract the watch pages as outlinks.
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info( self.logger.info(
'setting skip_download because this is a youtube ' 'setting skip_download because this is a youtube '
'playlist and we expect to capture videos from ' 'playlist (%s entries) and we expect to capture '
'individual watch pages') 'videos from individual watch pages',
# XXX good enuf? still fetches metadata for each video len(ie_result['entries_no_dl']))
# if we want to not do that, implement self._match_entry() # self.dl_disabled = True
self.params['skip_download'] = True
else: else:
self.logger.info( self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url) 'extractor %r found a video in %s', ie.IE_NAME, url)
@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page):
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"), payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
return ie_result
except brozzler.ShutdownRequested as e: except brozzler.ShutdownRequested as e:
raise raise
except BaseException as e: except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass return None
elif (hasattr(e, "exc_info") elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code") and hasattr(e.exc_info[1], "code")
@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page):
''' '''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site) ydl = _build_youtube_dl(worker, tempdir, site)
_try_youtube_dl(worker, ydl, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page)
return ydl.fetch_spy.fetches outlinks = []
if ie_result['extractor'] == 'youtube:playlist':
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
for e in ie_result.get('entries_no_dl', [])]
return ydl.fetch_spy.fetches, outlinks