mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 23:35:54 -04:00
watch pages as outlinks from youtube-dl playlists
and bypass downloading metadata about individual videos as well as the videos themselves (for youtube playlists), because even just the metadata can take many minutes or hours in case of thousands of videos
This commit is contained in:
parent
9211fb45ec
commit
8f9077fbf3
@ -186,9 +186,10 @@ class BrozzlerWorker:
|
||||
on_request=None, enable_youtube_dl=True):
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl_fetches = None
|
||||
ydl_outlinks = []
|
||||
if enable_youtube_dl:
|
||||
try:
|
||||
ydl_fetches = ydl.do_youtube_dl(self, site, page)
|
||||
ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
@ -207,18 +208,22 @@ class BrozzlerWorker:
|
||||
'youtube_dl raised exception on %s', page,
|
||||
exc_info=True)
|
||||
|
||||
browser_outlinks = []
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
||||
on_request)
|
||||
return outlinks
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
return []
|
||||
|
||||
outlinks = set()
|
||||
outlinks.update(ydl_outlinks)
|
||||
outlinks.update(browser_outlinks)
|
||||
return list(outlinks)
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
def _on_screenshot(screenshot_png):
|
||||
|
@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
return super().urlopen(req)
|
||||
|
||||
# def _match_entry(self, info_dict, incomplete):
|
||||
# return super()._match_entry(info_dict, incomplete)
|
||||
# if self.dl_disabled:
|
||||
# return 'Downloading disabled (probably youtube playlist)'
|
||||
|
||||
# def extract_info(self, *args, **kwargs):
|
||||
# self.dl_disabled = False
|
||||
# try:
|
||||
# return super().extract_info(*args, **kwargs)
|
||||
# finally:
|
||||
# self.dl_disabled = False
|
||||
|
||||
def add_default_extra_info(self, ie_result, ie, url):
|
||||
# hook in some logging
|
||||
@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
self.logger.info(
|
||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||
if ie.IE_NAME == 'youtube:playlist':
|
||||
# At this point ie_result['entries'] is an iterator that
|
||||
# will fetch more metadata from youtube to list all the
|
||||
# videos. We unroll that iterator here partly because
|
||||
# otherwise `process_ie_result()` will clobber it, and we
|
||||
# use it later to extract the watch pages as outlinks.
|
||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||
ie_result['entries'] = []
|
||||
self.logger.info(
|
||||
'setting skip_download because this is a youtube '
|
||||
'playlist and we expect to capture videos from '
|
||||
'individual watch pages')
|
||||
# XXX good enuf? still fetches metadata for each video
|
||||
# if we want to not do that, implement self._match_entry()
|
||||
self.params['skip_download'] = True
|
||||
'playlist (%s entries) and we expect to capture '
|
||||
'videos from individual watch pages',
|
||||
len(ie_result['entries_no_dl']))
|
||||
# self.dl_disabled = True
|
||||
else:
|
||||
self.logger.info(
|
||||
'extractor %r found a video in %s', ie.IE_NAME, url)
|
||||
@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers())
|
||||
return ie_result
|
||||
except brozzler.ShutdownRequested as e:
|
||||
raise
|
||||
except BaseException as e:
|
||||
except Exception as e:
|
||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||
pass
|
||||
return None
|
||||
elif (hasattr(e, "exc_info")
|
||||
and e.exc_info[0] == urllib.error.HTTPError
|
||||
and hasattr(e.exc_info[1], "code")
|
||||
@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page):
|
||||
'''
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||
_try_youtube_dl(worker, ydl, site, page)
|
||||
return ydl.fetch_spy.fetches
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = []
|
||||
if ie_result['extractor'] == 'youtube:playlist':
|
||||
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
|
||||
for e in ie_result.get('entries_no_dl', [])]
|
||||
return ydl.fetch_spy.fetches, outlinks
|
||||
|
Loading…
x
Reference in New Issue
Block a user