watch pages as outlinks from youtube-dl playlists

and bypass downloading metadata about individual videos as well as the videos themselves (for youtube playlists), because even just the metadata can take many minutes or hours in case of thousands of videos
2025-12-16 09:03:55 -05:00 · 2018-10-12 00:41:16 -07:00 · 2018-10-12 00:41:16 -07:00 · 8f9077fbf3
commit 8f9077fbf3
parent 9211fb45ec
2 changed files with 39 additions and 15 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -186,9 +186,10 @@ class BrozzlerWorker:
                     on_request=None, enable_youtube_dl=True):
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
        ydl_outlinks = []
        if enable_youtube_dl:
            try:
-                ydl_fetches = ydl.do_youtube_dl(self, site, page)
+                ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
            except brozzler.ReachedLimit as e:
                raise
            except brozzler.ShutdownRequested:
@ -207,18 +208,22 @@ class BrozzlerWorker:
                            'youtube_dl raised exception on %s', page,
                            exc_info=True)
        browser_outlinks = []
        if self._needs_browsing(page, ydl_fetches):
            self.logger.info('needs browsing: %s', page)
-            outlinks = self._browse_page(browser, site, page, on_screenshot,
+            browser_outlinks = self._browse_page(
-                                         on_request)
+                    browser, site, page, on_screenshot, on_request)
            return outlinks
        else:
            if not self._already_fetched(page, ydl_fetches):
                self.logger.info('needs fetch: %s', page)
                self._fetch_url(site, page)
            else:
                self.logger.info('already fetched: %s', page)
-            return []
+
        outlinks = set()
        outlinks.update(ydl_outlinks)
        outlinks.update(browser_outlinks)
        return list(outlinks)
    def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
        def _on_screenshot(screenshot_png):
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site):
            return super().urlopen(req)
        # def _match_entry(self, info_dict, incomplete):
-        #     return super()._match_entry(info_dict, incomplete)
+        #     if self.dl_disabled:
        #         return 'Downloading disabled (probably youtube playlist)'
        # def extract_info(self, *args, **kwargs):
        #     self.dl_disabled = False
        #     try:
        #         return super().extract_info(*args, **kwargs)
        #     finally:
        #         self.dl_disabled = False
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site):
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
                if ie.IE_NAME == 'youtube:playlist':
                    # At this point ie_result['entries'] is an iterator that
                    # will fetch more metadata from youtube to list all the
                    # videos. We unroll that iterator here partly because
                    # otherwise `process_ie_result()` will clobber it, and we
                    # use it later to extract the watch pages as outlinks.
                    ie_result['entries_no_dl'] = list(ie_result['entries'])
                    ie_result['entries'] = []
                    self.logger.info(
                            'setting skip_download because this is a youtube '
-                            'playlist and we expect to capture videos from '
+                            'playlist (%s entries) and we expect to capture '
-                            'individual watch pages')
+                            'videos from individual watch pages',
-                    # XXX good enuf? still fetches metadata for each video
+                            len(ie_result['entries_no_dl']))
-                    # if we want to not do that, implement self._match_entry()
+                    # self.dl_disabled = True
                    self.params['skip_download'] = True
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page):
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
-    except BaseException as e:
+    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
-            pass
+            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page):
    '''
    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site)
-        _try_youtube_dl(worker, ydl, site, page)
+        ie_result = _try_youtube_dl(worker, ydl, site, page)
-        return ydl.fetch_spy.fetches
+        outlinks = []
        if ie_result['extractor'] == 'youtube:playlist':
            outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
                        for e in ie_result.get('entries_no_dl', [])]
        return ydl.fetch_spy.fetches, outlinks