diff --git a/brozzler/worker.py b/brozzler/worker.py index a37e3f0..cf36210 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,9 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None + ydl_outlinks = [] if enable_youtube_dl: try: - ydl_fetches = ydl.do_youtube_dl(self, site, page) + ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -207,18 +208,22 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) + browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) - outlinks = self._browse_page(browser, site, page, on_screenshot, - on_request) - return outlinks + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) - return [] + + outlinks = set() + outlinks.update(ydl_outlinks) + outlinks.update(browser_outlinks) + return list(outlinks) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e0f5a86..cfdf43e 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site): return super().urlopen(req) # def _match_entry(self, info_dict, incomplete): - # return super()._match_entry(info_dict, incomplete) + # if self.dl_disabled: + # return 'Downloading disabled (probably youtube playlist)' + + # def extract_info(self, *args, **kwargs): + # self.dl_disabled = False + # try: + # return super().extract_info(*args, **kwargs) + # finally: + # self.dl_disabled = False def add_default_extra_info(self, ie_result, ie, url): # hook in some logging @@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site): self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) if ie.IE_NAME == 'youtube:playlist': + # At this point ie_result['entries'] is an iterator that + # will fetch more metadata from youtube to list all the + # videos. We unroll that iterator here partly because + # otherwise `process_ie_result()` will clobber it, and we + # use it later to extract the watch pages as outlinks. + ie_result['entries_no_dl'] = list(ie_result['entries']) + ie_result['entries'] = [] self.logger.info( 'setting skip_download because this is a youtube ' - 'playlist and we expect to capture videos from ' - 'individual watch pages') - # XXX good enuf? still fetches metadata for each video - # if we want to not do that, implement self._match_entry() - self.params['skip_download'] = True + 'playlist (%s entries) and we expect to capture ' + 'videos from individual watch pages', + len(ie_result['entries_no_dl'])) + # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page): content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) + return ie_result except brozzler.ShutdownRequested as e: raise - except BaseException as e: + except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: - pass + return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") @@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page): ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) - _try_youtube_dl(worker, ydl, site, page) - return ydl.fetch_spy.fetches + ie_result = _try_youtube_dl(worker, ydl, site, page) + outlinks = [] + if ie_result['extractor'] == 'youtube:playlist': + outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])] + return ydl.fetch_spy.fetches, outlinks