From e5536182dc8055c6b6352b184a15b508f32e1388 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 11 Oct 2018 23:28:34 -0700 Subject: [PATCH 1/6] use a thread-local callback in monkey-patched finish_frag_download, instead of locking around monkey-patching, to allow different threads to youtube-dl concurrently, but still not interfere with each other --- brozzler/ydl.py | 50 ++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index d67856c..4c16e17 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -30,7 +30,21 @@ import doublethink import datetime import threading -global_ydl_lock = threading.Lock() +thread_local = threading.local() +_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download +def _finish_frag_download(ffd_self, ctx): + ''' + We monkey-patch this youtube-dl internal method `_finish_frag_download()` + because it gets called after downloading the last segment of a segmented + video, which is a good time to upload the stitched-up video that youtube-dl + creates for us to warcprox. We have it call a thread-local callback + since different threads may be youtube-dl'ing at the same time. + ''' + result = _orig__finish_frag_download(ffd_self, ctx) + if hasattr(thread_local, 'finish_frag_download_callback'): + thread_local.finish_frag_download_callback(ffd_self, ctx) + return result +youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content def _webpage_read_content(self, *args, **kwargs): @@ -111,9 +125,10 @@ def _build_youtube_dl(worker, destdir, site): - keeps track of urls fetched using a `YoutubeDLSpy` - periodically updates `site.last_claimed` in rethinkdb - - if brozzling through warcprox and downloading fragmented (DASH) videos, - pushes the stitched together video to warcprox using a - WARCPROX_WRITE_RECORD request + - if brozzling through warcprox and downloading segmented videos (e.g. + HLS), pushes the stitched-up video created by youtube-dl to warcprox + using a WARCPROX_WRITE_RECORD request + - adds some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker @@ -199,21 +214,18 @@ def _build_youtube_dl(worker, destdir, site): }) def process_info(self, info_dict): - # lock this section to prevent race condition between threads that - # want to monkey patch _finish_frag_download() at the same time - with global_ydl_lock: - _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download - - def _finish_frag_download(ffd_self, ctx): - _orig__finish_frag_download(ffd_self, ctx) - if worker._using_warcprox(site): - self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) - - try: - youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download - return super().process_info(info_dict) - finally: - youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download + ''' + See comment above on `_finish_frag_download()` + ''' + def ffd_callback(ffd_self, ctx): + logging.info('%s') + if worker._using_warcprox(site): + self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) + try: + thread_local.finish_frag_download_callback = ffd_callback + return super().process_info(info_dict) + finally: + delattr(thread_local, 'finish_frag_download_callback') def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case youtube-dl takes a long time, heartbeat site.last_claimed From 9211fb45ec90c49489e51c0a64e6f9ac1177e9b0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:39:37 -0700 Subject: [PATCH 2/6] silence youtube-dl's logging, use only our own because youtube-dl's can be annoyingly verbose, confusing, doesn't tell us the things we're interested in, and doesn't tell us where the messages originate --- brozzler/ydl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4c16e17..e0f5a86 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -244,19 +244,23 @@ def _build_youtube_dl(worker, destdir, site): ydl_opts = { "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir), - "verbose": False, "retries": 1, - "logger": logging.getLogger("youtube_dl"), "nocheckcertificate": True, "hls_prefer_native": True, "noprogress": True, "nopart": True, "no_color": True, "progress_hooks": [maybe_heartbeat_site_last_claimed], + # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection # "best: Select the best quality format represented by a single # file with video and audio." "format": "best/bestvideo+bestaudio", + + ### we do our own logging + # "logger": logging.getLogger("youtube_dl"), + "verbose": False, + "quiet": True, } if worker._proxy_for(site): ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) From 8f9077fbf34c90d60c545fe97bbb9b5014f39364 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:41:16 -0700 Subject: [PATCH 3/6] watch pages as outlinks from youtube-dl playlists and bypass downloading metadata about individual videos as well as the videos themselves (for youtube playlists), because even just the metadata can take many minutes or hours in case of thousands of videos --- brozzler/worker.py | 15 ++++++++++----- brozzler/ydl.py | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index a37e3f0..cf36210 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,9 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None + ydl_outlinks = [] if enable_youtube_dl: try: - ydl_fetches = ydl.do_youtube_dl(self, site, page) + ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -207,18 +208,22 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) + browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) - outlinks = self._browse_page(browser, site, page, on_screenshot, - on_request) - return outlinks + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) - return [] + + outlinks = set() + outlinks.update(ydl_outlinks) + outlinks.update(browser_outlinks) + return list(outlinks) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e0f5a86..cfdf43e 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site): return super().urlopen(req) # def _match_entry(self, info_dict, incomplete): - # return super()._match_entry(info_dict, incomplete) + # if self.dl_disabled: + # return 'Downloading disabled (probably youtube playlist)' + + # def extract_info(self, *args, **kwargs): + # self.dl_disabled = False + # try: + # return super().extract_info(*args, **kwargs) + # finally: + # self.dl_disabled = False def add_default_extra_info(self, ie_result, ie, url): # hook in some logging @@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site): self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) if ie.IE_NAME == 'youtube:playlist': + # At this point ie_result['entries'] is an iterator that + # will fetch more metadata from youtube to list all the + # videos. We unroll that iterator here partly because + # otherwise `process_ie_result()` will clobber it, and we + # use it later to extract the watch pages as outlinks. + ie_result['entries_no_dl'] = list(ie_result['entries']) + ie_result['entries'] = [] self.logger.info( 'setting skip_download because this is a youtube ' - 'playlist and we expect to capture videos from ' - 'individual watch pages') - # XXX good enuf? still fetches metadata for each video - # if we want to not do that, implement self._match_entry() - self.params['skip_download'] = True + 'playlist (%s entries) and we expect to capture ' + 'videos from individual watch pages', + len(ie_result['entries_no_dl'])) + # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page): content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) + return ie_result except brozzler.ShutdownRequested as e: raise - except BaseException as e: + except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: - pass + return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") @@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page): ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) - _try_youtube_dl(worker, ydl, site, page) - return ydl.fetch_spy.fetches + ie_result = _try_youtube_dl(worker, ydl, site, page) + outlinks = [] + if ie_result['extractor'] == 'youtube:playlist': + outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])] + return ydl.fetch_spy.fetches, outlinks From 054ba6d7a038f05d0854cc2d216baed3e9888ff5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:48:38 -0700 Subject: [PATCH 4/6] tidy up some comments and docs --- brozzler/ydl.py | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index cfdf43e..f1b57cb 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,8 +1,6 @@ ''' brozzler/ydl.py - youtube-dl support for brozzler -This code was extracted from worker.py and - Copyright (C) 2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); @@ -128,7 +126,7 @@ def _build_youtube_dl(worker, destdir, site): - if brozzling through warcprox and downloading segmented videos (e.g. HLS), pushes the stitched-up video created by youtube-dl to warcprox using a WARCPROX_WRITE_RECORD request - - adds some logging + - some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker @@ -150,17 +148,6 @@ def _build_youtube_dl(worker, destdir, site): self.logger.debug('fetching %r', url) return super().urlopen(req) - # def _match_entry(self, info_dict, incomplete): - # if self.dl_disabled: - # return 'Downloading disabled (probably youtube playlist)' - - # def extract_info(self, *args, **kwargs): - # self.dl_disabled = False - # try: - # return super().extract_info(*args, **kwargs) - # finally: - # self.dl_disabled = False - def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) @@ -176,11 +163,10 @@ def _build_youtube_dl(worker, destdir, site): ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result['entries'] = [] self.logger.info( - 'setting skip_download because this is a youtube ' - 'playlist (%s entries) and we expect to capture ' - 'videos from individual watch pages', + 'not downoading %s videos from this youtube ' + 'playlist because we expect to capture them from ' + 'individual watch pages', len(ie_result['entries_no_dl'])) - # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -380,20 +366,23 @@ def do_youtube_dl(worker, site, page): page (brozzler.Page): the page we are brozzling Returns: - `list` of `dict`: with info about urls fetched: - - [{ - 'url': ..., - 'method': ..., - 'response_code': ..., - 'response_headers': ..., - }, ...] + tuple with two entries: + `list` of `dict`: with info about urls fetched: + [{ + 'url': ..., + 'method': ..., + 'response_code': ..., + 'response_headers': ..., + }, ...] + `list` of `str`: outlink urls ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = [] if ie_result['extractor'] == 'youtube:playlist': + # youtube watch pages as outlinks outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] for e in ie_result.get('entries_no_dl', [])] + # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks From 7497b7e5ac8b1f2ca159e7fd4ddaaed64464c857 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 11:03:54 -0700 Subject: [PATCH 5/6] tests expect outlinks to be a set --- brozzler/worker.py | 11 ++++------- brozzler/ydl.py | 8 ++++---- tests/test_brozzling.py | 2 +- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index cf36210..4aad2a3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,10 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None - ydl_outlinks = [] + outlinks = set() if enable_youtube_dl: try: - ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) + ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -208,11 +208,11 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) - browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) browser_outlinks = self._browse_page( browser, site, page, on_screenshot, on_request) + outlinks.update(browser_outlinks) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) @@ -220,10 +220,7 @@ class BrozzlerWorker: else: self.logger.info('already fetched: %s', page) - outlinks = set() - outlinks.update(ydl_outlinks) - outlinks.update(browser_outlinks) - return list(outlinks) + return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f1b57cb..4f9169c 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -379,10 +379,10 @@ def do_youtube_dl(worker, site, page): with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) - outlinks = [] - if ie_result['extractor'] == 'youtube:playlist': + outlinks = set() + if ie_result and ie_result.get('extractor') == 'youtube:playlist': # youtube watch pages as outlinks - outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] - for e in ie_result.get('entries_no_dl', [])] + outlinks = {'https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])} # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 8c2c9c8..686c8f4 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -2,7 +2,7 @@ ''' test_brozzling.py - XXX explain -Copyright (C) 2016-2017 Internet Archive +Copyright (C) 2016-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 65fad5e8bf32e2d8080097e2c78a64f579a173a1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 11:35:47 -0700 Subject: [PATCH 6/6] remove stray bad logging line --- brozzler/ydl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4f9169c..57550e5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -218,7 +218,6 @@ def _build_youtube_dl(worker, destdir, site): See comment above on `_finish_frag_download()` ''' def ffd_callback(ffd_self, ctx): - logging.info('%s') if worker._using_warcprox(site): self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) try: