update for m3u8s, better naming

2025-04-20 23:56:34 -04:00 · 2023-09-18 14:51:14 -07:00 · 2023-09-18 14:51:14 -07:00 · eef7173d72
commit eef7173d72
parent 75e0555d43
1 changed files with 20 additions and 21 deletions
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -99,9 +99,7 @@ def _build_youtube_dl(worker, destdir, site, page):

    - keeps track of urls fetched using a `YoutubeDLSpy`
    - periodically updates `site.last_claimed` in rethinkdb
-    - if brozzling through warcprox and downloading segmented videos (e.g.
-      HLS), pushes the stitched-up video created by yt-dlp/ffmpeg to warcprox
-      using a WARCPROX_WRITE_RECORD request
+    - pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request
    - some logging

    Args:
@ -153,7 +151,7 @@ def _build_youtube_dl(worker, destdir, site, page):
                self.logger.info(
                        'extractor %r found a download in %s', ie.IE_NAME, url)

-        def _push_stitched_up_vid_to_warcprox(self, site, info_dict, postprocessor):
+        def _push_video_to_warcprox(self, site, info_dict, postprocessor):
            # 220211 update: does yt-dlp supply content-type? no, not as such
            # XXX Don't know how to get the right content-type. Youtube-dl
            # doesn't supply it. Sometimes (with --hls-prefer-native)
@ -172,20 +170,21 @@ def _build_youtube_dl(worker, destdir, site, page):
                            'guessing mimetype %s because %r', mimetype, e)

            # youtube watch page postprocessor is MoveFiles
-            # but current yt-dlp duplicates capture if we handle FixupM3u8!!!
-            # we'll ignore postprocessor for now...
-            '''
+
            if postprocessor == 'FixupM3u8':
                url = 'youtube-dl:%05d:%s' % (
                       info_dict.get('playlist_index') or 1,
                       info_dict['webpage_url'])
            else:
-            '''
-            url = info_dict.get('url')
+                url = info_dict.get('url')
+
+            # skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
+            if url.endswith('.m3u8'):
+                return

            size = os.path.getsize(info_dict['filepath'])
            self.logger.info(
-                    'pushing %r video stitched-up as %s (%s bytes) to '
+                    'pushing %r video as %s (%s bytes) to '
                    'warcprox at %s with url %s', info_dict['format'],
                    mimetype, size, worker._proxy_for(site), url)
            with open(info_dict['filepath'], 'rb') as f:
@ -198,7 +197,7 @@ def _build_youtube_dl(worker, destdir, site, page):
                        warc_type='resource', content_type=mimetype, payload=f,
                        extra_headers=extra_headers)
                # consulted by _remember_videos()
-                ydl.stitch_ups.append({
+                ydl.pushed_videos.append({
                    'url': url,
                    'response_code': response.code,
                    'content-type': mimetype,
@ -225,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page):
            worker.logger.info('[ydl_postprocess_hook] Finished postprocessing')
            worker.logger.info('[ydl_postprocess_hook] postprocessor: {}'.format(d['postprocessor']))
            if worker._using_warcprox(site):
-                _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])
+                _YoutubeDL._push_video_to_warcprox(_YoutubeDL, site, d['info_dict'], d['postprocessor'])

    # default socket_timeout is 20 -- we hit it often when cluster is busy
    ydl_opts = {
@ -274,11 +273,11 @@ def _build_youtube_dl(worker, destdir, site, page):
    if site.extra_headers():
        ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
    ydl.fetch_spy = YoutubeDLSpy()
-    ydl.stitch_ups = []
+    ydl.pushed_videos = []
    ydl._opener.add_handler(ydl.fetch_spy)
    return ydl

-def _remember_videos(page, fetches, stitch_ups=None):
+def _remember_videos(page, fetches, pushed_videos=None):
    '''
    Saves info about videos captured by yt-dlp in `page.videos`.
    '''
@ -309,14 +308,14 @@ def _remember_videos(page, fetches, stitch_ups=None):
                        'response_headers']['content-range']
            logging.debug('embedded video %s', video)
            page.videos.append(video)
-    for stitch_up in stitch_ups or []:
-        if stitch_up['content-type'].startswith('video/'):
+    for pushed_video in pushed_videos or []:
+        if pushed_video['content-type'].startswith('video/'):
            video = {
                'blame': 'youtube-dl',
-                'url': stitch_up['url'],
-                'response_code': stitch_up['response_code'],
-                'content-type': stitch_up['content-type'],
-                'content-length': stitch_up['content-length'],
+                'url': pushed_video['url'],
+                'response_code': pushed_video['response_code'],
+                'content-type': pushed_video['content-type'],
+                'content-length': pushed_video['content-length'],
            }
            logging.debug('embedded video %s', video)
            page.videos.append(video)
@ -331,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
            # needs automated test
            # and yt-dlp needs sanitize_info for extract_info
            ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url))))
-        _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups)
+        _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos)
        if worker._using_warcprox(site):
            info_json = json.dumps(ie_result, sort_keys=True, indent=4)
            logging.info(