tidy up some comments and docs

2025-12-10 14:25:34 -05:00 · 2018-10-12 00:48:38 -07:00 · 2018-10-12 00:48:38 -07:00 · 054ba6d7a0
commit 054ba6d7a0
parent 8f9077fbf3
1 changed files with 15 additions and 26 deletions
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -1,8 +1,6 @@
 '''
 brozzler/ydl.py - youtube-dl support for brozzler
 This code was extracted from worker.py and 
 Copyright (C) 2018 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
@ -128,7 +126,7 @@ def _build_youtube_dl(worker, destdir, site):
    - if brozzling through warcprox and downloading segmented videos (e.g.
      HLS), pushes the stitched-up video created by youtube-dl to warcprox
      using a WARCPROX_WRITE_RECORD request
-    - adds some logging
+    - some logging
    Args:
        worker (brozzler.BrozzlerWorker): the calling brozzler worker
@ -150,17 +148,6 @@ def _build_youtube_dl(worker, destdir, site):
            self.logger.debug('fetching %r', url)
            return super().urlopen(req)
        # def _match_entry(self, info_dict, incomplete):
        #     if self.dl_disabled:
        #         return 'Downloading disabled (probably youtube playlist)'
        # def extract_info(self, *args, **kwargs):
        #     self.dl_disabled = False
        #     try:
        #         return super().extract_info(*args, **kwargs)
        #     finally:
        #         self.dl_disabled = False
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
@ -176,11 +163,10 @@ def _build_youtube_dl(worker, destdir, site):
                    ie_result['entries_no_dl'] = list(ie_result['entries'])
                    ie_result['entries'] = []
                    self.logger.info(
-                            'setting skip_download because this is a youtube '
+                            'not downoading %s videos from this youtube '
-                            'playlist (%s entries) and we expect to capture '
+                            'playlist because we expect to capture them from '
-                            'videos from individual watch pages',
+                            'individual watch pages',
                            len(ie_result['entries_no_dl']))
                    # self.dl_disabled = True
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
@ -380,20 +366,23 @@ def do_youtube_dl(worker, site, page):
        page (brozzler.Page): the page we are brozzling
    Returns:
-        `list` of `dict`: with info about urls fetched:
+        tuple with two entries:
-
+            `list` of `dict`: with info about urls fetched:
-            [{
+                [{
-                'url': ...,
+                    'url': ...,
-                'method': ...,
+                    'method': ...,
-                'response_code': ...,
+                    'response_code': ...,
-                'response_headers': ...,
+                    'response_headers': ...,
-            }, ...]
+                }, ...]
            `list` of `str`: outlink urls
    '''
    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site)
        ie_result = _try_youtube_dl(worker, ydl, site, page)
        outlinks = []
        if ie_result['extractor'] == 'youtube:playlist':
            # youtube watch pages as outlinks
            outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
                        for e in ie_result.get('entries_no_dl', [])]
        # any outlinks for other cases?
        return ydl.fetch_spy.fetches, outlinks