From 054ba6d7a038f05d0854cc2d216baed3e9888ff5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:48:38 -0700 Subject: [PATCH] tidy up some comments and docs --- brozzler/ydl.py | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index cfdf43e..f1b57cb 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,8 +1,6 @@ ''' brozzler/ydl.py - youtube-dl support for brozzler -This code was extracted from worker.py and - Copyright (C) 2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); @@ -128,7 +126,7 @@ def _build_youtube_dl(worker, destdir, site): - if brozzling through warcprox and downloading segmented videos (e.g. HLS), pushes the stitched-up video created by youtube-dl to warcprox using a WARCPROX_WRITE_RECORD request - - adds some logging + - some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker @@ -150,17 +148,6 @@ def _build_youtube_dl(worker, destdir, site): self.logger.debug('fetching %r', url) return super().urlopen(req) - # def _match_entry(self, info_dict, incomplete): - # if self.dl_disabled: - # return 'Downloading disabled (probably youtube playlist)' - - # def extract_info(self, *args, **kwargs): - # self.dl_disabled = False - # try: - # return super().extract_info(*args, **kwargs) - # finally: - # self.dl_disabled = False - def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) @@ -176,11 +163,10 @@ def _build_youtube_dl(worker, destdir, site): ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result['entries'] = [] self.logger.info( - 'setting skip_download because this is a youtube ' - 'playlist (%s entries) and we expect to capture ' - 'videos from individual watch pages', + 'not downoading %s videos from this youtube ' + 'playlist because we expect to capture them from ' + 'individual watch pages', len(ie_result['entries_no_dl'])) - # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -380,20 +366,23 @@ def do_youtube_dl(worker, site, page): page (brozzler.Page): the page we are brozzling Returns: - `list` of `dict`: with info about urls fetched: - - [{ - 'url': ..., - 'method': ..., - 'response_code': ..., - 'response_headers': ..., - }, ...] + tuple with two entries: + `list` of `dict`: with info about urls fetched: + [{ + 'url': ..., + 'method': ..., + 'response_code': ..., + 'response_headers': ..., + }, ...] + `list` of `str`: outlink urls ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = [] if ie_result['extractor'] == 'youtube:playlist': + # youtube watch pages as outlinks outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] for e in ie_result.get('entries_no_dl', [])] + # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks