diff --git a/brozzler/worker.py b/brozzler/worker.py index cf36210..4aad2a3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,10 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None - ydl_outlinks = [] + outlinks = set() if enable_youtube_dl: try: - ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) + ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -208,11 +208,11 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) - browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) browser_outlinks = self._browse_page( browser, site, page, on_screenshot, on_request) + outlinks.update(browser_outlinks) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) @@ -220,10 +220,7 @@ class BrozzlerWorker: else: self.logger.info('already fetched: %s', page) - outlinks = set() - outlinks.update(ydl_outlinks) - outlinks.update(browser_outlinks) - return list(outlinks) + return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f1b57cb..4f9169c 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -379,10 +379,10 @@ def do_youtube_dl(worker, site, page): with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) - outlinks = [] - if ie_result['extractor'] == 'youtube:playlist': + outlinks = set() + if ie_result and ie_result.get('extractor') == 'youtube:playlist': # youtube watch pages as outlinks - outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] - for e in ie_result.get('entries_no_dl', [])] + outlinks = {'https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])} # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 8c2c9c8..686c8f4 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -2,7 +2,7 @@ ''' test_brozzling.py - XXX explain -Copyright (C) 2016-2017 Internet Archive +Copyright (C) 2016-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.