mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
tests expect outlinks to be a set
This commit is contained in:
parent
054ba6d7a0
commit
7497b7e5ac
@ -186,10 +186,10 @@ class BrozzlerWorker:
|
||||
on_request=None, enable_youtube_dl=True):
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl_fetches = None
|
||||
ydl_outlinks = []
|
||||
outlinks = set()
|
||||
if enable_youtube_dl:
|
||||
try:
|
||||
ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
@ -208,11 +208,11 @@ class BrozzlerWorker:
|
||||
'youtube_dl raised exception on %s', page,
|
||||
exc_info=True)
|
||||
|
||||
browser_outlinks = []
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request)
|
||||
outlinks.update(browser_outlinks)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
@ -220,10 +220,7 @@ class BrozzlerWorker:
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
|
||||
outlinks = set()
|
||||
outlinks.update(ydl_outlinks)
|
||||
outlinks.update(browser_outlinks)
|
||||
return list(outlinks)
|
||||
return outlinks
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
def _on_screenshot(screenshot_png):
|
||||
|
@ -379,10 +379,10 @@ def do_youtube_dl(worker, site, page):
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = []
|
||||
if ie_result['extractor'] == 'youtube:playlist':
|
||||
outlinks = set()
|
||||
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
|
||||
# youtube watch pages as outlinks
|
||||
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
|
||||
for e in ie_result.get('entries_no_dl', [])]
|
||||
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
|
||||
for e in ie_result.get('entries_no_dl', [])}
|
||||
# any outlinks for other cases?
|
||||
return ydl.fetch_spy.fetches, outlinks
|
||||
|
@ -2,7 +2,7 @@
|
||||
'''
|
||||
test_brozzling.py - XXX explain
|
||||
|
||||
Copyright (C) 2016-2017 Internet Archive
|
||||
Copyright (C) 2016-2018 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
Loading…
x
Reference in New Issue
Block a user