mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
tests expect outlinks to be a set
This commit is contained in:
parent
054ba6d7a0
commit
7497b7e5ac
@ -186,10 +186,10 @@ class BrozzlerWorker:
|
|||||||
on_request=None, enable_youtube_dl=True):
|
on_request=None, enable_youtube_dl=True):
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
ydl_fetches = None
|
ydl_fetches = None
|
||||||
ydl_outlinks = []
|
outlinks = set()
|
||||||
if enable_youtube_dl:
|
if enable_youtube_dl:
|
||||||
try:
|
try:
|
||||||
ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
raise
|
raise
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
@ -208,11 +208,11 @@ class BrozzlerWorker:
|
|||||||
'youtube_dl raised exception on %s', page,
|
'youtube_dl raised exception on %s', page,
|
||||||
exc_info=True)
|
exc_info=True)
|
||||||
|
|
||||||
browser_outlinks = []
|
|
||||||
if self._needs_browsing(page, ydl_fetches):
|
if self._needs_browsing(page, ydl_fetches):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
browser_outlinks = self._browse_page(
|
browser_outlinks = self._browse_page(
|
||||||
browser, site, page, on_screenshot, on_request)
|
browser, site, page, on_screenshot, on_request)
|
||||||
|
outlinks.update(browser_outlinks)
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_fetches):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
@ -220,10 +220,7 @@ class BrozzlerWorker:
|
|||||||
else:
|
else:
|
||||||
self.logger.info('already fetched: %s', page)
|
self.logger.info('already fetched: %s', page)
|
||||||
|
|
||||||
outlinks = set()
|
return outlinks
|
||||||
outlinks.update(ydl_outlinks)
|
|
||||||
outlinks.update(browser_outlinks)
|
|
||||||
return list(outlinks)
|
|
||||||
|
|
||||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
|
@ -379,10 +379,10 @@ def do_youtube_dl(worker, site, page):
|
|||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||||
outlinks = []
|
outlinks = set()
|
||||||
if ie_result['extractor'] == 'youtube:playlist':
|
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
|
||||||
# youtube watch pages as outlinks
|
# youtube watch pages as outlinks
|
||||||
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
|
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
|
||||||
for e in ie_result.get('entries_no_dl', [])]
|
for e in ie_result.get('entries_no_dl', [])}
|
||||||
# any outlinks for other cases?
|
# any outlinks for other cases?
|
||||||
return ydl.fetch_spy.fetches, outlinks
|
return ydl.fetch_spy.fetches, outlinks
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
test_brozzling.py - XXX explain
|
test_brozzling.py - XXX explain
|
||||||
|
|
||||||
Copyright (C) 2016-2017 Internet Archive
|
Copyright (C) 2016-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user