mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
run ydl after browsing page
This commit is contained in:
parent
a4195e1a83
commit
b965c1fdf6
@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
||||
scopes and adds outlinks to the frontier
|
||||
|
||||
Copyright (C) 2014-2018 Internet Archive
|
||||
Copyright (C) 2014-2022 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -193,6 +193,22 @@ class BrozzlerWorker:
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl_fetches = None
|
||||
outlinks = set()
|
||||
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request)
|
||||
outlinks.update(browser_outlinks)
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
|
||||
if enable_youtube_dl:
|
||||
try:
|
||||
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||
@ -214,21 +230,6 @@ class BrozzlerWorker:
|
||||
'youtube_dl raised exception on %s', page,
|
||||
exc_info=True)
|
||||
|
||||
if self._needs_browsing(page, ydl_fetches):
|
||||
self.logger.info('needs browsing: %s', page)
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
browser, site, page, on_screenshot, on_request)
|
||||
outlinks.update(browser_outlinks)
|
||||
except brozzler.PageInterstitialShown:
|
||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
|
||||
return outlinks
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
|
Loading…
x
Reference in New Issue
Block a user