mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
WT-2950 exclude video file types if site has disabled video capture
This commit is contained in:
parent
c3a92b102f
commit
c722549697
@ -248,7 +248,10 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
if not self._needs_browsing(page_headers):
|
if not self._needs_browsing(page_headers):
|
||||||
self.logger.info("needs fetch: %s", page)
|
self.logger.info("needs fetch: %s", page)
|
||||||
self._fetch_url(site, page=page)
|
if self._is_video_type(page_headers) and site.video_capture != "ENABLE_VIDEO_CAPTURE":
|
||||||
|
self.logger.info("skipping video content: video capture disabled for site")
|
||||||
|
else:
|
||||||
|
self._fetch_url(site, page=page)
|
||||||
else:
|
else:
|
||||||
self.logger.info("needs browsing: %s", page)
|
self.logger.info("needs browsing: %s", page)
|
||||||
try:
|
try:
|
||||||
@ -301,13 +304,11 @@ class BrozzlerWorker:
|
|||||||
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _needs_browsing(self, page_headers):
|
def _needs_browsing(self, page_headers) -> bool:
|
||||||
if (
|
return not bool("content-type" in page_headers and "html" not in page_headers["content-type"])
|
||||||
"content-type" in page_headers
|
|
||||||
and "html" not in page_headers["content-type"]
|
def _is_video_type(self, page_headers) -> bool:
|
||||||
):
|
return ("content-type" in page_headers and "video" in page_headers["content-type"])
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||||
def _on_screenshot(screenshot_jpeg):
|
def _on_screenshot(screenshot_jpeg):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user