Merge branch 'ytdlp_last' into qa

This commit is contained in:
Barbara Miller 2024-04-18 16:24:10 -07:00
commit 55517fb432
2 changed files with 93 additions and 45 deletions

View File

@ -244,43 +244,96 @@ class BrozzlerWorker:
self.logger.info("brozzling {}".format(page))
outlinks = set()
try:
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
self._get_page_headers(page)
if enable_youtube_dl and ydl.should_ytdlp(page, site):
if self._needs_browsing(page):
self.logger.info("needs browsing: %s", page)
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except Exception as e:
if (
hasattr(e, "exc_info")
and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info(
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)
browser_outlinks = self._browse_page(
browser, site, page, on_screenshot, on_request
)
outlinks.update(browser_outlinks)
page.status_code = browser.websock_thread.page_status
self.logger.info("url %s status code %s", page.url, page.status_code)
except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp(page):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e:
raise
except brozzler.ShutdownRequested:
raise
except brozzler.ProxyError:
raise
except Exception as e:
if (
hasattr(e, "exc_info")
and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], "code")
and e.exc_info[1].code == 430
):
self.logger.info(
"youtube-dl got %s %s processing %s",
e.exc_info[1].code,
e.exc_info[1].msg,
page.url,
)
else:
self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True
)
else:
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
return outlinks
def _get_page_headers(self, page):
with requests.get(page.url, stream=True) as r:
content_type_header = content_length_header = last_modified_header = None
if "Content-Type" in r.headers:
content_type_header = "Content-Type"
elif "content-length" in r.headers:
content_type_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_type_header = "CONTENT-LENGTH"
if content_type_header:
page.content_type = r.headers[content_type_header]
self.logger.info(
"url %s content_type is %s", page.url, page.content_type
)
if "Content-Length" in r.headers:
content_length_header = "Content-Length"
elif "content-length" in r.headers:
content_length_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_length_header = "CONTENT-LENGTH"
if content_length_header:
page.content_length = int(r.headers[content_length_header])
self.logger.info(
"url %s content_length is %s", page.url, page.content_length
)
if "Last-Modified" in r.headers:
last_modified_header = "Last-Modified"
elif "Last-Modified" in r.headers:
last_modified_header = "Last-Modified"
elif "LAST-MODIFIED" in r.headers:
last_modified_header = "LAST-MODIFIED"
if last_modified_header:
page.last_modified = r.headers[last_modified_header]
self.logger.info(
"url %s last_modified is %s", page.url, page.last_modified
)
def _needs_browsing(self, page):
if page.content_type and "html" not in page.content_type:
return False
return True
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_jpeg):
if on_screenshot:

View File

@ -21,7 +21,6 @@ import yt_dlp
from yt_dlp.utils import match_filter_func
import brozzler
import urllib.request
from urllib.parse import urlparse
import tempfile
import urlcanon
import os
@ -35,17 +34,6 @@ import threading
thread_local = threading.local()
def is_html_maybe(url):
skip_url_exts = ['pdf', 'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mpeg']
parsed_url = urlparse(url)
base_url, ext = os.path.splitext(parsed_url.path)
ext = ext[1:]
for skip in skip_url_exts:
if ext.startswith(skip):
return False
return True
def _timestamp4datetime(timestamp):
"""split `timestamp` into a tuple of 6 integers.
@ -63,7 +51,14 @@ def _timestamp4datetime(timestamp):
)
def should_ytdlp(page, site):
if page.status_code != 200:
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = site["metadata"]["ait_seed_id"]
logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)