Merge branch 'ytdlp_last' into qa

This commit is contained in:
Barbara Miller 2024-04-23 16:18:30 -07:00
commit b09b3b5438
2 changed files with 7 additions and 4 deletions

View File

@ -247,7 +247,10 @@ class BrozzlerWorker:
self._get_page_headers(page) self._get_page_headers(page)
if self._needs_browsing(page): if not self._needs_browsing(page):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page) self.logger.info("needs browsing: %s", page)
try: try:
browser_outlinks = self._browse_page( browser_outlinks = self._browse_page(
@ -286,13 +289,12 @@ class BrozzlerWorker:
self.logger.error( self.logger.error(
"youtube_dl raised exception on %s", page, exc_info=True "youtube_dl raised exception on %s", page, exc_info=True
) )
else:
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
return outlinks return outlinks
def _get_page_headers(self, page): def _get_page_headers(self, page):
page.content_type = page.content_length = page.last_modified = None page.content_type = page.content_length = page.last_modified = None
# bypassing warcprox, requests' stream=True defers downloading the body of the response
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
with requests.get(page.url, stream=True) as r: with requests.get(page.url, stream=True) as r:
if "content-type" in r.headers: if "content-type" in r.headers:
page.content_type = r.headers["content-type"] page.content_type = r.headers["content-type"]

View File

@ -52,6 +52,7 @@ def _timestamp4datetime(timestamp):
) )
def should_ytdlp(page, site): def should_ytdlp(page, site):
# called only after we've passed needs_browsing() check
if page.status_code != 200: if page.status_code != 200:
return False return False