Merge branch 'ytdlp_last' into qa

This commit is contained in:
Barbara Miller 2024-04-19 13:24:01 -07:00
commit b3ef8d87c2

View File

@ -291,42 +291,24 @@ class BrozzlerWorker:
return outlinks return outlinks
def _get_page_headers(self, page): def _get_page_headers(self, page):
page.content_type = page.content_length = page.last_modified = None
with requests.get(page.url, stream=True) as r: with requests.get(page.url, stream=True) as r:
content_type_header = content_length_header = last_modified_header = None if "content-type" in r.headers:
if "Content-Type" in r.headers: page.content_type = r.headers["content-type"]
content_type_header = "Content-Type"
elif "content-length" in r.headers:
content_type_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_type_header = "CONTENT-LENGTH"
if content_type_header:
page.content_type = r.headers[content_type_header]
self.logger.info( self.logger.info(
"url %s content_type is %s", page.url, page.content_type "content_type: %s for url %s", page.content_type, page.url
) )
if "Content-Length" in r.headers: if "content-length" in r.headers:
content_length_header = "Content-Length" page.content_length = int(r.headers["content-length"])
elif "content-length" in r.headers:
content_length_header = "content-length"
elif "CONTENT-LENGTH" in r.headers:
content_length_header = "CONTENT-LENGTH"
if content_length_header:
page.content_length = int(r.headers[content_length_header])
self.logger.info( self.logger.info(
"url %s content_length is %s", page.url, page.content_length "content_length: %s for url %s", page.content_length, page.url
) )
if "Last-Modified" in r.headers: if "last-modified" in r.headers:
last_modified_header = "Last-Modified" page.last_modified = r.headers["last-modified"]
elif "Last-Modified" in r.headers:
last_modified_header = "Last-Modified"
elif "LAST-MODIFIED" in r.headers:
last_modified_header = "LAST-MODIFIED"
if last_modified_header:
page.last_modified = r.headers[last_modified_header]
self.logger.info( self.logger.info(
"url %s last_modified is %s", page.url, page.last_modified "last_modified: %s for url %s", page.last_modified, page.url
) )
def _needs_browsing(self, page): def _needs_browsing(self, page):