mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-25 00:59:52 -05:00
Merge branch 'ytdlp_last' into qa
This commit is contained in:
commit
b3ef8d87c2
@ -291,42 +291,24 @@ class BrozzlerWorker:
|
||||
return outlinks
|
||||
|
||||
def _get_page_headers(self, page):
|
||||
page.content_type = page.content_length = page.last_modified = None
|
||||
with requests.get(page.url, stream=True) as r:
|
||||
content_type_header = content_length_header = last_modified_header = None
|
||||
if "Content-Type" in r.headers:
|
||||
content_type_header = "Content-Type"
|
||||
elif "content-length" in r.headers:
|
||||
content_type_header = "content-length"
|
||||
elif "CONTENT-LENGTH" in r.headers:
|
||||
content_type_header = "CONTENT-LENGTH"
|
||||
if content_type_header:
|
||||
page.content_type = r.headers[content_type_header]
|
||||
if "content-type" in r.headers:
|
||||
page.content_type = r.headers["content-type"]
|
||||
self.logger.info(
|
||||
"url %s content_type is %s", page.url, page.content_type
|
||||
"content_type: %s for url %s", page.content_type, page.url
|
||||
)
|
||||
|
||||
if "Content-Length" in r.headers:
|
||||
content_length_header = "Content-Length"
|
||||
elif "content-length" in r.headers:
|
||||
content_length_header = "content-length"
|
||||
elif "CONTENT-LENGTH" in r.headers:
|
||||
content_length_header = "CONTENT-LENGTH"
|
||||
if content_length_header:
|
||||
page.content_length = int(r.headers[content_length_header])
|
||||
if "content-length" in r.headers:
|
||||
page.content_length = int(r.headers["content-length"])
|
||||
self.logger.info(
|
||||
"url %s content_length is %s", page.url, page.content_length
|
||||
"content_length: %s for url %s", page.content_length, page.url
|
||||
)
|
||||
|
||||
if "Last-Modified" in r.headers:
|
||||
last_modified_header = "Last-Modified"
|
||||
elif "Last-Modified" in r.headers:
|
||||
last_modified_header = "Last-Modified"
|
||||
elif "LAST-MODIFIED" in r.headers:
|
||||
last_modified_header = "LAST-MODIFIED"
|
||||
if last_modified_header:
|
||||
page.last_modified = r.headers[last_modified_header]
|
||||
if "last-modified" in r.headers:
|
||||
page.last_modified = r.headers["last-modified"]
|
||||
self.logger.info(
|
||||
"url %s last_modified is %s", page.url, page.last_modified
|
||||
"last_modified: %s for url %s", page.last_modified, page.url
|
||||
)
|
||||
|
||||
def _needs_browsing(self, page):
|
||||
|
Loading…
x
Reference in New Issue
Block a user