mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-25 00:59:52 -05:00
Merge branch 'ytdlp_last' into qa
This commit is contained in:
commit
b3ef8d87c2
@ -291,42 +291,24 @@ class BrozzlerWorker:
|
|||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _get_page_headers(self, page):
|
def _get_page_headers(self, page):
|
||||||
|
page.content_type = page.content_length = page.last_modified = None
|
||||||
with requests.get(page.url, stream=True) as r:
|
with requests.get(page.url, stream=True) as r:
|
||||||
content_type_header = content_length_header = last_modified_header = None
|
if "content-type" in r.headers:
|
||||||
if "Content-Type" in r.headers:
|
page.content_type = r.headers["content-type"]
|
||||||
content_type_header = "Content-Type"
|
|
||||||
elif "content-length" in r.headers:
|
|
||||||
content_type_header = "content-length"
|
|
||||||
elif "CONTENT-LENGTH" in r.headers:
|
|
||||||
content_type_header = "CONTENT-LENGTH"
|
|
||||||
if content_type_header:
|
|
||||||
page.content_type = r.headers[content_type_header]
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"url %s content_type is %s", page.url, page.content_type
|
"content_type: %s for url %s", page.content_type, page.url
|
||||||
)
|
)
|
||||||
|
|
||||||
if "Content-Length" in r.headers:
|
if "content-length" in r.headers:
|
||||||
content_length_header = "Content-Length"
|
page.content_length = int(r.headers["content-length"])
|
||||||
elif "content-length" in r.headers:
|
|
||||||
content_length_header = "content-length"
|
|
||||||
elif "CONTENT-LENGTH" in r.headers:
|
|
||||||
content_length_header = "CONTENT-LENGTH"
|
|
||||||
if content_length_header:
|
|
||||||
page.content_length = int(r.headers[content_length_header])
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"url %s content_length is %s", page.url, page.content_length
|
"content_length: %s for url %s", page.content_length, page.url
|
||||||
)
|
)
|
||||||
|
|
||||||
if "Last-Modified" in r.headers:
|
if "last-modified" in r.headers:
|
||||||
last_modified_header = "Last-Modified"
|
page.last_modified = r.headers["last-modified"]
|
||||||
elif "Last-Modified" in r.headers:
|
|
||||||
last_modified_header = "Last-Modified"
|
|
||||||
elif "LAST-MODIFIED" in r.headers:
|
|
||||||
last_modified_header = "LAST-MODIFIED"
|
|
||||||
if last_modified_header:
|
|
||||||
page.last_modified = r.headers[last_modified_header]
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"url %s last_modified is %s", page.url, page.last_modified
|
"last_modified: %s for url %s", page.last_modified, page.url
|
||||||
)
|
)
|
||||||
|
|
||||||
def _needs_browsing(self, page):
|
def _needs_browsing(self, page):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user