mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
Merge branch 'ytdlp_last' into qa
This commit is contained in:
commit
b09b3b5438
@ -247,7 +247,10 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
self._get_page_headers(page)
|
self._get_page_headers(page)
|
||||||
|
|
||||||
if self._needs_browsing(page):
|
if not self._needs_browsing(page):
|
||||||
|
self.logger.info("needs fetch: %s", page)
|
||||||
|
self._fetch_url(site, page=page)
|
||||||
|
else:
|
||||||
self.logger.info("needs browsing: %s", page)
|
self.logger.info("needs browsing: %s", page)
|
||||||
try:
|
try:
|
||||||
browser_outlinks = self._browse_page(
|
browser_outlinks = self._browse_page(
|
||||||
@ -286,13 +289,12 @@ class BrozzlerWorker:
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
"youtube_dl raised exception on %s", page, exc_info=True
|
"youtube_dl raised exception on %s", page, exc_info=True
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
self.logger.info("needs fetch: %s", page)
|
|
||||||
self._fetch_url(site, page=page)
|
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _get_page_headers(self, page):
|
def _get_page_headers(self, page):
|
||||||
page.content_type = page.content_length = page.last_modified = None
|
page.content_type = page.content_length = page.last_modified = None
|
||||||
|
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||||
|
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||||
with requests.get(page.url, stream=True) as r:
|
with requests.get(page.url, stream=True) as r:
|
||||||
if "content-type" in r.headers:
|
if "content-type" in r.headers:
|
||||||
page.content_type = r.headers["content-type"]
|
page.content_type = r.headers["content-type"]
|
||||||
|
@ -52,6 +52,7 @@ def _timestamp4datetime(timestamp):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def should_ytdlp(page, site):
|
def should_ytdlp(page, site):
|
||||||
|
# called only after we've passed needs_browsing() check
|
||||||
if page.status_code != 200:
|
if page.status_code != 200:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user