mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Merge pull request #280 from galgeek/ytdlp_last_refined_really
refine April ytdlp_last update
This commit is contained in:
commit
4f36684c7c
@ -246,9 +246,9 @@ class BrozzlerWorker:
|
|||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
|
|
||||||
self._get_page_headers(page)
|
page_headers = self._get_page_headers(page)
|
||||||
|
|
||||||
if not self._needs_browsing(page):
|
if not self._needs_browsing(page_headers):
|
||||||
self.logger.info("needs fetch: %s", page)
|
self.logger.info("needs fetch: %s", page)
|
||||||
self._fetch_url(site, page=page)
|
self._fetch_url(site, page=page)
|
||||||
else:
|
else:
|
||||||
@ -258,12 +258,12 @@ class BrozzlerWorker:
|
|||||||
browser, site, page, on_screenshot, on_request
|
browser, site, page, on_screenshot, on_request
|
||||||
)
|
)
|
||||||
outlinks.update(browser_outlinks)
|
outlinks.update(browser_outlinks)
|
||||||
page.status_code = browser.websock_thread.page_status
|
|
||||||
self.logger.info("url %s status code %s", page.url, page.status_code)
|
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(site, page, self._skip_av_seeds):
|
if enable_youtube_dl and ydl.should_ytdlp(
|
||||||
|
site, page, browser.websock_thread.page_status, self._skip_av_seeds
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
outlinks.update(ydl_outlinks)
|
outlinks.update(ydl_outlinks)
|
||||||
@ -293,30 +293,17 @@ class BrozzlerWorker:
|
|||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _get_page_headers(self, page):
|
def _get_page_headers(self, page):
|
||||||
page.content_type = page.content_length = page.last_modified = None
|
|
||||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||||
with requests.get(page.url, stream=True) as r:
|
with requests.get(page.url, stream=True) as r:
|
||||||
if "content-type" in r.headers:
|
page_headers = r.headers
|
||||||
page.content_type = r.headers["content-type"]
|
return page_headers
|
||||||
self.logger.info(
|
|
||||||
"content_type: %s for url %s", page.content_type, page.url
|
|
||||||
)
|
|
||||||
|
|
||||||
if "content-length" in r.headers:
|
def _needs_browsing(self, page_headers):
|
||||||
page.content_length = int(r.headers["content-length"])
|
if (
|
||||||
self.logger.info(
|
"content-type" in page_headers
|
||||||
"content_length: %s for url %s", page.content_length, page.url
|
and "html" not in page_headers["content-type"]
|
||||||
)
|
):
|
||||||
|
|
||||||
if "last-modified" in r.headers:
|
|
||||||
page.last_modified = r.headers["last-modified"]
|
|
||||||
self.logger.info(
|
|
||||||
"last_modified: %s for url %s", page.last_modified, page.url
|
|
||||||
)
|
|
||||||
|
|
||||||
def _needs_browsing(self, page):
|
|
||||||
if page.content_type and "html" not in page.content_type:
|
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -32,11 +32,11 @@ import threading
|
|||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
def should_ytdlp(site, page, skip_av_seeds):
|
def should_ytdlp(site, page, page_status, skip_av_seeds):
|
||||||
# called only after we've passed needs_browsing() check
|
# called only after we've passed needs_browsing() check
|
||||||
|
|
||||||
if page.status_code != 200:
|
if page_status != 200:
|
||||||
logging.info("skipping ytdlp: non-200 page status")
|
logging.info("skipping ytdlp: non-200 page status %s", page_status)
|
||||||
return False
|
return False
|
||||||
if site.skip_ytdlp:
|
if site.skip_ytdlp:
|
||||||
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
logging.info("skipping ytdlp: site marked skip_ytdlp")
|
||||||
|
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="brozzler",
|
name="brozzler",
|
||||||
version="1.5.50",
|
version="1.5.51",
|
||||||
description="Distributed web crawling with browsers",
|
description="Distributed web crawling with browsers",
|
||||||
url="https://github.com/internetarchive/brozzler",
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author="Noah Levitt",
|
author="Noah Levitt",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user