mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-01 02:56:10 -04:00
if not self._needs_browsing; _get_page_headers comment
This commit is contained in:
parent
7764c3f6d7
commit
2d183c7d0c
1 changed files with 6 additions and 4 deletions
|
@ -246,7 +246,10 @@ class BrozzlerWorker:
|
|||
|
||||
self._get_page_headers(page)
|
||||
|
||||
if self._needs_browsing(page):
|
||||
if not self._needs_browsing(page):
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info("needs browsing: %s", page)
|
||||
try:
|
||||
browser_outlinks = self._browse_page(
|
||||
|
@ -285,13 +288,12 @@ class BrozzlerWorker:
|
|||
self.logger.error(
|
||||
"youtube_dl raised exception on %s", page, exc_info=True
|
||||
)
|
||||
else:
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
self._fetch_url(site, page=page)
|
||||
return outlinks
|
||||
|
||||
def _get_page_headers(self, page):
|
||||
page.content_type = page.content_length = page.last_modified = None
|
||||
# bypassing warcprox, requests' stream=True defers downloading the body of the response
|
||||
# see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
|
||||
with requests.get(page.url, stream=True) as r:
|
||||
if "content-type" in r.headers:
|
||||
page.content_type = r.headers["content-type"]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue