From 7764c3f6d71b3a517c641594dd88f70b8d9c649f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 23 Apr 2024 16:00:18 -0700 Subject: [PATCH 1/2] add comment --- brozzler/ydl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4c4b7ad..9caf662 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -33,6 +33,7 @@ thread_local = threading.local() def should_ytdlp(page, site): + # called only after we've passed needs_browsing() check if page.status_code != 200: return False From 2d183c7d0cb4c14ff2432b4849efc8aa067dc4dc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 23 Apr 2024 16:01:35 -0700 Subject: [PATCH 2/2] if not self._needs_browsing; _get_page_headers comment --- brozzler/worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 2772db3..2bad677 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -246,7 +246,10 @@ class BrozzlerWorker: self._get_page_headers(page) - if self._needs_browsing(page): + if not self._needs_browsing(page): + self.logger.info("needs fetch: %s", page) + self._fetch_url(site, page=page) + else: self.logger.info("needs browsing: %s", page) try: browser_outlinks = self._browse_page( @@ -285,13 +288,12 @@ class BrozzlerWorker: self.logger.error( "youtube_dl raised exception on %s", page, exc_info=True ) - else: - self.logger.info("needs fetch: %s", page) - self._fetch_url(site, page=page) return outlinks def _get_page_headers(self, page): page.content_type = page.content_length = page.last_modified = None + # bypassing warcprox, requests' stream=True defers downloading the body of the response + # see https://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow with requests.get(page.url, stream=True) as r: if "content-type" in r.headers: page.content_type = r.headers["content-type"]