diff --git a/brozzler/worker.py b/brozzler/worker.py index faa7081..8416ec0 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -249,15 +249,15 @@ class BrozzlerWorker: if not self._needs_browsing(page_headers): self.logger.info("needs fetch: %s", page) - if site.video_capture in [ + if site.pdfs_only and not self._is_pdf(page_headers): + self.logger.info("skipping non-PDF content: PDFs only option enabled") + elif site.video_capture in [ VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, ] and self._is_video_type(page_headers): self.logger.info( "skipping video content: video MIME type capture disabled for site" ) - elif site.pdfs_only and not self._is_pdf(page_headers): - self.logger.info("skipping non-PDF content: PDFs only option enabled") else: self._fetch_url(site, page=page) else: diff --git a/job-conf.rst b/job-conf.rst index d0428ca..5378bac 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -114,10 +114,12 @@ seeds starving out other jobs. +=========+==========+===========+ | boolean | no | ``false`` | +---------+----------+-----------+ -Limits capture to PDFs based on MIME type. This value will only impact -processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs -requires an additional entry in the Warcprox-Meta header ``mime-type-filters`` -key. +Limits capture to PDFs based on the MIME type set in the HTTP response's +Content-Type header. This value only impacts processing of outlinks within +Brozzler. + +*Note: Ensuring comprehensive limiting to only PDFs requires an additional +entry in the Warcprox-Meta header `mime-type-filters` key.* ``seeds`` ~~~~~~~~~