mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
WT2590 addressing PR feedback
This commit is contained in:
parent
12db06ae8c
commit
36b17d2a66
@ -249,15 +249,15 @@ class BrozzlerWorker:
|
||||
|
||||
if not self._needs_browsing(page_headers):
|
||||
self.logger.info("needs fetch: %s", page)
|
||||
if site.video_capture in [
|
||||
if site.pdfs_only and not self._is_pdf(page_headers):
|
||||
self.logger.info("skipping non-PDF content: PDFs only option enabled")
|
||||
elif site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
|
||||
] and self._is_video_type(page_headers):
|
||||
self.logger.info(
|
||||
"skipping video content: video MIME type capture disabled for site"
|
||||
)
|
||||
elif site.pdfs_only and not self._is_pdf(page_headers):
|
||||
self.logger.info("skipping non-PDF content: PDFs only option enabled")
|
||||
else:
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
|
10
job-conf.rst
10
job-conf.rst
@ -114,10 +114,12 @@ seeds starving out other jobs.
|
||||
+=========+==========+===========+
|
||||
| boolean | no | ``false`` |
|
||||
+---------+----------+-----------+
|
||||
Limits capture to PDFs based on MIME type. This value will only impact
|
||||
processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs
|
||||
requires an additional entry in the Warcprox-Meta header ``mime-type-filters``
|
||||
key.
|
||||
Limits capture to PDFs based on the MIME type set in the HTTP response's
|
||||
Content-Type header. This value only impacts processing of outlinks within
|
||||
Brozzler.
|
||||
|
||||
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
|
||||
entry in the Warcprox-Meta header `mime-type-filters` key.*
|
||||
|
||||
``seeds``
|
||||
~~~~~~~~~
|
||||
|
Loading…
x
Reference in New Issue
Block a user