WT-2950 documentation + better conf handling + linting

This commit is contained in:
Gretchen Miller 2024-09-20 15:21:21 -07:00
parent dca9630982
commit 41aab1a0b0
3 changed files with 43 additions and 9 deletions

View File

@ -101,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()
sites = []
@ -199,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if not "status" in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
@ -224,16 +228,17 @@ class Job(doublethink.Document, ElapsedMixIn):
class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE: All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
Note: Ensuring full video MIME type blocking requires an entry in the
Warcprox-Meta header `mime-type-filters` key to fully block videos by
MIME type.
Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"

View File

@ -257,9 +257,7 @@ class BrozzlerWorker:
"skipping video content: video MIME type capture disabled for site"
)
elif site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info(
"skipping non-PDF content: PDFs only option enabled"
)
self.logger.info("skipping non-PDF content: PDFs only option enabled")
else:
self._fetch_url(site, page=page)
else:
@ -334,7 +332,8 @@ class BrozzlerWorker:
Determinse if the page's Content-Type header specifies that it is a PDF.
"""
return (
"content-type" in page_headers and "application/pdf" in page_headers["content-type"]
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):

View File

@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.
``pdfs_only``
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on MIME type. This value will only impact
processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs
requires an additional entry in the Warcprox-Meta header ``mime-type-filters``
key to fully block videos by MIME type.
``seeds``
~~~~~~~~~
+------------------------+----------+---------+
@ -158,6 +170,24 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.
``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*
Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which