mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
WT-2950 documentation + better conf handling + linting
This commit is contained in:
parent
dca9630982
commit
41aab1a0b0
@ -101,6 +101,8 @@ def new_job(frontier, job_conf):
|
||||
job.id = job_conf["id"]
|
||||
if "max_claimed_sites" in job_conf:
|
||||
job.max_claimed_sites = job_conf["max_claimed_sites"]
|
||||
if "pdfs_only" in job_conf:
|
||||
job.pdfs_only = job_conf["pdfs_only"]
|
||||
job.save()
|
||||
|
||||
sites = []
|
||||
@ -199,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
def populate_defaults(self):
|
||||
if not "status" in self:
|
||||
self.status = "ACTIVE"
|
||||
if "pdfs_only" not in self:
|
||||
self.pdfs_only = False
|
||||
if not "starts_and_stops" in self:
|
||||
if self.get("started"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
@ -224,16 +228,17 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
class VideoCaptureOptions(Enum):
|
||||
"""
|
||||
Enumeration of possible values for the `video_capture` config key.
|
||||
- ENABLE_VIDEO_CAPTURE: All video is captured.
|
||||
- DISABLE_VIDEO_CAPTURE: No video is captured.
|
||||
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||
combination of the next two values.
|
||||
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
|
||||
containing the word "video" is not captured.
|
||||
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||
|
||||
Note: Ensuring full video MIME type blocking requires an entry in the
|
||||
Warcprox-Meta header `mime-type-filters` key to fully block videos by
|
||||
MIME type.
|
||||
Note: Ensuring full video MIME type blocking requires an additional entry in the
|
||||
Warcprox-Meta header `mime-type-filters` key.
|
||||
"""
|
||||
|
||||
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
|
||||
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
|
||||
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
|
||||
|
@ -257,9 +257,7 @@ class BrozzlerWorker:
|
||||
"skipping video content: video MIME type capture disabled for site"
|
||||
)
|
||||
elif site.pdfs_only and not self._is_pdf(page_headers):
|
||||
self.logger.info(
|
||||
"skipping non-PDF content: PDFs only option enabled"
|
||||
)
|
||||
self.logger.info("skipping non-PDF content: PDFs only option enabled")
|
||||
else:
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
@ -334,7 +332,8 @@ class BrozzlerWorker:
|
||||
Determinse if the page's Content-Type header specifies that it is a PDF.
|
||||
"""
|
||||
return (
|
||||
"content-type" in page_headers and "application/pdf" in page_headers["content-type"]
|
||||
"content-type" in page_headers
|
||||
and "application/pdf" in page_headers["content-type"]
|
||||
)
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
|
30
job-conf.rst
30
job-conf.rst
@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
|
||||
simultaneously across the cluster. Addresses the problem of a job with many
|
||||
seeds starving out other jobs.
|
||||
|
||||
``pdfs_only``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
+---------+----------+-----------+
|
||||
| type | required | default |
|
||||
+=========+==========+===========+
|
||||
| boolean | no | ``false`` |
|
||||
+---------+----------+-----------+
|
||||
Limits capture to PDFs based on MIME type. This value will only impact
|
||||
processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs
|
||||
requires an additional entry in the Warcprox-Meta header ``mime-type-filters``
|
||||
key to fully block videos by MIME type.
|
||||
|
||||
``seeds``
|
||||
~~~~~~~~~
|
||||
+------------------------+----------+---------+
|
||||
@ -158,6 +170,24 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
|
||||
the default values in place. Brozzler submits login forms after page load.
|
||||
Then brozzling proceeds as usual.
|
||||
|
||||
``video_capture``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
+--------+----------+--------------------------+
|
||||
| type | required | default |
|
||||
+========+==========+==========================+
|
||||
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
|
||||
+--------+----------+--------------------------+
|
||||
Determines the level of video capture for the seed. This is an enumeration with four possible values:
|
||||
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
|
||||
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
|
||||
combination of the next two values.
|
||||
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
|
||||
the word "video" is not captured.
|
||||
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
|
||||
|
||||
*Note: Ensuring full video MIME type blocking requires an additional entry in
|
||||
the Warcprox-Meta header `mime-type-filters` key.*
|
||||
|
||||
Seed-level / top-level settings
|
||||
-------------------------------
|
||||
These are seed settings that can also be specified at the top level, in which
|
||||
|
Loading…
x
Reference in New Issue
Block a user