diff --git a/brozzler/model.py b/brozzler/model.py index 68ba037..fbdd6c7 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -101,6 +101,8 @@ def new_job(frontier, job_conf): job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] + if "pdfs_only" in job_conf: + job.pdfs_only = job_conf["pdfs_only"] job.save() sites = [] @@ -199,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn): def populate_defaults(self): if not "status" in self: self.status = "ACTIVE" + if "pdfs_only" not in self: + self.pdfs_only = False if not "starts_and_stops" in self: if self.get("started"): # backward compatibility self.starts_and_stops = [ @@ -224,16 +228,17 @@ class Job(doublethink.Document, ElapsedMixIn): class VideoCaptureOptions(Enum): """ Enumeration of possible values for the `video_capture` config key. - - ENABLE_VIDEO_CAPTURE: All video is captured. - - DISABLE_VIDEO_CAPTURE: No video is captured. + - ENABLE_VIDEO_CAPTURE (default): All video is captured. + - DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. - BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing the word "video" is not captured. - DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. - Note: Ensuring full video MIME type blocking requires an entry in the - Warcprox-Meta header `mime-type-filters` key to fully block videos by - MIME type. + Note: Ensuring full video MIME type blocking requires an additional entry in the + Warcprox-Meta header `mime-type-filters` key. """ + ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE" DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE" BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES" diff --git a/brozzler/worker.py b/brozzler/worker.py index ae6c750..89179e7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -257,9 +257,7 @@ class BrozzlerWorker: "skipping video content: video MIME type capture disabled for site" ) elif site.pdfs_only and not self._is_pdf(page_headers): - self.logger.info( - "skipping non-PDF content: PDFs only option enabled" - ) + self.logger.info("skipping non-PDF content: PDFs only option enabled") else: self._fetch_url(site, page=page) else: @@ -334,7 +332,8 @@ class BrozzlerWorker: Determinse if the page's Content-Type header specifies that it is a PDF. """ return ( - "content-type" in page_headers and "application/pdf" in page_headers["content-type"] + "content-type" in page_headers + and "application/pdf" in page_headers["content-type"] ) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): diff --git a/job-conf.rst b/job-conf.rst index 08707b6..be5a7ae 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``pdfs_only`` +~~~~~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ +Limits capture to PDFs based on MIME type. This value will only impact +processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs +requires an additional entry in the Warcprox-Meta header ``mime-type-filters`` +key to fully block videos by MIME type. + ``seeds`` ~~~~~~~~~ +------------------------+----------+---------+ @@ -158,6 +170,24 @@ other fields like checkboxes and/or hidden fields, brozzler will leave the default values in place. Brozzler submits login forms after page load. Then brozzling proceeds as usual. +``video_capture`` +~~~~~~~~~~~~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | yes | ``ENABLE_VIDEO_CAPTURE`` | ++--------+----------+--------------------------+ +Determines the level of video capture for the seed. This is an enumeration with four possible values: +* ENABLE_VIDEO_CAPTURE (default): All video is captured. +* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a + combination of the next two values. +* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing + the word "video" is not captured. +* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled. + +*Note: Ensuring full video MIME type blocking requires an additional entry in +the Warcprox-Meta header `mime-type-filters` key.* + Seed-level / top-level settings ------------------------------- These are seed settings that can also be specified at the top level, in which