Merge pull request #108 from nlevitt/docs

Docs
2025-07-30 01:58:51 -04:00 · 2018-05-31 14:15:12 -07:00 · 2018-05-31 14:15:12 -07:00 · b41ccd7e6b
commit b41ccd7e6b
parent aef4c40993 62bb540a11
6 changed files with 56 additions and 22 deletions
--- a/brozzler/dashboard/init.py
+++ b/brozzler/dashboard/init.py
@ -24,7 +24,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[dashboard]".\nSee README.rst for more information.',
+            'brozzler[dashboard]".\nSee readme.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import doublethink
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -31,7 +31,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
+            'brozzler[easy]".\nSee readme.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import argparse
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -31,7 +31,7 @@ try:
 except ImportError as e:
    logging.critical(
            '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
+            'brozzler[easy]".\nSee readme.rst for more information.',
            type(e).__name__, e)
    sys.exit(1)
 import doublethink
@ -270,7 +270,7 @@ Run pywb like so:

    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback

-See README.rst for more information.
+See readme.rst for more information.
 '''

 # copied and pasted from cdxdomainspecific.py, only changes are commented as
--- a/job-conf.rst
+++ b/job-conf.rst
@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems.
 +============+==========+===========+
 | dictionary | no       | ``false`` |
 +------------+----------+-----------+
-Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
-configured. The value of the Warcprox-Meta header is a json blob. It is used to
-pass settings and information to warcprox. Warcprox does not forward the header
-on to the remote site. See the warcprox docs for more information (XXX not yet
-written).
+Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
+is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
+used to pass settings and information to warcprox. Warcprox does not forward
+the header on to the remote site. For full documentation on ``warcprox-meta``
+see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header

 Brozzler takes the configured value of ``warcprox_meta``, converts it to
 json and populates the Warcprox-Meta header with that value. For example::
@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
 Matches if the full canonicalized parent url matches ``regex``. The parent url
 is the url of the page in which the link was found.

+Using ``warcprox_meta``
+=======================
+``warcprox_meta`` deserves some more discussion. It plays a very important role
+in brozzler job configuration. ``warcprox_meta`` is the way you set the
+filenames of the warcs for your crawl. For example, if each seed should have a
+different warc name prefix, you might have a job configured this way::
+
+    seeds:
+    - url: https://example.com/
+      warcprox_meta:
+        warc-prefix: seed1
+    - url: https://archive.org/
+      warcprox_meta:
+        warc-prefix: seed2
+
+``warcprox_meta`` is also the way to put limits on the size of the crawl job.
+For example, this configuration will stop the crawl after about 100 MB of novel
+content has been crawled::
+
+    seeds:
+    - url: https://example.com/
+    - url: https://archive.org/
+    warcprox_meta:
+      stats:
+        buckets:
+        - my-job
+      limits:
+        my-job/new/wire_bytes: 100000000
+
+To prevent any urls from a host from being captured, it's not sufficient to use
+a ``scope`` rule as described above. That kind of scoping only applies to
+navigational links discovered in crawled pages. To make absolutely sure no url
+from a given host is fetched, not even (say) an image embedded in a page, use
+``warcprox_meta`` like so::
+
+    warcprox_meta:
+      blocks:
+      - domain: spammy.com
+
+For complete documentation on the ``warcprox-meta`` request header, see
+https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
+
 .. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
 .. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
--- a/readme.rst
+++ b/readme.rst
@ -69,27 +69,19 @@ does not take advantage of brozzler's distributed nature.*
 Installation and Usage
 ----------------------

-To install brozzler only:
-
-::
+To install brozzler only::

    pip install brozzler  # in a virtualenv if desired

-Launch one or more workers:
-
-::
+Launch one or more workers::

    brozzler-worker --warcprox-auto

-Submit jobs:
-
-::
+Submit jobs::

    brozzler-new-job myjob.yaml

-Submit sites not tied to a job:
-
-::
+Submit sites not tied to a job::

    brozzler-new-site --time-limit=600 http://example.com/

--- a/setup.py
+++ b/setup.py
@ -37,7 +37,7 @@ setuptools.setup(
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
        author_email='nlevitt@archive.org',
-        long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
+        long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
        license='Apache License 2.0',
        packages=['brozzler', 'brozzler.dashboard'],
        package_data={