diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 54e74ec..36251cd 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -24,7 +24,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[dashboard]".\nSee README.rst for more information.', + 'brozzler[dashboard]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink diff --git a/brozzler/easy.py b/brozzler/easy.py index 83cf1ba..d4ccd5a 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', + 'brozzler[easy]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import argparse diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 5932f0b..ff26653 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', + 'brozzler[easy]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink @@ -270,7 +270,7 @@ Run pywb like so: $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback -See README.rst for more information. +See readme.rst for more information. ''' # copied and pasted from cdxdomainspecific.py, only changes are commented as diff --git a/job-conf.rst b/job-conf.rst index 1fa5bc6..403e821 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems. +============+==========+===========+ | dictionary | no | ``false`` | +------------+----------+-----------+ -Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is -configured. The value of the Warcprox-Meta header is a json blob. It is used to -pass settings and information to warcprox. Warcprox does not forward the header -on to the remote site. See the warcprox docs for more information (XXX not yet -written). +Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy`` +is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is +used to pass settings and information to warcprox. Warcprox does not forward +the header on to the remote site. For full documentation on ``warcprox-meta`` +see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header Brozzler takes the configured value of ``warcprox_meta``, converts it to json and populates the Warcprox-Meta header with that value. For example:: @@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``. Matches if the full canonicalized parent url matches ``regex``. The parent url is the url of the page in which the link was found. +Using ``warcprox_meta`` +======================= +``warcprox_meta`` deserves some more discussion. It plays a very important role +in brozzler job configuration. ``warcprox_meta`` is the way you set the +filenames of the warcs for your crawl. For example, if each seed should have a +different warc name prefix, you might have a job configured this way:: + + seeds: + - url: https://example.com/ + warcprox_meta: + warc-prefix: seed1 + - url: https://archive.org/ + warcprox_meta: + warc-prefix: seed2 + +``warcprox_meta`` is also the way to put limits on the size of the crawl job. +For example, this configuration will stop the crawl after about 100 MB of novel +content has been crawled:: + + seeds: + - url: https://example.com/ + - url: https://archive.org/ + warcprox_meta: + stats: + buckets: + - my-job + limits: + my-job/new/wire_bytes: 100000000 + +To prevent any urls from a host from being captured, it's not sufficient to use +a ``scope`` rule as described above. That kind of scoping only applies to +navigational links discovered in crawled pages. To make absolutely sure no url +from a given host is fetched, not even (say) an image embedded in a page, use +``warcprox_meta`` like so:: + + warcprox_meta: + blocks: + - domain: spammy.com + +For complete documentation on the ``warcprox-meta`` request header, see +https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header + .. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst .. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html diff --git a/README.rst b/readme.rst similarity index 98% rename from README.rst rename to readme.rst index 19cdf60..6aeb5e9 100644 --- a/README.rst +++ b/readme.rst @@ -69,27 +69,19 @@ does not take advantage of brozzler's distributed nature.* Installation and Usage ---------------------- -To install brozzler only: - -:: +To install brozzler only:: pip install brozzler # in a virtualenv if desired -Launch one or more workers: - -:: +Launch one or more workers:: brozzler-worker --warcprox-auto -Submit jobs: - -:: +Submit jobs:: brozzler-new-job myjob.yaml -Submit sites not tied to a job: - -:: +Submit sites not tied to a job:: brozzler-new-site --time-limit=600 http://example.com/ diff --git a/setup.py b/setup.py index 28168b3..d7ef0f7 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setuptools.setup( url='https://github.com/internetarchive/brozzler', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('README.rst', mode='rb').read().decode('UTF-8'), + long_description=open('readme.rst', mode='rb').read().decode('UTF-8'), license='Apache License 2.0', packages=['brozzler', 'brozzler.dashboard'], package_data={