From a00b5a7fd52f9405ed7a038197c473344bac92e7 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 May 2018 18:06:39 -0700
Subject: [PATCH 1/2] explain brozzler use of warcprox_meta

---
 job-conf.rst |  52 ++++++++++--
 readme.rst   | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 265 insertions(+), 5 deletions(-)
 create mode 100644 readme.rst

diff --git a/job-conf.rst b/job-conf.rst
index 1fa5bc6..403e821 100644
--- a/job-conf.rst
+++ b/job-conf.rst
@@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems.
 +============+==========+===========+
 | dictionary | no       | ``false`` |
 +------------+----------+-----------+
-Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
-configured. The value of the Warcprox-Meta header is a json blob. It is used to
-pass settings and information to warcprox. Warcprox does not forward the header
-on to the remote site. See the warcprox docs for more information (XXX not yet
-written).
+Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
+is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
+used to pass settings and information to warcprox. Warcprox does not forward
+the header on to the remote site. For full documentation on ``warcprox-meta``
+see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
 
 Brozzler takes the configured value of ``warcprox_meta``, converts it to
 json and populates the Warcprox-Meta header with that value. For example::
@@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
 Matches if the full canonicalized parent url matches ``regex``. The parent url
 is the url of the page in which the link was found.
 
+Using ``warcprox_meta``
+=======================
+``warcprox_meta`` deserves some more discussion. It plays a very important role
+in brozzler job configuration. ``warcprox_meta`` is the way you set the
+filenames of the warcs for your crawl. For example, if each seed should have a
+different warc name prefix, you might have a job configured this way::
+
+    seeds:
+    - url: https://example.com/
+      warcprox_meta:
+        warc-prefix: seed1
+    - url: https://archive.org/
+      warcprox_meta:
+        warc-prefix: seed2
+
+``warcprox_meta`` is also the way to put limits on the size of the crawl job.
+For example, this configuration will stop the crawl after about 100 MB of novel
+content has been crawled::
+
+    seeds:
+    - url: https://example.com/
+    - url: https://archive.org/
+    warcprox_meta:
+      stats:
+        buckets:
+        - my-job
+      limits:
+        my-job/new/wire_bytes: 100000000
+
+To prevent any urls from a host from being captured, it's not sufficient to use
+a ``scope`` rule as described above. That kind of scoping only applies to
+navigational links discovered in crawled pages. To make absolutely sure no url
+from a given host is fetched, not even (say) an image embedded in a page, use
+``warcprox_meta`` like so::
+
+    warcprox_meta:
+      blocks:
+      - domain: spammy.com
+
+For complete documentation on the ``warcprox-meta`` request header, see
+https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
+
 .. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
 .. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
diff --git a/readme.rst b/readme.rst
new file mode 100644
index 0000000..6aeb5e9
--- /dev/null
+++ b/readme.rst
@@ -0,0 +1,218 @@
+.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
+    :target: https://travis-ci.org/internetarchive/brozzler
+
+.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
+   :width: 60px
+
+|logo| brozzler
+===============
+"browser" \| "crawler" = "brozzler"
+
+Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
+or chromium) to fetch pages and embedded urls and to extract links. It also
+uses `youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media
+capture capabilities.
+
+Brozzler is designed to work in conjunction with
+`warcprox <https://github.com/internetarchive/warcprox>`_ for web
+archiving.
+
+Requirements
+------------
+
+- Python 3.4 or later
+- RethinkDB deployment
+- Chromium or Google Chrome >= version 64
+
+Worth noting is that the browser requires a graphical environment to run. You
+already have this on your laptop, but on a server it will probably require
+deploying some additional infrastructure (typically X11; note that Xvfb does
+not support screenshots; Xvnc4, from package vnc4server, does). The vagrant
+configuration in the brozzler repository (still a work in progress) has an
+example setup.
+
+Getting Started
+---------------
+
+The easiest way to get started with brozzler for web archiving is with
+``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox,
+`pywb <https://github.com/ikreymer/pywb>`_, and brozzler-dashboard, configured
+to work with each other, in a single process.
+
+Mac instructions:
+
+::
+
+    # install and start rethinkdb
+    brew install rethinkdb
+    # no brew? try rethinkdb's installer: https://www.rethinkdb.com/docs/install/osx/
+    rethinkdb &>>rethinkdb.log &
+
+    # install brozzler with special dependencies pywb and warcprox
+    pip install brozzler[easy]  # in a virtualenv if desired
+
+    # queue a site to crawl
+    brozzler-new-site http://example.com/
+
+    # or a job
+    brozzler-new-job job1.yml
+
+    # start brozzler-easy
+    brozzler-easy
+
+At this point brozzler-easy will start brozzling your site. Results will be
+immediately available for playback in pywb at http://localhost:8880/brozzler/.
+
+*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but
+does not take advantage of brozzler's distributed nature.*
+
+Installation and Usage
+----------------------
+
+To install brozzler only::
+
+    pip install brozzler  # in a virtualenv if desired
+
+Launch one or more workers::
+
+    brozzler-worker --warcprox-auto
+
+Submit jobs::
+
+    brozzler-new-job myjob.yaml
+
+Submit sites not tied to a job::
+
+    brozzler-new-site --time-limit=600 http://example.com/
+
+Job Configuration
+-----------------
+
+Jobs are defined using yaml files. Options may be specified either at the
+top-level or on individual seeds. At least one seed url must be specified,
+everything else is optional. For details, see `<job-conf.rst>`_.
+
+::
+
+    id: myjob
+    time_limit: 60 # seconds
+    proxy: 127.0.0.1:8000 # point at warcprox for archiving
+    ignore_robots: false
+    warcprox_meta: null
+    metadata: {}
+    seeds:
+      - url: http://one.example.org/
+      - url: http://two.example.org/
+        time_limit: 30
+      - url: http://three.example.org/
+        time_limit: 10
+        ignore_robots: true
+        scope:
+          surt: http://(org,example,
+
+Brozzler Dashboard
+------------------
+
+Brozzler comes with a rudimentary web application for viewing crawl job status.
+To install the brozzler with dependencies required to run this app, run
+
+::
+
+    pip install brozzler[dashboard]
+
+
+To start the app, run
+
+::
+
+    brozzler-dashboard
+
+See ``brozzler-dashboard --help`` for configuration options.
+
+Brozzler Wayback
+----------------
+
+Brozzler comes with a customized version of
+`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
+"captures" table (populated by warcprox) as its index.
+
+To use, first install dependencies.
+
+::
+
+    pip install brozzler[easy]
+
+Write a configuration file pywb.yml.
+
+::
+
+    # 'archive_paths' should point to the output directory of warcprox
+    archive_paths: warcs/  # pywb will fail without a trailing slash
+    collections:
+      brozzler:
+        index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
+          db: brozzler
+          table: captures
+          servers:
+          - localhost
+    enable_auto_colls: false
+    enable_cdx_api: true
+    framed_replay: true
+    port: 8880
+
+Run pywb like so:
+
+::
+
+    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
+
+Then browse http://localhost:8880/brozzler/.
+
+
+Headless Chrome (experimental)
+--------------------------------
+
+`Headless Chromium <https://chromium.googlesource.com/chromium/src/+/master/headless/README.md>`_
+is now available in stable Chrome releases for 64-bit Linux and may be
+used to run the browser without a visible window or X11 at all.
+
+To try this out, create a wrapper script like ~/bin/chrome-headless.sh:
+
+::
+
+    #!/bin/bash
+    exec /opt/google/chrome/chrome --headless --disable-gpu "$@"
+
+Run brozzler passing the path to the wrapper script as the ``--chrome-exe``
+option:
+
+::
+
+    chmod +x ~/bin/chrome-headless.sh
+    brozzler-worker --chrome-exe ~/bin/chrome-headless.sh
+
+Beware: Chrome's headless mode is still very new and has a number of
+`unresolved issues. <https://bugs.chromium.org/p/chromium/issues/list?can=2&q=Proj%3DHeadless>`_
+You may experience hangs or crashes with some types of content. Brozzler
+has not had much testing with it. For the moment we recommend using
+Chrome's regular mode instead.
+
+License
+-------
+
+Copyright 2015-2018 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may
+not use this software except in compliance with the License. You may
+obtain a copy of the License at
+
+::
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+

From 62bb540a11273dc99f430bfdd1e115963e66cbcc Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 31 May 2018 18:46:37 +0000
Subject: [PATCH 2/2] lowercase readme.rst

---
 README.rst                     | 226 ---------------------------------
 brozzler/dashboard/__init__.py |   2 +-
 brozzler/easy.py               |   2 +-
 brozzler/pywb.py               |   4 +-
 setup.py                       |   2 +-
 5 files changed, 5 insertions(+), 231 deletions(-)
 delete mode 100644 README.rst

diff --git a/README.rst b/README.rst
deleted file mode 100644
index 19cdf60..0000000
--- a/README.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
-    :target: https://travis-ci.org/internetarchive/brozzler
-
-.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
-   :width: 60px
-
-|logo| brozzler
-===============
-"browser" \| "crawler" = "brozzler"
-
-Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
-or chromium) to fetch pages and embedded urls and to extract links. It also
-uses `youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media
-capture capabilities.
-
-Brozzler is designed to work in conjunction with
-`warcprox <https://github.com/internetarchive/warcprox>`_ for web
-archiving.
-
-Requirements
-------------
-
-- Python 3.4 or later
-- RethinkDB deployment
-- Chromium or Google Chrome >= version 64
-
-Worth noting is that the browser requires a graphical environment to run. You
-already have this on your laptop, but on a server it will probably require
-deploying some additional infrastructure (typically X11; note that Xvfb does
-not support screenshots; Xvnc4, from package vnc4server, does). The vagrant
-configuration in the brozzler repository (still a work in progress) has an
-example setup.
-
-Getting Started
----------------
-
-The easiest way to get started with brozzler for web archiving is with
-``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox,
-`pywb <https://github.com/ikreymer/pywb>`_, and brozzler-dashboard, configured
-to work with each other, in a single process.
-
-Mac instructions:
-
-::
-
-    # install and start rethinkdb
-    brew install rethinkdb
-    # no brew? try rethinkdb's installer: https://www.rethinkdb.com/docs/install/osx/
-    rethinkdb &>>rethinkdb.log &
-
-    # install brozzler with special dependencies pywb and warcprox
-    pip install brozzler[easy]  # in a virtualenv if desired
-
-    # queue a site to crawl
-    brozzler-new-site http://example.com/
-
-    # or a job
-    brozzler-new-job job1.yml
-
-    # start brozzler-easy
-    brozzler-easy
-
-At this point brozzler-easy will start brozzling your site. Results will be
-immediately available for playback in pywb at http://localhost:8880/brozzler/.
-
-*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but
-does not take advantage of brozzler's distributed nature.*
-
-Installation and Usage
-----------------------
-
-To install brozzler only:
-
-::
-
-    pip install brozzler  # in a virtualenv if desired
-
-Launch one or more workers:
-
-::
-
-    brozzler-worker --warcprox-auto
-
-Submit jobs:
-
-::
-
-    brozzler-new-job myjob.yaml
-
-Submit sites not tied to a job:
-
-::
-
-    brozzler-new-site --time-limit=600 http://example.com/
-
-Job Configuration
------------------
-
-Jobs are defined using yaml files. Options may be specified either at the
-top-level or on individual seeds. At least one seed url must be specified,
-everything else is optional. For details, see `<job-conf.rst>`_.
-
-::
-
-    id: myjob
-    time_limit: 60 # seconds
-    proxy: 127.0.0.1:8000 # point at warcprox for archiving
-    ignore_robots: false
-    warcprox_meta: null
-    metadata: {}
-    seeds:
-      - url: http://one.example.org/
-      - url: http://two.example.org/
-        time_limit: 30
-      - url: http://three.example.org/
-        time_limit: 10
-        ignore_robots: true
-        scope:
-          surt: http://(org,example,
-
-Brozzler Dashboard
-------------------
-
-Brozzler comes with a rudimentary web application for viewing crawl job status.
-To install the brozzler with dependencies required to run this app, run
-
-::
-
-    pip install brozzler[dashboard]
-
-
-To start the app, run
-
-::
-
-    brozzler-dashboard
-
-See ``brozzler-dashboard --help`` for configuration options.
-
-Brozzler Wayback
-----------------
-
-Brozzler comes with a customized version of
-`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
-"captures" table (populated by warcprox) as its index.
-
-To use, first install dependencies.
-
-::
-
-    pip install brozzler[easy]
-
-Write a configuration file pywb.yml.
-
-::
-
-    # 'archive_paths' should point to the output directory of warcprox
-    archive_paths: warcs/  # pywb will fail without a trailing slash
-    collections:
-      brozzler:
-        index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
-          db: brozzler
-          table: captures
-          servers:
-          - localhost
-    enable_auto_colls: false
-    enable_cdx_api: true
-    framed_replay: true
-    port: 8880
-
-Run pywb like so:
-
-::
-
-    $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
-
-Then browse http://localhost:8880/brozzler/.
-
-
-Headless Chrome (experimental)
---------------------------------
-
-`Headless Chromium <https://chromium.googlesource.com/chromium/src/+/master/headless/README.md>`_
-is now available in stable Chrome releases for 64-bit Linux and may be
-used to run the browser without a visible window or X11 at all.
-
-To try this out, create a wrapper script like ~/bin/chrome-headless.sh:
-
-::
-
-    #!/bin/bash
-    exec /opt/google/chrome/chrome --headless --disable-gpu "$@"
-
-Run brozzler passing the path to the wrapper script as the ``--chrome-exe``
-option:
-
-::
-
-    chmod +x ~/bin/chrome-headless.sh
-    brozzler-worker --chrome-exe ~/bin/chrome-headless.sh
-
-Beware: Chrome's headless mode is still very new and has a number of
-`unresolved issues. <https://bugs.chromium.org/p/chromium/issues/list?can=2&q=Proj%3DHeadless>`_
-You may experience hangs or crashes with some types of content. Brozzler
-has not had much testing with it. For the moment we recommend using
-Chrome's regular mode instead.
-
-License
--------
-
-Copyright 2015-2018 Internet Archive
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may
-not use this software except in compliance with the License. You may
-obtain a copy of the License at
-
-::
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py
index 54e74ec..36251cd 100644
--- a/brozzler/dashboard/__init__.py
+++ b/brozzler/dashboard/__init__.py
@@ -24,7 +24,7 @@ try:
 except ImportError as e:
     logging.critical(
             '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[dashboard]".\nSee README.rst for more information.',
+            'brozzler[dashboard]".\nSee readme.rst for more information.',
             type(e).__name__, e)
     sys.exit(1)
 import doublethink
diff --git a/brozzler/easy.py b/brozzler/easy.py
index 83cf1ba..d4ccd5a 100644
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@@ -31,7 +31,7 @@ try:
 except ImportError as e:
     logging.critical(
             '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
+            'brozzler[easy]".\nSee readme.rst for more information.',
             type(e).__name__, e)
     sys.exit(1)
 import argparse
diff --git a/brozzler/pywb.py b/brozzler/pywb.py
index 5932f0b..ff26653 100644
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@@ -31,7 +31,7 @@ try:
 except ImportError as e:
     logging.critical(
             '%s: %s\n\nYou might need to run "pip install '
-            'brozzler[easy]".\nSee README.rst for more information.',
+            'brozzler[easy]".\nSee readme.rst for more information.',
             type(e).__name__, e)
     sys.exit(1)
 import doublethink
@@ -270,7 +270,7 @@ Run pywb like so:
 
     $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
 
-See README.rst for more information.
+See readme.rst for more information.
 '''
 
 # copied and pasted from cdxdomainspecific.py, only changes are commented as
diff --git a/setup.py b/setup.py
index 28168b3..d7ef0f7 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setuptools.setup(
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
         author_email='nlevitt@archive.org',
-        long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
+        long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
         license='Apache License 2.0',
         packages=['brozzler', 'brozzler.dashboard'],
         package_data={