From a00b5a7fd52f9405ed7a038197c473344bac92e7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 May 2018 18:06:39 -0700 Subject: [PATCH 1/2] explain brozzler use of warcprox_meta --- job-conf.rst | 52 ++++++++++-- readme.rst | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 readme.rst diff --git a/job-conf.rst b/job-conf.rst index 1fa5bc6..403e821 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems. +============+==========+===========+ | dictionary | no | ``false`` | +------------+----------+-----------+ -Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is -configured. The value of the Warcprox-Meta header is a json blob. It is used to -pass settings and information to warcprox. Warcprox does not forward the header -on to the remote site. See the warcprox docs for more information (XXX not yet -written). +Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy`` +is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is +used to pass settings and information to warcprox. Warcprox does not forward +the header on to the remote site. For full documentation on ``warcprox-meta`` +see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header Brozzler takes the configured value of ``warcprox_meta``, converts it to json and populates the Warcprox-Meta header with that value. For example:: @@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``. Matches if the full canonicalized parent url matches ``regex``. The parent url is the url of the page in which the link was found. +Using ``warcprox_meta`` +======================= +``warcprox_meta`` deserves some more discussion. It plays a very important role +in brozzler job configuration. ``warcprox_meta`` is the way you set the +filenames of the warcs for your crawl. For example, if each seed should have a +different warc name prefix, you might have a job configured this way:: + + seeds: + - url: https://example.com/ + warcprox_meta: + warc-prefix: seed1 + - url: https://archive.org/ + warcprox_meta: + warc-prefix: seed2 + +``warcprox_meta`` is also the way to put limits on the size of the crawl job. +For example, this configuration will stop the crawl after about 100 MB of novel +content has been crawled:: + + seeds: + - url: https://example.com/ + - url: https://archive.org/ + warcprox_meta: + stats: + buckets: + - my-job + limits: + my-job/new/wire_bytes: 100000000 + +To prevent any urls from a host from being captured, it's not sufficient to use +a ``scope`` rule as described above. That kind of scoping only applies to +navigational links discovered in crawled pages. To make absolutely sure no url +from a given host is fetched, not even (say) an image embedded in a page, use +``warcprox_meta`` like so:: + + warcprox_meta: + blocks: + - domain: spammy.com + +For complete documentation on the ``warcprox-meta`` request header, see +https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header + .. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst .. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html diff --git a/readme.rst b/readme.rst new file mode 100644 index 0000000..6aeb5e9 --- /dev/null +++ b/readme.rst @@ -0,0 +1,218 @@ +.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master + :target: https://travis-ci.org/internetarchive/brozzler + +.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg + :width: 60px + +|logo| brozzler +=============== +"browser" \| "crawler" = "brozzler" + +Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome +or chromium) to fetch pages and embedded urls and to extract links. It also +uses `youtube-dl `_ to enhance media +capture capabilities. + +Brozzler is designed to work in conjunction with +`warcprox `_ for web +archiving. + +Requirements +------------ + +- Python 3.4 or later +- RethinkDB deployment +- Chromium or Google Chrome >= version 64 + +Worth noting is that the browser requires a graphical environment to run. You +already have this on your laptop, but on a server it will probably require +deploying some additional infrastructure (typically X11; note that Xvfb does +not support screenshots; Xvnc4, from package vnc4server, does). The vagrant +configuration in the brozzler repository (still a work in progress) has an +example setup. + +Getting Started +--------------- + +The easiest way to get started with brozzler for web archiving is with +``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox, +`pywb `_, and brozzler-dashboard, configured +to work with each other, in a single process. + +Mac instructions: + +:: + + # install and start rethinkdb + brew install rethinkdb + # no brew? try rethinkdb's installer: https://www.rethinkdb.com/docs/install/osx/ + rethinkdb &>>rethinkdb.log & + + # install brozzler with special dependencies pywb and warcprox + pip install brozzler[easy] # in a virtualenv if desired + + # queue a site to crawl + brozzler-new-site http://example.com/ + + # or a job + brozzler-new-job job1.yml + + # start brozzler-easy + brozzler-easy + +At this point brozzler-easy will start brozzling your site. Results will be +immediately available for playback in pywb at http://localhost:8880/brozzler/. + +*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but +does not take advantage of brozzler's distributed nature.* + +Installation and Usage +---------------------- + +To install brozzler only:: + + pip install brozzler # in a virtualenv if desired + +Launch one or more workers:: + + brozzler-worker --warcprox-auto + +Submit jobs:: + + brozzler-new-job myjob.yaml + +Submit sites not tied to a job:: + + brozzler-new-site --time-limit=600 http://example.com/ + +Job Configuration +----------------- + +Jobs are defined using yaml files. Options may be specified either at the +top-level or on individual seeds. At least one seed url must be specified, +everything else is optional. For details, see ``_. + +:: + + id: myjob + time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving + ignore_robots: false + warcprox_meta: null + metadata: {} + seeds: + - url: http://one.example.org/ + - url: http://two.example.org/ + time_limit: 30 + - url: http://three.example.org/ + time_limit: 10 + ignore_robots: true + scope: + surt: http://(org,example, + +Brozzler Dashboard +------------------ + +Brozzler comes with a rudimentary web application for viewing crawl job status. +To install the brozzler with dependencies required to run this app, run + +:: + + pip install brozzler[dashboard] + + +To start the app, run + +:: + + brozzler-dashboard + +See ``brozzler-dashboard --help`` for configuration options. + +Brozzler Wayback +---------------- + +Brozzler comes with a customized version of +`pywb `_ which supports using the rethinkdb +"captures" table (populated by warcprox) as its index. + +To use, first install dependencies. + +:: + + pip install brozzler[easy] + +Write a configuration file pywb.yml. + +:: + + # 'archive_paths' should point to the output directory of warcprox + archive_paths: warcs/ # pywb will fail without a trailing slash + collections: + brozzler: + index_paths: !!python/object:brozzler.pywb.RethinkCDXSource + db: brozzler + table: captures + servers: + - localhost + enable_auto_colls: false + enable_cdx_api: true + framed_replay: true + port: 8880 + +Run pywb like so: + +:: + + $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback + +Then browse http://localhost:8880/brozzler/. + + +Headless Chrome (experimental) +-------------------------------- + +`Headless Chromium `_ +is now available in stable Chrome releases for 64-bit Linux and may be +used to run the browser without a visible window or X11 at all. + +To try this out, create a wrapper script like ~/bin/chrome-headless.sh: + +:: + + #!/bin/bash + exec /opt/google/chrome/chrome --headless --disable-gpu "$@" + +Run brozzler passing the path to the wrapper script as the ``--chrome-exe`` +option: + +:: + + chmod +x ~/bin/chrome-headless.sh + brozzler-worker --chrome-exe ~/bin/chrome-headless.sh + +Beware: Chrome's headless mode is still very new and has a number of +`unresolved issues. `_ +You may experience hangs or crashes with some types of content. Brozzler +has not had much testing with it. For the moment we recommend using +Chrome's regular mode instead. + +License +------- + +Copyright 2015-2018 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this software except in compliance with the License. You may +obtain a copy of the License at + +:: + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + From 62bb540a11273dc99f430bfdd1e115963e66cbcc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 31 May 2018 18:46:37 +0000 Subject: [PATCH 2/2] lowercase readme.rst --- README.rst | 226 --------------------------------- brozzler/dashboard/__init__.py | 2 +- brozzler/easy.py | 2 +- brozzler/pywb.py | 4 +- setup.py | 2 +- 5 files changed, 5 insertions(+), 231 deletions(-) delete mode 100644 README.rst diff --git a/README.rst b/README.rst deleted file mode 100644 index 19cdf60..0000000 --- a/README.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master - :target: https://travis-ci.org/internetarchive/brozzler - -.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg - :width: 60px - -|logo| brozzler -=============== -"browser" \| "crawler" = "brozzler" - -Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome -or chromium) to fetch pages and embedded urls and to extract links. It also -uses `youtube-dl `_ to enhance media -capture capabilities. - -Brozzler is designed to work in conjunction with -`warcprox `_ for web -archiving. - -Requirements ------------- - -- Python 3.4 or later -- RethinkDB deployment -- Chromium or Google Chrome >= version 64 - -Worth noting is that the browser requires a graphical environment to run. You -already have this on your laptop, but on a server it will probably require -deploying some additional infrastructure (typically X11; note that Xvfb does -not support screenshots; Xvnc4, from package vnc4server, does). The vagrant -configuration in the brozzler repository (still a work in progress) has an -example setup. - -Getting Started ---------------- - -The easiest way to get started with brozzler for web archiving is with -``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox, -`pywb `_, and brozzler-dashboard, configured -to work with each other, in a single process. - -Mac instructions: - -:: - - # install and start rethinkdb - brew install rethinkdb - # no brew? try rethinkdb's installer: https://www.rethinkdb.com/docs/install/osx/ - rethinkdb &>>rethinkdb.log & - - # install brozzler with special dependencies pywb and warcprox - pip install brozzler[easy] # in a virtualenv if desired - - # queue a site to crawl - brozzler-new-site http://example.com/ - - # or a job - brozzler-new-job job1.yml - - # start brozzler-easy - brozzler-easy - -At this point brozzler-easy will start brozzling your site. Results will be -immediately available for playback in pywb at http://localhost:8880/brozzler/. - -*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but -does not take advantage of brozzler's distributed nature.* - -Installation and Usage ----------------------- - -To install brozzler only: - -:: - - pip install brozzler # in a virtualenv if desired - -Launch one or more workers: - -:: - - brozzler-worker --warcprox-auto - -Submit jobs: - -:: - - brozzler-new-job myjob.yaml - -Submit sites not tied to a job: - -:: - - brozzler-new-site --time-limit=600 http://example.com/ - -Job Configuration ------------------ - -Jobs are defined using yaml files. Options may be specified either at the -top-level or on individual seeds. At least one seed url must be specified, -everything else is optional. For details, see ``_. - -:: - - id: myjob - time_limit: 60 # seconds - proxy: 127.0.0.1:8000 # point at warcprox for archiving - ignore_robots: false - warcprox_meta: null - metadata: {} - seeds: - - url: http://one.example.org/ - - url: http://two.example.org/ - time_limit: 30 - - url: http://three.example.org/ - time_limit: 10 - ignore_robots: true - scope: - surt: http://(org,example, - -Brozzler Dashboard ------------------- - -Brozzler comes with a rudimentary web application for viewing crawl job status. -To install the brozzler with dependencies required to run this app, run - -:: - - pip install brozzler[dashboard] - - -To start the app, run - -:: - - brozzler-dashboard - -See ``brozzler-dashboard --help`` for configuration options. - -Brozzler Wayback ----------------- - -Brozzler comes with a customized version of -`pywb `_ which supports using the rethinkdb -"captures" table (populated by warcprox) as its index. - -To use, first install dependencies. - -:: - - pip install brozzler[easy] - -Write a configuration file pywb.yml. - -:: - - # 'archive_paths' should point to the output directory of warcprox - archive_paths: warcs/ # pywb will fail without a trailing slash - collections: - brozzler: - index_paths: !!python/object:brozzler.pywb.RethinkCDXSource - db: brozzler - table: captures - servers: - - localhost - enable_auto_colls: false - enable_cdx_api: true - framed_replay: true - port: 8880 - -Run pywb like so: - -:: - - $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback - -Then browse http://localhost:8880/brozzler/. - - -Headless Chrome (experimental) --------------------------------- - -`Headless Chromium `_ -is now available in stable Chrome releases for 64-bit Linux and may be -used to run the browser without a visible window or X11 at all. - -To try this out, create a wrapper script like ~/bin/chrome-headless.sh: - -:: - - #!/bin/bash - exec /opt/google/chrome/chrome --headless --disable-gpu "$@" - -Run brozzler passing the path to the wrapper script as the ``--chrome-exe`` -option: - -:: - - chmod +x ~/bin/chrome-headless.sh - brozzler-worker --chrome-exe ~/bin/chrome-headless.sh - -Beware: Chrome's headless mode is still very new and has a number of -`unresolved issues. `_ -You may experience hangs or crashes with some types of content. Brozzler -has not had much testing with it. For the moment we recommend using -Chrome's regular mode instead. - -License -------- - -Copyright 2015-2018 Internet Archive - -Licensed under the Apache License, Version 2.0 (the "License"); you may -not use this software except in compliance with the License. You may -obtain a copy of the License at - -:: - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index 54e74ec..36251cd 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -24,7 +24,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[dashboard]".\nSee README.rst for more information.', + 'brozzler[dashboard]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink diff --git a/brozzler/easy.py b/brozzler/easy.py index 83cf1ba..d4ccd5a 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', + 'brozzler[easy]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import argparse diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 5932f0b..ff26653 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -31,7 +31,7 @@ try: except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' - 'brozzler[easy]".\nSee README.rst for more information.', + 'brozzler[easy]".\nSee readme.rst for more information.', type(e).__name__, e) sys.exit(1) import doublethink @@ -270,7 +270,7 @@ Run pywb like so: $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback -See README.rst for more information. +See readme.rst for more information. ''' # copied and pasted from cdxdomainspecific.py, only changes are commented as diff --git a/setup.py b/setup.py index 28168b3..d7ef0f7 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setuptools.setup( url='https://github.com/internetarchive/brozzler', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('README.rst', mode='rb').read().decode('UTF-8'), + long_description=open('readme.rst', mode='rb').read().decode('UTF-8'), license='Apache License 2.0', packages=['brozzler', 'brozzler.dashboard'], package_data={