mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
commit
b41ccd7e6b
@ -24,7 +24,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||
'brozzler[dashboard]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
|
@ -31,7 +31,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import argparse
|
||||
|
@ -31,7 +31,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
@ -270,7 +270,7 @@ Run pywb like so:
|
||||
|
||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||
|
||||
See README.rst for more information.
|
||||
See readme.rst for more information.
|
||||
'''
|
||||
|
||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||
|
52
job-conf.rst
52
job-conf.rst
@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems.
|
||||
+============+==========+===========+
|
||||
| dictionary | no | ``false`` |
|
||||
+------------+----------+-----------+
|
||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
||||
pass settings and information to warcprox. Warcprox does not forward the header
|
||||
on to the remote site. See the warcprox docs for more information (XXX not yet
|
||||
written).
|
||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||
used to pass settings and information to warcprox. Warcprox does not forward
|
||||
the header on to the remote site. For full documentation on ``warcprox-meta``
|
||||
see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||
|
||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||
json and populates the Warcprox-Meta header with that value. For example::
|
||||
@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
||||
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
||||
is the url of the page in which the link was found.
|
||||
|
||||
Using ``warcprox_meta``
|
||||
=======================
|
||||
``warcprox_meta`` deserves some more discussion. It plays a very important role
|
||||
in brozzler job configuration. ``warcprox_meta`` is the way you set the
|
||||
filenames of the warcs for your crawl. For example, if each seed should have a
|
||||
different warc name prefix, you might have a job configured this way::
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/
|
||||
warcprox_meta:
|
||||
warc-prefix: seed1
|
||||
- url: https://archive.org/
|
||||
warcprox_meta:
|
||||
warc-prefix: seed2
|
||||
|
||||
``warcprox_meta`` is also the way to put limits on the size of the crawl job.
|
||||
For example, this configuration will stop the crawl after about 100 MB of novel
|
||||
content has been crawled::
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/
|
||||
- url: https://archive.org/
|
||||
warcprox_meta:
|
||||
stats:
|
||||
buckets:
|
||||
- my-job
|
||||
limits:
|
||||
my-job/new/wire_bytes: 100000000
|
||||
|
||||
To prevent any urls from a host from being captured, it's not sufficient to use
|
||||
a ``scope`` rule as described above. That kind of scoping only applies to
|
||||
navigational links discovered in crawled pages. To make absolutely sure no url
|
||||
from a given host is fetched, not even (say) an image embedded in a page, use
|
||||
``warcprox_meta`` like so::
|
||||
|
||||
warcprox_meta:
|
||||
blocks:
|
||||
- domain: spammy.com
|
||||
|
||||
For complete documentation on the ``warcprox-meta`` request header, see
|
||||
https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||
|
||||
.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
|
||||
.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
|
||||
|
@ -69,27 +69,19 @@ does not take advantage of brozzler's distributed nature.*
|
||||
Installation and Usage
|
||||
----------------------
|
||||
|
||||
To install brozzler only:
|
||||
|
||||
::
|
||||
To install brozzler only::
|
||||
|
||||
pip install brozzler # in a virtualenv if desired
|
||||
|
||||
Launch one or more workers:
|
||||
|
||||
::
|
||||
Launch one or more workers::
|
||||
|
||||
brozzler-worker --warcprox-auto
|
||||
|
||||
Submit jobs:
|
||||
|
||||
::
|
||||
Submit jobs::
|
||||
|
||||
brozzler-new-job myjob.yaml
|
||||
|
||||
Submit sites not tied to a job:
|
||||
|
||||
::
|
||||
Submit sites not tied to a job::
|
||||
|
||||
brozzler-new-site --time-limit=600 http://example.com/
|
||||
|
2
setup.py
2
setup.py
@ -37,7 +37,7 @@ setuptools.setup(
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||
long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
|
||||
license='Apache License 2.0',
|
||||
packages=['brozzler', 'brozzler.dashboard'],
|
||||
package_data={
|
||||
|
Loading…
x
Reference in New Issue
Block a user