mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-30 18:18:44 -04:00
commit
b41ccd7e6b
6 changed files with 56 additions and 22 deletions
|
@ -24,7 +24,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
'brozzler[dashboard]".\nSee readme.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
|
|
|
@ -31,7 +31,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee README.rst for more information.',
|
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import argparse
|
import argparse
|
||||||
|
|
|
@ -31,7 +31,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee README.rst for more information.',
|
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
|
@ -270,7 +270,7 @@ Run pywb like so:
|
||||||
|
|
||||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||||
|
|
||||||
See README.rst for more information.
|
See readme.rst for more information.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||||
|
|
52
job-conf.rst
52
job-conf.rst
|
@ -224,11 +224,11 @@ contact the operator if the crawl is causing problems.
|
||||||
+============+==========+===========+
|
+============+==========+===========+
|
||||||
| dictionary | no | ``false`` |
|
| dictionary | no | ``false`` |
|
||||||
+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||||
pass settings and information to warcprox. Warcprox does not forward the header
|
used to pass settings and information to warcprox. Warcprox does not forward
|
||||||
on to the remote site. See the warcprox docs for more information (XXX not yet
|
the header on to the remote site. For full documentation on ``warcprox-meta``
|
||||||
written).
|
see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||||
|
|
||||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||||
json and populates the Warcprox-Meta header with that value. For example::
|
json and populates the Warcprox-Meta header with that value. For example::
|
||||||
|
@ -457,5 +457,47 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
||||||
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
||||||
is the url of the page in which the link was found.
|
is the url of the page in which the link was found.
|
||||||
|
|
||||||
|
Using ``warcprox_meta``
|
||||||
|
=======================
|
||||||
|
``warcprox_meta`` deserves some more discussion. It plays a very important role
|
||||||
|
in brozzler job configuration. ``warcprox_meta`` is the way you set the
|
||||||
|
filenames of the warcs for your crawl. For example, if each seed should have a
|
||||||
|
different warc name prefix, you might have a job configured this way::
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/
|
||||||
|
warcprox_meta:
|
||||||
|
warc-prefix: seed1
|
||||||
|
- url: https://archive.org/
|
||||||
|
warcprox_meta:
|
||||||
|
warc-prefix: seed2
|
||||||
|
|
||||||
|
``warcprox_meta`` is also the way to put limits on the size of the crawl job.
|
||||||
|
For example, this configuration will stop the crawl after about 100 MB of novel
|
||||||
|
content has been crawled::
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
- url: https://example.com/
|
||||||
|
- url: https://archive.org/
|
||||||
|
warcprox_meta:
|
||||||
|
stats:
|
||||||
|
buckets:
|
||||||
|
- my-job
|
||||||
|
limits:
|
||||||
|
my-job/new/wire_bytes: 100000000
|
||||||
|
|
||||||
|
To prevent any urls from a host from being captured, it's not sufficient to use
|
||||||
|
a ``scope`` rule as described above. That kind of scoping only applies to
|
||||||
|
navigational links discovered in crawled pages. To make absolutely sure no url
|
||||||
|
from a given host is fetched, not even (say) an image embedded in a page, use
|
||||||
|
``warcprox_meta`` like so::
|
||||||
|
|
||||||
|
warcprox_meta:
|
||||||
|
blocks:
|
||||||
|
- domain: spammy.com
|
||||||
|
|
||||||
|
For complete documentation on the ``warcprox-meta`` request header, see
|
||||||
|
https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||||
|
|
||||||
.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
|
.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
|
||||||
.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
|
.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
|
||||||
|
|
|
@ -69,27 +69,19 @@ does not take advantage of brozzler's distributed nature.*
|
||||||
Installation and Usage
|
Installation and Usage
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
To install brozzler only:
|
To install brozzler only::
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
pip install brozzler # in a virtualenv if desired
|
pip install brozzler # in a virtualenv if desired
|
||||||
|
|
||||||
Launch one or more workers:
|
Launch one or more workers::
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
brozzler-worker --warcprox-auto
|
brozzler-worker --warcprox-auto
|
||||||
|
|
||||||
Submit jobs:
|
Submit jobs::
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
brozzler-new-job myjob.yaml
|
brozzler-new-job myjob.yaml
|
||||||
|
|
||||||
Submit sites not tied to a job:
|
Submit sites not tied to a job::
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
brozzler-new-site --time-limit=600 http://example.com/
|
brozzler-new-site --time-limit=600 http://example.com/
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -37,7 +37,7 @@ setuptools.setup(
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
packages=['brozzler', 'brozzler.dashboard'],
|
packages=['brozzler', 'brozzler.dashboard'],
|
||||||
package_data={
|
package_data={
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue