mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 14:32:23 -04:00
Merge branch 'master' into qa
* master: expose more brozzle-page args update pillow dependency to get rid of github vul- more readme edits reformat readme to 80 columns Copy edits to job-conf readme bump up heartbeat interval (see comment) Copy edits back to dev version version 1.3 (messed up 1.2) setuptools wants README not readme back to dev version number version 1.2 bump dev version after merge is test_time_limit is failing because of timing?
This commit is contained in:
commit
418a3ef20c
9 changed files with 130 additions and 139 deletions
|
@ -1,21 +1,20 @@
|
||||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
||||||
:target: https://travis-ci.org/internetarchive/brozzler
|
:target: https://travis-ci.org/internetarchive/brozzler
|
||||||
|
|
||||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
|
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
|
||||||
:width: 60px
|
:width: 60px
|
||||||
|
|
||||||
|logo| brozzler
|
|logo| brozzler
|
||||||
===============
|
===============
|
||||||
"browser" \| "crawler" = "brozzler"
|
"browser" \| "crawler" = "brozzler"
|
||||||
|
|
||||||
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
|
Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome
|
||||||
or chromium) to fetch pages and embedded urls and to extract links. It also
|
or Chromium) to fetch pages and embedded URLs and to extract links. It employs
|
||||||
uses `youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media
|
`youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media capture
|
||||||
capture capabilities.
|
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
|
||||||
|
manage crawl state.
|
||||||
|
|
||||||
Brozzler is designed to work in conjunction with
|
Brozzler is designed to work in conjuction with warcprox for web archiving.
|
||||||
`warcprox <https://github.com/internetarchive/warcprox>`_ for web
|
|
||||||
archiving.
|
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
@ -24,20 +23,21 @@ Requirements
|
||||||
- RethinkDB deployment
|
- RethinkDB deployment
|
||||||
- Chromium or Google Chrome >= version 64
|
- Chromium or Google Chrome >= version 64
|
||||||
|
|
||||||
Worth noting is that the browser requires a graphical environment to run. You
|
Note: The browser requires a graphical environment to run. When brozzler is run
|
||||||
already have this on your laptop, but on a server it will probably require
|
on a server, this may require deploying some additional infrastructure,
|
||||||
deploying some additional infrastructure (typically X11; note that Xvfb does
|
typically X11. Xvnc4 and Xvfb are X11 variants that are suitable for use on a
|
||||||
not support screenshots; Xvnc4, from package vnc4server, does). The vagrant
|
server, because they don't display anything to a physical screen. The `vagrant
|
||||||
configuration in the brozzler repository (still a work in progress) has an
|
configuration <vagrant/>`_ in the brozzler repository has an example setup
|
||||||
example setup.
|
using Xvnc4. (When last tested, chromium on Xvfb did not support screenshots,
|
||||||
|
so Xvnc4 is preferred at this time.)
|
||||||
|
|
||||||
Getting Started
|
Getting Started
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
The easiest way to get started with brozzler for web archiving is with
|
The easiest way to get started with brozzler for web archiving is with
|
||||||
``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox,
|
``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox, brozzler
|
||||||
`pywb <https://github.com/ikreymer/pywb>`_, and brozzler-dashboard, configured
|
wayback, and brozzler-dashboard, configured to work with each other in a single
|
||||||
to work with each other, in a single process.
|
process.
|
||||||
|
|
||||||
Mac instructions:
|
Mac instructions:
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ Mac instructions:
|
||||||
# start brozzler-easy
|
# start brozzler-easy
|
||||||
brozzler-easy
|
brozzler-easy
|
||||||
|
|
||||||
At this point brozzler-easy will start brozzling your site. Results will be
|
At this point brozzler-easy will start archiving your site. Results will be
|
||||||
immediately available for playback in pywb at http://localhost:8880/brozzler/.
|
immediately available for playback in pywb at http://localhost:8880/brozzler/.
|
||||||
|
|
||||||
*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but
|
*Brozzler-easy demonstrates the full brozzler archival crawling workflow, but
|
||||||
|
@ -88,9 +88,9 @@ Submit sites not tied to a job::
|
||||||
Job Configuration
|
Job Configuration
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
Jobs are defined using yaml files. Options may be specified either at the
|
Brozzler jobs are defined using YAML files. Options may be specified either at
|
||||||
top-level or on individual seeds. At least one seed url must be specified,
|
the top-level or on individual seeds. At least one seed URL must be specified,
|
||||||
everything else is optional. For details, see `<job-conf.rst>`_.
|
however everything else is optional. For details, see `<job-conf.rst>`_.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
|
@ -127,13 +127,15 @@ To start the app, run
|
||||||
|
|
||||||
brozzler-dashboard
|
brozzler-dashboard
|
||||||
|
|
||||||
|
At this point Brozzler Dashboard will be accessible at http://localhost:8000/.
|
||||||
|
|
||||||
See ``brozzler-dashboard --help`` for configuration options.
|
See ``brozzler-dashboard --help`` for configuration options.
|
||||||
|
|
||||||
Brozzler Wayback
|
Brozzler Wayback
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
Brozzler comes with a customized version of
|
Brozzler comes with a customized version of `pywb
|
||||||
`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
|
<https://github.com/ikreymer/pywb>`_, which supports using the rethinkdb
|
||||||
"captures" table (populated by warcprox) as its index.
|
"captures" table (populated by warcprox) as its index.
|
||||||
|
|
||||||
To use, first install dependencies.
|
To use, first install dependencies.
|
||||||
|
@ -168,34 +170,11 @@ Run pywb like so:
|
||||||
|
|
||||||
Then browse http://localhost:8880/brozzler/.
|
Then browse http://localhost:8880/brozzler/.
|
||||||
|
|
||||||
|
|
||||||
Headless Chrome (experimental)
|
Headless Chrome (experimental)
|
||||||
--------------------------------
|
------------------------------
|
||||||
|
|
||||||
`Headless Chromium <https://chromium.googlesource.com/chromium/src/+/master/headless/README.md>`_
|
Brozzler is known to work nominally with Chrome/Chromium in headless mode, but
|
||||||
is now available in stable Chrome releases for 64-bit Linux and may be
|
this has not yet been extensively tested.
|
||||||
used to run the browser without a visible window or X11 at all.
|
|
||||||
|
|
||||||
To try this out, create a wrapper script like ~/bin/chrome-headless.sh:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
#!/bin/bash
|
|
||||||
exec /opt/google/chrome/chrome --headless --disable-gpu "$@"
|
|
||||||
|
|
||||||
Run brozzler passing the path to the wrapper script as the ``--chrome-exe``
|
|
||||||
option:
|
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
chmod +x ~/bin/chrome-headless.sh
|
|
||||||
brozzler-worker --chrome-exe ~/bin/chrome-headless.sh
|
|
||||||
|
|
||||||
Beware: Chrome's headless mode is still very new and has a number of
|
|
||||||
`unresolved issues. <https://bugs.chromium.org/p/chromium/issues/list?can=2&q=Proj%3DHeadless>`_
|
|
||||||
You may experience hangs or crashes with some types of content. Brozzler
|
|
||||||
has not had much testing with it. For the moment we recommend using
|
|
||||||
Chrome's regular mode instead.
|
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
|
@ -156,13 +156,12 @@ def brozzle_page(argv=None):
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
action='store_true')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
action='store_true')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-youtube-dl', dest='skip_youtube_dl',
|
'--skip-youtube-dl', dest='skip_youtube_dl', action='store_true')
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
|
@ -24,7 +24,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[dashboard]".\nSee readme.rst for more information.',
|
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
|
|
|
@ -31,7 +31,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
'brozzler[easy]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import argparse
|
import argparse
|
||||||
|
|
|
@ -31,7 +31,7 @@ try:
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
logging.critical(
|
logging.critical(
|
||||||
'%s: %s\n\nYou might need to run "pip install '
|
'%s: %s\n\nYou might need to run "pip install '
|
||||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
'brozzler[easy]".\nSee README.rst for more information.',
|
||||||
type(e).__name__, e)
|
type(e).__name__, e)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
|
@ -270,7 +270,7 @@ Run pywb like so:
|
||||||
|
|
||||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||||
|
|
||||||
See readme.rst for more information.
|
See README.rst for more information.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||||
|
|
|
@ -113,7 +113,11 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
class BrozzlerWorker:
|
class BrozzlerWorker:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
HEARTBEAT_INTERVAL = 20.0
|
# 3⅓ min heartbeat interval => 10 min ttl
|
||||||
|
# This is kind of a long time, because `frontier.claim_sites()`, which runs
|
||||||
|
# in the same thread as the heartbeats, can take a while on a busy brozzler
|
||||||
|
# cluster with slow rethinkdb.
|
||||||
|
HEARTBEAT_INTERVAL = 200.0
|
||||||
SITE_SESSION_MINUTES = 15
|
SITE_SESSION_MINUTES = 15
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -347,7 +351,8 @@ class BrozzlerWorker:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def full_and_thumb_jpegs(self, large_png):
|
def full_and_thumb_jpegs(self, large_png):
|
||||||
img = PIL.Image.open(io.BytesIO(large_png))
|
# these screenshots never have any alpha (right?)
|
||||||
|
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
||||||
|
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
img.save(out, "jpeg", quality=95)
|
img.save(out, "jpeg", quality=95)
|
||||||
|
|
156
job-conf.rst
156
job-conf.rst
|
@ -1,8 +1,9 @@
|
||||||
Brozzler Job Configuration
|
Brozzler Job Configuration
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
Jobs are defined using yaml files. At least one seed url must be specified,
|
Jobs are used to brozzle multiple seeds and/or apply settings and scope rules,
|
||||||
everything else is optional.
|
as defined byusing YAML files. At least one seed URL must be specified.
|
||||||
|
All other configurartions are optional.
|
||||||
|
|
||||||
.. contents::
|
.. contents::
|
||||||
|
|
||||||
|
@ -42,9 +43,8 @@ How inheritance works
|
||||||
|
|
||||||
Most of the settings that apply to seeds can also be specified at the top
|
Most of the settings that apply to seeds can also be specified at the top
|
||||||
level, in which case all seeds inherit those settings. If an option is
|
level, in which case all seeds inherit those settings. If an option is
|
||||||
specified both at the top level and at seed level, the results are merged with
|
specified both at the top level and at the seed level, the results are merged.
|
||||||
the seed-level value taking precedence in case of conflicts. It's probably
|
In cases of coflict, the seed-level value takes precedence.
|
||||||
easiest to make sense of this by way of an example.
|
|
||||||
|
|
||||||
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
||||||
at the seed level for the seed http://one.example.org/. At the top level we
|
at the seed level for the seed http://one.example.org/. At the top level we
|
||||||
|
@ -74,7 +74,7 @@ be::
|
||||||
- job1-stats
|
- job1-stats
|
||||||
- job1-seed1-stats
|
- job1-seed1-stats
|
||||||
|
|
||||||
Notice that:
|
In this example:
|
||||||
|
|
||||||
- There is a collision on ``warc-prefix`` and the seed-level value wins.
|
- There is a collision on ``warc-prefix`` and the seed-level value wins.
|
||||||
- Since ``buckets`` is a list, the merged result includes all the values from
|
- Since ``buckets`` is a list, the merged result includes all the values from
|
||||||
|
@ -120,8 +120,8 @@ specify any seed settings.
|
||||||
|
|
||||||
Seed-level-only settings
|
Seed-level-only settings
|
||||||
------------------------
|
------------------------
|
||||||
These settings can be specified only at the seed level, unlike most seed
|
These settings can be specified only at the seed level, unlike the settings
|
||||||
settings, which can also be specified at the top level.
|
below that can also be specified at the top level.
|
||||||
|
|
||||||
``url``
|
``url``
|
||||||
~~~~~~~
|
~~~~~~~
|
||||||
|
@ -130,7 +130,7 @@ settings, which can also be specified at the top level.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | yes | *n/a* |
|
| string | yes | *n/a* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
The seed url. Crawling starts here.
|
The seed URL. Brozzling starts here.
|
||||||
|
|
||||||
``username``
|
``username``
|
||||||
~~~~~~~~~~~~
|
~~~~~~~~~~~~
|
||||||
|
@ -153,14 +153,14 @@ If set, used to populate automatically detected login forms. If ``username``
|
||||||
and ``password`` are configured for a seed, brozzler will look for a login form
|
and ``password`` are configured for a seed, brozzler will look for a login form
|
||||||
on each page it crawls for that seed. A form that has a single text or email
|
on each page it crawls for that seed. A form that has a single text or email
|
||||||
field (the username), a single password field (``<input type="password">``),
|
field (the username), a single password field (``<input type="password">``),
|
||||||
and has ``method="POST"`` is considered to be a login form. The form may have
|
and has ``method="POST"`` is considered to be a login form. When forms have
|
||||||
other fields like checkboxes and hidden fields. For these, brozzler will leave
|
other fields like checkboxes and/or hidden fields, brozzler will leave
|
||||||
the default values in place. Brozzler submits login forms after page load.
|
the default values in place. Brozzler submits login forms after page load.
|
||||||
Then brozzling proceeds as usual.
|
Then brozzling proceeds as usual.
|
||||||
|
|
||||||
Seed-level / top-level settings
|
Seed-level / top-level settings
|
||||||
-------------------------------
|
-------------------------------
|
||||||
These are seed settings that can also be speficied at the top level, in which
|
These are seed settings that can also be specified at the top level, in which
|
||||||
case they are inherited by all seeds.
|
case they are inherited by all seeds.
|
||||||
|
|
||||||
``metadata``
|
``metadata``
|
||||||
|
@ -170,8 +170,9 @@ case they are inherited by all seeds.
|
||||||
+============+==========+=========+
|
+============+==========+=========+
|
||||||
| dictionary | no | *none* |
|
| dictionary | no | *none* |
|
||||||
+------------+----------+---------+
|
+------------+----------+---------+
|
||||||
Arbitrary information about the crawl job or site. Merely informative, not used
|
Information about the crawl job or site. Could be useful for external
|
||||||
by brozzler for anything. Could be of use to some external process.
|
descriptive or informative metadata, but not used by brozzler in the course of
|
||||||
|
archiving.
|
||||||
|
|
||||||
``time_limit``
|
``time_limit``
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
@ -202,8 +203,9 @@ warcprox for archival crawling.
|
||||||
+=========+==========+===========+
|
+=========+==========+===========+
|
||||||
| boolean | no | ``false`` |
|
| boolean | no | ``false`` |
|
||||||
+---------+----------+-----------+
|
+---------+----------+-----------+
|
||||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
If set to ``true``, brozzler will fetch pages that would otherwise be blocked
|
||||||
blocked by robots.txt rules.
|
by `robots.txt rules
|
||||||
|
<https://en.wikipedia.org/wiki/Robots_exclusion_standard>`_.
|
||||||
|
|
||||||
``user_agent``
|
``user_agent``
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
@ -213,9 +215,9 @@ blocked by robots.txt rules.
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+---------+----------+---------+
|
+---------+----------+---------+
|
||||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||||
It's good ettiquette to include a project URL with a notice to webmasters that
|
It is good ettiquette to include a project URL with a notice to webmasters that
|
||||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
explains why you are crawling, how to block the crawler via robots.txt, and how
|
||||||
contact the operator if the crawl is causing problems.
|
to contact the operator if the crawl is causing problems.
|
||||||
|
|
||||||
``warcprox_meta``
|
``warcprox_meta``
|
||||||
~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~
|
||||||
|
@ -227,8 +229,9 @@ contact the operator if the crawl is causing problems.
|
||||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||||
used to pass settings and information to warcprox. Warcprox does not forward
|
used to pass settings and information to warcprox. Warcprox does not forward
|
||||||
the header on to the remote site. For full documentation on ``warcprox-meta``
|
the header on to the remote site. For further explanation of this field and
|
||||||
see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
its uses see
|
||||||
|
https://github.com/internetarchive/warcprox/blob/master/api.rst
|
||||||
|
|
||||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||||
json and populates the Warcprox-Meta header with that value. For example::
|
json and populates the Warcprox-Meta header with that value. For example::
|
||||||
|
@ -256,8 +259,8 @@ Scope specificaion for the seed. See the "Scoping" section which follows.
|
||||||
Scoping
|
Scoping
|
||||||
=======
|
=======
|
||||||
|
|
||||||
The scope of a seed determines which links are scheduled for crawling and which
|
The scope of a seed determines which links are scheduled for crawling ("in
|
||||||
are not. Example::
|
scope") and which are not. For example::
|
||||||
|
|
||||||
scope:
|
scope:
|
||||||
accepts:
|
accepts:
|
||||||
|
@ -288,71 +291,69 @@ then the scope rule as a whole matches. For example::
|
||||||
- domain: youngscholars.unimelb.edu.au
|
- domain: youngscholars.unimelb.edu.au
|
||||||
substring: wp-login.php?action=logout
|
substring: wp-login.php?action=logout
|
||||||
|
|
||||||
This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or
|
This rule applies if the domain of the URL is "youngscholars.unimelb.edu.au" or
|
||||||
a subdomain, and the string "wp-login.php?action=logout" is found somewhere in
|
a subdomain, and the string "wp-login.php?action=logout" is found somewhere in
|
||||||
the url.
|
the URL.
|
||||||
|
|
||||||
Brozzler applies these logical steps to decide whether a url is in or out of
|
Brozzler applies these logical steps to decide whether a URL is in or out of
|
||||||
scope:
|
scope:
|
||||||
|
|
||||||
1. If the number of hops from seed is greater than ``max_hops``, the url is
|
1. If the number of hops from seed is greater than ``max_hops``, the URL is
|
||||||
**out of scope**.
|
**out of scope**.
|
||||||
2. Otherwise, if any ``block`` rule matches, the url is **out of scope**.
|
2. Otherwise, if any ``block`` rule matches, the URL is **out of scope**.
|
||||||
3. Otherwise, if any ``accept`` rule matches, the url is **in scope**.
|
3. Otherwise, if any ``accept`` rule matches, the URL is **in scope**.
|
||||||
4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page
|
4. Otherwise, if the URL is at most ``max_hops_off`` hops from the last page
|
||||||
that was in scope thanks to an ``accept`` rule, the url is **in scope**.
|
that was in scope because of an ``accept`` rule, the url is **in scope**.
|
||||||
5. Otherwise (no rules match), the url is **out of scope**.
|
5. Otherwise (no rules match), the url is **out of scope**.
|
||||||
|
|
||||||
Notably, ``block`` rules take precedence over ``accept`` rules.
|
In cases of conflict, ``block`` rules take precedence over ``accept`` rules.
|
||||||
|
|
||||||
It may also be helpful to think about a list of scope rules as a boolean
|
Scope rules may be conceived as a boolean expression. For example::
|
||||||
expression. For example::
|
|
||||||
|
|
||||||
blocks:
|
blocks:
|
||||||
- domain: youngscholars.unimelb.edu.au
|
- domain: youngscholars.unimelb.edu.au
|
||||||
substring: wp-login.php?action=logout
|
substring: wp-login.php?action=logout
|
||||||
- domain: malware.us
|
- domain: malware.us
|
||||||
|
|
||||||
means block the url IF::
|
means block the URL IF::
|
||||||
|
|
||||||
("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us"
|
("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us"
|
||||||
|
|
||||||
Automatic scoping based on seed urls
|
Automatic scoping based on seed URLs
|
||||||
------------------------------------
|
------------------------------------
|
||||||
Brozzler usually generates an ``accept`` scope rule based on the seed url. It
|
Brozzler usually generates an ``accept`` scope rule based on the seed URL. It
|
||||||
does this to fulfill the usual expectation that everything "under" the seed
|
does this to fulfill the usual expectation that everything "under" the seed
|
||||||
will be crawled.
|
will be crawled.
|
||||||
|
|
||||||
To generate the rule, brozzler canonicalizes the seed url using the `urlcanon
|
To generate the rule, brozzler canonicalizes the seed URL using the `urlcanon
|
||||||
<https://github.com/iipc/urlcanon>`_ library's "semantic" canonicalizer, then
|
<https://github.com/iipc/urlcanon>`_ library's "semantic" canonicalizer, then
|
||||||
removes the query string if any, and finally serializes the result in SSURT
|
removes the query string if any, and finally serializes the result in SSURT
|
||||||
[1]_ form. For example, a seed url of
|
[1]_ form. For example, a seed URL of
|
||||||
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
||||||
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
||||||
|
|
||||||
If the url in the browser location bar at the end of brozzling the seed page
|
Brozzler derives its general approach to the seed surt from `heritrix
|
||||||
differs from the seed url, brozzler automatically adds a second ``accept`` rule
|
<https://github.com/internetarchive/heritrix3>`_, but differs in a few respects.
|
||||||
to ensure the site is in scope, as if the new url were the original seed url.
|
|
||||||
It does this so that, for example, if ``http://example.com/`` redirects to
|
|
||||||
``http://www.example.com/``, the rest of the ``www.example.com`` is in scope.
|
|
||||||
|
|
||||||
Brozzler derives its general approach to the seed surt from Heritrix, but
|
|
||||||
differs in a few respects.
|
|
||||||
|
|
||||||
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
||||||
slash.
|
slash.
|
||||||
2. Canonicalization does not attempt to match heritrix exactly, though it
|
2. Canonicalization does not attempt to match heritrix exactly, though it
|
||||||
usually does match.
|
usually does match.
|
||||||
3. When generating a surt for an https url, heritrix changes the scheme to
|
3. When generating a SURT for an HTTPS URL, heritrix changes the scheme to
|
||||||
http. For example, the heritrix surt for ``https://www.example.com/`` is
|
HTTP. For example, the heritrix SURT for ``https://www.example.com/`` is
|
||||||
``http://(com,example,www,)`` and this means that all of
|
``http://(com,example,www,)`` and this means that all of
|
||||||
``http://www.example.com/*`` and ``https://www.example.com/*`` are in
|
``http://www.example.com/*`` and ``https://www.example.com/*`` are in
|
||||||
scope. It also means that a manually specified surt with scheme "https" does
|
scope. It also means that a manually specified SURT with scheme "https" does
|
||||||
not match anything. Brozzler does no scheme munging.
|
not match anything. Brozzler does no scheme munging.
|
||||||
4. Brozzler identifies seed "redirects" by retrieving the url from the
|
4. Brozzler identifies seed "redirects" by retrieving the URL from the
|
||||||
browser's location bar at the end of brozzling the seed page, whereas
|
browser's location bar at the end of brozzling the seed page, whereas
|
||||||
heritrix follows http 3xx redirects.
|
heritrix follows HTTP 3XX redirects. If the URL in the browser
|
||||||
5. Brozzler uses ssurt instead of surt.
|
location bar at the end of brozzling the seed page differs from the seed
|
||||||
|
URL, brozzler automatically adds a second ``accept`` rule to ensure the
|
||||||
|
site is in scope, as if the new URL were the original seed URL. For example,
|
||||||
|
if ``http://example.com/`` redirects to ``http://www.example.com/``, the
|
||||||
|
rest of the ``www.example.com`` is in scope.
|
||||||
|
5. Brozzler uses SSURT instead of SURT.
|
||||||
6. There is currently no brozzler option to disable the automatically generated
|
6. There is currently no brozzler option to disable the automatically generated
|
||||||
``accept`` rules.
|
``accept`` rules.
|
||||||
|
|
||||||
|
@ -366,9 +367,9 @@ Scope settings
|
||||||
+======+==========+=========+
|
+======+==========+=========+
|
||||||
| list | no | *none* |
|
| list | no | *none* |
|
||||||
+------+----------+---------+
|
+------+----------+---------+
|
||||||
List of scope rules. If any of the rules match, and the url is within
|
List of scope rules. If any of the rules match, the URL is within
|
||||||
``max_hops`` from seed, and none of the ``block`` rules apply, the url is in
|
``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is
|
||||||
scope.
|
in scope and brozzled.
|
||||||
|
|
||||||
``blocks``
|
``blocks``
|
||||||
~~~~~~~~~~~
|
~~~~~~~~~~~
|
||||||
|
@ -377,7 +378,8 @@ scope.
|
||||||
+======+==========+=========+
|
+======+==========+=========+
|
||||||
| list | no | *none* |
|
| list | no | *none* |
|
||||||
+------+----------+---------+
|
+------+----------+---------+
|
||||||
List of scope rules. If any of the rules match, the url is deemed out of scope.
|
List of scope rules. If any of the rules match, then the URL is deemed out
|
||||||
|
of scope and NOT brozzled.
|
||||||
|
|
||||||
``max_hops``
|
``max_hops``
|
||||||
~~~~~~~~~~~~
|
~~~~~~~~~~~~
|
||||||
|
@ -395,8 +397,8 @@ Maximum number of hops from seed.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| number | no | 0 |
|
| number | no | 0 |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Expands the scope to include urls up to this many hops from the last page that
|
Expands the scope to include URLs up to this many hops from the last page that
|
||||||
was in scope thanks to an ``accept`` rule.
|
was in scope because of an ``accept`` rule.
|
||||||
|
|
||||||
Scope rule conditions
|
Scope rule conditions
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -408,7 +410,7 @@ Scope rule conditions
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the host part of the canonicalized url is ``domain`` or a
|
Matches if the host part of the canonicalized URL is ``domain`` or a
|
||||||
subdomain.
|
subdomain.
|
||||||
|
|
||||||
``substring``
|
``substring``
|
||||||
|
@ -418,7 +420,7 @@ subdomain.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if ``substring`` is found anywhere in the canonicalized url.
|
Matches if ``substring`` value is found anywhere in the canonicalized URL.
|
||||||
|
|
||||||
``regex``
|
``regex``
|
||||||
~~~~~~~~~
|
~~~~~~~~~
|
||||||
|
@ -427,7 +429,7 @@ Matches if ``substring`` is found anywhere in the canonicalized url.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the full canonicalized url matches ``regex``.
|
Matches if the full canonicalized URL matches a regular expression.
|
||||||
|
|
||||||
``ssurt``
|
``ssurt``
|
||||||
~~~~~~~~~
|
~~~~~~~~~
|
||||||
|
@ -436,7 +438,8 @@ Matches if the full canonicalized url matches ``regex``.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``.
|
Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt``
|
||||||
|
value.
|
||||||
|
|
||||||
``surt``
|
``surt``
|
||||||
~~~~~~~~
|
~~~~~~~~
|
||||||
|
@ -445,7 +448,8 @@ Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt``
|
||||||
|
value.
|
||||||
|
|
||||||
``parent_url_regex``
|
``parent_url_regex``
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
@ -454,15 +458,15 @@ Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
Matches if the full canonicalized parent URL matches a regular expression.
|
||||||
is the url of the page in which the link was found.
|
The parent URL is the URL of the page in which a link is found.
|
||||||
|
|
||||||
Using ``warcprox_meta``
|
Using ``warcprox_meta``
|
||||||
=======================
|
=======================
|
||||||
``warcprox_meta`` deserves some more discussion. It plays a very important role
|
``warcprox_meta`` plays a very important role in brozzler job configuration.
|
||||||
in brozzler job configuration. ``warcprox_meta`` is the way you set the
|
It sets the filenames of the WARC files created by a job. For example, if each
|
||||||
filenames of the warcs for your crawl. For example, if each seed should have a
|
seed should have a different WARC filename prefix, you might configure a job
|
||||||
different warc name prefix, you might have a job configured this way::
|
this way::
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
|
@ -472,9 +476,9 @@ different warc name prefix, you might have a job configured this way::
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
warc-prefix: seed2
|
warc-prefix: seed2
|
||||||
|
|
||||||
``warcprox_meta`` is also the way to put limits on the size of the crawl job.
|
``warcprox_meta`` may also be used to limit the size of the job. For example,
|
||||||
For example, this configuration will stop the crawl after about 100 MB of novel
|
this configuration will stop the crawl after about 100 MB of novel content has
|
||||||
content has been crawled::
|
been archived::
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.com/
|
- url: https://example.com/
|
||||||
|
@ -486,10 +490,10 @@ content has been crawled::
|
||||||
limits:
|
limits:
|
||||||
my-job/new/wire_bytes: 100000000
|
my-job/new/wire_bytes: 100000000
|
||||||
|
|
||||||
To prevent any urls from a host from being captured, it's not sufficient to use
|
To prevent any URLs from a host from being captured, it is not sufficient to use
|
||||||
a ``scope`` rule as described above. That kind of scoping only applies to
|
a ``scope`` rule as described above. That kind of scoping only applies to
|
||||||
navigational links discovered in crawled pages. To make absolutely sure no url
|
navigational links discovered in crawled pages. To make absolutely sure that no
|
||||||
from a given host is fetched, not even (say) an image embedded in a page, use
|
url from a given host is fetched--not even an image embedded in a page--use
|
||||||
``warcprox_meta`` like so::
|
``warcprox_meta`` like so::
|
||||||
|
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
|
|
6
setup.py
6
setup.py
|
@ -32,12 +32,12 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b13.dev291',
|
version='1.4.dev297',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
|
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
packages=['brozzler', 'brozzler.dashboard'],
|
packages=['brozzler', 'brozzler.dashboard'],
|
||||||
package_data={
|
package_data={
|
||||||
|
@ -68,7 +68,7 @@ setuptools.setup(
|
||||||
'reppy==0.3.4',
|
'reppy==0.3.4',
|
||||||
'requests',
|
'requests',
|
||||||
'websocket-client!=0.39.0',
|
'websocket-client!=0.39.0',
|
||||||
'pillow==3.3.0',
|
'pillow>=5.2.0',
|
||||||
'urlcanon>=0.1.dev23',
|
'urlcanon>=0.1.dev23',
|
||||||
'doublethink>=0.2.0.dev88',
|
'doublethink>=0.2.0.dev88',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
|
|
|
@ -769,7 +769,7 @@ def test_time_limit(httpd):
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# create a new job with three sites that could be crawled forever
|
# create a new job with one seed that could be crawled forever
|
||||||
job_conf = {'seeds': [{
|
job_conf = {'seeds': [{
|
||||||
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
||||||
'time_limit': 20}]}
|
'time_limit': 20}]}
|
||||||
|
@ -789,6 +789,10 @@ def test_time_limit(httpd):
|
||||||
assert sites[0].status == 'FINISHED_TIME_LIMIT'
|
assert sites[0].status == 'FINISHED_TIME_LIMIT'
|
||||||
|
|
||||||
# all sites finished so job should be finished too
|
# all sites finished so job should be finished too
|
||||||
|
start = time.time()
|
||||||
|
job.refresh()
|
||||||
|
while not job.status == 'FINISHED' and time.time() - start < 10:
|
||||||
|
time.sleep(0.5)
|
||||||
job.refresh()
|
job.refresh()
|
||||||
assert job.status == 'FINISHED'
|
assert job.status == 'FINISHED'
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue