mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 06:22:23 -04:00
Merge branch 'requestIntercepted' into qa
This commit is contained in:
commit
533a5e74ee
7 changed files with 48 additions and 38 deletions
|
@ -1,8 +1,10 @@
|
||||||
|
dist: xenial
|
||||||
language: python
|
language: python
|
||||||
python:
|
python:
|
||||||
- 3.4
|
- 3.4
|
||||||
- 3.5
|
- 3.5
|
||||||
- 3.6
|
- 3.6
|
||||||
|
- 3.7
|
||||||
sudo: required
|
sudo: required
|
||||||
before_install:
|
before_install:
|
||||||
- sudo pip install --upgrade setuptools pip
|
- sudo pip install --upgrade setuptools pip
|
||||||
|
|
|
@ -106,7 +106,7 @@ def behaviors(behaviors_dir=None):
|
||||||
d = behaviors_dir or os.path.dirname(__file__)
|
d = behaviors_dir or os.path.dirname(__file__)
|
||||||
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
||||||
with open(behaviors_yaml) as fin:
|
with open(behaviors_yaml) as fin:
|
||||||
_behaviors = yaml.load(fin)
|
_behaviors = yaml.safe_load(fin)
|
||||||
return _behaviors
|
return _behaviors
|
||||||
|
|
||||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||||
|
|
|
@ -241,6 +241,13 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
elif message['method'] == 'Network.requestWillBeSent':
|
elif message['method'] == 'Network.requestWillBeSent':
|
||||||
if self.on_request:
|
if self.on_request:
|
||||||
self.on_request(message)
|
self.on_request(message)
|
||||||
|
elif message['method'] == 'Network.requestIntercepted':
|
||||||
|
if 'params' in message and 'authChallenge' in message['params']:
|
||||||
|
auth_challenge = message['params']['authChallenge']
|
||||||
|
self.logger.info('Network.requestIntercepted AuthChallenge %s %s',
|
||||||
|
auth_challenge['scheme'], auth_challenge['origin'])
|
||||||
|
else:
|
||||||
|
self.logger.info('Network.requestIntercepted non-AuthChallenge')
|
||||||
elif message['method'] == 'Page.interstitialShown':
|
elif message['method'] == 'Page.interstitialShown':
|
||||||
# AITFIVE-1529: handle http auth
|
# AITFIVE-1529: handle http auth
|
||||||
# we should kill the browser when we receive Page.interstitialShown and
|
# we should kill the browser when we receive Page.interstitialShown and
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||||
related logic
|
related logic
|
||||||
|
|
||||||
Copyright (C) 2014-2018 Internet Archive
|
Copyright (C) 2014-2019 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -35,7 +35,7 @@ import yaml
|
||||||
def load_schema():
|
def load_schema():
|
||||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||||
with open(schema_file) as f:
|
with open(schema_file) as f:
|
||||||
return yaml.load(f)
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
class JobValidator(cerberus.Validator):
|
class JobValidator(cerberus.Validator):
|
||||||
def _validate_type_url(self, value):
|
def _validate_type_url(self, value):
|
||||||
|
@ -48,7 +48,7 @@ class InvalidJobConf(Exception):
|
||||||
|
|
||||||
def validate_conf(job_conf, schema=load_schema()):
|
def validate_conf(job_conf, schema=load_schema()):
|
||||||
v = JobValidator(schema)
|
v = JobValidator(schema)
|
||||||
if not v.validate(job_conf):
|
if not v.validate(job_conf, normalize=False):
|
||||||
raise InvalidJobConf(v.errors)
|
raise InvalidJobConf(v.errors)
|
||||||
|
|
||||||
def merge(a, b):
|
def merge(a, b):
|
||||||
|
@ -68,7 +68,7 @@ def new_job_file(frontier, job_conf_file):
|
||||||
'''Returns new Job.'''
|
'''Returns new Job.'''
|
||||||
logging.info("loading %s", job_conf_file)
|
logging.info("loading %s", job_conf_file)
|
||||||
with open(job_conf_file) as f:
|
with open(job_conf_file) as f:
|
||||||
job_conf = yaml.load(f)
|
job_conf = yaml.safe_load(f)
|
||||||
return new_job(frontier, job_conf)
|
return new_job(frontier, job_conf)
|
||||||
|
|
||||||
def new_job(frontier, job_conf):
|
def new_job(frontier, job_conf):
|
||||||
|
|
58
job-conf.rst
58
job-conf.rst
|
@ -1,8 +1,8 @@
|
||||||
Brozzler Job Configuration
|
Brozzler Job Configuration
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
Jobs are used to brozzle multiple seeds and/or apply settings and scope rules,
|
Jobs are used to brozzle multiple seeds and/or apply settings and scope rules,
|
||||||
as defined byusing YAML files. At least one seed URL must be specified.
|
as defined byusing YAML files. At least one seed URL must be specified.
|
||||||
All other configurartions are optional.
|
All other configurartions are optional.
|
||||||
|
|
||||||
.. contents::
|
.. contents::
|
||||||
|
@ -43,7 +43,7 @@ How inheritance works
|
||||||
|
|
||||||
Most of the settings that apply to seeds can also be specified at the top
|
Most of the settings that apply to seeds can also be specified at the top
|
||||||
level, in which case all seeds inherit those settings. If an option is
|
level, in which case all seeds inherit those settings. If an option is
|
||||||
specified both at the top level and at the seed level, the results are merged.
|
specified both at the top level and at the seed level, the results are merged.
|
||||||
In cases of coflict, the seed-level value takes precedence.
|
In cases of coflict, the seed-level value takes precedence.
|
||||||
|
|
||||||
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
||||||
|
@ -170,8 +170,8 @@ case they are inherited by all seeds.
|
||||||
+============+==========+=========+
|
+============+==========+=========+
|
||||||
| dictionary | no | *none* |
|
| dictionary | no | *none* |
|
||||||
+------------+----------+---------+
|
+------------+----------+---------+
|
||||||
Information about the crawl job or site. Could be useful for external
|
Information about the crawl job or site. Could be useful for external
|
||||||
descriptive or informative metadata, but not used by brozzler in the course of
|
descriptive or informative metadata, but not used by brozzler in the course of
|
||||||
archiving.
|
archiving.
|
||||||
|
|
||||||
``time_limit``
|
``time_limit``
|
||||||
|
@ -203,8 +203,8 @@ warcprox for archival crawling.
|
||||||
+=========+==========+===========+
|
+=========+==========+===========+
|
||||||
| boolean | no | ``false`` |
|
| boolean | no | ``false`` |
|
||||||
+---------+----------+-----------+
|
+---------+----------+-----------+
|
||||||
If set to ``true``, brozzler will fetch pages that would otherwise be blocked
|
If set to ``true``, brozzler will fetch pages that would otherwise be blocked
|
||||||
by `robots.txt rules
|
by `robots.txt rules
|
||||||
<https://en.wikipedia.org/wiki/Robots_exclusion_standard>`_.
|
<https://en.wikipedia.org/wiki/Robots_exclusion_standard>`_.
|
||||||
|
|
||||||
``user_agent``
|
``user_agent``
|
||||||
|
@ -216,7 +216,7 @@ by `robots.txt rules
|
||||||
+---------+----------+---------+
|
+---------+----------+---------+
|
||||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||||
It is good ettiquette to include a project URL with a notice to webmasters that
|
It is good ettiquette to include a project URL with a notice to webmasters that
|
||||||
explains why you are crawling, how to block the crawler via robots.txt, and how
|
explains why you are crawling, how to block the crawler via robots.txt, and how
|
||||||
to contact the operator if the crawl is causing problems.
|
to contact the operator if the crawl is causing problems.
|
||||||
|
|
||||||
``warcprox_meta``
|
``warcprox_meta``
|
||||||
|
@ -229,8 +229,8 @@ to contact the operator if the crawl is causing problems.
|
||||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||||
used to pass settings and information to warcprox. Warcprox does not forward
|
used to pass settings and information to warcprox. Warcprox does not forward
|
||||||
the header on to the remote site. For further explanation of this field and
|
the header on to the remote site. For further explanation of this field and
|
||||||
its uses see
|
its uses see
|
||||||
https://github.com/internetarchive/warcprox/blob/master/api.rst
|
https://github.com/internetarchive/warcprox/blob/master/api.rst
|
||||||
|
|
||||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||||
|
@ -259,7 +259,7 @@ Scope specificaion for the seed. See the "Scoping" section which follows.
|
||||||
Scoping
|
Scoping
|
||||||
=======
|
=======
|
||||||
|
|
||||||
The scope of a seed determines which links are scheduled for crawling ("in
|
The scope of a seed determines which links are scheduled for crawling ("in
|
||||||
scope") and which are not. For example::
|
scope") and which are not. For example::
|
||||||
|
|
||||||
scope:
|
scope:
|
||||||
|
@ -330,9 +330,9 @@ To generate the rule, brozzler canonicalizes the seed URL using the `urlcanon
|
||||||
removes the query string if any, and finally serializes the result in SSURT
|
removes the query string if any, and finally serializes the result in SSURT
|
||||||
[1]_ form. For example, a seed URL of
|
[1]_ form. For example, a seed URL of
|
||||||
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
||||||
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
``com,example,www,//https:/foo/bar``.
|
||||||
|
|
||||||
Brozzler derives its general approach to the seed surt from `heritrix
|
Brozzler derives its general approach to the seed surt from `heritrix
|
||||||
<https://github.com/internetarchive/heritrix3>`_, but differs in a few respects.
|
<https://github.com/internetarchive/heritrix3>`_, but differs in a few respects.
|
||||||
|
|
||||||
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
||||||
|
@ -347,11 +347,11 @@ Brozzler derives its general approach to the seed surt from `heritrix
|
||||||
not match anything. Brozzler does no scheme munging.
|
not match anything. Brozzler does no scheme munging.
|
||||||
4. Brozzler identifies seed "redirects" by retrieving the URL from the
|
4. Brozzler identifies seed "redirects" by retrieving the URL from the
|
||||||
browser's location bar at the end of brozzling the seed page, whereas
|
browser's location bar at the end of brozzling the seed page, whereas
|
||||||
heritrix follows HTTP 3XX redirects. If the URL in the browser
|
heritrix follows HTTP 3XX redirects. If the URL in the browser
|
||||||
location bar at the end of brozzling the seed page differs from the seed
|
location bar at the end of brozzling the seed page differs from the seed
|
||||||
URL, brozzler automatically adds a second ``accept`` rule to ensure the
|
URL, brozzler automatically adds a second ``accept`` rule to ensure the
|
||||||
site is in scope, as if the new URL were the original seed URL. For example,
|
site is in scope, as if the new URL were the original seed URL. For example,
|
||||||
if ``http://example.com/`` redirects to ``http://www.example.com/``, the
|
if ``http://example.com/`` redirects to ``http://www.example.com/``, the
|
||||||
rest of the ``www.example.com`` is in scope.
|
rest of the ``www.example.com`` is in scope.
|
||||||
5. Brozzler uses SSURT instead of SURT.
|
5. Brozzler uses SSURT instead of SURT.
|
||||||
6. There is currently no brozzler option to disable the automatically generated
|
6. There is currently no brozzler option to disable the automatically generated
|
||||||
|
@ -368,7 +368,7 @@ Scope settings
|
||||||
| list | no | *none* |
|
| list | no | *none* |
|
||||||
+------+----------+---------+
|
+------+----------+---------+
|
||||||
List of scope rules. If any of the rules match, the URL is within
|
List of scope rules. If any of the rules match, the URL is within
|
||||||
``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is
|
``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is
|
||||||
in scope and brozzled.
|
in scope and brozzled.
|
||||||
|
|
||||||
``blocks``
|
``blocks``
|
||||||
|
@ -378,7 +378,7 @@ in scope and brozzled.
|
||||||
+======+==========+=========+
|
+======+==========+=========+
|
||||||
| list | no | *none* |
|
| list | no | *none* |
|
||||||
+------+----------+---------+
|
+------+----------+---------+
|
||||||
List of scope rules. If any of the rules match, then the URL is deemed out
|
List of scope rules. If any of the rules match, then the URL is deemed out
|
||||||
of scope and NOT brozzled.
|
of scope and NOT brozzled.
|
||||||
|
|
||||||
``max_hops``
|
``max_hops``
|
||||||
|
@ -438,7 +438,7 @@ Matches if the full canonicalized URL matches a regular expression.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt``
|
Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt``
|
||||||
value.
|
value.
|
||||||
|
|
||||||
``surt``
|
``surt``
|
||||||
|
@ -448,7 +448,7 @@ value.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt``
|
Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt``
|
||||||
value.
|
value.
|
||||||
|
|
||||||
``parent_url_regex``
|
``parent_url_regex``
|
||||||
|
@ -458,14 +458,14 @@ value.
|
||||||
+========+==========+=========+
|
+========+==========+=========+
|
||||||
| string | no | *none* |
|
| string | no | *none* |
|
||||||
+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Matches if the full canonicalized parent URL matches a regular expression.
|
Matches if the full canonicalized parent URL matches a regular expression.
|
||||||
The parent URL is the URL of the page in which a link is found.
|
The parent URL is the URL of the page in which a link is found.
|
||||||
|
|
||||||
Using ``warcprox_meta``
|
Using ``warcprox_meta``
|
||||||
=======================
|
=======================
|
||||||
``warcprox_meta`` plays a very important role in brozzler job configuration.
|
``warcprox_meta`` plays a very important role in brozzler job configuration.
|
||||||
It sets the filenames of the WARC files created by a job. For example, if each
|
It sets the filenames of the WARC files created by a job. For example, if each
|
||||||
seed should have a different WARC filename prefix, you might configure a job
|
seed should have a different WARC filename prefix, you might configure a job
|
||||||
this way::
|
this way::
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -476,8 +476,8 @@ this way::
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
warc-prefix: seed2
|
warc-prefix: seed2
|
||||||
|
|
||||||
``warcprox_meta`` may also be used to limit the size of the job. For example,
|
``warcprox_meta`` may also be used to limit the size of the job. For example,
|
||||||
this configuration will stop the crawl after about 100 MB of novel content has
|
this configuration will stop the crawl after about 100 MB of novel content has
|
||||||
been archived::
|
been archived::
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -492,7 +492,7 @@ been archived::
|
||||||
|
|
||||||
To prevent any URLs from a host from being captured, it is not sufficient to use
|
To prevent any URLs from a host from being captured, it is not sufficient to use
|
||||||
a ``scope`` rule as described above. That kind of scoping only applies to
|
a ``scope`` rule as described above. That kind of scoping only applies to
|
||||||
navigational links discovered in crawled pages. To make absolutely sure that no
|
navigational links discovered in crawled pages. To make absolutely sure that no
|
||||||
url from a given host is fetched--not even an image embedded in a page--use
|
url from a given host is fetched--not even an image embedded in a page--use
|
||||||
``warcprox_meta`` like so::
|
``warcprox_meta`` like so::
|
||||||
|
|
||||||
|
|
7
setup.py
7
setup.py
|
@ -2,7 +2,7 @@
|
||||||
'''
|
'''
|
||||||
setup.py - brozzler setup script
|
setup.py - brozzler setup script
|
||||||
|
|
||||||
Copyright (C) 2014-2018 Internet Archive
|
Copyright (C) 2014-2019 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.0',
|
version='1.5.4',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -72,7 +72,7 @@ setuptools.setup(
|
||||||
'pillow>=5.2.0',
|
'pillow>=5.2.0',
|
||||||
'urlcanon>=0.1.dev23',
|
'urlcanon>=0.1.dev23',
|
||||||
'doublethink>=0.2.0',
|
'doublethink>=0.2.0',
|
||||||
'rethinkdb>=2.3',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'cerberus>=1.0.1',
|
'cerberus>=1.0.1',
|
||||||
'jinja2>=2.10',
|
'jinja2>=2.10',
|
||||||
'cryptography>=2.3',
|
'cryptography>=2.3',
|
||||||
|
@ -98,6 +98,7 @@ setuptools.setup(
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.4',
|
||||||
'Programming Language :: Python :: 3.5',
|
'Programming Language :: Python :: 3.5',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'Programming Language :: Python :: 3.7',
|
||||||
'Topic :: Internet :: WWW/HTTP',
|
'Topic :: Internet :: WWW/HTTP',
|
||||||
'Topic :: System :: Archiving',
|
'Topic :: System :: Archiving',
|
||||||
])
|
])
|
||||||
|
|
|
@ -151,7 +151,7 @@ def test_robots_connection_failure():
|
||||||
assert brozzler.is_permitted_by_robots(site, url)
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
def test_scoping():
|
def test_scoping():
|
||||||
test_scope = yaml.load('''
|
test_scope = yaml.safe_load('''
|
||||||
max_hops: 100
|
max_hops: 100
|
||||||
accepts:
|
accepts:
|
||||||
- url_match: REGEX_MATCH
|
- url_match: REGEX_MATCH
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue