Merge branch 'requestIntercepted' into qa

This commit is contained in:
Barbara Miller 2019-05-14 12:00:23 -07:00
commit 533a5e74ee
7 changed files with 48 additions and 38 deletions

View File

@ -1,8 +1,10 @@
dist: xenial
language: python
python:
- 3.4
- 3.5
- 3.6
- 3.7
sudo: required
before_install:
- sudo pip install --upgrade setuptools pip

View File

@ -106,7 +106,7 @@ def behaviors(behaviors_dir=None):
d = behaviors_dir or os.path.dirname(__file__)
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
with open(behaviors_yaml) as fin:
_behaviors = yaml.load(fin)
_behaviors = yaml.safe_load(fin)
return _behaviors
def behavior_script(url, template_parameters=None, behaviors_dir=None):

View File

@ -241,6 +241,13 @@ class WebsockReceiverThread(threading.Thread):
elif message['method'] == 'Network.requestWillBeSent':
if self.on_request:
self.on_request(message)
elif message['method'] == 'Network.requestIntercepted':
if 'params' in message and 'authChallenge' in message['params']:
auth_challenge = message['params']['authChallenge']
self.logger.info('Network.requestIntercepted AuthChallenge %s %s',
auth_challenge['scheme'], auth_challenge['origin'])
else:
self.logger.info('Network.requestIntercepted non-AuthChallenge')
elif message['method'] == 'Page.interstitialShown':
# AITFIVE-1529: handle http auth
# we should kill the browser when we receive Page.interstitialShown and

View File

@ -2,7 +2,7 @@
brozzler/models.py - model classes representing jobs, sites, and pages, with
related logic
Copyright (C) 2014-2018 Internet Archive
Copyright (C) 2014-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -35,7 +35,7 @@ import yaml
def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
with open(schema_file) as f:
return yaml.load(f)
return yaml.safe_load(f)
class JobValidator(cerberus.Validator):
def _validate_type_url(self, value):
@ -48,7 +48,7 @@ class InvalidJobConf(Exception):
def validate_conf(job_conf, schema=load_schema()):
v = JobValidator(schema)
if not v.validate(job_conf):
if not v.validate(job_conf, normalize=False):
raise InvalidJobConf(v.errors)
def merge(a, b):
@ -68,7 +68,7 @@ def new_job_file(frontier, job_conf_file):
'''Returns new Job.'''
logging.info("loading %s", job_conf_file)
with open(job_conf_file) as f:
job_conf = yaml.load(f)
job_conf = yaml.safe_load(f)
return new_job(frontier, job_conf)
def new_job(frontier, job_conf):

View File

@ -1,8 +1,8 @@
Brozzler Job Configuration
**************************
Jobs are used to brozzle multiple seeds and/or apply settings and scope rules,
as defined byusing YAML files. At least one seed URL must be specified.
Jobs are used to brozzle multiple seeds and/or apply settings and scope rules,
as defined byusing YAML files. At least one seed URL must be specified.
All other configurartions are optional.
.. contents::
@ -43,7 +43,7 @@ How inheritance works
Most of the settings that apply to seeds can also be specified at the top
level, in which case all seeds inherit those settings. If an option is
specified both at the top level and at the seed level, the results are merged.
specified both at the top level and at the seed level, the results are merged.
In cases of coflict, the seed-level value takes precedence.
In the example yaml above, ``warcprox_meta`` is specified at the top level and
@ -170,8 +170,8 @@ case they are inherited by all seeds.
+============+==========+=========+
| dictionary | no | *none* |
+------------+----------+---------+
Information about the crawl job or site. Could be useful for external
descriptive or informative metadata, but not used by brozzler in the course of
Information about the crawl job or site. Could be useful for external
descriptive or informative metadata, but not used by brozzler in the course of
archiving.
``time_limit``
@ -203,8 +203,8 @@ warcprox for archival crawling.
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
If set to ``true``, brozzler will fetch pages that would otherwise be blocked
by `robots.txt rules
If set to ``true``, brozzler will fetch pages that would otherwise be blocked
by `robots.txt rules
<https://en.wikipedia.org/wiki/Robots_exclusion_standard>`_.
``user_agent``
@ -216,7 +216,7 @@ by `robots.txt rules
+---------+----------+---------+
The ``User-Agent`` header brozzler will send to identify itself to web servers.
It is good ettiquette to include a project URL with a notice to webmasters that
explains why you are crawling, how to block the crawler via robots.txt, and how
explains why you are crawling, how to block the crawler via robots.txt, and how
to contact the operator if the crawl is causing problems.
``warcprox_meta``
@ -229,8 +229,8 @@ to contact the operator if the crawl is causing problems.
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
used to pass settings and information to warcprox. Warcprox does not forward
the header on to the remote site. For further explanation of this field and
its uses see
the header on to the remote site. For further explanation of this field and
its uses see
https://github.com/internetarchive/warcprox/blob/master/api.rst
Brozzler takes the configured value of ``warcprox_meta``, converts it to
@ -259,7 +259,7 @@ Scope specificaion for the seed. See the "Scoping" section which follows.
Scoping
=======
The scope of a seed determines which links are scheduled for crawling ("in
The scope of a seed determines which links are scheduled for crawling ("in
scope") and which are not. For example::
scope:
@ -330,9 +330,9 @@ To generate the rule, brozzler canonicalizes the seed URL using the `urlcanon
removes the query string if any, and finally serializes the result in SSURT
[1]_ form. For example, a seed URL of
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
``com,example,www,//https:/foo/bar?a=b&c=d``.
``com,example,www,//https:/foo/bar``.
Brozzler derives its general approach to the seed surt from `heritrix
Brozzler derives its general approach to the seed surt from `heritrix
<https://github.com/internetarchive/heritrix3>`_, but differs in a few respects.
1. Unlike heritrix, brozzler does not strip the path segment after the last
@ -347,11 +347,11 @@ Brozzler derives its general approach to the seed surt from `heritrix
not match anything. Brozzler does no scheme munging.
4. Brozzler identifies seed "redirects" by retrieving the URL from the
browser's location bar at the end of brozzling the seed page, whereas
heritrix follows HTTP 3XX redirects. If the URL in the browser
location bar at the end of brozzling the seed page differs from the seed
URL, brozzler automatically adds a second ``accept`` rule to ensure the
site is in scope, as if the new URL were the original seed URL. For example,
if ``http://example.com/`` redirects to ``http://www.example.com/``, the
heritrix follows HTTP 3XX redirects. If the URL in the browser
location bar at the end of brozzling the seed page differs from the seed
URL, brozzler automatically adds a second ``accept`` rule to ensure the
site is in scope, as if the new URL were the original seed URL. For example,
if ``http://example.com/`` redirects to ``http://www.example.com/``, the
rest of the ``www.example.com`` is in scope.
5. Brozzler uses SSURT instead of SURT.
6. There is currently no brozzler option to disable the automatically generated
@ -368,7 +368,7 @@ Scope settings
| list | no | *none* |
+------+----------+---------+
List of scope rules. If any of the rules match, the URL is within
``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is
``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is
in scope and brozzled.
``blocks``
@ -378,7 +378,7 @@ in scope and brozzled.
+======+==========+=========+
| list | no | *none* |
+------+----------+---------+
List of scope rules. If any of the rules match, then the URL is deemed out
List of scope rules. If any of the rules match, then the URL is deemed out
of scope and NOT brozzled.
``max_hops``
@ -438,7 +438,7 @@ Matches if the full canonicalized URL matches a regular expression.
+========+==========+=========+
| string | no | *none* |
+--------+----------+---------+
Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt``
Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt``
value.
``surt``
@ -448,7 +448,7 @@ value.
+========+==========+=========+
| string | no | *none* |
+--------+----------+---------+
Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt``
Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt``
value.
``parent_url_regex``
@ -458,14 +458,14 @@ value.
+========+==========+=========+
| string | no | *none* |
+--------+----------+---------+
Matches if the full canonicalized parent URL matches a regular expression.
Matches if the full canonicalized parent URL matches a regular expression.
The parent URL is the URL of the page in which a link is found.
Using ``warcprox_meta``
=======================
``warcprox_meta`` plays a very important role in brozzler job configuration.
It sets the filenames of the WARC files created by a job. For example, if each
seed should have a different WARC filename prefix, you might configure a job
``warcprox_meta`` plays a very important role in brozzler job configuration.
It sets the filenames of the WARC files created by a job. For example, if each
seed should have a different WARC filename prefix, you might configure a job
this way::
seeds:
@ -476,8 +476,8 @@ this way::
warcprox_meta:
warc-prefix: seed2
``warcprox_meta`` may also be used to limit the size of the job. For example,
this configuration will stop the crawl after about 100 MB of novel content has
``warcprox_meta`` may also be used to limit the size of the job. For example,
this configuration will stop the crawl after about 100 MB of novel content has
been archived::
seeds:
@ -492,7 +492,7 @@ been archived::
To prevent any URLs from a host from being captured, it is not sufficient to use
a ``scope`` rule as described above. That kind of scoping only applies to
navigational links discovered in crawled pages. To make absolutely sure that no
navigational links discovered in crawled pages. To make absolutely sure that no
url from a given host is fetched--not even an image embedded in a page--use
``warcprox_meta`` like so::

View File

@ -2,7 +2,7 @@
'''
setup.py - brozzler setup script
Copyright (C) 2014-2018 Internet Archive
Copyright (C) 2014-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.0',
version='1.5.4',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -72,7 +72,7 @@ setuptools.setup(
'pillow>=5.2.0',
'urlcanon>=0.1.dev23',
'doublethink>=0.2.0',
'rethinkdb>=2.3',
'rethinkdb>=2.3,<2.4',
'cerberus>=1.0.1',
'jinja2>=2.10',
'cryptography>=2.3',
@ -98,6 +98,7 @@ setuptools.setup(
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Topic :: Internet :: WWW/HTTP',
'Topic :: System :: Archiving',
])

View File

@ -151,7 +151,7 @@ def test_robots_connection_failure():
assert brozzler.is_permitted_by_robots(site, url)
def test_scoping():
test_scope = yaml.load('''
test_scope = yaml.safe_load('''
max_hops: 100
accepts:
- url_match: REGEX_MATCH