diff --git a/.travis.yml b/.travis.yml index 59bd04d..318ad3b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,10 @@ +dist: xenial language: python python: - 3.4 - 3.5 - 3.6 +- 3.7 sudo: required before_install: - sudo pip install --upgrade setuptools pip diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 6321203..1e8c9fe 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -106,7 +106,7 @@ def behaviors(behaviors_dir=None): d = behaviors_dir or os.path.dirname(__file__) behaviors_yaml = os.path.join(d, 'behaviors.yaml') with open(behaviors_yaml) as fin: - _behaviors = yaml.load(fin) + _behaviors = yaml.safe_load(fin) return _behaviors def behavior_script(url, template_parameters=None, behaviors_dir=None): diff --git a/brozzler/browser.py b/brozzler/browser.py index 0548b1e..9fdc6a7 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -241,6 +241,13 @@ class WebsockReceiverThread(threading.Thread): elif message['method'] == 'Network.requestWillBeSent': if self.on_request: self.on_request(message) + elif message['method'] == 'Network.requestIntercepted': + if 'params' in message and 'authChallenge' in message['params']: + auth_challenge = message['params']['authChallenge'] + self.logger.info('Network.requestIntercepted AuthChallenge %s %s', + auth_challenge['scheme'], auth_challenge['origin']) + else: + self.logger.info('Network.requestIntercepted non-AuthChallenge') elif message['method'] == 'Page.interstitialShown': # AITFIVE-1529: handle http auth # we should kill the browser when we receive Page.interstitialShown and diff --git a/brozzler/model.py b/brozzler/model.py index 9832a40..77dae70 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -2,7 +2,7 @@ brozzler/models.py - model classes representing jobs, sites, and pages, with related logic -Copyright (C) 2014-2018 Internet Archive +Copyright (C) 2014-2019 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ import yaml def load_schema(): schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') with open(schema_file) as f: - return yaml.load(f) + return yaml.safe_load(f) class JobValidator(cerberus.Validator): def _validate_type_url(self, value): @@ -48,7 +48,7 @@ class InvalidJobConf(Exception): def validate_conf(job_conf, schema=load_schema()): v = JobValidator(schema) - if not v.validate(job_conf): + if not v.validate(job_conf, normalize=False): raise InvalidJobConf(v.errors) def merge(a, b): @@ -68,7 +68,7 @@ def new_job_file(frontier, job_conf_file): '''Returns new Job.''' logging.info("loading %s", job_conf_file) with open(job_conf_file) as f: - job_conf = yaml.load(f) + job_conf = yaml.safe_load(f) return new_job(frontier, job_conf) def new_job(frontier, job_conf): diff --git a/job-conf.rst b/job-conf.rst index 5d1eb0a..fb32513 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -1,8 +1,8 @@ Brozzler Job Configuration ************************** -Jobs are used to brozzle multiple seeds and/or apply settings and scope rules, -as defined byusing YAML files. At least one seed URL must be specified. +Jobs are used to brozzle multiple seeds and/or apply settings and scope rules, +as defined byusing YAML files. At least one seed URL must be specified. All other configurartions are optional. .. contents:: @@ -43,7 +43,7 @@ How inheritance works Most of the settings that apply to seeds can also be specified at the top level, in which case all seeds inherit those settings. If an option is -specified both at the top level and at the seed level, the results are merged. +specified both at the top level and at the seed level, the results are merged. In cases of coflict, the seed-level value takes precedence. In the example yaml above, ``warcprox_meta`` is specified at the top level and @@ -170,8 +170,8 @@ case they are inherited by all seeds. +============+==========+=========+ | dictionary | no | *none* | +------------+----------+---------+ -Information about the crawl job or site. Could be useful for external -descriptive or informative metadata, but not used by brozzler in the course of +Information about the crawl job or site. Could be useful for external +descriptive or informative metadata, but not used by brozzler in the course of archiving. ``time_limit`` @@ -203,8 +203,8 @@ warcprox for archival crawling. +=========+==========+===========+ | boolean | no | ``false`` | +---------+----------+-----------+ -If set to ``true``, brozzler will fetch pages that would otherwise be blocked -by `robots.txt rules +If set to ``true``, brozzler will fetch pages that would otherwise be blocked +by `robots.txt rules `_. ``user_agent`` @@ -216,7 +216,7 @@ by `robots.txt rules +---------+----------+---------+ The ``User-Agent`` header brozzler will send to identify itself to web servers. It is good ettiquette to include a project URL with a notice to webmasters that -explains why you are crawling, how to block the crawler via robots.txt, and how +explains why you are crawling, how to block the crawler via robots.txt, and how to contact the operator if the crawl is causing problems. ``warcprox_meta`` @@ -229,8 +229,8 @@ to contact the operator if the crawl is causing problems. Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy`` is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is used to pass settings and information to warcprox. Warcprox does not forward -the header on to the remote site. For further explanation of this field and -its uses see +the header on to the remote site. For further explanation of this field and +its uses see https://github.com/internetarchive/warcprox/blob/master/api.rst Brozzler takes the configured value of ``warcprox_meta``, converts it to @@ -259,7 +259,7 @@ Scope specificaion for the seed. See the "Scoping" section which follows. Scoping ======= -The scope of a seed determines which links are scheduled for crawling ("in +The scope of a seed determines which links are scheduled for crawling ("in scope") and which are not. For example:: scope: @@ -330,9 +330,9 @@ To generate the rule, brozzler canonicalizes the seed URL using the `urlcanon removes the query string if any, and finally serializes the result in SSURT [1]_ form. For example, a seed URL of ``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes -``com,example,www,//https:/foo/bar?a=b&c=d``. +``com,example,www,//https:/foo/bar``. -Brozzler derives its general approach to the seed surt from `heritrix +Brozzler derives its general approach to the seed surt from `heritrix `_, but differs in a few respects. 1. Unlike heritrix, brozzler does not strip the path segment after the last @@ -347,11 +347,11 @@ Brozzler derives its general approach to the seed surt from `heritrix not match anything. Brozzler does no scheme munging. 4. Brozzler identifies seed "redirects" by retrieving the URL from the browser's location bar at the end of brozzling the seed page, whereas - heritrix follows HTTP 3XX redirects. If the URL in the browser - location bar at the end of brozzling the seed page differs from the seed - URL, brozzler automatically adds a second ``accept`` rule to ensure the - site is in scope, as if the new URL were the original seed URL. For example, - if ``http://example.com/`` redirects to ``http://www.example.com/``, the + heritrix follows HTTP 3XX redirects. If the URL in the browser + location bar at the end of brozzling the seed page differs from the seed + URL, brozzler automatically adds a second ``accept`` rule to ensure the + site is in scope, as if the new URL were the original seed URL. For example, + if ``http://example.com/`` redirects to ``http://www.example.com/``, the rest of the ``www.example.com`` is in scope. 5. Brozzler uses SSURT instead of SURT. 6. There is currently no brozzler option to disable the automatically generated @@ -368,7 +368,7 @@ Scope settings | list | no | *none* | +------+----------+---------+ List of scope rules. If any of the rules match, the URL is within -``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is +``max_hops`` from seed, and none of the ``block`` rules apply, then the URL is in scope and brozzled. ``blocks`` @@ -378,7 +378,7 @@ in scope and brozzled. +======+==========+=========+ | list | no | *none* | +------+----------+---------+ -List of scope rules. If any of the rules match, then the URL is deemed out +List of scope rules. If any of the rules match, then the URL is deemed out of scope and NOT brozzled. ``max_hops`` @@ -438,7 +438,7 @@ Matches if the full canonicalized URL matches a regular expression. +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ -Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt`` +Matches if the canonicalized URL in SSURT [1]_ form starts with the ``ssurt`` value. ``surt`` @@ -448,7 +448,7 @@ value. +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ -Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt`` +Matches if the canonicalized URL in SURT [2]_ form starts with the ``surt`` value. ``parent_url_regex`` @@ -458,14 +458,14 @@ value. +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ -Matches if the full canonicalized parent URL matches a regular expression. +Matches if the full canonicalized parent URL matches a regular expression. The parent URL is the URL of the page in which a link is found. Using ``warcprox_meta`` ======================= -``warcprox_meta`` plays a very important role in brozzler job configuration. -It sets the filenames of the WARC files created by a job. For example, if each -seed should have a different WARC filename prefix, you might configure a job +``warcprox_meta`` plays a very important role in brozzler job configuration. +It sets the filenames of the WARC files created by a job. For example, if each +seed should have a different WARC filename prefix, you might configure a job this way:: seeds: @@ -476,8 +476,8 @@ this way:: warcprox_meta: warc-prefix: seed2 -``warcprox_meta`` may also be used to limit the size of the job. For example, -this configuration will stop the crawl after about 100 MB of novel content has +``warcprox_meta`` may also be used to limit the size of the job. For example, +this configuration will stop the crawl after about 100 MB of novel content has been archived:: seeds: @@ -492,7 +492,7 @@ been archived:: To prevent any URLs from a host from being captured, it is not sufficient to use a ``scope`` rule as described above. That kind of scoping only applies to -navigational links discovered in crawled pages. To make absolutely sure that no +navigational links discovered in crawled pages. To make absolutely sure that no url from a given host is fetched--not even an image embedded in a page--use ``warcprox_meta`` like so:: diff --git a/setup.py b/setup.py index d5c54af..7139950 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2018 Internet Archive +Copyright (C) 2014-2019 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.0', + version='1.5.4', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -72,7 +72,7 @@ setuptools.setup( 'pillow>=5.2.0', 'urlcanon>=0.1.dev23', 'doublethink>=0.2.0', - 'rethinkdb>=2.3', + 'rethinkdb>=2.3,<2.4', 'cerberus>=1.0.1', 'jinja2>=2.10', 'cryptography>=2.3', @@ -98,6 +98,7 @@ setuptools.setup( 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Topic :: Internet :: WWW/HTTP', 'Topic :: System :: Archiving', ]) diff --git a/tests/test_units.py b/tests/test_units.py index 1d62bc6..88e4450 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -151,7 +151,7 @@ def test_robots_connection_failure(): assert brozzler.is_permitted_by_robots(site, url) def test_scoping(): - test_scope = yaml.load(''' + test_scope = yaml.safe_load(''' max_hops: 100 accepts: - url_match: REGEX_MATCH