mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 06:22:23 -04:00
Merge branch 'requestIntercepted' into qa
This commit is contained in:
commit
533a5e74ee
7 changed files with 48 additions and 38 deletions
|
@ -1,8 +1,10 @@
|
|||
dist: xenial
|
||||
language: python
|
||||
python:
|
||||
- 3.4
|
||||
- 3.5
|
||||
- 3.6
|
||||
- 3.7
|
||||
sudo: required
|
||||
before_install:
|
||||
- sudo pip install --upgrade setuptools pip
|
||||
|
|
|
@ -106,7 +106,7 @@ def behaviors(behaviors_dir=None):
|
|||
d = behaviors_dir or os.path.dirname(__file__)
|
||||
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
||||
with open(behaviors_yaml) as fin:
|
||||
_behaviors = yaml.load(fin)
|
||||
_behaviors = yaml.safe_load(fin)
|
||||
return _behaviors
|
||||
|
||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||
|
|
|
@ -241,6 +241,13 @@ class WebsockReceiverThread(threading.Thread):
|
|||
elif message['method'] == 'Network.requestWillBeSent':
|
||||
if self.on_request:
|
||||
self.on_request(message)
|
||||
elif message['method'] == 'Network.requestIntercepted':
|
||||
if 'params' in message and 'authChallenge' in message['params']:
|
||||
auth_challenge = message['params']['authChallenge']
|
||||
self.logger.info('Network.requestIntercepted AuthChallenge %s %s',
|
||||
auth_challenge['scheme'], auth_challenge['origin'])
|
||||
else:
|
||||
self.logger.info('Network.requestIntercepted non-AuthChallenge')
|
||||
elif message['method'] == 'Page.interstitialShown':
|
||||
# AITFIVE-1529: handle http auth
|
||||
# we should kill the browser when we receive Page.interstitialShown and
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
brozzler/models.py - model classes representing jobs, sites, and pages, with
|
||||
related logic
|
||||
|
||||
Copyright (C) 2014-2018 Internet Archive
|
||||
Copyright (C) 2014-2019 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -35,7 +35,7 @@ import yaml
|
|||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||
with open(schema_file) as f:
|
||||
return yaml.load(f)
|
||||
return yaml.safe_load(f)
|
||||
|
||||
class JobValidator(cerberus.Validator):
|
||||
def _validate_type_url(self, value):
|
||||
|
@ -48,7 +48,7 @@ class InvalidJobConf(Exception):
|
|||
|
||||
def validate_conf(job_conf, schema=load_schema()):
|
||||
v = JobValidator(schema)
|
||||
if not v.validate(job_conf):
|
||||
if not v.validate(job_conf, normalize=False):
|
||||
raise InvalidJobConf(v.errors)
|
||||
|
||||
def merge(a, b):
|
||||
|
@ -68,7 +68,7 @@ def new_job_file(frontier, job_conf_file):
|
|||
'''Returns new Job.'''
|
||||
logging.info("loading %s", job_conf_file)
|
||||
with open(job_conf_file) as f:
|
||||
job_conf = yaml.load(f)
|
||||
job_conf = yaml.safe_load(f)
|
||||
return new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
|
|
|
@ -330,7 +330,7 @@ To generate the rule, brozzler canonicalizes the seed URL using the `urlcanon
|
|||
removes the query string if any, and finally serializes the result in SSURT
|
||||
[1]_ form. For example, a seed URL of
|
||||
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
||||
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
||||
``com,example,www,//https:/foo/bar``.
|
||||
|
||||
Brozzler derives its general approach to the seed surt from `heritrix
|
||||
<https://github.com/internetarchive/heritrix3>`_, but differs in a few respects.
|
||||
|
|
7
setup.py
7
setup.py
|
@ -2,7 +2,7 @@
|
|||
'''
|
||||
setup.py - brozzler setup script
|
||||
|
||||
Copyright (C) 2014-2018 Internet Archive
|
||||
Copyright (C) 2014-2019 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.5.0',
|
||||
version='1.5.4',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -72,7 +72,7 @@ setuptools.setup(
|
|||
'pillow>=5.2.0',
|
||||
'urlcanon>=0.1.dev23',
|
||||
'doublethink>=0.2.0',
|
||||
'rethinkdb>=2.3',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'cerberus>=1.0.1',
|
||||
'jinja2>=2.10',
|
||||
'cryptography>=2.3',
|
||||
|
@ -98,6 +98,7 @@ setuptools.setup(
|
|||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Topic :: Internet :: WWW/HTTP',
|
||||
'Topic :: System :: Archiving',
|
||||
])
|
||||
|
|
|
@ -151,7 +151,7 @@ def test_robots_connection_failure():
|
|||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_scoping():
|
||||
test_scope = yaml.load('''
|
||||
test_scope = yaml.safe_load('''
|
||||
max_hops: 100
|
||||
accepts:
|
||||
- url_match: REGEX_MATCH
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue