mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: pass behavior_parameters from job configuration into Site objects add --behavior-parameters argument to brozzler-new-site fix bug in final_bounces (not sure what I was thinking) restore accidentally removed functionality handling page redirects and friends cat logs on travis-ci failure reppy 0.4.1 has a significantly different api apparently, so for now let's go back to 0.3.4 still trying to get installation of pip to work on travis-ci update for reppy api change and pin to current version of reppy tweaks to ansible config to try to get the deployment to run on travis-ci
This commit is contained in:
commit
0eb07c9ca2
@ -12,3 +12,8 @@ install:
|
||||
- pip install $TRAVIS_BUILD_DIR pytest
|
||||
script:
|
||||
- py.test -v -s tests
|
||||
after_failure:
|
||||
- sudo cat /var/log/upstart/warcprox.log
|
||||
- sudo cat /var/log/upstart/brozzler-worker.log
|
||||
- sudo cat /var/log/upstart/pywb.log
|
||||
|
||||
|
@ -31,6 +31,7 @@
|
||||
- libjpeg-turbo8-dev
|
||||
- zlib1g-dev
|
||||
- gcc
|
||||
- g++
|
||||
- libpython3.4-dev
|
||||
- adobe-flashplugin
|
||||
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
||||
|
@ -8,16 +8,28 @@
|
||||
with_items:
|
||||
- python-setuptools
|
||||
- python3-setuptools
|
||||
- name: download pip-8.1.2.tar.gz
|
||||
- name: download pip-9.0.1.tar.gz
|
||||
get_url:
|
||||
url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
|
||||
url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
|
||||
dest: /tmp
|
||||
checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
|
||||
- name: extract pip-8.1.2.tar.gz
|
||||
unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
|
||||
- name: run "python3 setup.py install" in /tmp/pip-8.1.2
|
||||
command: python3 setup.py install chdir=/tmp/pip-8.1.2
|
||||
creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
|
||||
checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b
|
||||
- name: extract pip-9.0.1.tar.gz
|
||||
unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no
|
||||
|
||||
# this clause is a workaround for travis-ci, which only wants to install in /usr
|
||||
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601
|
||||
# but it complains that /usr/lib/python3.4/site-packages doesn't exist
|
||||
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831
|
||||
- file: path={{item}} state=directory
|
||||
with_items:
|
||||
- /usr/lib/python3.4/site-packages
|
||||
- /usr/lib/python3.4/dist-packages
|
||||
become: true
|
||||
|
||||
- name: run "python3 setup.py install" in /tmp/pip-9.0.1
|
||||
command: python3 setup.py install
|
||||
chdir=/tmp/pip-9.0.1
|
||||
creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
|
||||
become: true
|
||||
- name: run "pip install virtualenv"
|
||||
command: pip install virtualenv
|
||||
|
@ -344,8 +344,6 @@ __brzl_compileOutlinks(window).join(' ');
|
||||
def callback(message):
|
||||
if timer:
|
||||
timer.cancel()
|
||||
if message["id"] in self._waiting_on_result_messages:
|
||||
del self._waiting_on_result_messages[message["id"]]
|
||||
if "callback" in chain[0]:
|
||||
chain[0]["callback"](message)
|
||||
self._chain_chrome_messages(chain[1:])
|
||||
@ -506,12 +504,19 @@ __brzl_compileOutlinks(window).join(' ');
|
||||
self.on_response(message)
|
||||
|
||||
def _page_load_event_fired(self, message):
|
||||
def page_url_after_load_event(message):
|
||||
if message["result"]["result"]["value"] != self.url:
|
||||
if self.on_url_change:
|
||||
self.on_url_change(message["result"]["result"]["value"])
|
||||
msg_id = self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression":"document.URL"})
|
||||
self._waiting_on_result_messages[msg_id] = page_url_after_load_event
|
||||
|
||||
self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
|
||||
self._behavior = Behavior(self.url, self)
|
||||
self._behavior.start(self.behavior_parameters)
|
||||
|
||||
self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})
|
||||
|
||||
def _console_message_added(self, message):
|
||||
self.logger.debug("%s console.%s %s", self._websock.url,
|
||||
message["params"]["message"]["level"],
|
||||
@ -553,6 +558,7 @@ __brzl_compileOutlinks(window).join(' ');
|
||||
elif "result" in message:
|
||||
if message["id"] in self._waiting_on_result_messages:
|
||||
callback = self._waiting_on_result_messages[message["id"]]
|
||||
del self._waiting_on_result_messages[message["id"]]
|
||||
self.logger.debug(
|
||||
"received result for message id=%s, calling %s",
|
||||
message["id"], callback)
|
||||
|
@ -224,6 +224,12 @@ def brozzler_new_site():
|
||||
'Warcprox-Meta http request header to send with each request; '
|
||||
'must be a json blob, ignored unless warcprox features are '
|
||||
'enabled'))
|
||||
arg_parser.add_argument(
|
||||
'--behavior-parameters', dest='behavior_parameters',
|
||||
default=None, help=(
|
||||
'json blob of parameters to populate the javascript behavior '
|
||||
'template, e.g. {"parameter_username":"x",'
|
||||
'"parameter_password":"y"}'))
|
||||
_add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
@ -234,8 +240,10 @@ def brozzler_new_site():
|
||||
time_limit=int(args.time_limit) if args.time_limit else None,
|
||||
ignore_robots=args.ignore_robots,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
warcprox_meta=(
|
||||
json.loads(args.warcprox_meta) if args.warcprox_meta else None))
|
||||
warcprox_meta=json.loads(
|
||||
args.warcprox_meta) if args.warcprox_meta else None,
|
||||
behavior_parameters=json.loads(
|
||||
args.behavior_parameters) if args.behavior_parameters else None)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
|
@ -88,7 +88,8 @@ def new_job(frontier, job_conf):
|
||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||
metadata=merged_conf.get("metadata"),
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||
user_agent=merged_conf.get("user_agent"))
|
||||
user_agent=merged_conf.get("user_agent"),
|
||||
behavior_parameters=merged_conf.get("behavior_parameters"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
|
@ -1,20 +1,20 @@
|
||||
#
|
||||
# brozzler/robots.py - robots.txt support
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler/robots.py - robots.txt support
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
|
||||
class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
Transaction = collections.namedtuple('Transaction', ['request', 'response'])
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
self.transactions = []
|
||||
|
||||
def final_bounces(self, url):
|
||||
"""Resolves redirect chains in self.transactions, returns a list of
|
||||
"""
|
||||
Resolves redirect chains in self.transactions, returns a list of
|
||||
Transaction representing the final redirect destinations of the given
|
||||
url. There could be more than one if for example youtube-dl hit the
|
||||
same url with HEAD and then GET requests."""
|
||||
same url with HEAD and then GET requests.
|
||||
"""
|
||||
redirects = {}
|
||||
for txn in self.transactions:
|
||||
# XXX check http status 301,302,303,307? check for "uri" header
|
||||
# as well as "location"? see urllib.request.HTTPRedirectHandler
|
||||
if ((txn.request.full_url == url
|
||||
or txn.request.full_url in redirects)
|
||||
and 'location' in txn.response.headers):
|
||||
if 'location' in txn.response.headers:
|
||||
redirects[txn.request.full_url] = txn
|
||||
|
||||
final_url = url
|
||||
|
4
setup.py
4
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b7.dev113',
|
||||
version='1.1b7.dev121',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -59,7 +59,7 @@ setuptools.setup(
|
||||
install_requires=[
|
||||
'PyYAML',
|
||||
'youtube-dl',
|
||||
'reppy',
|
||||
'reppy==0.3.4',
|
||||
'requests',
|
||||
'websocket-client',
|
||||
'pillow==3.3.0',
|
||||
|
Loading…
x
Reference in New Issue
Block a user