Merge branch 'master' into qa

* master:
  pass behavior_parameters from job configuration into Site objects
  add --behavior-parameters argument to brozzler-new-site
  fix bug in final_bounces (not sure what I was thinking)
  restore accidentally removed functionality handling page redirects and friends
  cat logs on travis-ci failure
  reppy 0.4.1 has a significantly different api apparently, so for now let's go back to 0.3.4
  still trying to get installation of pip to work on travis-ci
  update for reppy api change and pin to current version of reppy
  tweaks to ansible config to try to get the deployment to run on travis-ci
This commit is contained in:
Noah Levitt 2016-11-09 13:43:24 -08:00
commit 0eb07c9ca2
9 changed files with 73 additions and 39 deletions

View File

@ -12,3 +12,8 @@ install:
- pip install $TRAVIS_BUILD_DIR pytest
script:
- py.test -v -s tests
after_failure:
- sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log
- sudo cat /var/log/upstart/pywb.log

View File

@ -31,6 +31,7 @@
- libjpeg-turbo8-dev
- zlib1g-dev
- gcc
- g++
- libpython3.4-dev
- adobe-flashplugin
- name: install Xvnc upstart config /etc/init/Xvnc.conf

View File

@ -8,16 +8,28 @@
with_items:
- python-setuptools
- python3-setuptools
- name: download pip-8.1.2.tar.gz
- name: download pip-9.0.1.tar.gz
get_url:
url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
dest: /tmp
checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
- name: extract pip-8.1.2.tar.gz
unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
- name: run "python3 setup.py install" in /tmp/pip-8.1.2
command: python3 setup.py install chdir=/tmp/pip-8.1.2
creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b
- name: extract pip-9.0.1.tar.gz
unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no
# this clause is a workaround for travis-ci, which only wants to install in /usr
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601
# but it complains that /usr/lib/python3.4/site-packages doesn't exist
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831
- file: path={{item}} state=directory
with_items:
- /usr/lib/python3.4/site-packages
- /usr/lib/python3.4/dist-packages
become: true
- name: run "python3 setup.py install" in /tmp/pip-9.0.1
command: python3 setup.py install
chdir=/tmp/pip-9.0.1
creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
become: true
- name: run "pip install virtualenv"
command: pip install virtualenv

View File

@ -344,8 +344,6 @@ __brzl_compileOutlinks(window).join(' ');
def callback(message):
if timer:
timer.cancel()
if message["id"] in self._waiting_on_result_messages:
del self._waiting_on_result_messages[message["id"]]
if "callback" in chain[0]:
chain[0]["callback"](message)
self._chain_chrome_messages(chain[1:])
@ -506,12 +504,19 @@ __brzl_compileOutlinks(window).join(' ');
self.on_response(message)
def _page_load_event_fired(self, message):
def page_url_after_load_event(message):
if message["result"]["result"]["value"] != self.url:
if self.on_url_change:
self.on_url_change(message["result"]["result"]["value"])
msg_id = self.send_to_chrome(
method="Runtime.evaluate",
params={"expression":"document.URL"})
self._waiting_on_result_messages[msg_id] = page_url_after_load_event
self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
self._behavior = Behavior(self.url, self)
self._behavior.start(self.behavior_parameters)
self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})
def _console_message_added(self, message):
self.logger.debug("%s console.%s %s", self._websock.url,
message["params"]["message"]["level"],
@ -553,6 +558,7 @@ __brzl_compileOutlinks(window).join(' ');
elif "result" in message:
if message["id"] in self._waiting_on_result_messages:
callback = self._waiting_on_result_messages[message["id"]]
del self._waiting_on_result_messages[message["id"]]
self.logger.debug(
"received result for message id=%s, calling %s",
message["id"], callback)

View File

@ -224,6 +224,12 @@ def brozzler_new_site():
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
arg_parser.add_argument(
'--behavior-parameters', dest='behavior_parameters',
default=None, help=(
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
@ -234,8 +240,10 @@ def brozzler_new_site():
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
warcprox_meta=(
json.loads(args.warcprox_meta) if args.warcprox_meta else None))
warcprox_meta=json.loads(
args.warcprox_meta) if args.warcprox_meta else None,
behavior_parameters=json.loads(
args.behavior_parameters) if args.behavior_parameters else None)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)

View File

@ -88,7 +88,8 @@ def new_job(frontier, job_conf):
warcprox_meta=merged_conf.get("warcprox_meta"),
metadata=merged_conf.get("metadata"),
remember_outlinks=merged_conf.get("remember_outlinks"),
user_agent=merged_conf.get("user_agent"))
user_agent=merged_conf.get("user_agent"),
behavior_parameters=merged_conf.get("behavior_parameters"))
sites.append(site)
# insert all the sites into database before the job

View File

@ -1,20 +1,20 @@
#
# brozzler/robots.py - robots.txt support
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
'''
brozzler/robots.py - robots.txt support
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import json
import logging

View File

@ -48,6 +48,7 @@ class ExtraHeaderAdder(urllib.request.BaseHandler):
class YoutubeDLSpy(urllib.request.BaseHandler):
Transaction = collections.namedtuple('Transaction', ['request', 'response'])
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self):
self.reset()
@ -62,17 +63,17 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
self.transactions = []
def final_bounces(self, url):
"""Resolves redirect chains in self.transactions, returns a list of
"""
Resolves redirect chains in self.transactions, returns a list of
Transaction representing the final redirect destinations of the given
url. There could be more than one if for example youtube-dl hit the
same url with HEAD and then GET requests."""
same url with HEAD and then GET requests.
"""
redirects = {}
for txn in self.transactions:
# XXX check http status 301,302,303,307? check for "uri" header
# as well as "location"? see urllib.request.HTTPRedirectHandler
if ((txn.request.full_url == url
or txn.request.full_url in redirects)
and 'location' in txn.response.headers):
if 'location' in txn.response.headers:
redirects[txn.request.full_url] = txn
final_url = url

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b7.dev113',
version='1.1b7.dev121',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -59,7 +59,7 @@ setuptools.setup(
install_requires=[
'PyYAML',
'youtube-dl',
'reppy',
'reppy==0.3.4',
'requests',
'websocket-client',
'pillow==3.3.0',