Merge branch 'master' into qa

* master:
  fix exception happening now that we have binary data in rethinkdb (the cookie db) "TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable"
  dev version number again
  another version for pypi
  avoid "Uncaught RangeError: Maximum call stack size exceeded" compiling outlinks
  back to a dev version number
  bump version to 1.1b4 for pypi upload
  logging tweak
  install brozzler.webconsole package
This commit is contained in:
Noah Levitt 2016-08-05 17:13:09 -07:00
commit 10c90431e6
3 changed files with 35 additions and 12 deletions

View file

@ -301,17 +301,19 @@ class Browser:
self._behavior = None self._behavior = None
OUTLINKS_JS = """ OUTLINKS_JS = """
var compileOutlinks = function(frame) { var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
var outlinks = Array.prototype.slice.call( var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]')); frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) { for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i]) { // sometimes undefined (why?) if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(compileOutlinks(frame.frames[i])); outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i]));
} }
} }
return outlinks; return outlinks;
} }
compileOutlinks(window).join(' '); __brzl_compileOutlinks(window).join(' ');
""" """
def _chain_chrome_messages(self, chain): def _chain_chrome_messages(self, chain):
@ -540,7 +542,7 @@ compileOutlinks(window).join(' ');
elif "result" in message: elif "result" in message:
if message["id"] in self._waiting_on_result_messages: if message["id"] in self._waiting_on_result_messages:
callback = self._waiting_on_result_messages[message["id"]] callback = self._waiting_on_result_messages[message["id"]]
self.logger.info( self.logger.debug(
"received result for message id=%s, calling %s", "received result for message id=%s, calling %s",
message["id"], callback) message["id"], callback)
callback(message) callback(message)

View file

@ -34,6 +34,7 @@ import os
import importlib import importlib
import rethinkdb import rethinkdb
import yaml import yaml
import base64
# flask does its own logging config # flask does its own logging config
# logging.basicConfig( # logging.basicConfig(
@ -120,8 +121,10 @@ def page_yaml(page_id):
@app.route("/api/sites/<site_id>") @app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>") @app.route("/api/site/<site_id>")
def site(site_id): def site(site_id):
site_ = r.table("sites").get(site_id).run() s = r.table("sites").get(site_id).run()
return flask.jsonify(site_) if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml") @app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml") @app.route("/api/site/<site_id>/yaml")
@ -139,8 +142,12 @@ def stats(bucket):
@app.route("/api/jobs/<int:job_id>/sites") @app.route("/api/jobs/<int:job_id>/sites")
@app.route("/api/job/<int:job_id>/sites") @app.route("/api/job/<int:job_id>/sites")
def sites(job_id): def sites(job_id):
sites_ = r.table("sites").get_all(job_id, index="job_id").run() sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
return flask.jsonify(sites=list(sites_)) # TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<int:job_id>") @app.route("/api/jobs/<int:job_id>")
@app.route("/api/job/<int:job_id>") @app.route("/api/job/<int:job_id>")

View file

@ -18,18 +18,32 @@ limitations under the License.
''' '''
import setuptools import setuptools
import os
def find_package_data(package):
pkg_data = []
depth = len(package.split('.'))
path = os.path.join(*package.split('.'))
for dirpath, dirnames, filenames in os.walk(path):
if not os.path.exists(os.path.join(dirpath, '__init__.py')):
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
return pkg_data
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b4.dev63', version='1.1b6.dev69',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
author_email='nlevitt@archive.org', author_email='nlevitt@archive.org',
long_description=open('README.rst', encoding='UTF-8').read(), long_description=open('README.rst', encoding='UTF-8').read(),
license='Apache License 2.0', license='Apache License 2.0',
packages=['brozzler'], packages=['brozzler', 'brozzler.webconsole'],
package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml']}, package_data={
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml'],
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
},
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'brozzle-page=brozzler.cli:brozzle_page', 'brozzle-page=brozzler.cli:brozzle_page',