Merge branch 'master' into qa

* master:
  fix exception happening now that we have binary data in rethinkdb (the cookie db) "TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable"
  dev version number again
  another version for pypi
  avoid "Uncaught RangeError: Maximum call stack size exceeded" compiling outlinks
  back to a dev version number
  bump version to 1.1b4 for pypi upload
  logging tweak
  install brozzler.webconsole package
This commit is contained in:
Noah Levitt 2016-08-05 17:13:09 -07:00
commit 10c90431e6
3 changed files with 35 additions and 12 deletions

View File

@ -301,17 +301,19 @@ class Browser:
self._behavior = None
OUTLINKS_JS = """
var compileOutlinks = function(frame) {
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i]) { // sometimes undefined (why?)
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i]));
}
}
return outlinks;
}
compileOutlinks(window).join(' ');
__brzl_compileOutlinks(window).join(' ');
"""
def _chain_chrome_messages(self, chain):
@ -540,7 +542,7 @@ compileOutlinks(window).join(' ');
elif "result" in message:
if message["id"] in self._waiting_on_result_messages:
callback = self._waiting_on_result_messages[message["id"]]
self.logger.info(
self.logger.debug(
"received result for message id=%s, calling %s",
message["id"], callback)
callback(message)

View File

@ -34,6 +34,7 @@ import os
import importlib
import rethinkdb
import yaml
import base64
# flask does its own logging config
# logging.basicConfig(
@ -120,8 +121,10 @@ def page_yaml(page_id):
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
def site(site_id):
site_ = r.table("sites").get(site_id).run()
return flask.jsonify(site_)
s = r.table("sites").get(site_id).run()
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
@ -139,8 +142,12 @@ def stats(bucket):
@app.route("/api/jobs/<int:job_id>/sites")
@app.route("/api/job/<int:job_id>/sites")
def sites(job_id):
sites_ = r.table("sites").get_all(job_id, index="job_id").run()
return flask.jsonify(sites=list(sites_))
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<int:job_id>")
@app.route("/api/job/<int:job_id>")

View File

@ -18,18 +18,32 @@ limitations under the License.
'''
import setuptools
import os
def find_package_data(package):
pkg_data = []
depth = len(package.split('.'))
path = os.path.join(*package.split('.'))
for dirpath, dirnames, filenames in os.walk(path):
if not os.path.exists(os.path.join(dirpath, '__init__.py')):
relpath = os.path.join(*dirpath.split(os.sep)[depth:])
pkg_data.extend(os.path.join(relpath, f) for f in filenames)
return pkg_data
setuptools.setup(
name='brozzler',
version='1.1b4.dev63',
version='1.1b6.dev69',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
author_email='nlevitt@archive.org',
long_description=open('README.rst', encoding='UTF-8').read(),
license='Apache License 2.0',
packages=['brozzler'],
package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml']},
packages=['brozzler', 'brozzler.webconsole'],
package_data={
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml'],
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
},
entry_points={
'console_scripts': [
'brozzle-page=brozzler.cli:brozzle_page',