diff --git a/brozzler/browser.py b/brozzler/browser.py index 77b845b..55212c7 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -301,17 +301,19 @@ class Browser: self._behavior = None OUTLINKS_JS = """ -var compileOutlinks = function(frame) { +var __brzl_framesDone = new Set(); +var __brzl_compileOutlinks = function(frame) { + __brzl_framesDone.add(frame); var outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href]')); for (var i = 0; i < frame.frames.length; i++) { - if (frame.frames[i]) { // sometimes undefined (why?) - outlinks = outlinks.concat(compileOutlinks(frame.frames[i])); + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + outlinks = outlinks.concat(__brzl_compileOutlinks(frame.frames[i])); } } return outlinks; } -compileOutlinks(window).join(' '); +__brzl_compileOutlinks(window).join(' '); """ def _chain_chrome_messages(self, chain): @@ -540,7 +542,7 @@ compileOutlinks(window).join(' '); elif "result" in message: if message["id"] in self._waiting_on_result_messages: callback = self._waiting_on_result_messages[message["id"]] - self.logger.info( + self.logger.debug( "received result for message id=%s, calling %s", message["id"], callback) callback(message) diff --git a/brozzler/webconsole/__init__.py b/brozzler/webconsole/__init__.py index 43f306c..2044110 100644 --- a/brozzler/webconsole/__init__.py +++ b/brozzler/webconsole/__init__.py @@ -34,6 +34,7 @@ import os import importlib import rethinkdb import yaml +import base64 # flask does its own logging config # logging.basicConfig( @@ -120,8 +121,10 @@ def page_yaml(page_id): @app.route("/api/sites/") @app.route("/api/site/") def site(site_id): - site_ = r.table("sites").get(site_id).run() - return flask.jsonify(site_) + s = r.table("sites").get(site_id).run() + if "cookie_db" in s: + s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") + return flask.jsonify(s) @app.route("/api/sites//yaml") @app.route("/api/site//yaml") @@ -139,8 +142,12 @@ def stats(bucket): @app.route("/api/jobs//sites") @app.route("/api/job//sites") def sites(job_id): - sites_ = r.table("sites").get_all(job_id, index="job_id").run() - return flask.jsonify(sites=list(sites_)) + sites_ = list(r.table("sites").get_all(job_id, index="job_id").run()) + # TypeError: is not JSON serializable + for s in sites_: + if "cookie_db" in s: + s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") + return flask.jsonify(sites=sites_) @app.route("/api/jobs/") @app.route("/api/job/") diff --git a/setup.py b/setup.py index e0aa2f5..ba8484e 100644 --- a/setup.py +++ b/setup.py @@ -18,18 +18,32 @@ limitations under the License. ''' import setuptools +import os + +def find_package_data(package): + pkg_data = [] + depth = len(package.split('.')) + path = os.path.join(*package.split('.')) + for dirpath, dirnames, filenames in os.walk(path): + if not os.path.exists(os.path.join(dirpath, '__init__.py')): + relpath = os.path.join(*dirpath.split(os.sep)[depth:]) + pkg_data.extend(os.path.join(relpath, f) for f in filenames) + return pkg_data setuptools.setup( name='brozzler', - version='1.1b4.dev63', + version='1.1b6.dev69', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', author_email='nlevitt@archive.org', long_description=open('README.rst', encoding='UTF-8').read(), license='Apache License 2.0', - packages=['brozzler'], - package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml']}, + packages=['brozzler', 'brozzler.webconsole'], + package_data={ + 'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml'], + 'brozzler.webconsole': find_package_data('brozzler.webconsole'), + }, entry_points={ 'console_scripts': [ 'brozzle-page=brozzler.cli:brozzle_page',