From 0e096dd4e4d618332844986f09fe91c58ac50ae6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 3 Oct 2016 15:03:08 -0700 Subject: [PATCH] don't try to read the browser's cookie database if the browser hasn't been started (which can happen if the page is simply fetched rather than browsed because it's not html) --- brozzler/browser.py | 2 +- brozzler/worker.py | 3 ++- setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 92e3479..17d7153 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -185,7 +185,7 @@ class Browser: cookie_location = os.path.join( self._work_dir.name, "chrome-user-data", "Default", "Cookies") self.logger.debug( - "marking cookies persistent then reading file into memory: %s ", + "marking cookies persistent then reading file into memory: %s", cookie_location) try: with sqlite3.connect(cookie_location) as conn: diff --git a/brozzler/worker.py b/brozzler/worker.py index 0d777e3..38e7ecb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -312,7 +312,8 @@ class BrozzlerWorker: page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) - site.cookie_db=browser.persist_and_read_cookie_db() + if browser.is_running(): + site.cookie_db = browser.persist_and_read_cookie_db() self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) diff --git a/setup.py b/setup.py index 912a829..6447309 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev87', + version='1.1b6.dev88', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',