diff --git a/brozzler/browser.py b/brozzler/browser.py index 4db78ed..474afd7 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -131,13 +131,18 @@ class Browser: # these can raise exceptions self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() - self._chrome_instance = Chrome(port=self.chrome_port, - executable=self.chrome_exe, + self._chrome_instance = Chrome( + port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, - user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]), + user_data_dir=os.sep.join([ + self._work_dir.name, "chrome-user-data"]), ignore_cert_errors=self.ignore_cert_errors, proxy=proxy or self.proxy) - self._websocket_url = self._chrome_instance.start() + try: + self._websocket_url = self._chrome_instance.start() + except: + self._chrome_instance = None + raise def stop(self): try: diff --git a/brozzler/worker.py b/brozzler/worker.py index 74701d2..9d1b6c0 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -288,9 +288,8 @@ class BrozzlerWorker: while (not self._shutdown_requested.is_set() and time.time() - start < 7 * 60): self._frontier.honor_stop_request(site.job_id) - page = self._frontier.claim_page(site, - "{}:{}".format( - socket.gethostname(), browser.chrome_port)) + page = self._frontier.claim_page(site, "%s:%s" % ( + socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks(site, page, outlinks) @@ -338,7 +337,11 @@ class BrozzlerWorker: try: latest_state = None while not self._shutdown_requested.is_set(): - if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): + if self._service_registry and ( + not hasattr(self, "status_info") + or (rethinkstuff.utcnow() - + self.status_info["last_heartbeat"]).total_seconds() + > self.HEARTBEAT_INTERVAL): self._service_heartbeat() try: @@ -358,7 +361,8 @@ class BrozzlerWorker: raise except brozzler.browser.NoBrowsersAvailable: if latest_state != "browsers-busy": - self.logger.info("all %s browsers are busy", self._max_browsers) + self.logger.info( + "all %s browsers are busy", self._max_browsers) latest_state = "browsers-busy" except brozzler.NothingToClaim: if latest_state != "no-unclaimed-sites": diff --git a/setup.py b/setup.py index 8fc5085..6d8f422 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev35', + version='1.1.dev36', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',