set Browser._chrome_instance=None if _chrome_instance.start() throws exception, to avoid endless loop after one failure

This commit is contained in:
Noah Levitt 2016-06-29 19:47:25 -05:00
parent 2e687b65fb
commit 79beddfc44
3 changed files with 19 additions and 10 deletions

View File

@ -131,13 +131,18 @@ class Browser:
# these can raise exceptions
self.chrome_port = self._find_available_port()
self._work_dir = tempfile.TemporaryDirectory()
self._chrome_instance = Chrome(port=self.chrome_port,
executable=self.chrome_exe,
self._chrome_instance = Chrome(
port=self.chrome_port, executable=self.chrome_exe,
user_home_dir=self._work_dir.name,
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
user_data_dir=os.sep.join([
self._work_dir.name, "chrome-user-data"]),
ignore_cert_errors=self.ignore_cert_errors,
proxy=proxy or self.proxy)
try:
self._websocket_url = self._chrome_instance.start()
except:
self._chrome_instance = None
raise
def stop(self):
try:

View File

@ -288,8 +288,7 @@ class BrozzlerWorker:
while (not self._shutdown_requested.is_set()
and time.time() - start < 7 * 60):
self._frontier.honor_stop_request(site.job_id)
page = self._frontier.claim_page(site,
"{}:{}".format(
page = self._frontier.claim_page(site, "%s:%s" % (
socket.gethostname(), browser.chrome_port))
outlinks = self.brozzle_page(browser, site, page)
self._frontier.completed_page(site, page)
@ -338,7 +337,11 @@ class BrozzlerWorker:
try:
latest_state = None
while not self._shutdown_requested.is_set():
if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
if self._service_registry and (
not hasattr(self, "status_info")
or (rethinkstuff.utcnow() -
self.status_info["last_heartbeat"]).total_seconds()
> self.HEARTBEAT_INTERVAL):
self._service_heartbeat()
try:
@ -358,7 +361,8 @@ class BrozzlerWorker:
raise
except brozzler.browser.NoBrowsersAvailable:
if latest_state != "browsers-busy":
self.logger.info("all %s browsers are busy", self._max_browsers)
self.logger.info(
"all %s browsers are busy", self._max_browsers)
latest_state = "browsers-busy"
except brozzler.NothingToClaim:
if latest_state != "no-unclaimed-sites":

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup(
name='brozzler',
version='1.1.dev35',
version='1.1.dev36',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',