improving shutdown process

This commit is contained in:
Noah Levitt 2016-12-14 14:49:41 -08:00
parent f23f928c16
commit 5fa96b6438
5 changed files with 214 additions and 152 deletions

View file

@ -68,7 +68,7 @@ def fixup(url):
Does rudimentary canonicalization, such as converting IDN to punycode. Does rudimentary canonicalization, such as converting IDN to punycode.
''' '''
import surt import surt
hurl = _surt.handyurl.parse(url) hurl = surt.handyurl.parse(url)
# handyurl.parse() already lowercases the scheme via urlsplit # handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host: if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower() hurl.host = hurl.host.encode('idna').decode('ascii').lower()
@ -115,22 +115,60 @@ def behavior_script(url, template_parameters=None):
logging.info( logging.info(
'using behavior %s for %s', 'using behavior %s for %s',
behavior['behavior_js'], url) behavior['behavior_js'], url)
return behavior['script']
elif 'behavior_js_template' in behavior: elif 'behavior_js_template' in behavior:
parameters = dict() parameters = dict()
if 'default_parameters' in behavior: if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters']) parameters.update(behavior['default_parameters'])
if template_parameters: if template_parameters:
parameters.update(template_parameters) parameters.update(template_parameters)
javascript = behavior['template'].safe_substitute(parameters) script = behavior['template'].safe_substitute(parameters)
logging.info( logging.info(
'using template=%s populated with parameters=%s for %s', 'using template=%s populated with parameters=%s for %s',
repr(behavior['behavior_js_template']), parameters, url) repr(behavior['behavior_js_template']), parameters, url)
return script
return behavior['script']
return None return None
def thread_raise(thread, exctype):
'''
Raises the exception exctype in the thread.
Adapted from http://tomerfiliba.com/recipes/Thread2/ which explains:
"The exception will be raised only when executing python bytecode. If your
thread calls a native/built-in blocking function, the exception will be
raised only when execution returns to the python code."
'''
import ctypes, inspect, threading
if not thread.is_alive():
raise threading.ThreadError('thread %s is not running' % thread)
if not inspect.isclass(exctype):
raise TypeError(
'cannot raise %s, only exception types can be raised (not '
'instances)' % exc_type)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(thread.ident), ctypes.py_object(exctype))
if res == 0:
raise ValueError('invalid thread id? thread.ident=%s' % thread.ident)
elif res != 1:
# if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, 0)
raise SystemError('PyThreadState_SetAsyncExc failed')
def sleep(duration):
'''
Sleeps for duration seconds in increments of 0.5 seconds.
Use this so that the sleep can be interrupted by thread_raise().
'''
import time
start = time.time()
while True:
elapsed = time.time() - start
if elapsed >= duration:
break
time.sleep(min(duration - elapsed, 0.5))
from brozzler.site import Page, Site from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots from brozzler.robots import is_permitted_by_robots

View file

@ -29,6 +29,7 @@ from requests.structures import CaseInsensitiveDict
import datetime import datetime
import base64 import base64
from brozzler.chrome import Chrome from brozzler.chrome import Chrome
import surt
class BrowsingException(Exception): class BrowsingException(Exception):
pass pass
@ -86,8 +87,10 @@ class BrowserPool:
self._in_use.remove(browser) self._in_use.remove(browser)
def shutdown_now(self): def shutdown_now(self):
self.logger.info('shutting down browser pool')
with self._lock:
for browser in self._in_use: for browser in self._in_use:
browser.abort_browse_page() browser.stop()
def num_available(self): def num_available(self):
return len(self._available) return len(self._available)
@ -137,8 +140,9 @@ class Browser:
''' '''
try: try:
if self.is_running(): if self.is_running():
self.chrome.stop() self._browser_controller.stop()
self.websocket_url = None self.websocket_url = None
self.chrome.stop()
except: except:
self.logger.error('problem stopping', exc_info=True) self.logger.error('problem stopping', exc_info=True)
@ -191,6 +195,7 @@ class Browser:
if self.is_browsing: if self.is_browsing:
raise BrowsingException('browser is already busy browsing a page') raise BrowsingException('browser is already busy browsing a page')
self.is_browsing = True self.is_browsing = True
try:
self._browser_controller.navigate_to_page(page_url, timeout=300) self._browser_controller.navigate_to_page(page_url, timeout=300)
## if login_credentials: ## if login_credentials:
## self._browser_controller.try_login(login_credentials) (5 min?) ## self._browser_controller.try_login(login_credentials) (5 min?)
@ -209,9 +214,22 @@ class Browser:
## run behavior (3 min) ## run behavior (3 min)
## outlinks += retrieve_outlinks (60 sec) ## outlinks += retrieve_outlinks (60 sec)
final_page_url = self._browser_controller.url() final_page_url = self._browser_controller.url()
self.is_browsing = False
return final_page_url, outlinks return final_page_url, outlinks
except brozzler.ShutdownRequested:
self.logger.info('shutdown requested')
raise
except websocket.WebSocketConnectionClosedException as e:
# import pdb; pdb.set_trace()
raise BrowsingException(e)
# if not self.is_running():
# logging.info('appears shutdown was requested')
# return None, None
# else:
# raise BrowsingException(
# "websocket closed, did chrome die? %s" % (
# self.websocket_url))
finally:
self.is_browsing = False
class Counter: class Counter:
def __init__(self): def __init__(self):
@ -235,7 +253,6 @@ class BrowserController:
self._command_id = Counter() self._command_id = Counter()
self._websock_thread = None self._websock_thread = None
self._websock_open = None self._websock_open = None
self._got_page_load_event = None
self._result_messages = {} self._result_messages = {}
def _wait_for(self, callback, timeout=None): def _wait_for(self, callback, timeout=None):
@ -244,6 +261,7 @@ class BrowserController:
''' '''
start = time.time() start = time.time()
while True: while True:
brozzler.sleep(0.5)
if callback(): if callback():
return return
elapsed = time.time() - start elapsed = time.time() - start
@ -251,7 +269,6 @@ class BrowserController:
raise BrowsingTimeout( raise BrowsingTimeout(
'timed out after %.1fs waiting for: %s' % ( 'timed out after %.1fs waiting for: %s' % (
elapsed, callback)) elapsed, callback))
time.sleep(0.5)
def __enter__(self): def __enter__(self):
self.start() self.start()
@ -262,18 +279,28 @@ class BrowserController:
def start(self): def start(self):
if not self._websock_thread: if not self._websock_thread:
calling_thread = threading.current_thread()
def on_open(websock): def on_open(websock):
self._websock_open = datetime.datetime.utcnow() self._websock_open = datetime.datetime.utcnow()
def on_error(websock, e):
'''
Raises BrowsingException in the thread that called start()
'''
self.logger.error(
'exception from websocket receiver thread', exc_info=1)
brozzler.thread_raise(calling_thread, BrowsingException)
# open websocket, start thread that receives messages # open websocket, start thread that receives messages
self._websock = websocket.WebSocketApp( self._websock = websocket.WebSocketApp(
self.websocket_url, on_open=on_open, self.websocket_url, on_open=on_open,
on_message=self._on_message) on_message=self._on_message, on_error=on_error)
thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format( thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format(
self.websocket_url, datetime.datetime.utcnow()) surt.handyurl.parse(self.websocket_url).port,
datetime.datetime.utcnow())
self._websock_thread = threading.Thread( self._websock_thread = threading.Thread(
target=self._websock.run_forever, name=thread_name, target=self._websock.run_forever, name=thread_name,
kwargs={'ping_timeout': 0.5}) daemon=True)
self._websock_thread.start() self._websock_thread.start()
self._wait_for(lambda: self._websock_open, timeout=10) self._wait_for(lambda: self._websock_open, timeout=10)
@ -296,6 +323,7 @@ class BrowserController:
if self._websock_thread: if self._websock_thread:
if (self._websock and self._websock.sock if (self._websock and self._websock.sock
and self._websock.sock.connected): and self._websock.sock.connected):
self.logger.info('shutting down websocket connection')
try: try:
self._websock.close() self._websock.close()
except BaseException as e: except BaseException as e:
@ -303,6 +331,7 @@ class BrowserController:
'exception closing websocket %s - %s', 'exception closing websocket %s - %s',
self._websock, e) self._websock, e)
if self._websock_thread != threading.current_thread():
self._websock_thread.join(timeout=30) self._websock_thread.join(timeout=30)
if self._websock_thread.is_alive(): if self._websock_thread.is_alive():
self.logger.error( self.logger.error(
@ -323,18 +352,28 @@ class BrowserController:
self.logger.error( self.logger.error(
'uncaught exception in _handle_message message=%s', 'uncaught exception in _handle_message message=%s',
message, exc_info=True) message, exc_info=True)
self.abort = True
def _handle_message(self, websock, json_message): def _handle_message(self, websock, json_message):
message = json.loads(json_message) message = json.loads(json_message)
if 'method' in message: if 'method' in message:
if message['method'] == 'Page.loadEventFired': if message['method'] == 'Page.loadEventFired':
self._got_page_load_event = datetime.datetime.utcnow() self._got_page_load_event = datetime.datetime.utcnow()
elif message["method"] == "Debugger.paused": elif message['method'] == 'Debugger.paused':
self._debugger_paused(message) self._debugger_paused(message)
elif message['method'] == 'Console.messageAdded':
self.logger.debug(
'%s console.%s %s', self._websock.url,
message['params']['message']['level'],
message['params']['message']['text'])
else:
self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message: elif 'result' in message:
if message['id'] in self._result_messages: if message['id'] in self._result_messages:
self._result_messages[message['id']] = message self._result_messages[message['id']] = message
else:
self.logger.debug("%s", json_message)
else:
self.logger.debug("%s", json_message)
def _debugger_paused(self, message): def _debugger_paused(self, message):
# we hit the breakpoint set in start(), get rid of google analytics # we hit the breakpoint set in start(), get rid of google analytics
@ -363,12 +402,6 @@ class BrowserController:
self, page_url, extra_headers=None, user_agent=None, timeout=300): self, page_url, extra_headers=None, user_agent=None, timeout=300):
''' '''
''' '''
# navigate to about:blank here to avoid situation where we navigate to
# the same page that we're currently on, perhaps with a different
# #fragment, which prevents Page.loadEventFired from happening
self.send_to_chrome(
method='Page.navigate', params={'url': 'about:blank'})
headers = extra_headers or {} headers = extra_headers or {}
headers['Accept-Encoding'] = 'identity' headers['Accept-Encoding'] = 'identity'
self.send_to_chrome( self.send_to_chrome(
@ -382,9 +415,9 @@ class BrowserController:
# navigate to the page! # navigate to the page!
self.logger.info('navigating to page %s', page_url) self.logger.info('navigating to page %s', page_url)
self._got_page_load_event = None
self.send_to_chrome(method='Page.navigate', params={'url': page_url}) self.send_to_chrome(method='Page.navigate', params={'url': page_url})
self._wait_for(lambda: self._got_page_load_event, timeout=timeout)
self._wait_for(lambda:self._got_page_load_event, timeout=timeout)
OUTLINKS_JS = r''' OUTLINKS_JS = r'''
var __brzl_framesDone = new Set(); var __brzl_framesDone = new Set();
@ -464,7 +497,7 @@ __brzl_compileOutlinks(window).join('\n');
'behavior reached hard timeout after %.1fs', elapsed) 'behavior reached hard timeout after %.1fs', elapsed)
return return
time.sleep(7) brozzler.sleep(7)
self._result_messages[self._command_id.peek_next()] = None self._result_messages[self._command_id.peek_next()] = None
msg_id = self.send_to_chrome( msg_id = self.send_to_chrome(

View file

@ -34,9 +34,7 @@ import tempfile
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__( def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False):
self, chrome_exe, port=9222, ignore_cert_errors=False, proxy=None,
cookie_db=None):
''' '''
Initializes instance of this class. Initializes instance of this class.
@ -47,17 +45,10 @@ class Chrome:
port: chrome debugging protocol port (default 9222) port: chrome debugging protocol port (default 9222)
ignore_cert_errors: configure chrome to accept all certs (default ignore_cert_errors: configure chrome to accept all certs (default
False) False)
proxy: http proxy 'host:port' (default None)
cookie_db: raw bytes of chrome/chromium sqlite3 cookies database,
which, if supplied, will be written to
{chrome_user_data_dir}/Default/Cookies before running the
browser (default None)
''' '''
self.port = port self.port = port
self.chrome_exe = chrome_exe self.chrome_exe = chrome_exe
self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors self.ignore_cert_errors = ignore_cert_errors
self.cookie_db = cookie_db
self._shutdown = threading.Event() self._shutdown = threading.Event()
def __enter__(self): def __enter__(self):
@ -87,17 +78,15 @@ class Chrome:
return default_port return default_port
def _init_cookie_db(self): def _init_cookie_db(self, cookie_db):
if self.cookie_db is not None:
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies') cookie_location = os.path.join(cookie_dir, 'Cookies')
self.logger.debug( self.logger.debug('cookie DB provided, writing to %s', cookie_location)
'cookie DB provided, writing to %s', cookie_location)
os.makedirs(cookie_dir, exist_ok=True) os.makedirs(cookie_dir, exist_ok=True)
try: try:
with open(cookie_location, 'wb') as cookie_file: with open(cookie_location, 'wb') as cookie_file:
cookie_file.write(self.cookie_db) cookie_file.write(cookie_db)
except OSError: except OSError:
self.logger.error( self.logger.error(
'exception writing cookie file at %s', 'exception writing cookie file at %s',
@ -126,15 +115,26 @@ class Chrome:
cookie_location, exc_info=True) cookie_location, exc_info=True)
return cookie_db return cookie_db
def start(self): def start(self, proxy=None, cookie_db=None):
''' '''
Returns websocket url to chrome window with about:blank loaded. Starts chrome/chromium process.
Args:
proxy: http proxy 'host:port' (default None)
cookie_db: raw bytes of chrome/chromium sqlite3 cookies database,
which, if supplied, will be written to
{chrome_user_data_dir}/Default/Cookies before running the
browser (default None)
Returns:
websocket url to chrome window with about:blank loaded
''' '''
# these can raise exceptions # these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory() self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join( self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data') self._home_tmpdir.name, 'chrome-user-data')
self._init_cookie_db() if cookie_db:
self._init_cookie_db(cookie_db)
new_env = os.environ.copy() new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name new_env['HOME'] = self._home_tmpdir.name
@ -151,8 +151,8 @@ class Chrome:
'--disable-extensions', '--disable-save-password-bubble'] '--disable-extensions', '--disable-save-password-bubble']
if self.ignore_cert_errors: if self.ignore_cert_errors:
chrome_args.append('--ignore-certificate-errors') chrome_args.append('--ignore-certificate-errors')
if self.proxy: if proxy:
chrome_args.append('--proxy-server=%s' % self.proxy) chrome_args.append('--proxy-server=%s' % proxy)
chrome_args.append('about:blank') chrome_args.append('about:blank')
self.logger.info( self.logger.info(
'running: %s', repr(subprocess.list2cmdline(chrome_args))) 'running: %s', repr(subprocess.list2cmdline(chrome_args)))
@ -162,7 +162,8 @@ class Chrome:
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
self._out_reader_thread = threading.Thread( self._out_reader_thread = threading.Thread(
target=self._read_stderr_stdout, target=self._read_stderr_stdout,
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid,
daemon=True)
self._out_reader_thread.start() self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid) self.logger.info('chrome running, pid %s' % self.chrome_process.pid)

View file

@ -159,8 +159,8 @@ def brozzle_page():
f.write(screenshot_png) f.write(screenshot_png)
logging.info('wrote screenshot to %s', filename) logging.info('wrote screenshot to %s', filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe, proxy=site.proxy) browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser.start() browser.start(proxy=site.proxy)
try: try:
outlinks = worker.brozzle_page( outlinks = worker.brozzle_page(
browser, site, page, on_screenshot=on_screenshot) browser, site, page, on_screenshot=on_screenshot)
@ -276,9 +276,7 @@ def brozzler_worker():
def sigint(signum, frame): def sigint(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)') raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)')
# do not print in signal handler to avoid RuntimeError: reentrant call def dump_state(signum, frame):
state_dump_msgs = []
def queue_state_dump(signum, frame):
signal.signal(signal.SIGQUIT, signal.SIG_IGN) signal.signal(signal.SIGQUIT, signal.SIG_IGN)
try: try:
state_strs = [] state_strs = []
@ -291,15 +289,15 @@ def brozzler_worker():
state_strs.append('<???:thread:ident=%s>' % ident) state_strs.append('<???:thread:ident=%s>' % ident)
stack = traceback.format_stack(frames[ident]) stack = traceback.format_stack(frames[ident])
state_strs.append(''.join(stack)) state_strs.append(''.join(stack))
state_dump_msgs.append( logging.info(
'dumping state (caught signal %s)\n%s' % ( 'dumping state (caught signal %s)\n%s' % (
signum, '\n'.join(state_strs))) signum, '\n'.join(state_strs)))
except BaseException as e: except BaseException as e:
state_dump_msgs.append('exception dumping state: %s' % e) logging.error('exception dumping state: %s' % e)
finally: finally:
signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint) signal.signal(signal.SIGINT, sigint)
@ -311,17 +309,7 @@ def brozzler_worker():
frontier, service_registry, max_browsers=int(args.max_browsers), frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe) chrome_exe=args.chrome_exe)
worker.start() worker.run()
try:
while worker.is_alive():
while state_dump_msgs:
logging.warn(state_dump_msgs.pop(0))
time.sleep(0.5)
logging.critical('worker thread has died, shutting down')
except brozzler.ShutdownRequested as e:
pass
finally:
worker.shutdown_now()
logging.info('brozzler-worker is all done, exiting') logging.info('brozzler-worker is all done, exiting')

View file

@ -104,12 +104,10 @@ class BrozzlerWorker:
self._default_proxy = proxy self._default_proxy = proxy
self._default_enable_warcprox_features = enable_warcprox_features self._default_enable_warcprox_features = enable_warcprox_features
self._browser_pool = brozzler.browser.BrowserPool(max_browsers, self._browser_pool = brozzler.browser.BrowserPool(
chrome_exe=chrome_exe, ignore_cert_errors=True) max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
self._shutdown_requested = threading.Event()
self._thread = None
self._start_stop_lock = threading.Lock()
self._browsing_threads = set() self._browsing_threads = set()
self._browsing_threads_lock = threading.Lock()
def _proxy(self, site): def _proxy(self, site):
if site.proxy: if site.proxy:
@ -203,6 +201,8 @@ class BrozzlerWorker:
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"), payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
except brozzler.ShutdownRequested as e:
raise
except BaseException as e: except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass pass
@ -257,6 +257,8 @@ class BrozzlerWorker:
self._try_youtube_dl(ydl, site, page) self._try_youtube_dl(ydl, site, page)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
raise raise
except brozzler.ShutdownRequested:
raise
except Exception as e: except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code') and hasattr(e.exc_info[1], 'code')
@ -323,11 +325,10 @@ class BrozzlerWorker:
start = time.time() start = time.time()
page = None page = None
try: try:
while (not self._shutdown_requested.is_set() while time.time() - start < 7 * 60:
and time.time() - start < 7 * 60):
self._frontier.honor_stop_request(site.job_id) self._frontier.honor_stop_request(site.job_id)
page = self._frontier.claim_page(site, "%s:%s" % ( page = self._frontier.claim_page(site, "%s:%s" % (
socket.gethostname(), browser.chrome_port)) socket.gethostname(), browser.chrome.port))
if (page.needs_robots_check and if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)): not brozzler.is_permitted_by_robots(site, page.url)):
@ -341,22 +342,23 @@ class BrozzlerWorker:
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
page = None page = None
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except brozzler.NothingToClaim: except brozzler.NothingToClaim:
self.logger.info("no pages left for site %s", site) self.logger.info("no pages left for site %s", site)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
self._frontier.reached_limit(site, e) self._frontier.reached_limit(site, e)
except brozzler.CrawlJobStopped: except brozzler.CrawlJobStopped:
self._frontier.finished(site, "FINISHED_STOP_REQUESTED") self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
except brozzler.browser.BrowsingAborted: # except brozzler.browser.BrowsingAborted:
self.logger.info("{} shut down".format(browser)) # self.logger.info("{} shut down".format(browser))
except: except:
self.logger.critical("unexpected exception", exc_info=True) self.logger.critical("unexpected exception", exc_info=True)
finally: finally:
self.logger.info("finished session brozzling site, stopping "
"browser and disclaiming site")
browser.stop() browser.stop()
self._frontier.disclaim_site(site, page) self._frontier.disclaim_site(site, page)
self._browser_pool.release(browser) self._browser_pool.release(browser)
with self._browsing_threads_lock:
self._browsing_threads.remove(threading.current_thread()) self._browsing_threads.remove(threading.current_thread())
def _service_heartbeat(self): def _service_heartbeat(self):
@ -381,17 +383,24 @@ class BrozzlerWorker:
"failed to send heartbeat and update service registry " "failed to send heartbeat and update service registry "
"with info %s: %s", status_info, e) "with info %s: %s", status_info, e)
def _service_heartbeat_if_due(self):
'''Sends service registry heartbeat if due'''
due = False
if self._service_registry:
if not hasattr(self, "status_info"):
due = True
else:
d = rethinkstuff.utcnow() - self.status_info["last_heartbeat"]
due = d.total_seconds() > self.HEARTBEAT_INTERVAL
if due:
self._service_heartbeat()
def run(self): def run(self):
try: try:
latest_state = None latest_state = None
while not self._shutdown_requested.is_set(): while True:
if self._service_registry and ( self._service_heartbeat_if_due()
not hasattr(self, "status_info")
or (rethinkstuff.utcnow() -
self.status_info["last_heartbeat"]).total_seconds()
> self.HEARTBEAT_INTERVAL):
self._service_heartbeat()
try: try:
browser = self._browser_pool.acquire() browser = self._browser_pool.acquire()
try: try:
@ -401,11 +410,12 @@ class BrozzlerWorker:
"brozzling site (proxy=%s) %s", "brozzling site (proxy=%s) %s",
repr(self._proxy(site)), site) repr(self._proxy(site)), site)
th = threading.Thread( th = threading.Thread(
target=lambda: self._brozzle_site( target=self._brozzle_site, args=(browser, site),
browser, site), name="BrozzlingThread:%s" % site.seed,
name="BrozzlingThread:%s" % site.seed) daemon=True)
th.start() with self._browsing_threads_lock:
self._browsing_threads.add(th) self._browsing_threads.add(th)
th.start()
except: except:
self._browser_pool.release(browser) self._browser_pool.release(browser)
raise raise
@ -419,6 +429,8 @@ class BrozzlerWorker:
self.logger.info("no unclaimed sites to browse") self.logger.info("no unclaimed sites to browse")
latest_state = "no-unclaimed-sites" latest_state = "no-unclaimed-sites"
time.sleep(0.5) time.sleep(0.5)
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except: except:
self.logger.critical( self.logger.critical(
"thread exiting due to unexpected exception", "thread exiting due to unexpected exception",
@ -432,26 +444,16 @@ class BrozzlerWorker:
"failed to unregister from service registry", "failed to unregister from service registry",
exc_info=True) exc_info=True)
def start(self): self.logger.info(
with self._start_stop_lock: 'shutting down %s brozzling threads',
if self._thread: len(self._browsing_threads))
self.logger.warn( with self._browsing_threads_lock:
'ignoring start request because self._thread is ' for th in self._browsing_threads:
'not None') if th.is_alive():
return brozzler.thread_raise(th, brozzler.ShutdownRequested)
self._thread = threading.Thread(
target=self.run, name="BrozzlerWorker")
self._thread.start()
def shutdown_now(self):
with self._start_stop_lock:
self.logger.info("brozzler worker shutting down")
self._shutdown_requested.set()
self._browser_pool.shutdown_now() self._browser_pool.shutdown_now()
while self._browsing_threads: # copy to avoid "RuntimeError: Set changed size during iteration"
time.sleep(0.5) thredz = set(self._browsing_threads)
self._thread = None for th in thredz:
th.join()
def is_alive(self):
return self._thread and self._thread.is_alive()