get rid of --browser-wait and --routing-key in favor of sensible defaults, some other tweaks

This commit is contained in:
Noah Levitt 2014-06-11 10:58:08 -07:00
parent a78e60f1da
commit 025db91dea
3 changed files with 75 additions and 75 deletions

View File

@ -19,8 +19,6 @@ if __name__=="__main__":
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description='umbra - browser automation tool communicating via AMQP',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
help='Seconds to wait for browser initialization')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='Executable to use to invoke chrome')
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
@ -29,9 +27,7 @@ if __name__=="__main__":
help='AMQP exchange name')
arg_parser.add_argument('--queue', dest='amqp_queue', default='urls',
help='AMQP queue to consume urls from')
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='url',
help='AMQP routing key to bind to the AMQP queue')
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='3',
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
help='Max number of chrome instances simultaneously browsing pages')
arg_parser.add_argument('-v', '--verbose', dest='log_level',
action="store_const", default=logging.INFO, const=logging.DEBUG)
@ -44,10 +40,9 @@ if __name__=="__main__":
logging.info("umbra {} starting up".format(umbra.version))
controller = umbra.Umbra(args.amqp_url, args.chrome_exe, args.browser_wait,
controller = umbra.Umbra(args.amqp_url, args.chrome_exe,
max_active_browsers=int(args.max_browsers),
exchange_name=args.amqp_exchange, queue_name=args.amqp_queue,
routing_key=args.amqp_routing_key)
exchange_name=args.amqp_exchange, queue_name=args.amqp_queue)
class ShutdownRequested(Exception):
pass
@ -77,10 +72,15 @@ if __name__=="__main__":
th.join()
except BaseException as e:
logging.warn("caught exception {}".format(e))
controller.shutdown_now()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
for i in range(6,0,-1):
controller.shutdown_now()
try:
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
break # if we get here, we're done, all threads finished
except:
logging.warn("caught exception {}".format(e))
logging.info("all finished, exiting")

View File

@ -20,12 +20,12 @@ class BrowserPool:
BASE_PORT = 9200
def __init__(self, size=3, chrome_exe='chromium-browser', chrome_wait=60):
def __init__(self, size=3, chrome_exe='chromium-browser'):
self._available = set()
self._in_use = set()
for i in range(0, size):
browser = Browser(BrowserPool.BASE_PORT + i, chrome_exe, chrome_wait)
browser = Browser(BrowserPool.BASE_PORT + i, chrome_exe)
self._available.add(browser)
self._lock = threading.Lock()
@ -61,11 +61,10 @@ class Browser:
HARD_TIMEOUT_SECONDS = 20 * 60
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', chrome_wait=60):
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'):
self.command_id = itertools.count(1)
self.chrome_port = chrome_port
self.chrome_exe = chrome_exe
self.chrome_wait = chrome_wait
self._behavior = None
self._websock = None
self._abort_browse_page = False
@ -84,8 +83,7 @@ class Browser:
# these can raise exceptions
self._work_dir = tempfile.TemporaryDirectory()
self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe,
self.chrome_wait, self._work_dir.name,
os.sep.join([self._work_dir.name, "chrome-user-data"]))
self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"]))
self._websocket_url = self._chrome_instance.start()
def stop(self):
@ -119,7 +117,7 @@ class Browser:
while True:
time.sleep(0.5)
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websock))
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
elif time.time() - start > Browser.HARD_TIMEOUT_SECONDS:
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
return
@ -209,10 +207,9 @@ class Browser:
class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, port, executable, browser_wait, user_home_dir, user_data_dir):
def __init__(self, port, executable, user_home_dir, user_data_dir):
self.port = port
self.executable = executable
self.browser_wait = browser_wait
self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir
@ -225,6 +222,7 @@ class Chrome:
# returns websocket url to chrome window with about:blank loaded
def start(self):
timeout_sec = 60
new_env = os.environ.copy()
new_env["HOME"] = self.user_home_dir
chrome_args = [self.executable,
@ -257,7 +255,7 @@ class Chrome:
except:
pass
finally:
if time.time() - start > float(self.browser_wait):
if time.time() - start > timeout_sec:
raise Exception("failed to retrieve {} after {} seconds".format(json_url, time.time() - start))
else:
time.sleep(0.5)

View File

@ -45,16 +45,13 @@ class AmqpBrowserController:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, amqp_url='amqp://guest:guest@localhost:5672/%2f',
chrome_exe='chromium-browser', browser_wait=60,
max_active_browsers=1, queue_name='urls', routing_key='url',
exchange_name='umbra'):
chrome_exe='chromium-browser', max_active_browsers=1,
queue_name='urls', exchange_name='umbra'):
self.amqp_url = amqp_url
self.queue_name = queue_name
self.routing_key = routing_key
self.exchange_name = exchange_name
self._browser_pool = BrowserPool(size=max_active_browsers,
chrome_exe=chrome_exe, chrome_wait=browser_wait)
self._browser_pool = BrowserPool(size=max_active_browsers, chrome_exe=chrome_exe)
def start(self):
self._browsing_threads = set()
@ -83,6 +80,50 @@ class AmqpBrowserController:
self._browser_pool.shutdown_now()
self._consumer_thread.join()
def _wait_for_and_browse_urls(self, conn, consumer, timeout):
start = time.time()
browser = None
consumer.qos(prefetch_count=1)
while not self._consumer_stop.is_set() and time.time() - start < timeout:
try:
browser = self._browser_pool.acquire() # raises KeyError if none available
browser.start()
def callback(body, message):
self._start_browsing_page(browser, message, body['clientId'], body['url'], body['metadata'])
consumer.callbacks = [callback]
while True:
try:
conn.drain_events(timeout=0.5)
break # out of "while True" to acquire another browser
except socket.timeout:
pass
if self._consumer_stop.is_set() or time.time() - start >= timeout:
browser.stop()
self._browser_pool.release(browser)
break
except KeyError:
# no browsers available
time.sleep(0.5)
except:
self.logger.critical("problem with browser initialization", exc_info=True)
time.sleep(0.5)
finally:
consumer.callbacks = None
def _wait_for_active_browsers(self):
self.logger.info("waiting for browsing threads to finish")
while True:
with self._browsing_threads_lock:
if len(self._browsing_threads) == 0:
break
time.sleep(0.5)
self.logger.info("active browsing threads finished")
def _consume_amqp(self):
# XXX https://webarchive.jira.com/browse/ARI-3811
# After running for some amount of time (3 weeks in the latest case),
@ -92,65 +133,24 @@ class AmqpBrowserController:
# reopen the connection every 15 minutes
RECONNECT_AFTER_SECONDS = 15 * 60
url_queue = kombu.Queue(self.queue_name, routing_key=self.routing_key,
exchange=self._exchange)
url_queue = kombu.Queue(self.queue_name, exchange=self._exchange)
while not self._consumer_stop.is_set():
try:
self.logger.info("connecting to amqp exchange={} at {}".format(self._exchange.name, self.amqp_url))
with kombu.Connection(self.amqp_url) as conn:
conn_opened = time.time()
with conn.Consumer(url_queue) as consumer:
consumer.qos(prefetch_count=1)
browser = None
while not self._consumer_stop.is_set() and time.time() - conn_opened < RECONNECT_AFTER_SECONDS:
try:
browser = self._browser_pool.acquire() # raises KeyError if none available
browser.start()
consumer.callbacks = [self._make_callback(browser)]
while True:
try:
conn.drain_events(timeout=0.5)
break # out of "while True" to acquire another browser
except socket.timeout:
pass
if self._consumer_stop.is_set() or time.time() - conn_opened >= RECONNECT_AFTER_SECONDS:
browser.stop()
self._browser_pool.release(browser)
break
except KeyError:
# no browsers available
time.sleep(0.5)
except:
self.logger.critical("problem with browser initialization", exc_info=True)
time.sleep(0.5)
finally:
consumer.callbacks = None
self._wait_for_and_browse_urls(conn, consumer, timeout=RECONNECT_AFTER_SECONDS)
# need to wait for browsers to finish here, before closing
# the amqp connection, because they use it to do
# message.ack() after they finish browsing a page
self.logger.info("waiting for browsing threads to finish")
while True:
with self._browsing_threads_lock:
if len(self._browsing_threads) == 0:
break
time.sleep(0.5)
self.logger.info("browsing threads finished")
self._wait_for_active_browsers()
except BaseException as e:
self.logger.error("caught exception {}".format(e), exc_info=True)
time.sleep(0.5)
self.logger.error("attempting to reopen amqp connection")
def _make_callback(self, browser):
def callback(body, message):
self._start_browsing_page(browser, message, body['clientId'], body['url'], body['metadata'])
return callback
def _start_browsing_page(self, browser, message, client_id, url, parent_url_metadata):
def on_request(chrome_msg):
payload = chrome_msg['params']['request']
@ -161,7 +161,7 @@ class AmqpBrowserController:
publish = self._producer_conn.ensure(self._producer, self._producer.publish)
publish(payload, exchange=self._exchange, routing_key=client_id)
def browse_page_async():
def browse_page_sync():
self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url))
try:
browser.browse_page(url, on_request=on_request)
@ -176,13 +176,15 @@ class AmqpBrowserController:
browser.stop()
self._browser_pool.release(browser)
def browse_thread_run_then_cleanup():
browse_page_sync()
with self._browsing_threads_lock:
self._browsing_threads.remove(threading.current_thread())
import random
threadName = "BrowsingThread{}-{}".format(browser.chrome_port,
thread_name = "BrowsingThread{}-{}".format(browser.chrome_port,
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
th = threading.Thread(target=browse_page_async, name=threadName)
th = threading.Thread(target=browse_thread_run_then_cleanup, name=thread_name)
with self._browsing_threads_lock:
self._browsing_threads.add(th)
th.start()