make umbra amenable to clustering by using a pool of n browsers and removing the browser-clientId affinity (not useful currently since we start a fresh browser instance for each page browsed), and set prefetch_count=1 on amqp consumers to round-robin incoming urls among umbra instances

This commit is contained in:
Noah Levitt 2014-05-23 21:59:34 -07:00
parent 8d269f4c56
commit 2c4ba005b5
4 changed files with 94 additions and 70 deletions

View File

@ -37,6 +37,7 @@ queue = Queue(args.amqp_queue, exchange=exchange)
try:
with Connection(args.amqp_url) as conn:
with conn.Consumer(queue, callbacks=[print_and_maybe_ack]) as consumer:
consumer.qos(prefetch_count=1)
while True:
try:
conn.drain_events(timeout=0.5)

View File

@ -8,7 +8,7 @@ import umbra
import sys
import signal
import os
import umbra.controller
import umbra
if __name__=="__main__":
import faulthandler
@ -38,18 +38,19 @@ if __name__=="__main__":
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
controller = umbra.controller.AmqpBrowserController(args.amqp_url,
args.chrome_exe, args.browser_wait,
umbra = umbra.Umbra(args.amqp_url, args.chrome_exe, args.browser_wait,
max_active_browsers=int(args.max_browsers),
exchange_name=args.amqp_exchange, queue_name=args.amqp_queue,
routing_key=args.amqp_routing_key)
umbra.start()
try:
while True:
time.sleep(0.5)
except:
pass
finally:
controller.shutdown()
umbra.shutdown()

View File

@ -12,8 +12,44 @@ import subprocess
import signal
import tempfile
import os
import socket
from umbra.behaviors import Behavior
class BrowserPool:
def __init__(self, size=3, chrome_exe='chromium-browser', chrome_wait=60):
self._available = set()
for i in range(0, size):
port_holder = self._grab_random_port()
browser = Browser(port_holder.getsockname()[1], chrome_exe, chrome_wait)
self._available.add((browser, port_holder))
self._lock = threading.Lock()
def _grab_random_port(self):
"""Returns socket bound to some port."""
sock = socket.socket()
sock.bind(('127.0.0.1', 0))
return sock
def _hold_port(self, port):
"""Returns socket bound to supplied port."""
sock = socket.socket()
sock.bind(('127.0.0.1', port))
return sock
def acquire(self):
"""Returns browser from pool if available, raises KeyError otherwise."""
with self._lock:
(browser, port_holder) = self._available.pop()
port_holder.close()
return browser
def release(self, browser):
with self._lock:
port_holder = self._hold_port(browser.chrome_port)
self._available.add((browser, port_holder))
class Browser:
"""Runs chrome/chromium to synchronously browse one page at a time using
worker.browse_page(). Currently the implementation starts up a new instance
@ -143,11 +179,6 @@ class Chrome:
self.browser_wait = browser_wait
self.user_data_dir = user_data_dir
def fetch_debugging_json():
raw_json = urllib.request.urlopen("http://localhost:%s/json" % self.port).read()
json = raw_json.decode('utf-8')
return json.loads(json)
# returns websocket url to chrome window with about:blank loaded
def __enter__(self):
chrome_args = [self.executable,

View File

@ -5,7 +5,7 @@ import logging
import time
import threading
import kombu
from umbra.browser import Browser
from umbra.browser import BrowserPool
class AmqpBrowserController:
"""
@ -47,36 +47,35 @@ class AmqpBrowserController:
chrome_exe='chromium-browser', browser_wait=60,
max_active_browsers=1, queue_name='urls', routing_key='url',
exchange_name='umbra'):
self.amqp_url = amqp_url
self.chrome_exe = chrome_exe
self.browser_wait = browser_wait
self.max_active_browsers = max_active_browsers
self.queue_name = queue_name
self.routing_key = routing_key
self.exchange_name = exchange_name
self._exchange = kombu.Exchange(name=self.exchange_name, type='direct', durable=True)
self.producer = None
self.producer_lock = threading.Lock()
with self.producer_lock:
self.producer_conn = kombu.Connection(self.amqp_url)
self.producer = self.producer_conn.Producer(serializer='json')
self._browser_pool = BrowserPool(size=max_active_browsers,
chrome_exe=chrome_exe, chrome_wait=browser_wait)
self.browsers = {}
self.browsers_lock = threading.Lock()
self.num_active_browsers = 0
self.amqp_thread = threading.Thread(target=self._consume_amqp)
self.amqp_stop = threading.Event()
self.amqp_thread.start()
def start(self):
self._exchange = kombu.Exchange(name=self.exchange_name, type='direct',
durable=True)
self._producer = None
self._producer_lock = threading.Lock()
with self._producer_lock:
self._producer_conn = kombu.Connection(self.amqp_url)
self._producer = self._producer_conn.Producer(serializer='json')
self._amqp_thread = threading.Thread(target=self._consume_amqp)
self._amqp_stop = threading.Event()
self._amqp_thread.start()
def shutdown(self):
self.logger.info("shutting down amqp consumer {}".format(self.amqp_url))
self.amqp_stop.set()
self.amqp_thread.join()
with self.producer_lock:
self.producer_conn.close()
self.producer_conn = None
self._amqp_stop.set()
self._amqp_thread.join()
with self._producer_lock:
self._producer_conn.close()
self._producer_conn = None
def _consume_amqp(self):
# XXX https://webarchive.jira.com/browse/ARI-3811
@ -87,66 +86,58 @@ class AmqpBrowserController:
# reopen the connection every 15 minutes
RECONNECT_AFTER_SECONDS = 15 * 60
while not self.amqp_stop.is_set():
browser = None
while not self._amqp_stop.is_set():
try:
url_queue = kombu.Queue(self.queue_name, routing_key=self.routing_key, exchange=self._exchange)
self.logger.info("connecting to amqp exchange={} at {}".format(self._exchange.name, self.amqp_url))
with kombu.Connection(self.amqp_url) as conn:
conn_opened = time.time()
with conn.Consumer(url_queue, callbacks=[self._browse_page_requested]) as consumer:
import socket
while (not self.amqp_stop.is_set() and time.time() - conn_opened < RECONNECT_AFTER_SECONDS):
with conn.Consumer(url_queue) as consumer:
consumer.qos(prefetch_count=1)
while (not self._amqp_stop.is_set() and time.time() - conn_opened < RECONNECT_AFTER_SECONDS):
import socket
try:
if self.num_active_browsers < self.max_active_browsers:
conn.drain_events(timeout=0.5)
else:
time.sleep(0.5)
except socket.timeout:
browser = self._browser_pool.acquire()
consumer.callbacks = [self._make_callback(browser)]
conn.drain_events(timeout=0.5)
consumer.callbacks = None
except KeyError:
# no browsers available
pass
except socket.timeout:
# no urls in the queue
self._browser_pool.release(browser)
except BaseException as e:
self.logger.error("amqp exception {}".format(e))
time.sleep(0.5)
self.logger.error("attempting to reopen amqp connection")
def _browse_page_requested(self, body, message):
def _make_callback(self, browser):
def callback(body, message):
self._browse_page(browser, body['clientId'], body['url'], body['metadata'])
message.ack()
return callback
def _browse_page(self, browser, client_id, url, parent_url_metadata):
"""Kombu Consumer callback. Provisions a Browser and
asynchronously asks it to browse the requested url."""
client_id = body['clientId']
def on_request(chrome_msg):
payload = chrome_msg['params']['request']
payload['parentUrl'] = body['url']
payload['parentUrlMetadata'] = body['metadata']
payload['parentUrl'] = url
payload['parentUrlMetadata'] = parent_url_metadata
self.logger.debug('sending to amqp exchange={} routing_key={} payload={}'.format(self.exchange_name, client_id, payload))
with self.producer_lock:
publish = self.producer_conn.ensure(self.producer, self.producer.publish)
with self._producer_lock:
publish = self._producer_conn.ensure(self._producer, self._producer.publish)
publish(payload, exchange=self._exchange, routing_key=client_id)
with self.browsers_lock:
if client_id in self.browsers:
browser = self.browsers[client_id]
else:
# XXX should reuse ports
port = 9222 + len(self.browsers)
browser = Browser(chrome_port=port, chrome_exe=self.chrome_exe,
chrome_wait=self.browser_wait)
self.browsers[client_id] = browser
def browse_page_async():
self.logger.info('client_id={} body={}'.format(client_id, body))
while True:
with self.browsers_lock:
if self.num_active_browsers < self.max_active_browsers:
self.num_active_browsers += 1
break
time.sleep(0.5)
browser.browse_page(body['url'], on_request=on_request)
with self.browsers_lock:
self.num_active_browsers -= 1
self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url))
browser.browse_page(url, on_request=on_request)
self._browser_pool.release(browser)
threading.Thread(target=browse_page_async).start()
message.ack()