mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
get rid of --browser-wait and --routing-key in favor of sensible defaults, some other tweaks
This commit is contained in:
parent
a78e60f1da
commit
025db91dea
16
bin/umbra
16
bin/umbra
@ -19,8 +19,6 @@ if __name__=="__main__":
|
|||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
description='umbra - browser automation tool communicating via AMQP',
|
description='umbra - browser automation tool communicating via AMQP',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
|
|
||||||
help='Seconds to wait for browser initialization')
|
|
||||||
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||||
help='Executable to use to invoke chrome')
|
help='Executable to use to invoke chrome')
|
||||||
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
||||||
@ -29,9 +27,7 @@ if __name__=="__main__":
|
|||||||
help='AMQP exchange name')
|
help='AMQP exchange name')
|
||||||
arg_parser.add_argument('--queue', dest='amqp_queue', default='urls',
|
arg_parser.add_argument('--queue', dest='amqp_queue', default='urls',
|
||||||
help='AMQP queue to consume urls from')
|
help='AMQP queue to consume urls from')
|
||||||
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='url',
|
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
help='AMQP routing key to bind to the AMQP queue')
|
|
||||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='3',
|
|
||||||
help='Max number of chrome instances simultaneously browsing pages')
|
help='Max number of chrome instances simultaneously browsing pages')
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
||||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
@ -44,10 +40,9 @@ if __name__=="__main__":
|
|||||||
|
|
||||||
logging.info("umbra {} starting up".format(umbra.version))
|
logging.info("umbra {} starting up".format(umbra.version))
|
||||||
|
|
||||||
controller = umbra.Umbra(args.amqp_url, args.chrome_exe, args.browser_wait,
|
controller = umbra.Umbra(args.amqp_url, args.chrome_exe,
|
||||||
max_active_browsers=int(args.max_browsers),
|
max_active_browsers=int(args.max_browsers),
|
||||||
exchange_name=args.amqp_exchange, queue_name=args.amqp_queue,
|
exchange_name=args.amqp_exchange, queue_name=args.amqp_queue)
|
||||||
routing_key=args.amqp_routing_key)
|
|
||||||
|
|
||||||
class ShutdownRequested(Exception):
|
class ShutdownRequested(Exception):
|
||||||
pass
|
pass
|
||||||
@ -77,10 +72,15 @@ if __name__=="__main__":
|
|||||||
th.join()
|
th.join()
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
logging.warn("caught exception {}".format(e))
|
logging.warn("caught exception {}".format(e))
|
||||||
|
for i in range(6,0,-1):
|
||||||
controller.shutdown_now()
|
controller.shutdown_now()
|
||||||
|
try:
|
||||||
for th in threading.enumerate():
|
for th in threading.enumerate():
|
||||||
if th != threading.current_thread():
|
if th != threading.current_thread():
|
||||||
th.join()
|
th.join()
|
||||||
|
break # if we get here, we're done, all threads finished
|
||||||
|
except:
|
||||||
|
logging.warn("caught exception {}".format(e))
|
||||||
|
|
||||||
logging.info("all finished, exiting")
|
logging.info("all finished, exiting")
|
||||||
|
|
||||||
|
@ -20,12 +20,12 @@ class BrowserPool:
|
|||||||
|
|
||||||
BASE_PORT = 9200
|
BASE_PORT = 9200
|
||||||
|
|
||||||
def __init__(self, size=3, chrome_exe='chromium-browser', chrome_wait=60):
|
def __init__(self, size=3, chrome_exe='chromium-browser'):
|
||||||
self._available = set()
|
self._available = set()
|
||||||
self._in_use = set()
|
self._in_use = set()
|
||||||
|
|
||||||
for i in range(0, size):
|
for i in range(0, size):
|
||||||
browser = Browser(BrowserPool.BASE_PORT + i, chrome_exe, chrome_wait)
|
browser = Browser(BrowserPool.BASE_PORT + i, chrome_exe)
|
||||||
self._available.add(browser)
|
self._available.add(browser)
|
||||||
|
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
@ -61,11 +61,10 @@ class Browser:
|
|||||||
|
|
||||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||||
|
|
||||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', chrome_wait=60):
|
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser'):
|
||||||
self.command_id = itertools.count(1)
|
self.command_id = itertools.count(1)
|
||||||
self.chrome_port = chrome_port
|
self.chrome_port = chrome_port
|
||||||
self.chrome_exe = chrome_exe
|
self.chrome_exe = chrome_exe
|
||||||
self.chrome_wait = chrome_wait
|
|
||||||
self._behavior = None
|
self._behavior = None
|
||||||
self._websock = None
|
self._websock = None
|
||||||
self._abort_browse_page = False
|
self._abort_browse_page = False
|
||||||
@ -84,8 +83,7 @@ class Browser:
|
|||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe,
|
self._chrome_instance = Chrome(self.chrome_port, self.chrome_exe,
|
||||||
self.chrome_wait, self._work_dir.name,
|
self._work_dir.name, os.sep.join([self._work_dir.name, "chrome-user-data"]))
|
||||||
os.sep.join([self._work_dir.name, "chrome-user-data"]))
|
|
||||||
self._websocket_url = self._chrome_instance.start()
|
self._websocket_url = self._chrome_instance.start()
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
@ -119,7 +117,7 @@ class Browser:
|
|||||||
while True:
|
while True:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
|
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
|
||||||
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websock))
|
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
|
||||||
elif time.time() - start > Browser.HARD_TIMEOUT_SECONDS:
|
elif time.time() - start > Browser.HARD_TIMEOUT_SECONDS:
|
||||||
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
|
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
|
||||||
return
|
return
|
||||||
@ -209,10 +207,9 @@ class Browser:
|
|||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, port, executable, browser_wait, user_home_dir, user_data_dir):
|
def __init__(self, port, executable, user_home_dir, user_data_dir):
|
||||||
self.port = port
|
self.port = port
|
||||||
self.executable = executable
|
self.executable = executable
|
||||||
self.browser_wait = browser_wait
|
|
||||||
self.user_home_dir = user_home_dir
|
self.user_home_dir = user_home_dir
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
|
|
||||||
@ -225,6 +222,7 @@ class Chrome:
|
|||||||
|
|
||||||
# returns websocket url to chrome window with about:blank loaded
|
# returns websocket url to chrome window with about:blank loaded
|
||||||
def start(self):
|
def start(self):
|
||||||
|
timeout_sec = 60
|
||||||
new_env = os.environ.copy()
|
new_env = os.environ.copy()
|
||||||
new_env["HOME"] = self.user_home_dir
|
new_env["HOME"] = self.user_home_dir
|
||||||
chrome_args = [self.executable,
|
chrome_args = [self.executable,
|
||||||
@ -257,7 +255,7 @@ class Chrome:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
if time.time() - start > float(self.browser_wait):
|
if time.time() - start > timeout_sec:
|
||||||
raise Exception("failed to retrieve {} after {} seconds".format(json_url, time.time() - start))
|
raise Exception("failed to retrieve {} after {} seconds".format(json_url, time.time() - start))
|
||||||
else:
|
else:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
@ -45,16 +45,13 @@ class AmqpBrowserController:
|
|||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, amqp_url='amqp://guest:guest@localhost:5672/%2f',
|
def __init__(self, amqp_url='amqp://guest:guest@localhost:5672/%2f',
|
||||||
chrome_exe='chromium-browser', browser_wait=60,
|
chrome_exe='chromium-browser', max_active_browsers=1,
|
||||||
max_active_browsers=1, queue_name='urls', routing_key='url',
|
queue_name='urls', exchange_name='umbra'):
|
||||||
exchange_name='umbra'):
|
|
||||||
self.amqp_url = amqp_url
|
self.amqp_url = amqp_url
|
||||||
self.queue_name = queue_name
|
self.queue_name = queue_name
|
||||||
self.routing_key = routing_key
|
|
||||||
self.exchange_name = exchange_name
|
self.exchange_name = exchange_name
|
||||||
|
|
||||||
self._browser_pool = BrowserPool(size=max_active_browsers,
|
self._browser_pool = BrowserPool(size=max_active_browsers, chrome_exe=chrome_exe)
|
||||||
chrome_exe=chrome_exe, chrome_wait=browser_wait)
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self._browsing_threads = set()
|
self._browsing_threads = set()
|
||||||
@ -83,31 +80,19 @@ class AmqpBrowserController:
|
|||||||
self._browser_pool.shutdown_now()
|
self._browser_pool.shutdown_now()
|
||||||
self._consumer_thread.join()
|
self._consumer_thread.join()
|
||||||
|
|
||||||
def _consume_amqp(self):
|
def _wait_for_and_browse_urls(self, conn, consumer, timeout):
|
||||||
# XXX https://webarchive.jira.com/browse/ARI-3811
|
start = time.time()
|
||||||
# After running for some amount of time (3 weeks in the latest case),
|
|
||||||
# consumer looks normal but doesn't consume any messages. Not clear if
|
|
||||||
# it's hanging in drain_events() or not. As a temporary measure for
|
|
||||||
# mitigation (if it works) or debugging (if it doesn't work), close and
|
|
||||||
# reopen the connection every 15 minutes
|
|
||||||
RECONNECT_AFTER_SECONDS = 15 * 60
|
|
||||||
|
|
||||||
url_queue = kombu.Queue(self.queue_name, routing_key=self.routing_key,
|
|
||||||
exchange=self._exchange)
|
|
||||||
|
|
||||||
while not self._consumer_stop.is_set():
|
|
||||||
try:
|
|
||||||
self.logger.info("connecting to amqp exchange={} at {}".format(self._exchange.name, self.amqp_url))
|
|
||||||
with kombu.Connection(self.amqp_url) as conn:
|
|
||||||
conn_opened = time.time()
|
|
||||||
with conn.Consumer(url_queue) as consumer:
|
|
||||||
consumer.qos(prefetch_count=1)
|
|
||||||
browser = None
|
browser = None
|
||||||
while not self._consumer_stop.is_set() and time.time() - conn_opened < RECONNECT_AFTER_SECONDS:
|
consumer.qos(prefetch_count=1)
|
||||||
|
|
||||||
|
while not self._consumer_stop.is_set() and time.time() - start < timeout:
|
||||||
try:
|
try:
|
||||||
browser = self._browser_pool.acquire() # raises KeyError if none available
|
browser = self._browser_pool.acquire() # raises KeyError if none available
|
||||||
browser.start()
|
browser.start()
|
||||||
consumer.callbacks = [self._make_callback(browser)]
|
|
||||||
|
def callback(body, message):
|
||||||
|
self._start_browsing_page(browser, message, body['clientId'], body['url'], body['metadata'])
|
||||||
|
consumer.callbacks = [callback]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -116,7 +101,7 @@ class AmqpBrowserController:
|
|||||||
except socket.timeout:
|
except socket.timeout:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if self._consumer_stop.is_set() or time.time() - conn_opened >= RECONNECT_AFTER_SECONDS:
|
if self._consumer_stop.is_set() or time.time() - start >= timeout:
|
||||||
browser.stop()
|
browser.stop()
|
||||||
self._browser_pool.release(browser)
|
self._browser_pool.release(browser)
|
||||||
break
|
break
|
||||||
@ -130,27 +115,42 @@ class AmqpBrowserController:
|
|||||||
finally:
|
finally:
|
||||||
consumer.callbacks = None
|
consumer.callbacks = None
|
||||||
|
|
||||||
# need to wait for browsers to finish here, before closing
|
def _wait_for_active_browsers(self):
|
||||||
# the amqp connection, because they use it to do
|
|
||||||
# message.ack() after they finish browsing a page
|
|
||||||
self.logger.info("waiting for browsing threads to finish")
|
self.logger.info("waiting for browsing threads to finish")
|
||||||
while True:
|
while True:
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
if len(self._browsing_threads) == 0:
|
if len(self._browsing_threads) == 0:
|
||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
self.logger.info("browsing threads finished")
|
self.logger.info("active browsing threads finished")
|
||||||
|
|
||||||
|
def _consume_amqp(self):
|
||||||
|
# XXX https://webarchive.jira.com/browse/ARI-3811
|
||||||
|
# After running for some amount of time (3 weeks in the latest case),
|
||||||
|
# consumer looks normal but doesn't consume any messages. Not clear if
|
||||||
|
# it's hanging in drain_events() or not. As a temporary measure for
|
||||||
|
# mitigation (if it works) or debugging (if it doesn't work), close and
|
||||||
|
# reopen the connection every 15 minutes
|
||||||
|
RECONNECT_AFTER_SECONDS = 15 * 60
|
||||||
|
|
||||||
|
url_queue = kombu.Queue(self.queue_name, exchange=self._exchange)
|
||||||
|
|
||||||
|
while not self._consumer_stop.is_set():
|
||||||
|
try:
|
||||||
|
self.logger.info("connecting to amqp exchange={} at {}".format(self._exchange.name, self.amqp_url))
|
||||||
|
with kombu.Connection(self.amqp_url) as conn:
|
||||||
|
with conn.Consumer(url_queue) as consumer:
|
||||||
|
self._wait_for_and_browse_urls(conn, consumer, timeout=RECONNECT_AFTER_SECONDS)
|
||||||
|
|
||||||
|
# need to wait for browsers to finish here, before closing
|
||||||
|
# the amqp connection, because they use it to do
|
||||||
|
# message.ack() after they finish browsing a page
|
||||||
|
self._wait_for_active_browsers()
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self.logger.error("caught exception {}".format(e), exc_info=True)
|
self.logger.error("caught exception {}".format(e), exc_info=True)
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
self.logger.error("attempting to reopen amqp connection")
|
self.logger.error("attempting to reopen amqp connection")
|
||||||
|
|
||||||
def _make_callback(self, browser):
|
|
||||||
def callback(body, message):
|
|
||||||
self._start_browsing_page(browser, message, body['clientId'], body['url'], body['metadata'])
|
|
||||||
return callback
|
|
||||||
|
|
||||||
def _start_browsing_page(self, browser, message, client_id, url, parent_url_metadata):
|
def _start_browsing_page(self, browser, message, client_id, url, parent_url_metadata):
|
||||||
def on_request(chrome_msg):
|
def on_request(chrome_msg):
|
||||||
payload = chrome_msg['params']['request']
|
payload = chrome_msg['params']['request']
|
||||||
@ -161,7 +161,7 @@ class AmqpBrowserController:
|
|||||||
publish = self._producer_conn.ensure(self._producer, self._producer.publish)
|
publish = self._producer_conn.ensure(self._producer, self._producer.publish)
|
||||||
publish(payload, exchange=self._exchange, routing_key=client_id)
|
publish(payload, exchange=self._exchange, routing_key=client_id)
|
||||||
|
|
||||||
def browse_page_async():
|
def browse_page_sync():
|
||||||
self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url))
|
self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url))
|
||||||
try:
|
try:
|
||||||
browser.browse_page(url, on_request=on_request)
|
browser.browse_page(url, on_request=on_request)
|
||||||
@ -176,13 +176,15 @@ class AmqpBrowserController:
|
|||||||
browser.stop()
|
browser.stop()
|
||||||
self._browser_pool.release(browser)
|
self._browser_pool.release(browser)
|
||||||
|
|
||||||
|
def browse_thread_run_then_cleanup():
|
||||||
|
browse_page_sync()
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
self._browsing_threads.remove(threading.current_thread())
|
self._browsing_threads.remove(threading.current_thread())
|
||||||
|
|
||||||
import random
|
import random
|
||||||
threadName = "BrowsingThread{}-{}".format(browser.chrome_port,
|
thread_name = "BrowsingThread{}-{}".format(browser.chrome_port,
|
||||||
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
||||||
th = threading.Thread(target=browse_page_async, name=threadName)
|
th = threading.Thread(target=browse_thread_run_then_cleanup, name=thread_name)
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
self._browsing_threads.add(th)
|
self._browsing_threads.add(th)
|
||||||
th.start()
|
th.start()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user