diff --git a/brozzler/browser.py b/brozzler/browser.py index 41d9ae7..27b2b23 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -30,6 +30,7 @@ import datetime import base64 from brozzler.chrome import Chrome import surt +import socket class BrowsingException(Exception): pass @@ -41,10 +42,12 @@ class BrowsingTimeout(BrowsingException): pass class BrowserPool: + ''' + Manages pool of browsers. Automatically chooses available port for the + debugging protocol. + ''' logger = logging.getLogger(__module__ + '.' + __qualname__) - BASE_PORT = 9200 - def __init__(self, size=3, **kwargs): ''' Initializes the pool. @@ -54,13 +57,8 @@ class BrowserPool: **kwargs: arguments for Browser(...) ''' self.size = size - self._available = set() + self.kwargs = kwargs self._in_use = set() - - for i in range(0, size): - browser = Browser(port=BrowserPool.BASE_PORT + i, **kwargs) - self._available.add(browser) - self._lock = threading.Lock() def acquire(self): @@ -74,16 +72,22 @@ class BrowserPool: NoBrowsersAvailable if none available ''' with self._lock: - try: - browser = self._available.pop() - except KeyError: + if len(self._in_use) >= self.size: raise NoBrowsersAvailable + + # choose available port + sock = socket.socket() + sock.bind(('0.0.0.0', 0)) + port = sock.getsockname()[1] + sock.close() + + browser = Browser(port=port, **self.kwargs) self._in_use.add(browser) return browser def release(self, browser): + browser.stop() # make sure with self._lock: - self._available.add(browser) self._in_use.remove(browser) def shutdown_now(self): @@ -91,13 +95,12 @@ class BrowserPool: 'shutting down browser pool (%s browsers in use)', len(self._in_use)) with self._lock: - for browser in self._available: - browser.stop() for browser in self._in_use: browser.stop() + self._in_use.clear() def num_available(self): - return len(self._available) + return self.size - len(self._in_use) def num_in_use(self): return len(self._in_use) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 8924950..87316cf 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -28,7 +28,6 @@ import re import signal import sqlite3 import json -import psutil import tempfile class Chrome: @@ -61,24 +60,6 @@ class Chrome: def __exit__(self, *args): self.stop() - def _find_available_port(self, default_port=9200): - try: - conns = psutil.net_connections(kind='tcp') - except psutil.AccessDenied: - return default_port - - if not any(conn.laddr[1] == default_port for conn in conns): - return default_port - - for p in range(9999,8999,-1): - if not any(conn.laddr[1] == p for conn in conns): - self.logger.warn( - 'port %s already in use, using %s instead', - default_port, p) - return p - - return default_port - def _init_cookie_db(self, cookie_db): cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default') cookie_location = os.path.join(cookie_dir, 'Cookies') @@ -140,7 +121,6 @@ class Chrome: new_env = os.environ.copy() new_env['HOME'] = self._home_tmpdir.name - self.port = self._find_available_port(self.port) chrome_args = [ self.chrome_exe, '--remote-debugging-port=%s' % self.port, diff --git a/setup.py b/setup.py index 7869965..809a848 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev192', + version='1.1b9.dev193', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -71,7 +71,6 @@ setuptools.setup( 'surt>=0.3.0', 'rethinkstuff>=0.1.5', 'rethinkdb>=2.3,<2.4', - 'psutil==4.3.0', 'cerberus==1.0.1', 'jinja2', ], diff --git a/tests/test_units.py b/tests/test_units.py index fc24a99..c5c8869 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -25,7 +25,6 @@ import brozzler import brozzler.chrome import socket import logging -import psutil import yaml @pytest.fixture(scope='module') @@ -57,23 +56,6 @@ def test_robots(httpd): site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') assert not brozzler.is_permitted_by_robots(site, url) -def test_find_available_port(): - x = brozzler.chrome.Chrome(None, None) - try: - psutil.net_connections(kind='tcp') - except psutil.AccessDenied: - logging.warn( - 'skipping _find_available_port() test because ' - 'psutil.net_connections(kind="tcp") raised AccessDenied') - return - assert x._find_available_port(9800) == 9800 - sock = socket.socket() - sock.bind(('localhost', 9800)) - sock.listen(0) - assert x._find_available_port(9800) >= 9990 - sock.close() - assert x._find_available_port(9800) == 9800 - def test_scoping(): test_scope = yaml.load(''' max_hops: 100