diff --git a/brozzler/browser.py b/brozzler/browser.py index 62ef813..a8cdd1f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -30,7 +30,6 @@ from brozzler.chrome import Chrome from brozzler.behaviors import Behavior from requests.structures import CaseInsensitiveDict import base64 -import psutil import sqlite3 import datetime @@ -103,7 +102,9 @@ class Browser: HARD_TIMEOUT_SECONDS = 20 * 60 - def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): + def __init__( + self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, + ignore_cert_errors=False): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe @@ -130,7 +131,6 @@ class Browser: def start(self, proxy=None, cookie_db=None): if not self._chrome_instance: # these can raise exceptions - self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() if cookie_db is not None: cookie_dir = os.path.join( @@ -199,23 +199,6 @@ class Browser: cookie_location, exc_info=True) return cookie_db - def _find_available_port(self): - port_available = False - port = self.chrome_port - - try: - conns = psutil.net_connections(kind="tcp") - except psutil.AccessDenied: - return port - - for p in range(port, 65535): - if any(connection.laddr[1] == p for connection in conns): - self.logger.warn("port %s already open, will try %s", p, p+1) - else: - port = p - break - return port - def is_running(self): return bool(self._websocket_url) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 9b4d816..96ae9cd 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -29,6 +29,7 @@ import signal import sqlite3 import datetime import json +import psutil class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) @@ -53,6 +54,24 @@ class Chrome: def __exit__(self, *args): self.stop() + def _find_available_port(self, default_port=9200): + try: + conns = psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + return default_port + + if any(conn.laddr[1] == default_port for conn in conns): + return default_port + + for p in range(9999,8999,-1): + if not any(conn.laddr[1] == p for conn in conns): + self.logger.warn( + 'port %s already in use, using %s instead', + default_port, p) + return p + + return default_port + def start(self): ''' Returns websocket url to chrome window with about:blank loaded. @@ -60,6 +79,7 @@ class Chrome: timeout_sec = 600 new_env = os.environ.copy() new_env['HOME'] = self.user_home_dir + self.port = self._find_available_port(self.port) chrome_args = [ self.executable, '--use-mock-keychain', # mac thing '--user-data-dir=%s' % self.user_data_dir, diff --git a/setup.py b/setup.py index 545f560..08748a2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev131', + version='1.1b8.dev132', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py index 2fee049..acd5fd0 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -22,6 +22,9 @@ import http.server import threading import os import brozzler +import brozzler.chrome +import socket +import logging @pytest.fixture(scope='module') def httpd(request): @@ -52,3 +55,18 @@ def test_robots(httpd): site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') assert not brozzler.is_permitted_by_robots(site, url) +def test_find_available_port(): + try: + psutil.net_connections(kind='tcp') + except psutil.AccessDenied: + logging.warn( + 'skipping _find_available_port() test because ' + 'psutil.net_connections(kind="tcp") raised AccessDenied') + return + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 + sock = socket.socket() + sock.bind(('localhost', 9800)) + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9999 + sock.close() + assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800 +