move _find_available_ports to chrome.py, changing the way it works so that browser:9200 doesn't get stuck at 9201 forever, which pushes 9201 to 9202 etc, and add a unit test

This commit is contained in:
Noah Levitt 2016-12-06 17:12:20 -08:00
parent 74009852d6
commit ce03381b92
4 changed files with 42 additions and 21 deletions

View File

@ -30,7 +30,6 @@ from brozzler.chrome import Chrome
from brozzler.behaviors import Behavior from brozzler.behaviors import Behavior
from requests.structures import CaseInsensitiveDict from requests.structures import CaseInsensitiveDict
import base64 import base64
import psutil
import sqlite3 import sqlite3
import datetime import datetime
@ -103,7 +102,9 @@ class Browser:
HARD_TIMEOUT_SECONDS = 20 * 60 HARD_TIMEOUT_SECONDS = 20 * 60
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): def __init__(
self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
ignore_cert_errors=False):
self.command_id = itertools.count(1) self.command_id = itertools.count(1)
self.chrome_port = chrome_port self.chrome_port = chrome_port
self.chrome_exe = chrome_exe self.chrome_exe = chrome_exe
@ -130,7 +131,6 @@ class Browser:
def start(self, proxy=None, cookie_db=None): def start(self, proxy=None, cookie_db=None):
if not self._chrome_instance: if not self._chrome_instance:
# these can raise exceptions # these can raise exceptions
self.chrome_port = self._find_available_port()
self._work_dir = tempfile.TemporaryDirectory() self._work_dir = tempfile.TemporaryDirectory()
if cookie_db is not None: if cookie_db is not None:
cookie_dir = os.path.join( cookie_dir = os.path.join(
@ -199,23 +199,6 @@ class Browser:
cookie_location, exc_info=True) cookie_location, exc_info=True)
return cookie_db return cookie_db
def _find_available_port(self):
port_available = False
port = self.chrome_port
try:
conns = psutil.net_connections(kind="tcp")
except psutil.AccessDenied:
return port
for p in range(port, 65535):
if any(connection.laddr[1] == p for connection in conns):
self.logger.warn("port %s already open, will try %s", p, p+1)
else:
port = p
break
return port
def is_running(self): def is_running(self):
return bool(self._websocket_url) return bool(self._websocket_url)

View File

@ -29,6 +29,7 @@ import signal
import sqlite3 import sqlite3
import datetime import datetime
import json import json
import psutil
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + '.' + __qualname__)
@ -53,6 +54,24 @@ class Chrome:
def __exit__(self, *args): def __exit__(self, *args):
self.stop() self.stop()
def _find_available_port(self, default_port=9200):
try:
conns = psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
return default_port
if any(conn.laddr[1] == default_port for conn in conns):
return default_port
for p in range(9999,8999,-1):
if not any(conn.laddr[1] == p for conn in conns):
self.logger.warn(
'port %s already in use, using %s instead',
default_port, p)
return p
return default_port
def start(self): def start(self):
''' '''
Returns websocket url to chrome window with about:blank loaded. Returns websocket url to chrome window with about:blank loaded.
@ -60,6 +79,7 @@ class Chrome:
timeout_sec = 600 timeout_sec = 600
new_env = os.environ.copy() new_env = os.environ.copy()
new_env['HOME'] = self.user_home_dir new_env['HOME'] = self.user_home_dir
self.port = self._find_available_port(self.port)
chrome_args = [ chrome_args = [
self.executable, '--use-mock-keychain', # mac thing self.executable, '--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self.user_data_dir, '--user-data-dir=%s' % self.user_data_dir,

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b8.dev131', version='1.1b8.dev132',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View File

@ -22,6 +22,9 @@ import http.server
import threading import threading
import os import os
import brozzler import brozzler
import brozzler.chrome
import socket
import logging
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -52,3 +55,18 @@ def test_robots(httpd):
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url) assert not brozzler.is_permitted_by_robots(site, url)
def test_find_available_port():
try:
psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
logging.warn(
'skipping _find_available_port() test because '
'psutil.net_connections(kind="tcp") raised AccessDenied')
return
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800
sock = socket.socket()
sock.bind(('localhost', 9800))
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9999
sock.close()
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800