Merge branch 'master' into qa

* master:
  let the OS pick an available port, to avoid what appear to be timing issues causing multiple browsers to choose the same port
This commit is contained in:
Noah Levitt 2017-02-22 12:44:27 -08:00
commit cb75bb6e04
4 changed files with 19 additions and 55 deletions

View File

@ -30,6 +30,7 @@ import datetime
import base64
from brozzler.chrome import Chrome
import surt
import socket
class BrowsingException(Exception):
pass
@ -41,10 +42,12 @@ class BrowsingTimeout(BrowsingException):
pass
class BrowserPool:
'''
Manages pool of browsers. Automatically chooses available port for the
debugging protocol.
'''
logger = logging.getLogger(__module__ + '.' + __qualname__)
BASE_PORT = 9200
def __init__(self, size=3, **kwargs):
'''
Initializes the pool.
@ -54,13 +57,8 @@ class BrowserPool:
**kwargs: arguments for Browser(...)
'''
self.size = size
self._available = set()
self.kwargs = kwargs
self._in_use = set()
for i in range(0, size):
browser = Browser(port=BrowserPool.BASE_PORT + i, **kwargs)
self._available.add(browser)
self._lock = threading.Lock()
def acquire(self):
@ -74,16 +72,22 @@ class BrowserPool:
NoBrowsersAvailable if none available
'''
with self._lock:
try:
browser = self._available.pop()
except KeyError:
if len(self._in_use) >= self.size:
raise NoBrowsersAvailable
# choose available port
sock = socket.socket()
sock.bind(('0.0.0.0', 0))
port = sock.getsockname()[1]
sock.close()
browser = Browser(port=port, **self.kwargs)
self._in_use.add(browser)
return browser
def release(self, browser):
browser.stop() # make sure
with self._lock:
self._available.add(browser)
self._in_use.remove(browser)
def shutdown_now(self):
@ -91,13 +95,12 @@ class BrowserPool:
'shutting down browser pool (%s browsers in use)',
len(self._in_use))
with self._lock:
for browser in self._available:
browser.stop()
for browser in self._in_use:
browser.stop()
self._in_use.clear()
def num_available(self):
return len(self._available)
return self.size - len(self._in_use)
def num_in_use(self):
return len(self._in_use)

View File

@ -28,7 +28,6 @@ import re
import signal
import sqlite3
import json
import psutil
import tempfile
class Chrome:
@ -61,24 +60,6 @@ class Chrome:
def __exit__(self, *args):
self.stop()
def _find_available_port(self, default_port=9200):
try:
conns = psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
return default_port
if not any(conn.laddr[1] == default_port for conn in conns):
return default_port
for p in range(9999,8999,-1):
if not any(conn.laddr[1] == p for conn in conns):
self.logger.warn(
'port %s already in use, using %s instead',
default_port, p)
return p
return default_port
def _init_cookie_db(self, cookie_db):
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies')
@ -140,7 +121,6 @@ class Chrome:
new_env = os.environ.copy()
new_env['HOME'] = self._home_tmpdir.name
self.port = self._find_available_port(self.port)
chrome_args = [
self.chrome_exe,
'--remote-debugging-port=%s' % self.port,

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev192',
version='1.1b9.dev193',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -71,7 +71,6 @@ setuptools.setup(
'surt>=0.3.0',
'rethinkstuff>=0.1.5',
'rethinkdb>=2.3,<2.4',
'psutil==4.3.0',
'cerberus==1.0.1',
'jinja2',
],

View File

@ -25,7 +25,6 @@ import brozzler
import brozzler.chrome
import socket
import logging
import psutil
import yaml
@pytest.fixture(scope='module')
@ -57,23 +56,6 @@ def test_robots(httpd):
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url)
def test_find_available_port():
x = brozzler.chrome.Chrome(None, None)
try:
psutil.net_connections(kind='tcp')
except psutil.AccessDenied:
logging.warn(
'skipping _find_available_port() test because '
'psutil.net_connections(kind="tcp") raised AccessDenied')
return
assert x._find_available_port(9800) == 9800
sock = socket.socket()
sock.bind(('localhost', 9800))
sock.listen(0)
assert x._find_available_port(9800) >= 9990
sock.close()
assert x._find_available_port(9800) == 9800
def test_scoping():
test_scope = yaml.load('''
max_hops: 100