mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
let the OS pick an available port, to avoid what appear to be timing issues causing multiple browsers to choose the same port
This commit is contained in:
parent
3c4ab834da
commit
2398031010
4 changed files with 19 additions and 55 deletions
|
@ -30,6 +30,7 @@ import datetime
|
||||||
import base64
|
import base64
|
||||||
from brozzler.chrome import Chrome
|
from brozzler.chrome import Chrome
|
||||||
import surt
|
import surt
|
||||||
|
import socket
|
||||||
|
|
||||||
class BrowsingException(Exception):
|
class BrowsingException(Exception):
|
||||||
pass
|
pass
|
||||||
|
@ -41,10 +42,12 @@ class BrowsingTimeout(BrowsingException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BrowserPool:
|
class BrowserPool:
|
||||||
|
'''
|
||||||
|
Manages pool of browsers. Automatically chooses available port for the
|
||||||
|
debugging protocol.
|
||||||
|
'''
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||||
|
|
||||||
BASE_PORT = 9200
|
|
||||||
|
|
||||||
def __init__(self, size=3, **kwargs):
|
def __init__(self, size=3, **kwargs):
|
||||||
'''
|
'''
|
||||||
Initializes the pool.
|
Initializes the pool.
|
||||||
|
@ -54,13 +57,8 @@ class BrowserPool:
|
||||||
**kwargs: arguments for Browser(...)
|
**kwargs: arguments for Browser(...)
|
||||||
'''
|
'''
|
||||||
self.size = size
|
self.size = size
|
||||||
self._available = set()
|
self.kwargs = kwargs
|
||||||
self._in_use = set()
|
self._in_use = set()
|
||||||
|
|
||||||
for i in range(0, size):
|
|
||||||
browser = Browser(port=BrowserPool.BASE_PORT + i, **kwargs)
|
|
||||||
self._available.add(browser)
|
|
||||||
|
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
def acquire(self):
|
def acquire(self):
|
||||||
|
@ -74,16 +72,22 @@ class BrowserPool:
|
||||||
NoBrowsersAvailable if none available
|
NoBrowsersAvailable if none available
|
||||||
'''
|
'''
|
||||||
with self._lock:
|
with self._lock:
|
||||||
try:
|
if len(self._in_use) >= self.size:
|
||||||
browser = self._available.pop()
|
|
||||||
except KeyError:
|
|
||||||
raise NoBrowsersAvailable
|
raise NoBrowsersAvailable
|
||||||
|
|
||||||
|
# choose available port
|
||||||
|
sock = socket.socket()
|
||||||
|
sock.bind(('0.0.0.0', 0))
|
||||||
|
port = sock.getsockname()[1]
|
||||||
|
sock.close()
|
||||||
|
|
||||||
|
browser = Browser(port=port, **self.kwargs)
|
||||||
self._in_use.add(browser)
|
self._in_use.add(browser)
|
||||||
return browser
|
return browser
|
||||||
|
|
||||||
def release(self, browser):
|
def release(self, browser):
|
||||||
|
browser.stop() # make sure
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._available.add(browser)
|
|
||||||
self._in_use.remove(browser)
|
self._in_use.remove(browser)
|
||||||
|
|
||||||
def shutdown_now(self):
|
def shutdown_now(self):
|
||||||
|
@ -91,13 +95,12 @@ class BrowserPool:
|
||||||
'shutting down browser pool (%s browsers in use)',
|
'shutting down browser pool (%s browsers in use)',
|
||||||
len(self._in_use))
|
len(self._in_use))
|
||||||
with self._lock:
|
with self._lock:
|
||||||
for browser in self._available:
|
|
||||||
browser.stop()
|
|
||||||
for browser in self._in_use:
|
for browser in self._in_use:
|
||||||
browser.stop()
|
browser.stop()
|
||||||
|
self._in_use.clear()
|
||||||
|
|
||||||
def num_available(self):
|
def num_available(self):
|
||||||
return len(self._available)
|
return self.size - len(self._in_use)
|
||||||
|
|
||||||
def num_in_use(self):
|
def num_in_use(self):
|
||||||
return len(self._in_use)
|
return len(self._in_use)
|
||||||
|
|
|
@ -28,7 +28,6 @@ import re
|
||||||
import signal
|
import signal
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import json
|
import json
|
||||||
import psutil
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
class Chrome:
|
class Chrome:
|
||||||
|
@ -61,24 +60,6 @@ class Chrome:
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def _find_available_port(self, default_port=9200):
|
|
||||||
try:
|
|
||||||
conns = psutil.net_connections(kind='tcp')
|
|
||||||
except psutil.AccessDenied:
|
|
||||||
return default_port
|
|
||||||
|
|
||||||
if not any(conn.laddr[1] == default_port for conn in conns):
|
|
||||||
return default_port
|
|
||||||
|
|
||||||
for p in range(9999,8999,-1):
|
|
||||||
if not any(conn.laddr[1] == p for conn in conns):
|
|
||||||
self.logger.warn(
|
|
||||||
'port %s already in use, using %s instead',
|
|
||||||
default_port, p)
|
|
||||||
return p
|
|
||||||
|
|
||||||
return default_port
|
|
||||||
|
|
||||||
def _init_cookie_db(self, cookie_db):
|
def _init_cookie_db(self, cookie_db):
|
||||||
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
||||||
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
||||||
|
@ -140,7 +121,6 @@ class Chrome:
|
||||||
|
|
||||||
new_env = os.environ.copy()
|
new_env = os.environ.copy()
|
||||||
new_env['HOME'] = self._home_tmpdir.name
|
new_env['HOME'] = self._home_tmpdir.name
|
||||||
self.port = self._find_available_port(self.port)
|
|
||||||
chrome_args = [
|
chrome_args = [
|
||||||
self.chrome_exe,
|
self.chrome_exe,
|
||||||
'--remote-debugging-port=%s' % self.port,
|
'--remote-debugging-port=%s' % self.port,
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev192',
|
version='1.1b9.dev193',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -71,7 +71,6 @@ setuptools.setup(
|
||||||
'surt>=0.3.0',
|
'surt>=0.3.0',
|
||||||
'rethinkstuff>=0.1.5',
|
'rethinkstuff>=0.1.5',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'psutil==4.3.0',
|
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
],
|
],
|
||||||
|
|
|
@ -25,7 +25,6 @@ import brozzler
|
||||||
import brozzler.chrome
|
import brozzler.chrome
|
||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
import psutil
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
|
@ -57,23 +56,6 @@ def test_robots(httpd):
|
||||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||||
assert not brozzler.is_permitted_by_robots(site, url)
|
assert not brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
def test_find_available_port():
|
|
||||||
x = brozzler.chrome.Chrome(None, None)
|
|
||||||
try:
|
|
||||||
psutil.net_connections(kind='tcp')
|
|
||||||
except psutil.AccessDenied:
|
|
||||||
logging.warn(
|
|
||||||
'skipping _find_available_port() test because '
|
|
||||||
'psutil.net_connections(kind="tcp") raised AccessDenied')
|
|
||||||
return
|
|
||||||
assert x._find_available_port(9800) == 9800
|
|
||||||
sock = socket.socket()
|
|
||||||
sock.bind(('localhost', 9800))
|
|
||||||
sock.listen(0)
|
|
||||||
assert x._find_available_port(9800) >= 9990
|
|
||||||
sock.close()
|
|
||||||
assert x._find_available_port(9800) == 9800
|
|
||||||
|
|
||||||
def test_scoping():
|
def test_scoping():
|
||||||
test_scope = yaml.load('''
|
test_scope = yaml.load('''
|
||||||
max_hops: 100
|
max_hops: 100
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue