mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: let the OS pick an available port, to avoid what appear to be timing issues causing multiple browsers to choose the same port
This commit is contained in:
commit
cb75bb6e04
@ -30,6 +30,7 @@ import datetime
|
||||
import base64
|
||||
from brozzler.chrome import Chrome
|
||||
import surt
|
||||
import socket
|
||||
|
||||
class BrowsingException(Exception):
|
||||
pass
|
||||
@ -41,10 +42,12 @@ class BrowsingTimeout(BrowsingException):
|
||||
pass
|
||||
|
||||
class BrowserPool:
|
||||
'''
|
||||
Manages pool of browsers. Automatically chooses available port for the
|
||||
debugging protocol.
|
||||
'''
|
||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||
|
||||
BASE_PORT = 9200
|
||||
|
||||
def __init__(self, size=3, **kwargs):
|
||||
'''
|
||||
Initializes the pool.
|
||||
@ -54,13 +57,8 @@ class BrowserPool:
|
||||
**kwargs: arguments for Browser(...)
|
||||
'''
|
||||
self.size = size
|
||||
self._available = set()
|
||||
self.kwargs = kwargs
|
||||
self._in_use = set()
|
||||
|
||||
for i in range(0, size):
|
||||
browser = Browser(port=BrowserPool.BASE_PORT + i, **kwargs)
|
||||
self._available.add(browser)
|
||||
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def acquire(self):
|
||||
@ -74,16 +72,22 @@ class BrowserPool:
|
||||
NoBrowsersAvailable if none available
|
||||
'''
|
||||
with self._lock:
|
||||
try:
|
||||
browser = self._available.pop()
|
||||
except KeyError:
|
||||
if len(self._in_use) >= self.size:
|
||||
raise NoBrowsersAvailable
|
||||
|
||||
# choose available port
|
||||
sock = socket.socket()
|
||||
sock.bind(('0.0.0.0', 0))
|
||||
port = sock.getsockname()[1]
|
||||
sock.close()
|
||||
|
||||
browser = Browser(port=port, **self.kwargs)
|
||||
self._in_use.add(browser)
|
||||
return browser
|
||||
|
||||
def release(self, browser):
|
||||
browser.stop() # make sure
|
||||
with self._lock:
|
||||
self._available.add(browser)
|
||||
self._in_use.remove(browser)
|
||||
|
||||
def shutdown_now(self):
|
||||
@ -91,13 +95,12 @@ class BrowserPool:
|
||||
'shutting down browser pool (%s browsers in use)',
|
||||
len(self._in_use))
|
||||
with self._lock:
|
||||
for browser in self._available:
|
||||
browser.stop()
|
||||
for browser in self._in_use:
|
||||
browser.stop()
|
||||
self._in_use.clear()
|
||||
|
||||
def num_available(self):
|
||||
return len(self._available)
|
||||
return self.size - len(self._in_use)
|
||||
|
||||
def num_in_use(self):
|
||||
return len(self._in_use)
|
||||
|
@ -28,7 +28,6 @@ import re
|
||||
import signal
|
||||
import sqlite3
|
||||
import json
|
||||
import psutil
|
||||
import tempfile
|
||||
|
||||
class Chrome:
|
||||
@ -61,24 +60,6 @@ class Chrome:
|
||||
def __exit__(self, *args):
|
||||
self.stop()
|
||||
|
||||
def _find_available_port(self, default_port=9200):
|
||||
try:
|
||||
conns = psutil.net_connections(kind='tcp')
|
||||
except psutil.AccessDenied:
|
||||
return default_port
|
||||
|
||||
if not any(conn.laddr[1] == default_port for conn in conns):
|
||||
return default_port
|
||||
|
||||
for p in range(9999,8999,-1):
|
||||
if not any(conn.laddr[1] == p for conn in conns):
|
||||
self.logger.warn(
|
||||
'port %s already in use, using %s instead',
|
||||
default_port, p)
|
||||
return p
|
||||
|
||||
return default_port
|
||||
|
||||
def _init_cookie_db(self, cookie_db):
|
||||
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
|
||||
cookie_location = os.path.join(cookie_dir, 'Cookies')
|
||||
@ -140,7 +121,6 @@ class Chrome:
|
||||
|
||||
new_env = os.environ.copy()
|
||||
new_env['HOME'] = self._home_tmpdir.name
|
||||
self.port = self._find_available_port(self.port)
|
||||
chrome_args = [
|
||||
self.chrome_exe,
|
||||
'--remote-debugging-port=%s' % self.port,
|
||||
|
3
setup.py
3
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev192',
|
||||
version='1.1b9.dev193',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -71,7 +71,6 @@ setuptools.setup(
|
||||
'surt>=0.3.0',
|
||||
'rethinkstuff>=0.1.5',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'psutil==4.3.0',
|
||||
'cerberus==1.0.1',
|
||||
'jinja2',
|
||||
],
|
||||
|
@ -25,7 +25,6 @@ import brozzler
|
||||
import brozzler.chrome
|
||||
import socket
|
||||
import logging
|
||||
import psutil
|
||||
import yaml
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
@ -57,23 +56,6 @@ def test_robots(httpd):
|
||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_find_available_port():
|
||||
x = brozzler.chrome.Chrome(None, None)
|
||||
try:
|
||||
psutil.net_connections(kind='tcp')
|
||||
except psutil.AccessDenied:
|
||||
logging.warn(
|
||||
'skipping _find_available_port() test because '
|
||||
'psutil.net_connections(kind="tcp") raised AccessDenied')
|
||||
return
|
||||
assert x._find_available_port(9800) == 9800
|
||||
sock = socket.socket()
|
||||
sock.bind(('localhost', 9800))
|
||||
sock.listen(0)
|
||||
assert x._find_available_port(9800) >= 9990
|
||||
sock.close()
|
||||
assert x._find_available_port(9800) == 9800
|
||||
|
||||
def test_scoping():
|
||||
test_scope = yaml.load('''
|
||||
max_hops: 100
|
||||
|
Loading…
x
Reference in New Issue
Block a user