mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
move _find_available_ports to chrome.py, changing the way it works so that browser:9200 doesn't get stuck at 9201 forever, which pushes 9201 to 9202 etc, and add a unit test
This commit is contained in:
parent
74009852d6
commit
ce03381b92
@ -30,7 +30,6 @@ from brozzler.chrome import Chrome
|
|||||||
from brozzler.behaviors import Behavior
|
from brozzler.behaviors import Behavior
|
||||||
from requests.structures import CaseInsensitiveDict
|
from requests.structures import CaseInsensitiveDict
|
||||||
import base64
|
import base64
|
||||||
import psutil
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
@ -103,7 +102,9 @@ class Browser:
|
|||||||
|
|
||||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||||
|
|
||||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
|
def __init__(
|
||||||
|
self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None,
|
||||||
|
ignore_cert_errors=False):
|
||||||
self.command_id = itertools.count(1)
|
self.command_id = itertools.count(1)
|
||||||
self.chrome_port = chrome_port
|
self.chrome_port = chrome_port
|
||||||
self.chrome_exe = chrome_exe
|
self.chrome_exe = chrome_exe
|
||||||
@ -130,7 +131,6 @@ class Browser:
|
|||||||
def start(self, proxy=None, cookie_db=None):
|
def start(self, proxy=None, cookie_db=None):
|
||||||
if not self._chrome_instance:
|
if not self._chrome_instance:
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self.chrome_port = self._find_available_port()
|
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
if cookie_db is not None:
|
if cookie_db is not None:
|
||||||
cookie_dir = os.path.join(
|
cookie_dir = os.path.join(
|
||||||
@ -199,23 +199,6 @@ class Browser:
|
|||||||
cookie_location, exc_info=True)
|
cookie_location, exc_info=True)
|
||||||
return cookie_db
|
return cookie_db
|
||||||
|
|
||||||
def _find_available_port(self):
|
|
||||||
port_available = False
|
|
||||||
port = self.chrome_port
|
|
||||||
|
|
||||||
try:
|
|
||||||
conns = psutil.net_connections(kind="tcp")
|
|
||||||
except psutil.AccessDenied:
|
|
||||||
return port
|
|
||||||
|
|
||||||
for p in range(port, 65535):
|
|
||||||
if any(connection.laddr[1] == p for connection in conns):
|
|
||||||
self.logger.warn("port %s already open, will try %s", p, p+1)
|
|
||||||
else:
|
|
||||||
port = p
|
|
||||||
break
|
|
||||||
return port
|
|
||||||
|
|
||||||
def is_running(self):
|
def is_running(self):
|
||||||
return bool(self._websocket_url)
|
return bool(self._websocket_url)
|
||||||
|
|
||||||
|
@ -29,6 +29,7 @@ import signal
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import psutil
|
||||||
|
|
||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||||
@ -53,6 +54,24 @@ class Chrome:
|
|||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
|
def _find_available_port(self, default_port=9200):
|
||||||
|
try:
|
||||||
|
conns = psutil.net_connections(kind='tcp')
|
||||||
|
except psutil.AccessDenied:
|
||||||
|
return default_port
|
||||||
|
|
||||||
|
if any(conn.laddr[1] == default_port for conn in conns):
|
||||||
|
return default_port
|
||||||
|
|
||||||
|
for p in range(9999,8999,-1):
|
||||||
|
if not any(conn.laddr[1] == p for conn in conns):
|
||||||
|
self.logger.warn(
|
||||||
|
'port %s already in use, using %s instead',
|
||||||
|
default_port, p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
return default_port
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
'''
|
'''
|
||||||
Returns websocket url to chrome window with about:blank loaded.
|
Returns websocket url to chrome window with about:blank loaded.
|
||||||
@ -60,6 +79,7 @@ class Chrome:
|
|||||||
timeout_sec = 600
|
timeout_sec = 600
|
||||||
new_env = os.environ.copy()
|
new_env = os.environ.copy()
|
||||||
new_env['HOME'] = self.user_home_dir
|
new_env['HOME'] = self.user_home_dir
|
||||||
|
self.port = self._find_available_port(self.port)
|
||||||
chrome_args = [
|
chrome_args = [
|
||||||
self.executable, '--use-mock-keychain', # mac thing
|
self.executable, '--use-mock-keychain', # mac thing
|
||||||
'--user-data-dir=%s' % self.user_data_dir,
|
'--user-data-dir=%s' % self.user_data_dir,
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b8.dev131',
|
version='1.1b8.dev132',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -22,6 +22,9 @@ import http.server
|
|||||||
import threading
|
import threading
|
||||||
import os
|
import os
|
||||||
import brozzler
|
import brozzler
|
||||||
|
import brozzler.chrome
|
||||||
|
import socket
|
||||||
|
import logging
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
@ -52,3 +55,18 @@ def test_robots(httpd):
|
|||||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||||
assert not brozzler.is_permitted_by_robots(site, url)
|
assert not brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
def test_find_available_port():
|
||||||
|
try:
|
||||||
|
psutil.net_connections(kind='tcp')
|
||||||
|
except psutil.AccessDenied:
|
||||||
|
logging.warn(
|
||||||
|
'skipping _find_available_port() test because '
|
||||||
|
'psutil.net_connections(kind="tcp") raised AccessDenied')
|
||||||
|
return
|
||||||
|
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800
|
||||||
|
sock = socket.socket()
|
||||||
|
sock.bind(('localhost', 9800))
|
||||||
|
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9999
|
||||||
|
sock.close()
|
||||||
|
assert brozzler.chrome.Chrome._find_available_port(None, 9800) == 9800
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user