mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
split Chrome class into its own module
This commit is contained in:
parent
3c43fdaced
commit
74009852d6
3 changed files with 219 additions and 179 deletions
|
@ -1,6 +1,5 @@
|
||||||
'''
|
'''
|
||||||
brozzler/browser.py - classes responsible for running web browsers
|
brozzler/browser.py - manages the browsers for brozzler
|
||||||
(chromium/chromium) and browsing web pages in them
|
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
|
|
||||||
|
@ -19,23 +18,19 @@ limitations under the License.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
|
||||||
import itertools
|
import itertools
|
||||||
import websocket
|
import websocket
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
import subprocess
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import brozzler
|
import brozzler
|
||||||
|
from brozzler.chrome import Chrome
|
||||||
from brozzler.behaviors import Behavior
|
from brozzler.behaviors import Behavior
|
||||||
from requests.structures import CaseInsensitiveDict
|
from requests.structures import CaseInsensitiveDict
|
||||||
import select
|
|
||||||
import re
|
|
||||||
import base64
|
import base64
|
||||||
import psutil
|
import psutil
|
||||||
import signal
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
@ -233,7 +228,7 @@ class Browser:
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
on_url_change=None):
|
on_url_change=None):
|
||||||
"""
|
"""
|
||||||
Synchronously loads a page, takes a screenshot, and runs behaviors.
|
Synchronously loads a page, runs behaviors, and takes a screenshot.
|
||||||
|
|
||||||
Raises BrowsingException if browsing the page fails in a non-critical
|
Raises BrowsingException if browsing the page fails in a non-critical
|
||||||
way.
|
way.
|
||||||
|
@ -263,10 +258,10 @@ class Browser:
|
||||||
self._websocket_url, on_open=self._visit_page,
|
self._websocket_url, on_open=self._visit_page,
|
||||||
on_message=self._wrap_handle_message)
|
on_message=self._wrap_handle_message)
|
||||||
|
|
||||||
threadName = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
|
thread_name = "WebsockThread:{}-{:%Y%m%d%H%M%S}".format(
|
||||||
self.chrome_port, datetime.datetime.utcnow())
|
self.chrome_port, datetime.datetime.utcnow())
|
||||||
websock_thread = threading.Thread(
|
websock_thread = threading.Thread(
|
||||||
target=self._websock.run_forever, name=threadName,
|
target=self._websock.run_forever, name=thread_name,
|
||||||
kwargs={'ping_timeout':0.5})
|
kwargs={'ping_timeout':0.5})
|
||||||
websock_thread.start()
|
websock_thread.start()
|
||||||
self._start = time.time()
|
self._start = time.time()
|
||||||
|
@ -570,171 +565,3 @@ __brzl_compileOutlinks(window).join('\n');
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s", json_message)
|
# self.logger.debug("%s", json_message)
|
||||||
|
|
||||||
class Chrome:
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
|
||||||
|
|
||||||
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
|
|
||||||
self.port = port
|
|
||||||
self.executable = executable
|
|
||||||
self.user_home_dir = user_home_dir
|
|
||||||
self.user_data_dir = user_data_dir
|
|
||||||
self.proxy = proxy
|
|
||||||
self.ignore_cert_errors = ignore_cert_errors
|
|
||||||
self._shutdown = threading.Event()
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
'''
|
|
||||||
Returns websocket url to chrome window with about:blank loaded.
|
|
||||||
'''
|
|
||||||
return self.start()
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
self.stop()
|
|
||||||
|
|
||||||
def start(self):
|
|
||||||
'''
|
|
||||||
Returns websocket url to chrome window with about:blank loaded.
|
|
||||||
'''
|
|
||||||
timeout_sec = 600
|
|
||||||
new_env = os.environ.copy()
|
|
||||||
new_env["HOME"] = self.user_home_dir
|
|
||||||
chrome_args = [
|
|
||||||
self.executable, "--use-mock-keychain", # mac thing
|
|
||||||
"--user-data-dir={}".format(self.user_data_dir),
|
|
||||||
"--remote-debugging-port={}".format(self.port),
|
|
||||||
"--disable-web-sockets", "--disable-cache",
|
|
||||||
"--window-size=1100,900", "--no-default-browser-check",
|
|
||||||
"--disable-first-run-ui", "--no-first-run",
|
|
||||||
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
|
||||||
"--disable-web-security", "--disable-notifications",
|
|
||||||
"--disable-extensions",
|
|
||||||
"--disable-save-password-bubble"]
|
|
||||||
if self.ignore_cert_errors:
|
|
||||||
chrome_args.append("--ignore-certificate-errors")
|
|
||||||
if self.proxy:
|
|
||||||
chrome_args.append("--proxy-server={}".format(self.proxy))
|
|
||||||
chrome_args.append("about:blank")
|
|
||||||
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
|
||||||
# start_new_session - new process group so we can kill the whole group
|
|
||||||
self.chrome_process = subprocess.Popen(chrome_args, env=new_env,
|
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0,
|
|
||||||
start_new_session=True)
|
|
||||||
self._out_reader_thread = threading.Thread(target=self._read_stderr_stdout,
|
|
||||||
name="ChromeOutReaderThread(pid={})".format(self.chrome_process.pid))
|
|
||||||
self._out_reader_thread.start()
|
|
||||||
self.logger.info("chrome running, pid {}".format(self.chrome_process.pid))
|
|
||||||
self._start = time.time() # member variable just so that kill -QUIT reports it
|
|
||||||
|
|
||||||
json_url = "http://localhost:%s/json" % self.port
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
|
||||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
|
||||||
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
|
|
||||||
|
|
||||||
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
|
||||||
self.logger.debug("{} returned {}".format(json_url, raw_json))
|
|
||||||
url = debug_info[0]['webSocketDebuggerUrl']
|
|
||||||
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
|
|
||||||
return url
|
|
||||||
except BaseException as e:
|
|
||||||
if int(time.time() - self._start) % 10 == 5:
|
|
||||||
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
if time.time() - self._start > timeout_sec:
|
|
||||||
self.logger.error("killing chrome, failed to retrieve %s after %s seconds", json_url, time.time() - self._start)
|
|
||||||
self.stop()
|
|
||||||
raise Exception("killed chrome, failed to retrieve {} after {} seconds".format(json_url, time.time() - self._start))
|
|
||||||
else:
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
def _read_stderr_stdout(self):
|
|
||||||
# XXX select doesn't work on windows
|
|
||||||
def readline_nonblock(f):
|
|
||||||
buf = b""
|
|
||||||
while not self._shutdown.is_set() and (
|
|
||||||
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
|
||||||
[f],[],[],0.5)[0]:
|
|
||||||
buf += f.read(1)
|
|
||||||
return buf
|
|
||||||
|
|
||||||
try:
|
|
||||||
while not self._shutdown.is_set():
|
|
||||||
buf = readline_nonblock(self.chrome_process.stdout)
|
|
||||||
if buf:
|
|
||||||
if re.search(
|
|
||||||
b"Xlib: extension|"
|
|
||||||
b"CERT_PKIXVerifyCert for [^ ]* failed|"
|
|
||||||
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
|
|
||||||
b"ERROR:gpu_child_thread.cc", buf):
|
|
||||||
logging.log(
|
|
||||||
brozzler.TRACE, "chrome pid %s STDOUT %s",
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
else:
|
|
||||||
logging.debug(
|
|
||||||
"chrome pid %s STDOUT %s",
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
|
|
||||||
buf = readline_nonblock(self.chrome_process.stderr)
|
|
||||||
if buf:
|
|
||||||
if re.search(
|
|
||||||
b"Xlib: extension|"
|
|
||||||
b"CERT_PKIXVerifyCert for [^ ]* failed|"
|
|
||||||
b"^ALSA lib|ERROR:gl_surface_glx.cc|"
|
|
||||||
b"ERROR:gpu_child_thread.cc", buf):
|
|
||||||
logging.log(
|
|
||||||
brozzler.TRACE, "chrome pid %s STDOUT %s",
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
else:
|
|
||||||
logging.debug(
|
|
||||||
"chrome pid %s STDERR %s",
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
except:
|
|
||||||
logging.error("unexpected exception", exc_info=True)
|
|
||||||
|
|
||||||
def stop(self):
|
|
||||||
if not self.chrome_process or self._shutdown.is_set():
|
|
||||||
return
|
|
||||||
|
|
||||||
timeout_sec = 300
|
|
||||||
self._shutdown.set()
|
|
||||||
self.logger.info("terminating chrome pgid %s" % self.chrome_process.pid)
|
|
||||||
|
|
||||||
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
|
||||||
first_sigterm = time.time()
|
|
||||||
|
|
||||||
try:
|
|
||||||
while time.time() - first_sigterm < timeout_sec:
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
status = self.chrome_process.poll()
|
|
||||||
if status is not None:
|
|
||||||
if status == 0:
|
|
||||||
self.logger.info(
|
|
||||||
"chrome pid %s exited normally",
|
|
||||||
self.chrome_process.pid)
|
|
||||||
else:
|
|
||||||
self.logger.warn(
|
|
||||||
"chrome pid %s exited with nonzero status %s",
|
|
||||||
self.chrome_process.pid, status)
|
|
||||||
|
|
||||||
# XXX I would like to forcefully kill the process group
|
|
||||||
# here to guarantee no orphaned chromium subprocesses hang
|
|
||||||
# around, but there's a chance I suppose that some other
|
|
||||||
# process could have started with the same pgid
|
|
||||||
return
|
|
||||||
|
|
||||||
self.logger.warn(
|
|
||||||
"chrome pid %s still alive %.1f seconds after sending "
|
|
||||||
"SIGTERM, sending SIGKILL", self.chrome_process.pid,
|
|
||||||
time.time() - first_sigterm)
|
|
||||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
|
||||||
status = self.chrome_process.wait()
|
|
||||||
self.logger.warn(
|
|
||||||
"chrome pid %s reaped (status=%s) after killing with "
|
|
||||||
"SIGKILL", self.chrome_process.pid, status)
|
|
||||||
finally:
|
|
||||||
self._out_reader_thread.join()
|
|
||||||
self.chrome_process = None
|
|
||||||
|
|
213
brozzler/chrome.py
Normal file
213
brozzler/chrome.py
Normal file
|
@ -0,0 +1,213 @@
|
||||||
|
'''
|
||||||
|
brozzler/chrome.py - manages the chrome/chromium browser for brozzler
|
||||||
|
|
||||||
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import urllib.request
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import subprocess
|
||||||
|
import os
|
||||||
|
import brozzler
|
||||||
|
import select
|
||||||
|
import re
|
||||||
|
import signal
|
||||||
|
import sqlite3
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
class Chrome:
|
||||||
|
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, port, executable, user_home_dir, user_data_dir, proxy=None,
|
||||||
|
ignore_cert_errors=False):
|
||||||
|
self.port = port
|
||||||
|
self.executable = executable
|
||||||
|
self.user_home_dir = user_home_dir
|
||||||
|
self.user_data_dir = user_data_dir
|
||||||
|
self.proxy = proxy
|
||||||
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
|
self._shutdown = threading.Event()
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
'''
|
||||||
|
Returns websocket url to chrome window with about:blank loaded.
|
||||||
|
'''
|
||||||
|
return self.start()
|
||||||
|
|
||||||
|
def __exit__(self, *args):
|
||||||
|
self.stop()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
'''
|
||||||
|
Returns websocket url to chrome window with about:blank loaded.
|
||||||
|
'''
|
||||||
|
timeout_sec = 600
|
||||||
|
new_env = os.environ.copy()
|
||||||
|
new_env['HOME'] = self.user_home_dir
|
||||||
|
chrome_args = [
|
||||||
|
self.executable, '--use-mock-keychain', # mac thing
|
||||||
|
'--user-data-dir=%s' % self.user_data_dir,
|
||||||
|
'--remote-debugging-port=%s' % self.port,
|
||||||
|
'--disable-web-sockets', '--disable-cache',
|
||||||
|
'--window-size=1100,900', '--no-default-browser-check',
|
||||||
|
'--disable-first-run-ui', '--no-first-run',
|
||||||
|
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
||||||
|
'--disable-web-security', '--disable-notifications',
|
||||||
|
'--disable-extensions', '--disable-save-password-bubble']
|
||||||
|
if self.ignore_cert_errors:
|
||||||
|
chrome_args.append('--ignore-certificate-errors')
|
||||||
|
if self.proxy:
|
||||||
|
chrome_args.append('--proxy-server=%s' % self.proxy)
|
||||||
|
chrome_args.append('about:blank')
|
||||||
|
self.logger.info(
|
||||||
|
'running: %s' % repr(subprocess.list2cmdline(chrome_args)))
|
||||||
|
# start_new_session - new process group so we can kill the whole group
|
||||||
|
self.chrome_process = subprocess.Popen(
|
||||||
|
chrome_args, env=new_env, start_new_session=True,
|
||||||
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0)
|
||||||
|
self._out_reader_thread = threading.Thread(
|
||||||
|
target=self._read_stderr_stdout,
|
||||||
|
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid)
|
||||||
|
self._out_reader_thread.start()
|
||||||
|
self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
|
||||||
|
# make this a member variable so that kill -QUIT reports it
|
||||||
|
self._start = time.time()
|
||||||
|
|
||||||
|
json_url = 'http://localhost:%s/json' % self.port
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||||
|
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
||||||
|
debug_info = [x for x in all_debug_info
|
||||||
|
if x['url'] == 'about:blank']
|
||||||
|
|
||||||
|
if debug_info and 'webSocketDebuggerUrl' in debug_info[0]:
|
||||||
|
self.logger.debug('%s returned %s', json_url, raw_json)
|
||||||
|
url = debug_info[0]['webSocketDebuggerUrl']
|
||||||
|
self.logger.info(
|
||||||
|
'got chrome window websocket debug url %s from %s',
|
||||||
|
url, json_url)
|
||||||
|
return url
|
||||||
|
except BaseException as e:
|
||||||
|
if int(time.time() - self._start) % 10 == 5:
|
||||||
|
self.logger.warn(
|
||||||
|
'problem with %s (will keep trying until timeout '
|
||||||
|
'of %d seconds): %s', json_url, timeout_sec, e)
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
if time.time() - self._start > timeout_sec:
|
||||||
|
self.logger.error(
|
||||||
|
'killing chrome, failed to retrieve %s after %s '
|
||||||
|
'seconds', json_url, time.time() - self._start)
|
||||||
|
self.stop()
|
||||||
|
raise Exception(
|
||||||
|
'killed chrome, failed to retrieve %s after %s '
|
||||||
|
'seconds' % (json_url, time.time() - self._start))
|
||||||
|
else:
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
def _read_stderr_stdout(self):
|
||||||
|
# XXX select doesn't work on windows
|
||||||
|
def readline_nonblock(f):
|
||||||
|
buf = b''
|
||||||
|
while not self._shutdown.is_set() and (
|
||||||
|
len(buf) == 0 or buf[-1] != 0xa) and select.select(
|
||||||
|
[f],[],[],0.5)[0]:
|
||||||
|
buf += f.read(1)
|
||||||
|
return buf
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not self._shutdown.is_set():
|
||||||
|
buf = readline_nonblock(self.chrome_process.stdout)
|
||||||
|
if buf:
|
||||||
|
if re.search(
|
||||||
|
b'Xlib: extension|'
|
||||||
|
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||||
|
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||||
|
b'ERROR:gpu_child_thread.cc', buf):
|
||||||
|
logging.log(
|
||||||
|
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||||
|
self.chrome_process.pid, buf)
|
||||||
|
else:
|
||||||
|
logging.debug(
|
||||||
|
'chrome pid %s STDOUT %s',
|
||||||
|
self.chrome_process.pid, buf)
|
||||||
|
|
||||||
|
buf = readline_nonblock(self.chrome_process.stderr)
|
||||||
|
if buf:
|
||||||
|
if re.search(
|
||||||
|
b'Xlib: extension|'
|
||||||
|
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||||
|
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||||
|
b'ERROR:gpu_child_thread.cc', buf):
|
||||||
|
logging.log(
|
||||||
|
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||||
|
self.chrome_process.pid, buf)
|
||||||
|
else:
|
||||||
|
logging.debug(
|
||||||
|
'chrome pid %s STDERR %s',
|
||||||
|
self.chrome_process.pid, buf)
|
||||||
|
except:
|
||||||
|
logging.error('unexpected exception', exc_info=True)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
if not self.chrome_process or self._shutdown.is_set():
|
||||||
|
return
|
||||||
|
|
||||||
|
timeout_sec = 300
|
||||||
|
self._shutdown.set()
|
||||||
|
self.logger.info('terminating chrome pgid %s' % self.chrome_process.pid)
|
||||||
|
|
||||||
|
os.killpg(self.chrome_process.pid, signal.SIGTERM)
|
||||||
|
first_sigterm = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while time.time() - first_sigterm < timeout_sec:
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
status = self.chrome_process.poll()
|
||||||
|
if status is not None:
|
||||||
|
if status == 0:
|
||||||
|
self.logger.info(
|
||||||
|
'chrome pid %s exited normally',
|
||||||
|
self.chrome_process.pid)
|
||||||
|
else:
|
||||||
|
self.logger.warn(
|
||||||
|
'chrome pid %s exited with nonzero status %s',
|
||||||
|
self.chrome_process.pid, status)
|
||||||
|
|
||||||
|
# XXX I would like to forcefully kill the process group
|
||||||
|
# here to guarantee no orphaned chromium subprocesses hang
|
||||||
|
# around, but there's a chance I suppose that some other
|
||||||
|
# process could have started with the same pgid
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.warn(
|
||||||
|
'chrome pid %s still alive %.1f seconds after sending '
|
||||||
|
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
||||||
|
time.time() - first_sigterm)
|
||||||
|
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||||
|
status = self.chrome_process.wait()
|
||||||
|
self.logger.warn(
|
||||||
|
'chrome pid %s reaped (status=%s) after killing with '
|
||||||
|
'SIGKILL', self.chrome_process.pid, status)
|
||||||
|
finally:
|
||||||
|
self._out_reader_thread.join()
|
||||||
|
self.chrome_process = None
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b8.dev130',
|
version='1.1b8.dev131',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue