move cookie db management code into chrome.py

This commit is contained in:
Noah Levitt 2016-12-06 18:04:51 -08:00
parent ce03381b92
commit d3063fbd2b
3 changed files with 74 additions and 66 deletions

View File

@ -22,7 +22,6 @@ import itertools
import websocket import websocket
import time import time
import threading import threading
import tempfile
import os import os
import random import random
import brozzler import brozzler
@ -130,31 +129,10 @@ class Browser:
def start(self, proxy=None, cookie_db=None): def start(self, proxy=None, cookie_db=None):
if not self._chrome_instance: if not self._chrome_instance:
# these can raise exceptions
self._work_dir = tempfile.TemporaryDirectory()
if cookie_db is not None:
cookie_dir = os.path.join(
self._work_dir.name, "chrome-user-data", "Default")
cookie_location = os.path.join(cookie_dir, "Cookies")
self.logger.debug(
"cookie DB provided, writing to %s", cookie_location)
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, 'wb') as cookie_file:
cookie_file.write(cookie_db)
except OSError:
self.logger.error(
"exception writing cookie file at %s",
cookie_location, exc_info=True)
self._chrome_instance = Chrome( self._chrome_instance = Chrome(
port=self.chrome_port, executable=self.chrome_exe, port=self.chrome_port, executable=self.chrome_exe,
user_home_dir=self._work_dir.name,
user_data_dir=os.sep.join([
self._work_dir.name, "chrome-user-data"]),
ignore_cert_errors=self.ignore_cert_errors, ignore_cert_errors=self.ignore_cert_errors,
proxy=proxy or self.proxy) proxy=proxy or self.proxy, cookie_db=None)
try: try:
self._websocket_url = self._chrome_instance.start() self._websocket_url = self._chrome_instance.start()
except: except:
@ -166,45 +144,22 @@ class Browser:
if self.is_running(): if self.is_running():
self._chrome_instance.stop() self._chrome_instance.stop()
self._chrome_instance = None self._chrome_instance = None
try:
self._work_dir.cleanup()
except:
self.logger.error("exception deleting %s", self._work_dir,
exc_info=True)
self._work_dir = None
self._websocket_url = None self._websocket_url = None
except: except:
self.logger.error("problem stopping", exc_info=True) self.logger.error("problem stopping", exc_info=True)
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
self.logger.debug(
"marking cookies persistent then reading file into memory: %s",
cookie_location)
try:
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute("UPDATE cookies SET persistent = 1")
except sqlite3.Error:
self.logger.error("exception updating cookie DB", exc_info=True)
cookie_db=None
try:
with open(cookie_location, "rb") as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
"exception reading from cookie DB file %s",
cookie_location, exc_info=True)
return cookie_db
def is_running(self): def is_running(self):
return bool(self._websocket_url) return bool(self._websocket_url)
def abort_browse_page(self): def abort_browse_page(self):
self._abort_browse_page = True self._abort_browse_page = True
def persist_and_read_cookie_db(self):
if self._chrome_instance:
return self._chrome_instance.persist_and_read_cookie_db()
else:
return None
def browse_page( def browse_page(
self, url, extra_headers=None, behavior_parameters=None, self, url, extra_headers=None, behavior_parameters=None,
user_agent=None, user_agent=None,

View File

@ -27,22 +27,21 @@ import select
import re import re
import signal import signal
import sqlite3 import sqlite3
import datetime
import json import json
import psutil import psutil
import tempfile
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + '.' + __qualname__) logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__( def __init__(
self, port, executable, user_home_dir, user_data_dir, proxy=None, self, port, executable, proxy=None, ignore_cert_errors=False,
ignore_cert_errors=False): cookie_db=None):
self.port = port self.port = port
self.executable = executable self.executable = executable
self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir
self.proxy = proxy self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors self.ignore_cert_errors = ignore_cert_errors
self.cookie_db = cookie_db
self._shutdown = threading.Event() self._shutdown = threading.Event()
def __enter__(self): def __enter__(self):
@ -72,17 +71,61 @@ class Chrome:
return default_port return default_port
def _init_cookie_db(self):
if self.cookie_db is not None:
cookie_dir = os.path.join(self._chrome_user_data_dir, 'Default')
cookie_location = os.path.join(cookie_dir, 'Cookies')
self.logger.debug(
'cookie DB provided, writing to %s', cookie_location)
os.makedirs(cookie_dir, exist_ok=True)
try:
with open(cookie_location, 'wb') as cookie_file:
cookie_file.write(self.cookie_db)
except OSError:
self.logger.error(
'exception writing cookie file at %s',
cookie_location, exc_info=True)
def persist_and_read_cookie_db(self):
cookie_location = os.path.join(
self._chrome_user_data_dir, 'Default', 'Cookies')
self.logger.debug(
'marking cookies persistent then reading file into memory: %s',
cookie_location)
try:
with sqlite3.connect(cookie_location) as conn:
cur = conn.cursor()
cur.execute('UPDATE cookies SET persistent = 1')
except sqlite3.Error:
self.logger.error('exception updating cookie DB', exc_info=True)
cookie_db = None
try:
with open(cookie_location, 'rb') as cookie_file:
cookie_db = cookie_file.read()
except OSError:
self.logger.error(
'exception reading from cookie DB file %s',
cookie_location, exc_info=True)
return cookie_db
def start(self): def start(self):
''' '''
Returns websocket url to chrome window with about:blank loaded. Returns websocket url to chrome window with about:blank loaded.
''' '''
timeout_sec = 600 # these can raise exceptions
self._home_tmpdir = tempfile.TemporaryDirectory()
self._chrome_user_data_dir = os.path.join(
self._home_tmpdir.name, 'chrome-user-data'),
self._init_cookie_db()
new_env = os.environ.copy() new_env = os.environ.copy()
new_env['HOME'] = self.user_home_dir new_env['HOME'] = self._home_tmpdir.name
self.port = self._find_available_port(self.port) self.port = self._find_available_port(self.port)
chrome_args = [ chrome_args = [
self.executable, '--use-mock-keychain', # mac thing self.executable, '--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self.user_data_dir, '--user-data-dir=%s' % self._chrome_user_data_dir,
'--remote-debugging-port=%s' % self.port, '--remote-debugging-port=%s' % self.port,
'--disable-web-sockets', '--disable-cache', '--disable-web-sockets', '--disable-cache',
'--window-size=1100,900', '--no-default-browser-check', '--window-size=1100,900', '--no-default-browser-check',
@ -96,7 +139,7 @@ class Chrome:
chrome_args.append('--proxy-server=%s' % self.proxy) chrome_args.append('--proxy-server=%s' % self.proxy)
chrome_args.append('about:blank') chrome_args.append('about:blank')
self.logger.info( self.logger.info(
'running: %s' % repr(subprocess.list2cmdline(chrome_args))) 'running: %s', repr(subprocess.list2cmdline(chrome_args)))
# start_new_session - new process group so we can kill the whole group # start_new_session - new process group so we can kill the whole group
self.chrome_process = subprocess.Popen( self.chrome_process = subprocess.Popen(
chrome_args, env=new_env, start_new_session=True, chrome_args, env=new_env, start_new_session=True,
@ -106,11 +149,14 @@ class Chrome:
name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid) name='ChromeOutReaderThread(pid=%s)' % self.chrome_process.pid)
self._out_reader_thread.start() self._out_reader_thread.start()
self.logger.info('chrome running, pid %s' % self.chrome_process.pid) self.logger.info('chrome running, pid %s' % self.chrome_process.pid)
return self._websocket_url()
def _websocket_url(self):
timeout_sec = 600
json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it # make this a member variable so that kill -QUIT reports it
self._start = time.time() self._start = time.time()
json_url = 'http://localhost:%s/json' % self.port
while True: while True:
try: try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read() raw_json = urllib.request.urlopen(json_url, timeout=30).read()
@ -134,7 +180,7 @@ class Chrome:
finally: finally:
if time.time() - self._start > timeout_sec: if time.time() - self._start > timeout_sec:
self.logger.error( self.logger.error(
'killing chrome, failed to retrieve %s after %s ' 'killing chrome, failed to retrieve %s after % '
'seconds', json_url, time.time() - self._start) 'seconds', json_url, time.time() - self._start)
self.stop() self.stop()
raise Exception( raise Exception(
@ -228,6 +274,13 @@ class Chrome:
self.logger.warn( self.logger.warn(
'chrome pid %s reaped (status=%s) after killing with ' 'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status) 'SIGKILL', self.chrome_process.pid, status)
try:
self._home_tmpdir.cleanup()
except:
self.logger.error(
"exception deleting %s", self._home_tmpdir,
exc_info=True)
finally: finally:
self._out_reader_thread.join() self._out_reader_thread.join()
self.chrome_process = None self.chrome_process = None

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b8.dev132', version='1.1b8.dev133',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',