From f6ffb4acea2e68e648d07f44c4261f9fd90a2f10 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 10 Jan 2019 16:11:24 -0800 Subject: [PATCH 1/6] update (C) --- brozzler/behaviors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 1ba1a92..e792e69 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -1,7 +1,7 @@ # # brozzler/behaviors.yaml - behavior configuration # -# Copyright (C) 2014-2018 Internet Archive +# Copyright (C) 2014-2019 Internet Archive # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From c288c9ae98dab6eff740ab18cc25bea09342773f Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 6 Feb 2019 16:22:10 +0000 Subject: [PATCH 2/6] Add disk cache options to Chrome Add `Chrome` options `disk_cache` and `disk_cache_size` which add chromium options `--disk-cache-dir=` and `--disk-cache-size=N` (bytes). The default is to use `--disable-cache` (no disk caching). There are two ways to use the new vars, if you just use `Chrome(disk_cache=True)` the chromium cli option `--disable-cache` is NOT used and chromium writes disk cache inside profile dir. If you use `Chrome(disk_cache='/tmp/custom_dir', disk_cache_size=10000)` chromium will use `--disk-cache-dir=/tmp/custom_dir --disk-cache-size=10000`. --- brozzler/chrome.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 7423328..0036c1b 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -62,7 +62,8 @@ def check_version(chrome_exe): class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) - def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): + def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False, + disk_cache=None, disk_cache_size=None): ''' Initializes instance of this class. @@ -79,6 +80,8 @@ class Chrome: self.ignore_cert_errors = ignore_cert_errors self._shutdown = threading.Event() self.chrome_process = None + self.disk_cache = disk_cache + self.disk_cache_size = disk_cache_size def __enter__(self): ''' @@ -134,7 +137,8 @@ class Chrome: cookie_location, exc_info=True) return cookie_db - def start(self, proxy=None, cookie_db=None): + def start(self, proxy=None, cookie_db=None, disk_cache=None, + disk_cache_size=None): ''' Starts chrome/chromium process. @@ -144,7 +148,12 @@ class Chrome: which, if supplied, will be written to {chrome_user_data_dir}/Default/Cookies before running the browser (default None) - + disk_cache: use disk cache. If True, use default cache location inside + `self._home_tmpdir`. If its a string, try to use that path for + disk cache (default None) + disk_cache_size: Forces the maximum disk space to be used by the disk + cache, in bytes. Used only when `cache` is a disk path. + (default None) Returns: websocket url to chrome window with about:blank loaded ''' @@ -154,6 +163,10 @@ class Chrome: self._home_tmpdir.name, 'chrome-user-data') if cookie_db: self._init_cookie_db(cookie_db) + if disk_cache: + self.disk_cache = disk_cache + if disk_cache_size: + self.disk_cache_size = disk_cache_size self._shutdown.clear() new_env = os.environ.copy() @@ -166,12 +179,22 @@ class Chrome: '--disable-background-networking', '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', '--disable-cache', + '--disable-web-sockets', '--window-size=1100,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', '--disable-web-security', '--disable-notifications', '--disable-extensions', '--disable-save-password-bubble'] + + if self.disk_cache: + if isinstance(self.disk_cache, str): + chrome_args.append('--disk-cache-dir=%s' % self.disk_cache) + if self.disk_cache_size: + chrome_args.append('--disk-cache-size=%s' % + self.disk_cache_size) + else: + chrome_args.append('--disable-cache') + if self.ignore_cert_errors: chrome_args.append('--ignore-certificate-errors') if proxy: From 31e611771ea6c5608fadcc10f149b03ef8a77fa4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 7 Feb 2019 07:42:45 +0000 Subject: [PATCH 3/6] Improve disk cache options Remove `--disable-cache`, its not used any more. Rename `disk_cache` to `disk_cache_dir` and use only path (str) argument. Decouple `--disk-cache-size` from `--disk-cache-dir` so it is possible to use either or both. --- brozzler/chrome.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 0036c1b..d83b7b1 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -63,7 +63,7 @@ class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False, - disk_cache=None, disk_cache_size=None): + disk_cache_dir=None, disk_cache_size=None): ''' Initializes instance of this class. @@ -80,7 +80,7 @@ class Chrome: self.ignore_cert_errors = ignore_cert_errors self._shutdown = threading.Event() self.chrome_process = None - self.disk_cache = disk_cache + self.disk_cache_dir = disk_cache_dir self.disk_cache_size = disk_cache_size def __enter__(self): @@ -137,7 +137,7 @@ class Chrome: cookie_location, exc_info=True) return cookie_db - def start(self, proxy=None, cookie_db=None, disk_cache=None, + def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, disk_cache_size=None): ''' Starts chrome/chromium process. @@ -148,9 +148,8 @@ class Chrome: which, if supplied, will be written to {chrome_user_data_dir}/Default/Cookies before running the browser (default None) - disk_cache: use disk cache. If True, use default cache location inside - `self._home_tmpdir`. If its a string, try to use that path for - disk cache (default None) + disk_cache_dir: use directory for disk cache. The default location + is inside `self._home_tmpdir` (default None). disk_cache_size: Forces the maximum disk space to be used by the disk cache, in bytes. Used only when `cache` is a disk path. (default None) @@ -163,8 +162,8 @@ class Chrome: self._home_tmpdir.name, 'chrome-user-data') if cookie_db: self._init_cookie_db(cookie_db) - if disk_cache: - self.disk_cache = disk_cache + if disk_cache_dir: + self.disk_cache_dir = disk_cache_dir if disk_cache_size: self.disk_cache_size = disk_cache_size self._shutdown.clear() @@ -186,15 +185,10 @@ class Chrome: '--disable-web-security', '--disable-notifications', '--disable-extensions', '--disable-save-password-bubble'] - if self.disk_cache: - if isinstance(self.disk_cache, str): - chrome_args.append('--disk-cache-dir=%s' % self.disk_cache) - if self.disk_cache_size: - chrome_args.append('--disk-cache-size=%s' % - self.disk_cache_size) - else: - chrome_args.append('--disable-cache') - + if self.disk_cache_dir: + chrome_args.append('--disk-cache-dir=%s' % self.disk_cache_dir) + if self.disk_cache_size: + chrome_args.append('--disk-cache-size=%s' % self.disk_cache_size) if self.ignore_cert_errors: chrome_args.append('--ignore-certificate-errors') if proxy: From adeca823dd28e6a613f87638395f99174442b80b Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Feb 2019 07:21:44 +0000 Subject: [PATCH 4/6] Remove stale comment --- brozzler/chrome.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index d83b7b1..cb69bda 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -151,8 +151,7 @@ class Chrome: disk_cache_dir: use directory for disk cache. The default location is inside `self._home_tmpdir` (default None). disk_cache_size: Forces the maximum disk space to be used by the disk - cache, in bytes. Used only when `cache` is a disk path. - (default None) + cache, in bytes. (default None) Returns: websocket url to chrome window with about:blank loaded ''' From 9c48a6fa11122ca0ba4df339a721ebf33b67ccdd Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Feb 2019 20:59:08 +0000 Subject: [PATCH 5/6] Use disk cache params only on Chrome.start Use `disk_cache_dir` and `disk_cache_size` only on `Chrome.start` and not on `Chrome.__init__`. Drop `disk_cache_dir` and `disk_cache_size` class attributes. --- brozzler/chrome.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index cb69bda..5928586 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -62,8 +62,7 @@ def check_version(chrome_exe): class Chrome: logger = logging.getLogger(__module__ + '.' + __qualname__) - def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False, - disk_cache_dir=None, disk_cache_size=None): + def __init__(self, chrome_exe, port=9222, ignore_cert_errors=False): ''' Initializes instance of this class. @@ -80,8 +79,6 @@ class Chrome: self.ignore_cert_errors = ignore_cert_errors self._shutdown = threading.Event() self.chrome_process = None - self.disk_cache_dir = disk_cache_dir - self.disk_cache_size = disk_cache_size def __enter__(self): ''' @@ -161,10 +158,6 @@ class Chrome: self._home_tmpdir.name, 'chrome-user-data') if cookie_db: self._init_cookie_db(cookie_db) - if disk_cache_dir: - self.disk_cache_dir = disk_cache_dir - if disk_cache_size: - self.disk_cache_size = disk_cache_size self._shutdown.clear() new_env = os.environ.copy() @@ -184,10 +177,10 @@ class Chrome: '--disable-web-security', '--disable-notifications', '--disable-extensions', '--disable-save-password-bubble'] - if self.disk_cache_dir: - chrome_args.append('--disk-cache-dir=%s' % self.disk_cache_dir) - if self.disk_cache_size: - chrome_args.append('--disk-cache-size=%s' % self.disk_cache_size) + if disk_cache_dir: + chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) + if disk_cache_size: + chrome_args.append('--disk-cache-size=%s' % disk_cache_size) if self.ignore_cert_errors: chrome_args.append('--ignore-certificate-errors') if proxy: From 61274ae9940daedbb40345868257676d88de5795 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 14 Mar 2019 20:03:17 +0000 Subject: [PATCH 6/6] peg to working doublethink see: https://github.com/internetarchive/doublethink/commit/f7fc7da725c9b --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0c610e4..d5c54af 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev320', + version='1.5.0', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -71,7 +71,7 @@ setuptools.setup( 'websocket-client>=0.39.0,<=0.48.0', 'pillow>=5.2.0', 'urlcanon>=0.1.dev23', - 'doublethink>=0.2.0.dev90', + 'doublethink>=0.2.0', 'rethinkdb>=2.3', 'cerberus>=1.0.1', 'jinja2>=2.10',