From 36b4f80350122b9d9a2a641fd7a10601a7cf3798 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 25 Jun 2020 17:18:56 -0700 Subject: [PATCH 1/2] try SPN2 downloadThroughput limit --- brozzler/browser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 6c210ff..a302237 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,7 +1,7 @@ ''' brozzler/browser.py - manages the browsers for brozzler -Copyright (C) 2014-2018 Internet Archive +Copyright (C) 2014-2020 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -362,6 +362,11 @@ class Browser: self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') + # traffic shaping used by SPN2 to aid warcprox resilience + # 4294967296 bytes/second = 4MB/second + self.send_to_chrome(method='Network.emulateNetworkConditions', + params={'downloadThroughput': 4194304}) + # disable google analytics and amp analytics self.send_to_chrome( method='Network.setBlockedURLs', From 739d09294e57483ec9e3789589069547eb560327 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 26 Jun 2020 18:32:11 -0700 Subject: [PATCH 2/2] make configurable --- brozzler/browser.py | 17 +++++++++-------- brozzler/cli.py | 9 ++++++++- brozzler/worker.py | 6 ++++-- 3 files changed, 21 insertions(+), 11 deletions(-) mode change 100644 => 100755 brozzler/cli.py diff --git a/brozzler/browser.py b/brozzler/browser.py index a302237..34d3eec 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -362,11 +362,6 @@ class Browser: self.send_to_chrome(method='ServiceWorker.enable') self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') - # traffic shaping used by SPN2 to aid warcprox resilience - # 4294967296 bytes/second = 4MB/second - self.send_to_chrome(method='Network.emulateNetworkConditions', - params={'downloadThroughput': 4194304}) - # disable google analytics and amp analytics self.send_to_chrome( method='Network.setBlockedURLs', @@ -428,7 +423,7 @@ class Browser: username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, - page_timeout=300, behavior_timeout=900): + page_timeout=300, behavior_timeout=900, download_throughput=-1): ''' Browses page in browser. @@ -493,7 +488,8 @@ class Browser: with brozzler.thread_accept_exceptions(): self.configure_browser( extra_headers=extra_headers, - user_agent=user_agent) + user_agent=user_agent, + download_throughput=download_throughput) self.navigate_to_page(page_url, timeout=page_timeout) if password: self.try_login(username, password, timeout=page_timeout) @@ -577,7 +573,7 @@ class Browser: # run behavior again with short timeout? # retrieve outlinks again and append to list? - def configure_browser(self, extra_headers=None, user_agent=None): + def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1): headers = extra_headers or {} headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch self.websock_thread.expect_result(self._command_id.peek()) @@ -591,6 +587,11 @@ class Browser: msg_id = self.send_to_chrome( method='Network.setUserAgentOverride', params={'userAgent': user_agent}) + if download_throughput > -1: + # traffic shaping already used by SPN2 to aid warcprox resilience + # parameter value as bytes/second, or -1 to disable (default) + msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions', + params={'downloadThroughput': download_throughput}) def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) diff --git a/brozzler/cli.py b/brozzler/cli.py old mode 100644 new mode 100755 index 347487f..fb973e3 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -153,6 +153,9 @@ def brozzle_page(argv=None): help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--browser_throughput', type=int, dest='download_throughput', default=-1, + help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') arg_parser.add_argument( '--screenshot-full-page', dest='screenshot_full_page', action='store_true') @@ -185,7 +188,8 @@ def brozzle_page(argv=None): skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl, simpler404=args.simpler404, - screenshot_full_page=args.screenshot_full_page) + screenshot_full_page=args.screenshot_full_page, + download_throughput=args.download_throughput) def on_screenshot(screenshot_jpeg): OK_CHARS = string.ascii_letters + string.digits @@ -315,6 +319,9 @@ def brozzler_worker(argv=None): help='max number of chrome instances simultaneously browsing pages') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--browser_throughput', type=int, dest='download_throughput', default=-1, + help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=( diff --git a/brozzler/worker.py b/brozzler/worker.py index c8279b2..6c88275 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -52,7 +52,7 @@ class BrozzlerWorker: chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, - page_timeout=300, behavior_timeout=900): + page_timeout=300, behavior_timeout=900, download_throughput=-1): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -68,6 +68,7 @@ class BrozzlerWorker: self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout + self._download_throughput = download_throughput self._browser_pool = brozzler.browser.BrowserPool( max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True) @@ -306,7 +307,8 @@ class BrozzlerWorker: simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, - behavior_timeout=self._behavior_timeout) + behavior_timeout=self._behavior_timeout, + download_throughput=self._download_throughput) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks