Merge pull request #202 from galgeek/limit_downloadThroughput

configurable limit for Chromium download throughput
This commit is contained in:
jkafader 2020-07-23 14:14:20 -07:00 committed by GitHub
commit 1b9ebca13c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 22 additions and 7 deletions

View File

@ -1,7 +1,7 @@
'''
brozzler/browser.py - manages the browsers for brozzler
Copyright (C) 2014-2018 Internet Archive
Copyright (C) 2014-2020 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -423,7 +423,7 @@ class Browser:
username=None, password=None, hashtags=None,
screenshot_full_page=False, skip_extract_outlinks=False,
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
page_timeout=300, behavior_timeout=900):
page_timeout=300, behavior_timeout=900, download_throughput=-1):
'''
Browses page in browser.
@ -488,7 +488,8 @@ class Browser:
with brozzler.thread_accept_exceptions():
self.configure_browser(
extra_headers=extra_headers,
user_agent=user_agent)
user_agent=user_agent,
download_throughput=download_throughput)
self.navigate_to_page(page_url, timeout=page_timeout)
if password:
self.try_login(username, password, timeout=page_timeout)
@ -572,7 +573,7 @@ class Browser:
# run behavior again with short timeout?
# retrieve outlinks again and append to list?
def configure_browser(self, extra_headers=None, user_agent=None):
def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1):
headers = extra_headers or {}
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
self.websock_thread.expect_result(self._command_id.peek())
@ -586,6 +587,11 @@ class Browser:
msg_id = self.send_to_chrome(
method='Network.setUserAgentOverride',
params={'userAgent': user_agent})
if download_throughput > -1:
# traffic shaping already used by SPN2 to aid warcprox resilience
# parameter value as bytes/second, or -1 to disable (default)
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
params={'downloadThroughput': download_throughput})
def navigate_to_page(self, page_url, timeout=300):
self.logger.info('navigating to page %s', page_url)

9
brozzler/cli.py Normal file → Executable file
View File

@ -153,6 +153,9 @@ def brozzle_page(argv=None):
help='use this password to try to log in if a login form is found')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--browser_throughput', type=int, dest='download_throughput', default=-1,
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
arg_parser.add_argument(
'--screenshot-full-page', dest='screenshot_full_page',
action='store_true')
@ -185,7 +188,8 @@ def brozzle_page(argv=None):
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
simpler404=args.simpler404,
screenshot_full_page=args.screenshot_full_page)
screenshot_full_page=args.screenshot_full_page,
download_throughput=args.download_throughput)
def on_screenshot(screenshot_jpeg):
OK_CHARS = string.ascii_letters + string.digits
@ -315,6 +319,9 @@ def brozzler_worker(argv=None):
help='max number of chrome instances simultaneously browsing pages')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--browser_throughput', type=int, dest='download_throughput', default=-1,
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
arg_parser.add_argument(
'--warcprox-auto', dest='warcprox_auto', action='store_true',
help=(

View File

@ -52,7 +52,7 @@ class BrozzlerWorker:
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900):
page_timeout=300, behavior_timeout=900, download_throughput=-1):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -68,6 +68,7 @@ class BrozzlerWorker:
self._screenshot_full_page = screenshot_full_page
self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout
self._download_throughput = download_throughput
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -306,7 +307,8 @@ class BrozzlerWorker:
simpler404=self._simpler404,
screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout)
behavior_timeout=self._behavior_timeout,
download_throughput=self._download_throughput)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks