mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-14 09:15:54 -04:00
make configurable
This commit is contained in:
parent
754d024f7a
commit
4936ce09d7
3 changed files with 21 additions and 11 deletions
|
@ -362,11 +362,6 @@ class Browser:
|
||||||
self.send_to_chrome(method='ServiceWorker.enable')
|
self.send_to_chrome(method='ServiceWorker.enable')
|
||||||
self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')
|
self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')
|
||||||
|
|
||||||
# traffic shaping used by SPN2 to aid warcprox resilience
|
|
||||||
# 4294967296 bytes/second = 4MB/second
|
|
||||||
self.send_to_chrome(method='Network.emulateNetworkConditions',
|
|
||||||
params={'downloadThroughput': 4194304})
|
|
||||||
|
|
||||||
# disable google analytics and amp analytics
|
# disable google analytics and amp analytics
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Network.setBlockedURLs',
|
method='Network.setBlockedURLs',
|
||||||
|
@ -428,7 +423,7 @@ class Browser:
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||||
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
||||||
page_timeout=300, behavior_timeout=900):
|
page_timeout=300, behavior_timeout=900, download_throughput=-1):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
|
@ -493,7 +488,8 @@ class Browser:
|
||||||
with brozzler.thread_accept_exceptions():
|
with brozzler.thread_accept_exceptions():
|
||||||
self.configure_browser(
|
self.configure_browser(
|
||||||
extra_headers=extra_headers,
|
extra_headers=extra_headers,
|
||||||
user_agent=user_agent)
|
user_agent=user_agent,
|
||||||
|
download_throughput=download_throughput)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
if password:
|
if password:
|
||||||
self.try_login(username, password, timeout=page_timeout)
|
self.try_login(username, password, timeout=page_timeout)
|
||||||
|
@ -577,7 +573,7 @@ class Browser:
|
||||||
# run behavior again with short timeout?
|
# run behavior again with short timeout?
|
||||||
# retrieve outlinks again and append to list?
|
# retrieve outlinks again and append to list?
|
||||||
|
|
||||||
def configure_browser(self, extra_headers=None, user_agent=None):
|
def configure_browser(self, extra_headers=None, user_agent=None, download_throughput=-1):
|
||||||
headers = extra_headers or {}
|
headers = extra_headers or {}
|
||||||
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
|
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
@ -591,6 +587,11 @@ class Browser:
|
||||||
msg_id = self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Network.setUserAgentOverride',
|
method='Network.setUserAgentOverride',
|
||||||
params={'userAgent': user_agent})
|
params={'userAgent': user_agent})
|
||||||
|
if download_throughput > -1:
|
||||||
|
# traffic shaping already used by SPN2 to aid warcprox resilience
|
||||||
|
# parameter value as bytes/second, or -1 to disable (default)
|
||||||
|
msg_id = self.send_to_chrome(method='Network.emulateNetworkConditions',
|
||||||
|
params={'downloadThroughput': download_throughput})
|
||||||
|
|
||||||
def navigate_to_page(self, page_url, timeout=300):
|
def navigate_to_page(self, page_url, timeout=300):
|
||||||
self.logger.info('navigating to page %s', page_url)
|
self.logger.info('navigating to page %s', page_url)
|
||||||
|
|
9
brozzler/cli.py
Normal file → Executable file
9
brozzler/cli.py
Normal file → Executable file
|
@ -153,6 +153,9 @@ def brozzle_page(argv=None):
|
||||||
help='use this password to try to log in if a login form is found')
|
help='use this password to try to log in if a login form is found')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--browser_throughput', type=int, dest='download_throughput', default=-1,
|
||||||
|
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--screenshot-full-page', dest='screenshot_full_page',
|
'--screenshot-full-page', dest='screenshot_full_page',
|
||||||
action='store_true')
|
action='store_true')
|
||||||
|
@ -185,7 +188,8 @@ def brozzle_page(argv=None):
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl,
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
simpler404=args.simpler404,
|
simpler404=args.simpler404,
|
||||||
screenshot_full_page=args.screenshot_full_page)
|
screenshot_full_page=args.screenshot_full_page,
|
||||||
|
download_throughput=args.download_throughput)
|
||||||
|
|
||||||
def on_screenshot(screenshot_jpeg):
|
def on_screenshot(screenshot_jpeg):
|
||||||
OK_CHARS = string.ascii_letters + string.digits
|
OK_CHARS = string.ascii_letters + string.digits
|
||||||
|
@ -315,6 +319,9 @@ def brozzler_worker(argv=None):
|
||||||
help='max number of chrome instances simultaneously browsing pages')
|
help='max number of chrome instances simultaneously browsing pages')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--browser_throughput', type=int, dest='download_throughput', default=-1,
|
||||||
|
help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--warcprox-auto', dest='warcprox_auto', action='store_true',
|
'--warcprox-auto', dest='warcprox_auto', action='store_true',
|
||||||
help=(
|
help=(
|
||||||
|
|
|
@ -52,7 +52,7 @@ class BrozzlerWorker:
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||||
page_timeout=300, behavior_timeout=900):
|
page_timeout=300, behavior_timeout=900, download_throughput=-1):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
@ -68,6 +68,7 @@ class BrozzlerWorker:
|
||||||
self._screenshot_full_page = screenshot_full_page
|
self._screenshot_full_page = screenshot_full_page
|
||||||
self._page_timeout = page_timeout
|
self._page_timeout = page_timeout
|
||||||
self._behavior_timeout = behavior_timeout
|
self._behavior_timeout = behavior_timeout
|
||||||
|
self._download_throughput = download_throughput
|
||||||
|
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(
|
self._browser_pool = brozzler.browser.BrowserPool(
|
||||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
|
@ -306,7 +307,8 @@ class BrozzlerWorker:
|
||||||
simpler404=self._simpler404,
|
simpler404=self._simpler404,
|
||||||
screenshot_full_page=self._screenshot_full_page,
|
screenshot_full_page=self._screenshot_full_page,
|
||||||
page_timeout=self._page_timeout,
|
page_timeout=self._page_timeout,
|
||||||
behavior_timeout=self._behavior_timeout)
|
behavior_timeout=self._behavior_timeout,
|
||||||
|
download_throughput=self._download_throughput)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue