From 0847d93d9ea4769d964b9e786eb6a173b5b69509 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 15 Mar 2023 14:15:18 -0700 Subject: [PATCH 1/5] add socket_timeout opt for yt-dlp --- brozzler/ydl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ffdd523..906c653 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -229,6 +229,7 @@ def _build_youtube_dl(worker, destdir, site, page): if d['postprocessor'] == 'FixupM3u8' and worker._using_warcprox(site): _YoutubeDL._push_stitched_up_vid_to_warcprox(_YoutubeDL, site, d['info_dict']) + # default socket_timeout is 20 -- we hit it often when cluster is busy ydl_opts = { "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir), "retries": 1, @@ -237,6 +238,7 @@ def _build_youtube_dl(worker, destdir, site, page): "noprogress": True, "nopart": True, "no_color": True, + "socket_timeout": 40, "progress_hooks": [maybe_heartbeat_site_last_claimed], "postprocessor_hooks": [ydl_postprocess_hook], From 0d4ed6a8be068b3d2fc3df0ab3ac882f17433c07 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 15 Mar 2023 15:55:08 -0700 Subject: [PATCH 2/5] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a21687d..097e720 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.32', + version='1.5.33', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 7783f92ce23be4cfdebda072332dc8fea65535d9 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 26 Apr 2023 14:51:19 -0700 Subject: [PATCH 3/5] larger chrome window: 1400,900 --- brozzler/chrome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index f908da2..5ef7427 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -172,7 +172,7 @@ class Chrome: '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', '--disable-web-sockets', - '--window-size=1100,900', '--no-default-browser-check', + '--window-size=1400,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', '--disable-web-security', '--disable-notifications', From 6d69105c7944a92d2e6420ea4d1adc54739974a7 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 28 Apr 2023 13:49:44 -0700 Subject: [PATCH 4/5] configurable window height & width --- brozzler/browser.py | 2 +- brozzler/chrome.py | 9 ++++++--- brozzler/cli.py | 18 ++++++++++++++++-- brozzler/worker.py | 11 ++++++++--- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 0cda56e..581b72f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -1,7 +1,7 @@ ''' brozzler/browser.py - manages the browsers for brozzler -Copyright (C) 2014-2020 Internet Archive +Copyright (C) 2014-2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 5ef7427..ec712fe 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -1,7 +1,7 @@ ''' brozzler/chrome.py - manages the chrome/chromium browser for brozzler -Copyright (C) 2014-2020 Internet Archive +Copyright (C) 2014-2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -135,7 +135,8 @@ class Chrome: return cookie_db def start(self, proxy=None, cookie_db=None, disk_cache_dir=None, - disk_cache_size=None, websocket_timeout=60): + disk_cache_size=None, websocket_timeout=60, + window_height=900, window_width=1400): ''' Starts chrome/chromium process. @@ -150,6 +151,7 @@ class Chrome: disk_cache_size: Forces the maximum disk space to be used by the disk cache, in bytes. (default None) websocket_timeout: websocket timeout, in seconds + window_height, window_width: window height and width, in pixels Returns: websocket url to chrome window with about:blank loaded ''' @@ -172,7 +174,8 @@ class Chrome: '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', '--disable-web-sockets', - '--window-size=1400,900', '--no-default-browser-check', + f'--window-size={window_width},{window_height}', + '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', '--disable-web-security', '--disable-notifications', diff --git a/brozzler/cli.py b/brozzler/cli.py index c146a48..a6d0aba 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ ''' brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2019 Internet Archive +Copyright (C) 2014-2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -156,6 +156,12 @@ def brozzle_page(argv=None): arg_parser.add_argument( '--browser_throughput', type=int, dest='download_throughput', default=-1, help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + arg_parser.add_argument( + '--browser_window_height', type=int, dest='window_height', default=900, + help='browser window height in pixels') + arg_parser.add_argument( + '--browser_window_width', type=int, dest='window_width', default=1400, + help='browser window width in pixels') arg_parser.add_argument( '--stealth', dest='stealth', action='store_true', help='Try to avoid web bot detection') @@ -193,6 +199,8 @@ def brozzle_page(argv=None): simpler404=args.simpler404, screenshot_full_page=args.screenshot_full_page, download_throughput=args.download_throughput, + window_height=args.window_height, + window_width=args.window_width, stealth=args.stealth) def on_screenshot(screenshot_jpeg): @@ -206,7 +214,7 @@ def brozzle_page(argv=None): browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: - browser.start(proxy=args.proxy) + browser.start(proxy=args.proxy, window_height=args.window_height, window_width=args.window_width) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) @@ -326,6 +334,12 @@ def brozzler_worker(argv=None): arg_parser.add_argument( '--browser_throughput', type=int, dest='download_throughput', default=-1, help='Chrome DevTools downloadThroughput for Network.emulateNetworkConditions') + arg_parser.add_argument( + '--browser_window_height', type=int, dest='window_height', default=900, + help='browser window height in pixels') + arg_parser.add_argument( + '--browser_window_width', type=int, dest='window_width', default=1400, + help='browser window width in pixels') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=( diff --git a/brozzler/worker.py b/brozzler/worker.py index 8aa6083..e8f7619 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning it runs youtube-dl on them, browses them and runs behaviors if appropriate, scopes and adds outlinks to the frontier -Copyright (C) 2014-2018 Internet Archive +Copyright (C) 2014-2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -53,7 +53,8 @@ class BrozzlerWorker: skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, - download_throughput=-1, stealth=False): + download_throughput=-1, stealth=False, + window_height=900, window_width=1400): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -71,6 +72,8 @@ class BrozzlerWorker: self._behavior_timeout = behavior_timeout self._extract_outlinks_timeout = extract_outlinks_timeout self._download_throughput = download_throughput + self._window_height = window_height + self._window_width = window_width self._stealth = stealth self._browser_pool = brozzler.browser.BrowserPool( @@ -294,7 +297,9 @@ class BrozzlerWorker: if not browser.is_running(): browser.start( proxy=self._proxy_for(site), - cookie_db=site.get('cookie_db')) + cookie_db=site.get('cookie_db'), + window_height=self._window_height, + window_width=self._window_width) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(page), behavior_parameters=site.get('behavior_parameters'), From b138b1e89b3be16da402c112da7055657dc089dc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 30 Apr 2023 11:15:23 -0700 Subject: [PATCH 5/5] bump version, copyright date --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 097e720..98fdecc 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2022 Internet Archive +Copyright (C) 2014-2023 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.33', + version='1.5.34', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',