diff --git a/brozzler/chrome.py b/brozzler/chrome.py index f3a4841..5a1a8aa 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -48,7 +48,7 @@ def check_version(chrome_exe): # Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04 cmd = [chrome_exe, "--version"] out = subprocess.check_output(cmd, timeout=60) - m = re.search(rb"(Chromium|Google Chrome) ([\d.]+)", out) + m = re.search(br'(Chromium|Google Chrome|Thorium) ([\d.]+)', out) if not m: sys.exit( "unable to parse browser version from output of " diff --git a/brozzler/cli.py b/brozzler/cli.py index 7523008..bb360c9 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -129,9 +129,9 @@ def configure_logging(args): def suggest_default_chrome_exe(): # mac os x application executable paths for path in [ - "/Applications/Chromium.app/Contents/MacOS/Chromium", - "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - ]: + '/Applications/Thorium.app/Contents/MacOS/Thorium', + '/Applications/Chromium.app/Contents/MacOS/Chromium', + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome']: if os.path.exists(path): return path diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2120fbb..6934cc0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -104,35 +104,12 @@ class ExtraHeaderAdder(urllib.request.BaseHandler): return req -class YoutubeDLSpy(urllib.request.BaseHandler): - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self): - self.reset() - - def _http_response(self, request, response): - fetch = { - "url": request.full_url, - "method": request.get_method(), - "response_code": response.code, - "response_headers": response.headers, - } - self.fetches.append(fetch) - return response - - http_response = https_response = _http_response - - def reset(self): - self.fetches = [] - - def _build_youtube_dl(worker, destdir, site, page): """ Builds a yt-dlp `yt_dlp.YoutubeDL` for brozzling `site` with `worker`. The `YoutubeDL` instance does a few special brozzler-specific things: - - keeps track of urls fetched using a `YoutubeDLSpy` - periodically updates `site.last_claimed` in rethinkdb - pushes captured video to warcprox using a WARCPROX_WRITE_RECORD request - some logging @@ -141,6 +118,7 @@ def _build_youtube_dl(worker, destdir, site, page): worker (brozzler.BrozzlerWorker): the calling brozzler worker destdir (str): where to save downloaded videos site (brozzler.Site): the site we are brozzling + page (brozzler.Page): the page we are brozzling Returns: a yt-dlp `yt_dlp.YoutubeDL` instance @@ -310,7 +288,7 @@ def _build_youtube_dl(worker, destdir, site, page): "match_filter": match_filter_func("!is_live"), "extractor_args": {"youtube": {"skip": ["dash", "hls"]}}, # --cache-dir local or.. - # this looked like a problem with nsf-mounted homedir, shouldn't be a problem for brozzler on focal? + # this looked like a problem with nsf-mounted homedir, maybe not a problem for brozzler on focal? "cache_dir": "/home/archiveit", "logger": logging.getLogger("yt_dlp"), "verbose": False, @@ -324,56 +302,25 @@ def _build_youtube_dl(worker, destdir, site, page): ydl = _YoutubeDL(ydl_opts) if site.extra_headers(): ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page))) - ydl.fetch_spy = YoutubeDLSpy() ydl.pushed_videos = [] - ydl._opener.add_handler(ydl.fetch_spy) return ydl - -def _remember_videos(page, fetches, pushed_videos=None): +def _remember_videos(page, pushed_videos=None): """ Saves info about videos captured by yt-dlp in `page.videos`. """ if not "videos" in page: page.videos = [] - for fetch in fetches or []: - content_type = fetch["response_headers"].get_content_type() - if ( - content_type.startswith("video/") - # skip manifests of DASH segmented video - - # see https://github.com/internetarchive/brozzler/pull/70 - and content_type != "video/vnd.mpeg.dash.mpd" - and fetch["method"] == "GET" - and fetch["response_code"] in (200, 206) - ): - video = { - "blame": "youtube-dl", - "url": fetch["url"], - "response_code": fetch["response_code"], - "content-type": content_type, - } - if "content-length" in fetch["response_headers"]: - video["content-length"] = int( - fetch["response_headers"]["content-length"] - ) - if "content-range" in fetch["response_headers"]: - # skip chunked youtube video - if "googlevideo.com/videoplayback" in fetch["url"]: - continue - video["content-range"] = fetch["response_headers"]["content-range"] - logging.debug("embedded video %s", video) - page.videos.append(video) for pushed_video in pushed_videos or []: - if pushed_video["content-type"].startswith("video/"): - video = { - "blame": "youtube-dl", - "url": pushed_video["url"], - "response_code": pushed_video["response_code"], - "content-type": pushed_video["content-type"], - "content-length": pushed_video["content-length"], - } - logging.debug("embedded video %s", video) - page.videos.append(video) + video = { + "blame": "youtube-dl", + "url": pushed_video["url"], + "response_code": pushed_video["response_code"], + "content-type": pushed_video["content-type"], + "content-length": pushed_video["content-length"], + } + logging.debug("pushed video %s", video) + page.videos.append(video) def _try_youtube_dl(worker, ydl, site, page): @@ -389,7 +336,7 @@ def _try_youtube_dl(worker, ydl, site, page): ie_result = ydl.sanitize_info( ydl.extract_info(str(urlcanon.whatwg(ytdlp_url))) ) - _remember_videos(page, ydl.fetch_spy.fetches, ydl.pushed_videos) + _remember_videos(page, ydl.pushed_videos) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info(