diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 7df8671..068b904 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -321,44 +321,126 @@ def _remove_query(url): # XXX chop off path after last slash?? site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query]) -import doublethink + +def _mdfind(identifier): + import subprocess + + try: + result = subprocess.check_output( + ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True + ) + # Just treat any errors as "couldn't find app" + except subprocess.CalledProcessError: + return None + + if result: + return result.rstrip("\n") + + +def _suggest_default_chrome_exe_mac(): + import os + + path = None + # Try Chromium first, then Chrome + result = _mdfind("org.chromium.Chromium") + if result is not None: + path = f"{result}/Contents/MacOS/Chromium" + + result = _mdfind("com.google.Chrome") + if result is not None: + path = f"{result}/Contents/MacOS/Google Chrome" + + if path is not None and os.path.exists(path): + return path + + # Fall back to default paths if mdfind couldn't find it + # (mdfind might fail to find them even in their default paths + # if the system has Spotlight disabled.) + for path in [ + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + ]: + if os.path.exists(path): + return path + + +def suggest_default_chrome_exe(): + import shutil, sys + + # First ask mdfind, which lets us find it in non-default paths + if sys.platform == "darwin": + path = _suggest_default_chrome_exe_mac() + if path is not None: + return path + + # "chromium-browser" is the executable on ubuntu trusty + # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium" + # google chrome executable names taken from these packages: + # http://www.ubuntuupdates.org/ppa/google_chrome + for exe in [ + "chromium-browser", + "chromium", + "google-chrome", + "google-chrome-stable", + "google-chrome-beta", + "google-chrome-unstable", + ]: + if shutil.which(exe): + return exe + return "chromium-browser" + + import datetime -EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC) +EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc) -# we could make this configurable if there's a good reason -MAX_PAGE_FAILURES = 3 -from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots -from brozzler.frontier import RethinkDbFrontier from brozzler.browser import Browser, BrowserPool, BrowsingException -from brozzler.model import ( - new_job, - new_job_file, - new_site, - Job, - Page, - Site, - InvalidJobConf, -) -from brozzler.cli import suggest_default_chrome_exe __all__ = [ - "Page", - "Site", - "BrozzlerWorker", "is_permitted_by_robots", - "RethinkDbFrontier", "Browser", "BrowserPool", "BrowsingException", - "new_job", - "new_site", - "Job", - "new_job_file", - "InvalidJobConf", "sleep", "thread_accept_exceptions", "thread_raise", + "suggest_default_chrome_exe", ] + +try: + import doublethink + + # All of these imports use doublethink for real and are unsafe + # to do if doublethink is unavailable. + from brozzler.worker import BrozzlerWorker + from brozzler.frontier import RethinkDbFrontier + from brozzler.model import ( + new_job, + new_job_file, + new_site, + Job, + Page, + Site, + InvalidJobConf, + ) + + __all__.extend( + [ + "Page", + "BrozzlerWorker", + "RethinkDbFrontier", + "Site", + "new_job", + "new_site", + "Job", + "new_job_file", + "InvalidJobConf", + ] + ) +except ImportError: + pass + +# we could make this configurable if there's a good reason +MAX_PAGE_FAILURES = 3 diff --git a/brozzler/cli.py b/brozzler/cli.py index 751f685..27e7e81 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -30,17 +30,17 @@ import doublethink import signal import string import structlog -import subprocess import sys import threading import time import traceback import warnings import yaml -import shutil import base64 import rethinkdb as rdb +from brozzler import suggest_default_chrome_exe + r = rdb.RethinkDB() logger = structlog.get_logger(logger_name=__name__) @@ -213,68 +213,6 @@ def configure_logging(args): ) -def mdfind(identifier): - try: - result = subprocess.check_output( - ["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True - ) - # Just treat any errors as "couldn't find app" - except subprocess.CalledProcessError: - return None - - if result: - return result.rstrip("\n") - - -def suggest_default_chrome_exe_mac(): - path = None - # Try Chromium first, then Chrome - result = mdfind("org.chromium.Chromium") - if result is not None: - path = f"{result}/Contents/MacOS/Chromium" - - result = mdfind("com.google.Chrome") - if result is not None: - path = f"{result}/Contents/MacOS/Google Chrome" - - if path is not None and os.path.exists(path): - return path - - # Fall back to default paths if mdfind couldn't find it - # (mdfind might fail to find them even in their default paths - # if the system has Spotlight disabled.) - for path in [ - "/Applications/Chromium.app/Contents/MacOS/Chromium", - "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - ]: - if os.path.exists(path): - return path - - -def suggest_default_chrome_exe(): - # First ask mdfind, which lets us find it in non-default paths - if sys.platform == "darwin": - path = suggest_default_chrome_exe_mac() - if path is not None: - return path - - # "chromium-browser" is the executable on ubuntu trusty - # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium" - # google chrome executable names taken from these packages: - # http://www.ubuntuupdates.org/ppa/google_chrome - for exe in [ - "chromium-browser", - "chromium", - "google-chrome", - "google-chrome-stable", - "google-chrome-beta", - "google-chrome-unstable", - ]: - if shutil.which(exe): - return exe - return "chromium-browser" - - class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter): """ Like argparse.ArgumentDefaultsHelpFormatter but omits the default value @@ -455,7 +393,7 @@ def brozzle_page(argv=None): site, page, on_screenshot=on_screenshot, - enable_youtube_dl=not args.skip_youtube_dl, + enable_youtube_dl=not worker._skip_youtube_dl, ) logger.info("outlinks", outlinks=sorted(outlinks)) except brozzler.ReachedLimit as e: diff --git a/brozzler/worker.py b/brozzler/worker.py index b8befef..868ded4 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,7 +39,6 @@ import urlcanon from requests.structures import CaseInsensitiveDict import rethinkdb as rdb from . import metrics -from . import ydl r = rdb.RethinkDB() @@ -96,6 +95,16 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + + # We definitely shouldn't ytdlp if the optional extra is missing + try: + import yt_dlp + except ImportError: + self.logger.info( + "optional yt-dlp extra not installed; setting skip_youtube_dl to True" + ) + self._skip_youtube_dl = True + self._ytdlp_tmpdir = ytdlp_tmpdir self._simpler404 = simpler404 self._screenshot_full_page = screenshot_full_page @@ -260,6 +269,38 @@ class BrozzlerWorker: img.save(out, "jpeg", quality=95) return out.getbuffer() + def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds): + # called only after we've passed needs_browsing() check + + if page_status != 200: + logger.info("skipping ytdlp: non-200 page status", page_status=page_status) + return False + if site.skip_ytdlp: + logger.info("skipping ytdlp: site marked skip_ytdlp") + return False + + ytdlp_url = page.redirect_url if page.redirect_url else page.url + + if "chrome-error:" in ytdlp_url: + return False + + ytdlp_seed = ( + site["metadata"]["ait_seed_id"] + if "metadata" in site and "ait_seed_id" in site["metadata"] + else None + ) + + # TODO: develop UI and refactor + if ytdlp_seed: + if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: + logger.info("skipping ytdlp: site in skip_av_seeds") + site.skip_ytdlp = True + return False + else: + site.skip_ytdlp = False + + return True + @metrics.brozzler_page_processing_duration_seconds.time() @metrics.brozzler_in_progress_pages.track_inprogress() def brozzle_page( @@ -293,10 +334,12 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: page_logger.info("page interstitial shown (http auth)") - if enable_youtube_dl and ydl.should_ytdlp( - site, page, status_code, self._skip_av_seeds + if enable_youtube_dl and self.should_ytdlp( + page_logger, site, page, status_code, self._skip_av_seeds ): try: + from . import ydl + ydl_outlinks = ydl.do_youtube_dl( self, site, page, self._ytdlp_proxy_endpoints ) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 7fab1f7..ae756a0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5 logger = structlog.get_logger(logger_name=__name__) -def should_ytdlp(site, page, page_status, skip_av_seeds): - # called only after we've passed needs_browsing() check - - if page_status != 200: - logger.info("skipping ytdlp: non-200 page status", page_status=page_status) - return False - if site.skip_ytdlp: - logger.info("skipping ytdlp: site marked skip_ytdlp") - return False - - ytdlp_url = page.redirect_url if page.redirect_url else page.url - - if "chrome-error:" in ytdlp_url: - return False - - ytdlp_seed = ( - site["metadata"]["ait_seed_id"] - if "metadata" in site and "ait_seed_id" in site["metadata"] - else None - ) - - # TODO: develop UI and refactor - if ytdlp_seed: - if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds: - logger.info("skipping ytdlp: site in skip_av_seeds") - site.skip_ytdlp = True - return False - else: - site.skip_ytdlp = False - - return True - - def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]