__init__.py: rework imports

Although doublethink is an optional dependency to allow brozzler to be
used as a library without it, in practice we had some mandatory import
statements that prevented brozzler from being imported without it.
This fixes that by gating off some of the imports and exports.

If doublethink is available, brozzler works as it is now. But if it
isn't, we make a few changes:

* brozzler.worker, brozzler.cli and brozzler.model reexports are
  disabled
* One brozzler.cli function, which is used outside brozzler's own cli,
  has been moved into brozzler's __init__.py. For compatibility, it's
  reexported from brozzler.cli.
This commit is contained in:
Misty De Méo 2025-03-04 13:18:42 -08:00
parent b45e5dc096
commit 7b896be372
2 changed files with 112 additions and 90 deletions

View File

@ -321,44 +321,128 @@ def _remove_query(url):
# XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
import doublethink
import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
def _mdfind(identifier):
import subprocess
try:
result = subprocess.check_output(
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
)
# Just treat any errors as "couldn't find app"
except subprocess.CalledProcessError:
return None
if result:
return result.rstrip("\n")
def _suggest_default_chrome_exe_mac():
import os
path = None
# Try Chromium first, then Chrome
result = _mdfind("org.chromium.Chromium")
if result is not None:
path = f"{result}/Contents/MacOS/Chromium"
result = _mdfind("com.google.Chrome")
if result is not None:
path = f"{result}/Contents/MacOS/Google Chrome"
if path is not None and os.path.exists(path):
return path
# Fall back to default paths if mdfind couldn't find it
# (mdfind might fail to find them even in their default paths
# if the system has Spotlight disabled.)
for path in [
"/Applications/Chromium.app/Contents/MacOS/Chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
]:
if os.path.exists(path):
return path
def suggest_default_chrome_exe():
import shutil, sys
# First ask mdfind, which lets us find it in non-default paths
if sys.platform == "darwin":
path = _suggest_default_chrome_exe_mac()
if path is not None:
return path
# "chromium-browser" is the executable on ubuntu trusty
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
# google chrome executable names taken from these packages:
# http://www.ubuntuupdates.org/ppa/google_chrome
for exe in [
"chromium-browser",
"chromium",
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-unstable",
]:
if shutil.which(exe):
return exe
return "chromium-browser"
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.model import (
new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
from brozzler.cli import suggest_default_chrome_exe
__all__ = [
"Page",
"Site",
"BrozzlerWorker",
"is_permitted_by_robots",
"RethinkDbFrontier",
"Browser",
"BrowserPool",
"BrowsingException",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
"sleep",
"thread_accept_exceptions",
"thread_raise",
"suggest_default_chrome_exe",
]
import datetime
try:
import doublethink
# Synchronize epoch with doublethink if available
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
# All of these imports use doublethink for real and are unsafe
# to do if doublethink is unavailable.
from brozzler.worker import BrozzlerWorker
from brozzler.frontier import RethinkDbFrontier
from brozzler.model import (
new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
__all__.extend(
[
"Page",
"BrozzlerWorker",
"RethinkDbFrontier",
"Site",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
]
)
except ImportError:
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=datetime.timezone.utc
)
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3

View File

@ -30,17 +30,17 @@ import doublethink
import signal
import string
import structlog
import subprocess
import sys
import threading
import time
import traceback
import warnings
import yaml
import shutil
import base64
import rethinkdb as rdb
from brozzler import suggest_default_chrome_exe
r = rdb.RethinkDB()
logger = structlog.get_logger(logger_name=__name__)
@ -213,68 +213,6 @@ def configure_logging(args):
)
def mdfind(identifier):
try:
result = subprocess.check_output(
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
)
# Just treat any errors as "couldn't find app"
except subprocess.CalledProcessError:
return None
if result:
return result.rstrip("\n")
def suggest_default_chrome_exe_mac():
path = None
# Try Chromium first, then Chrome
result = mdfind("org.chromium.Chromium")
if result is not None:
path = f"{result}/Contents/MacOS/Chromium"
result = mdfind("com.google.Chrome")
if result is not None:
path = f"{result}/Contents/MacOS/Google Chrome"
if path is not None and os.path.exists(path):
return path
# Fall back to default paths if mdfind couldn't find it
# (mdfind might fail to find them even in their default paths
# if the system has Spotlight disabled.)
for path in [
"/Applications/Chromium.app/Contents/MacOS/Chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
]:
if os.path.exists(path):
return path
def suggest_default_chrome_exe():
# First ask mdfind, which lets us find it in non-default paths
if sys.platform == "darwin":
path = suggest_default_chrome_exe_mac()
if path is not None:
return path
# "chromium-browser" is the executable on ubuntu trusty
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
# google chrome executable names taken from these packages:
# http://www.ubuntuupdates.org/ppa/google_chrome
for exe in [
"chromium-browser",
"chromium",
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-unstable",
]:
if shutil.which(exe):
return exe
return "chromium-browser"
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
"""
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value