mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
__init__.py: rework imports
Although doublethink is an optional dependency to allow brozzler to be used as a library without it, in practice we had some mandatory import statements that prevented brozzler from being imported without it. This fixes that by gating off some of the imports and exports. If doublethink is available, brozzler works as it is now. But if it isn't, we make a few changes: * brozzler.worker, brozzler.cli and brozzler.model reexports are disabled * One brozzler.cli function, which is used outside brozzler's own cli, has been moved into brozzler's __init__.py. For compatibility, it's reexported from brozzler.cli.
This commit is contained in:
parent
b45e5dc096
commit
7b896be372
@ -321,44 +321,128 @@ def _remove_query(url):
|
||||
# XXX chop off path after last slash??
|
||||
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
|
||||
|
||||
import doublethink
|
||||
import datetime
|
||||
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||
def _mdfind(identifier):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def _suggest_default_chrome_exe_mac():
|
||||
import os
|
||||
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = _mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = _mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
import shutil, sys
|
||||
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = _suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
from brozzler.cli import suggest_default_chrome_exe
|
||||
|
||||
__all__ = [
|
||||
"Page",
|
||||
"Site",
|
||||
"BrozzlerWorker",
|
||||
"is_permitted_by_robots",
|
||||
"RethinkDbFrontier",
|
||||
"Browser",
|
||||
"BrowserPool",
|
||||
"BrowsingException",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
"sleep",
|
||||
"thread_accept_exceptions",
|
||||
"thread_raise",
|
||||
"suggest_default_chrome_exe",
|
||||
]
|
||||
|
||||
import datetime
|
||||
|
||||
try:
|
||||
import doublethink
|
||||
|
||||
# Synchronize epoch with doublethink if available
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||
|
||||
# All of these imports use doublethink for real and are unsafe
|
||||
# to do if doublethink is unavailable.
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
|
||||
__all__.extend(
|
||||
[
|
||||
"Page",
|
||||
"BrozzlerWorker",
|
||||
"RethinkDbFrontier",
|
||||
"Site",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
]
|
||||
)
|
||||
except ImportError:
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=datetime.timezone.utc
|
||||
)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
@ -30,17 +30,17 @@ import doublethink
|
||||
import signal
|
||||
import string
|
||||
import structlog
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import warnings
|
||||
import yaml
|
||||
import shutil
|
||||
import base64
|
||||
import rethinkdb as rdb
|
||||
|
||||
from brozzler import suggest_default_chrome_exe
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
@ -213,68 +213,6 @@ def configure_logging(args):
|
||||
)
|
||||
|
||||
|
||||
def mdfind(identifier):
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def suggest_default_chrome_exe_mac():
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
|
||||
"""
|
||||
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
|
||||
|
Loading…
x
Reference in New Issue
Block a user