__init__.py: rework imports (#334)
Some checks are pending
Python Formatting Check / formatting (push) Waiting to run
Tests / Run tests (3.12) (push) Waiting to run
Tests / Run tests (3.8) (push) Waiting to run

* __init__.py: rework imports

Although doublethink is an optional dependency to allow brozzler to be
used as a library without it, in practice we had some mandatory import
statements that prevented brozzler from being imported without it.
This fixes that by gating off some of the imports and exports.

If doublethink is available, brozzler works as it is now. But if it
isn't, we make a few changes:

* brozzler.worker, brozzler.cli and brozzler.model reexports are
  disabled
* One brozzler.cli function, which is used outside brozzler's own cli,
  has been moved into brozzler's __init__.py. For compatibility, it's
  reexported from brozzler.cli.

* Make tz-aware datetime of the epoch with stdlib

* Only import yt-dlp if we're using it

* ydl: never try if extra missing

* cli: use worker's yt-dlp check

---------

Co-authored-by: Alex Dempsey <avdempsey@archive.org>
This commit is contained in:
Misty De Méo 2025-03-06 14:49:22 -08:00 committed by GitHub
parent 0f707dc02b
commit 21102ca95c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 156 additions and 126 deletions

View File

@ -321,44 +321,126 @@ def _remove_query(url):
# XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
import doublethink
def _mdfind(identifier):
import subprocess
try:
result = subprocess.check_output(
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
)
# Just treat any errors as "couldn't find app"
except subprocess.CalledProcessError:
return None
if result:
return result.rstrip("\n")
def _suggest_default_chrome_exe_mac():
import os
path = None
# Try Chromium first, then Chrome
result = _mdfind("org.chromium.Chromium")
if result is not None:
path = f"{result}/Contents/MacOS/Chromium"
result = _mdfind("com.google.Chrome")
if result is not None:
path = f"{result}/Contents/MacOS/Google Chrome"
if path is not None and os.path.exists(path):
return path
# Fall back to default paths if mdfind couldn't find it
# (mdfind might fail to find them even in their default paths
# if the system has Spotlight disabled.)
for path in [
"/Applications/Chromium.app/Contents/MacOS/Chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
]:
if os.path.exists(path):
return path
def suggest_default_chrome_exe():
import shutil, sys
# First ask mdfind, which lets us find it in non-default paths
if sys.platform == "darwin":
path = _suggest_default_chrome_exe_mac()
if path is not None:
return path
# "chromium-browser" is the executable on ubuntu trusty
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
# google chrome executable names taken from these packages:
# http://www.ubuntuupdates.org/ppa/google_chrome
for exe in [
"chromium-browser",
"chromium",
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-unstable",
]:
if shutil.which(exe):
return exe
return "chromium-browser"
import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.model import (
new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
from brozzler.cli import suggest_default_chrome_exe
__all__ = [
"Page",
"Site",
"BrozzlerWorker",
"is_permitted_by_robots",
"RethinkDbFrontier",
"Browser",
"BrowserPool",
"BrowsingException",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
"sleep",
"thread_accept_exceptions",
"thread_raise",
"suggest_default_chrome_exe",
]
try:
import doublethink
# All of these imports use doublethink for real and are unsafe
# to do if doublethink is unavailable.
from brozzler.worker import BrozzlerWorker
from brozzler.frontier import RethinkDbFrontier
from brozzler.model import (
new_job,
new_job_file,
new_site,
Job,
Page,
Site,
InvalidJobConf,
)
__all__.extend(
[
"Page",
"BrozzlerWorker",
"RethinkDbFrontier",
"Site",
"new_job",
"new_site",
"Job",
"new_job_file",
"InvalidJobConf",
]
)
except ImportError:
pass
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3

View File

@ -30,17 +30,17 @@ import doublethink
import signal
import string
import structlog
import subprocess
import sys
import threading
import time
import traceback
import warnings
import yaml
import shutil
import base64
import rethinkdb as rdb
from brozzler import suggest_default_chrome_exe
r = rdb.RethinkDB()
logger = structlog.get_logger(logger_name=__name__)
@ -213,68 +213,6 @@ def configure_logging(args):
)
def mdfind(identifier):
try:
result = subprocess.check_output(
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
)
# Just treat any errors as "couldn't find app"
except subprocess.CalledProcessError:
return None
if result:
return result.rstrip("\n")
def suggest_default_chrome_exe_mac():
path = None
# Try Chromium first, then Chrome
result = mdfind("org.chromium.Chromium")
if result is not None:
path = f"{result}/Contents/MacOS/Chromium"
result = mdfind("com.google.Chrome")
if result is not None:
path = f"{result}/Contents/MacOS/Google Chrome"
if path is not None and os.path.exists(path):
return path
# Fall back to default paths if mdfind couldn't find it
# (mdfind might fail to find them even in their default paths
# if the system has Spotlight disabled.)
for path in [
"/Applications/Chromium.app/Contents/MacOS/Chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
]:
if os.path.exists(path):
return path
def suggest_default_chrome_exe():
# First ask mdfind, which lets us find it in non-default paths
if sys.platform == "darwin":
path = suggest_default_chrome_exe_mac()
if path is not None:
return path
# "chromium-browser" is the executable on ubuntu trusty
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
# google chrome executable names taken from these packages:
# http://www.ubuntuupdates.org/ppa/google_chrome
for exe in [
"chromium-browser",
"chromium",
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-unstable",
]:
if shutil.which(exe):
return exe
return "chromium-browser"
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
"""
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
@ -455,7 +393,7 @@ def brozzle_page(argv=None):
site,
page,
on_screenshot=on_screenshot,
enable_youtube_dl=not args.skip_youtube_dl,
enable_youtube_dl=not worker._skip_youtube_dl,
)
logger.info("outlinks", outlinks=sorted(outlinks))
except brozzler.ReachedLimit as e:

View File

@ -39,7 +39,6 @@ import urlcanon
from requests.structures import CaseInsensitiveDict
import rethinkdb as rdb
from . import metrics
from . import ydl
r = rdb.RethinkDB()
@ -96,6 +95,16 @@ class BrozzlerWorker:
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
# We definitely shouldn't ytdlp if the optional extra is missing
try:
import yt_dlp
except ImportError:
self.logger.info(
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
)
self._skip_youtube_dl = True
self._ytdlp_tmpdir = ytdlp_tmpdir
self._simpler404 = simpler404
self._screenshot_full_page = screenshot_full_page
@ -260,6 +269,38 @@ class BrozzlerWorker:
img.save(out, "jpeg", quality=95)
return out.getbuffer()
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
# called only after we've passed needs_browsing() check
if page_status != 200:
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
return False
if site.skip_ytdlp:
logger.info("skipping ytdlp: site marked skip_ytdlp")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logger.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
return True
@metrics.brozzler_page_processing_duration_seconds.time()
@metrics.brozzler_in_progress_pages.track_inprogress()
def brozzle_page(
@ -293,10 +334,12 @@ class BrozzlerWorker:
except brozzler.PageInterstitialShown:
page_logger.info("page interstitial shown (http auth)")
if enable_youtube_dl and ydl.should_ytdlp(
site, page, status_code, self._skip_av_seeds
if enable_youtube_dl and self.should_ytdlp(
page_logger, site, page, status_code, self._skip_av_seeds
):
try:
from . import ydl
ydl_outlinks = ydl.do_youtube_dl(
self, site, page, self._ytdlp_proxy_endpoints
)

View File

@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5
logger = structlog.get_logger(logger_name=__name__)
def should_ytdlp(site, page, page_status, skip_av_seeds):
# called only after we've passed needs_browsing() check
if page_status != 200:
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
return False
if site.skip_ytdlp:
logger.info("skipping ytdlp: site marked skip_ytdlp")
return False
ytdlp_url = page.redirect_url if page.redirect_url else page.url
if "chrome-error:" in ytdlp_url:
return False
ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)
# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logger.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False
return True
def isyoutubehost(url):
# split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]