mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-18 23:06:01 -04:00
__init__.py: rework imports (#334)
* __init__.py: rework imports Although doublethink is an optional dependency to allow brozzler to be used as a library without it, in practice we had some mandatory import statements that prevented brozzler from being imported without it. This fixes that by gating off some of the imports and exports. If doublethink is available, brozzler works as it is now. But if it isn't, we make a few changes: * brozzler.worker, brozzler.cli and brozzler.model reexports are disabled * One brozzler.cli function, which is used outside brozzler's own cli, has been moved into brozzler's __init__.py. For compatibility, it's reexported from brozzler.cli. * Make tz-aware datetime of the epoch with stdlib * Only import yt-dlp if we're using it * ydl: never try if extra missing * cli: use worker's yt-dlp check --------- Co-authored-by: Alex Dempsey <avdempsey@archive.org>
This commit is contained in:
parent
0f707dc02b
commit
21102ca95c
@ -321,44 +321,126 @@ def _remove_query(url):
|
||||
# XXX chop off path after last slash??
|
||||
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
|
||||
|
||||
import doublethink
|
||||
|
||||
def _mdfind(identifier):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def _suggest_default_chrome_exe_mac():
|
||||
import os
|
||||
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = _mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = _mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
import shutil, sys
|
||||
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = _suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
import datetime
|
||||
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
from brozzler.cli import suggest_default_chrome_exe
|
||||
|
||||
__all__ = [
|
||||
"Page",
|
||||
"Site",
|
||||
"BrozzlerWorker",
|
||||
"is_permitted_by_robots",
|
||||
"RethinkDbFrontier",
|
||||
"Browser",
|
||||
"BrowserPool",
|
||||
"BrowsingException",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
"sleep",
|
||||
"thread_accept_exceptions",
|
||||
"thread_raise",
|
||||
"suggest_default_chrome_exe",
|
||||
]
|
||||
|
||||
try:
|
||||
import doublethink
|
||||
|
||||
# All of these imports use doublethink for real and are unsafe
|
||||
# to do if doublethink is unavailable.
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
|
||||
__all__.extend(
|
||||
[
|
||||
"Page",
|
||||
"BrozzlerWorker",
|
||||
"RethinkDbFrontier",
|
||||
"Site",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
]
|
||||
)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
@ -30,17 +30,17 @@ import doublethink
|
||||
import signal
|
||||
import string
|
||||
import structlog
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import warnings
|
||||
import yaml
|
||||
import shutil
|
||||
import base64
|
||||
import rethinkdb as rdb
|
||||
|
||||
from brozzler import suggest_default_chrome_exe
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
@ -213,68 +213,6 @@ def configure_logging(args):
|
||||
)
|
||||
|
||||
|
||||
def mdfind(identifier):
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def suggest_default_chrome_exe_mac():
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
|
||||
"""
|
||||
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
|
||||
@ -455,7 +393,7 @@ def brozzle_page(argv=None):
|
||||
site,
|
||||
page,
|
||||
on_screenshot=on_screenshot,
|
||||
enable_youtube_dl=not args.skip_youtube_dl,
|
||||
enable_youtube_dl=not worker._skip_youtube_dl,
|
||||
)
|
||||
logger.info("outlinks", outlinks=sorted(outlinks))
|
||||
except brozzler.ReachedLimit as e:
|
||||
|
@ -39,7 +39,6 @@ import urlcanon
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import rethinkdb as rdb
|
||||
from . import metrics
|
||||
from . import ydl
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
@ -96,6 +95,16 @@ class BrozzlerWorker:
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
|
||||
# We definitely shouldn't ytdlp if the optional extra is missing
|
||||
try:
|
||||
import yt_dlp
|
||||
except ImportError:
|
||||
self.logger.info(
|
||||
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
|
||||
)
|
||||
self._skip_youtube_dl = True
|
||||
|
||||
self._ytdlp_tmpdir = ytdlp_tmpdir
|
||||
self._simpler404 = simpler404
|
||||
self._screenshot_full_page = screenshot_full_page
|
||||
@ -260,6 +269,38 @@ class BrozzlerWorker:
|
||||
img.save(out, "jpeg", quality=95)
|
||||
return out.getbuffer()
|
||||
|
||||
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logger.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
# TODO: develop UI and refactor
|
||||
if ytdlp_seed:
|
||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
||||
logger.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
return True
|
||||
|
||||
@metrics.brozzler_page_processing_duration_seconds.time()
|
||||
@metrics.brozzler_in_progress_pages.track_inprogress()
|
||||
def brozzle_page(
|
||||
@ -293,10 +334,12 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
page_logger.info("page interstitial shown (http auth)")
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(
|
||||
site, page, status_code, self._skip_av_seeds
|
||||
if enable_youtube_dl and self.should_ytdlp(
|
||||
page_logger, site, page, status_code, self._skip_av_seeds
|
||||
):
|
||||
try:
|
||||
from . import ydl
|
||||
|
||||
ydl_outlinks = ydl.do_youtube_dl(
|
||||
self, site, page, self._ytdlp_proxy_endpoints
|
||||
)
|
||||
|
@ -43,39 +43,6 @@ YTDLP_MAX_REDIRECTS = 5
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
|
||||
|
||||
def should_ytdlp(site, page, page_status, skip_av_seeds):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logger.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
# TODO: develop UI and refactor
|
||||
if ytdlp_seed:
|
||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
||||
logger.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def isyoutubehost(url):
|
||||
# split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
|
||||
return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]
|
||||
|
Loading…
x
Reference in New Issue
Block a user