mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
Merge branch 'doublethink-optional' into qa
This commit is contained in:
commit
96459fe766
@ -321,44 +321,126 @@ def _remove_query(url):
|
||||
# XXX chop off path after last slash??
|
||||
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
|
||||
|
||||
import doublethink
|
||||
|
||||
def _mdfind(identifier):
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def _suggest_default_chrome_exe_mac():
|
||||
import os
|
||||
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = _mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = _mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
import shutil, sys
|
||||
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = _suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
import datetime
|
||||
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
|
||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
from brozzler.cli import suggest_default_chrome_exe
|
||||
|
||||
__all__ = [
|
||||
"Page",
|
||||
"Site",
|
||||
"BrozzlerWorker",
|
||||
"is_permitted_by_robots",
|
||||
"RethinkDbFrontier",
|
||||
"Browser",
|
||||
"BrowserPool",
|
||||
"BrowsingException",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
"sleep",
|
||||
"thread_accept_exceptions",
|
||||
"thread_raise",
|
||||
"suggest_default_chrome_exe",
|
||||
]
|
||||
|
||||
try:
|
||||
import doublethink
|
||||
|
||||
# All of these imports use doublethink for real and are unsafe
|
||||
# to do if doublethink is unavailable.
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.model import (
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
InvalidJobConf,
|
||||
)
|
||||
|
||||
__all__.extend(
|
||||
[
|
||||
"Page",
|
||||
"BrozzlerWorker",
|
||||
"RethinkDbFrontier",
|
||||
"Site",
|
||||
"new_job",
|
||||
"new_site",
|
||||
"Job",
|
||||
"new_job_file",
|
||||
"InvalidJobConf",
|
||||
]
|
||||
)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
@ -30,17 +30,17 @@ import doublethink
|
||||
import signal
|
||||
import string
|
||||
import structlog
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import warnings
|
||||
import yaml
|
||||
import shutil
|
||||
import base64
|
||||
import rethinkdb as rdb
|
||||
|
||||
from brozzler import suggest_default_chrome_exe
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
@ -213,69 +213,6 @@ def configure_logging(args):
|
||||
)
|
||||
|
||||
|
||||
def mdfind(identifier):
|
||||
try:
|
||||
result = subprocess.check_output(
|
||||
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
|
||||
)
|
||||
# Just treat any errors as "couldn't find app"
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
|
||||
if result:
|
||||
return result.rstrip("\n")
|
||||
|
||||
|
||||
def suggest_default_chrome_exe_mac():
|
||||
path = None
|
||||
# Try Chromium first, then Chrome
|
||||
result = mdfind("org.chromium.Chromium")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Chromium"
|
||||
|
||||
result = mdfind("com.google.Chrome")
|
||||
if result is not None:
|
||||
path = f"{result}/Contents/MacOS/Google Chrome"
|
||||
|
||||
if path is not None and os.path.exists(path):
|
||||
return path
|
||||
|
||||
# Fall back to default paths if mdfind couldn't find it
|
||||
# (mdfind might fail to find them even in their default paths
|
||||
# if the system has Spotlight disabled.)
|
||||
for path in [
|
||||
"/Applications/Thorium.app/Contents/MacOS/Thorium",
|
||||
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
|
||||
|
||||
def suggest_default_chrome_exe():
|
||||
# First ask mdfind, which lets us find it in non-default paths
|
||||
if sys.platform == "darwin":
|
||||
path = suggest_default_chrome_exe_mac()
|
||||
if path is not None:
|
||||
return path
|
||||
|
||||
# "chromium-browser" is the executable on ubuntu trusty
|
||||
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
|
||||
# google chrome executable names taken from these packages:
|
||||
# http://www.ubuntuupdates.org/ppa/google_chrome
|
||||
for exe in [
|
||||
"chromium-browser",
|
||||
"chromium",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-unstable",
|
||||
]:
|
||||
if shutil.which(exe):
|
||||
return exe
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
|
||||
"""
|
||||
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
|
||||
|
@ -40,7 +40,6 @@ import urlcanon
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import rethinkdb as rdb
|
||||
from . import metrics
|
||||
from . import ydl
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
@ -95,6 +94,16 @@ class BrozzlerWorker:
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
|
||||
# We definitely shouldn't ytdlp if the optional extra is missing
|
||||
try:
|
||||
import yt_dlp
|
||||
except ImportError:
|
||||
self.logger.info(
|
||||
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
|
||||
)
|
||||
self._skip_youtube_dl = True
|
||||
|
||||
self._ytdlp_tmpdir = ytdlp_tmpdir
|
||||
self._simpler404 = simpler404
|
||||
self._screenshot_full_page = screenshot_full_page
|
||||
@ -260,6 +269,38 @@ class BrozzlerWorker:
|
||||
img.save(out, "jpeg", quality=95)
|
||||
return out.getbuffer()
|
||||
|
||||
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
|
||||
return False
|
||||
if site.skip_ytdlp:
|
||||
logger.info("skipping ytdlp: site marked skip_ytdlp")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
ytdlp_seed = (
|
||||
site["metadata"]["ait_seed_id"]
|
||||
if "metadata" in site and "ait_seed_id" in site["metadata"]
|
||||
else None
|
||||
)
|
||||
|
||||
# TODO: develop UI and refactor
|
||||
if ytdlp_seed:
|
||||
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
|
||||
logger.info("skipping ytdlp: site in skip_av_seeds")
|
||||
site.skip_ytdlp = True
|
||||
return False
|
||||
else:
|
||||
site.skip_ytdlp = False
|
||||
|
||||
return True
|
||||
|
||||
@metrics.brozzler_page_processing_duration_seconds.time()
|
||||
@metrics.brozzler_in_progress_pages.track_inprogress()
|
||||
def brozzle_page(
|
||||
@ -303,8 +344,12 @@ class BrozzlerWorker:
|
||||
except brozzler.PageInterstitialShown:
|
||||
page_logger.info("page interstitial shown (http auth)")
|
||||
|
||||
if enable_youtube_dl and ydl.should_ytdlp(site, page, status_code):
|
||||
if enable_youtube_dl and self.should_ytdlp(
|
||||
page_logger, site, page, status_code, set()
|
||||
):
|
||||
try:
|
||||
from . import ydl
|
||||
|
||||
ydl_outlinks = ydl.do_youtube_dl(
|
||||
self, site, page, self._ytdlp_proxy_endpoints
|
||||
)
|
||||
|
@ -47,27 +47,6 @@ YTDLP_MAX_REDIRECTS = 5
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
|
||||
|
||||
def should_ytdlp(site, page, page_status):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
if page_status != 200:
|
||||
logger.info("skipping ytdlp: non-200 page status %s", page_status)
|
||||
return False
|
||||
if site.video_capture in [
|
||||
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
|
||||
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
|
||||
]:
|
||||
logger.info("skipping ytdlp: site has video capture disabled")
|
||||
return False
|
||||
|
||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||
|
||||
if "chrome-error:" in ytdlp_url:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def isyoutubehost(url):
|
||||
# split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname
|
||||
return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0]
|
||||
|
@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues"
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pytest>=8.3.5",
|
||||
"ruff>=0.9.9"
|
||||
]
|
||||
|
Loading…
x
Reference in New Issue
Block a user