mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
Merge branch 'master' into gretchen/easy-instructions-update
This commit is contained in:
commit
e77b4f1c63
4
.github/workflows/python-formatting.yml
vendored
4
.github/workflows/python-formatting.yml
vendored
@ -22,10 +22,10 @@ jobs:
|
||||
- name: Create virtual environment
|
||||
run: python -m venv venv
|
||||
|
||||
- name: Install black
|
||||
- name: Install ruff
|
||||
run: |
|
||||
./venv/bin/pip install --upgrade pip
|
||||
./venv/bin/pip install black
|
||||
./venv/bin/pip install ruff
|
||||
|
||||
- name: Run formatting check
|
||||
run: make ck-format
|
||||
|
4
Makefile
4
Makefile
@ -1,7 +1,7 @@
|
||||
.PHONY: format
|
||||
format:
|
||||
venv/bin/black -t py35 -t py36 -t py37 -t py38 -t py39 -t py310 -t py311 -t py312 .
|
||||
venv/bin/ruff format --target-version py37 .
|
||||
|
||||
.PHONY: ck-format
|
||||
ck-format:
|
||||
venv/bin/black --check .
|
||||
venv/bin/ruff format --check --target-version py37 .
|
||||
|
@ -367,7 +367,7 @@ class Chrome:
|
||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||
status = self.chrome_process.wait()
|
||||
pid_logger.warning(
|
||||
"chrome reaped after killing with " "SIGKILL",
|
||||
"chrome reaped after killing with SIGKILL",
|
||||
status=status,
|
||||
)
|
||||
|
||||
|
@ -74,6 +74,17 @@ def add_common_options(arg_parser, argv=None):
|
||||
const=logging.DEBUG,
|
||||
help=("very verbose logging"),
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"--syslogd-log-prefix",
|
||||
dest="syslogd_log_prefix",
|
||||
action="store_true",
|
||||
help="add syslogd log level prefix for journalctl filtering",
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
"--worker-id",
|
||||
dest="worker_id",
|
||||
help="ID for this worker, displayed in logs if provided",
|
||||
)
|
||||
# arg_parser.add_argument(
|
||||
# '-s', '--silent', dest='log_level', action='store_const',
|
||||
# default=logging.INFO, const=logging.CRITICAL)
|
||||
@ -131,30 +142,58 @@ def decorate_logger_name(a, b, event_dict):
|
||||
return event_dict
|
||||
|
||||
|
||||
# https://manpages.debian.org/testing/libsystemd-dev/sd-daemon.3.en.html
|
||||
def _systemd_log_prefix(_, level, log):
|
||||
SYSLOG_MAP = {
|
||||
"critical": 2,
|
||||
"error": 3,
|
||||
"exception": 3,
|
||||
"warn": 3,
|
||||
"warning": 3,
|
||||
"info": 6,
|
||||
"debug": 7,
|
||||
"notset": 7,
|
||||
}
|
||||
prefix = SYSLOG_MAP.get(level)
|
||||
if prefix is not None:
|
||||
log = f"<{prefix}>{log}"
|
||||
|
||||
return log
|
||||
|
||||
|
||||
def configure_logging(args):
|
||||
processors = [
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.processors.add_log_level,
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.dev.set_exc_info,
|
||||
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True),
|
||||
structlog.processors.CallsiteParameterAdder(
|
||||
[
|
||||
structlog.processors.CallsiteParameter.FILENAME,
|
||||
structlog.processors.CallsiteParameter.FUNC_NAME,
|
||||
structlog.processors.CallsiteParameter.LINENO,
|
||||
],
|
||||
),
|
||||
decorate_logger_name,
|
||||
structlog.dev.ConsoleRenderer(),
|
||||
]
|
||||
|
||||
if args.syslogd_log_prefix:
|
||||
processors.append(_systemd_log_prefix)
|
||||
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.processors.add_log_level,
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.dev.set_exc_info,
|
||||
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=True),
|
||||
structlog.processors.CallsiteParameterAdder(
|
||||
[
|
||||
structlog.processors.CallsiteParameter.FILENAME,
|
||||
structlog.processors.CallsiteParameter.FUNC_NAME,
|
||||
structlog.processors.CallsiteParameter.LINENO,
|
||||
],
|
||||
),
|
||||
decorate_logger_name,
|
||||
structlog.dev.ConsoleRenderer(),
|
||||
],
|
||||
processors=processors,
|
||||
wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
|
||||
context_class=dict,
|
||||
logger_factory=structlog.PrintLoggerFactory(),
|
||||
cache_logger_on_first_use=False,
|
||||
)
|
||||
|
||||
# Adds the worker ID to the global binding, if supplied
|
||||
if args.worker_id is not None:
|
||||
structlog.contextvars.bind_contextvars(worker_id=args.worker_id)
|
||||
|
||||
# We still configure logging for now because its handlers
|
||||
# are used for the gunicorn spawned by the brozzler dashboard.
|
||||
logging.basicConfig(
|
||||
@ -953,7 +992,7 @@ def brozzler_list_pages(argv=None):
|
||||
"--claimed",
|
||||
dest="claimed",
|
||||
action="store_true",
|
||||
help=("limit to pages that are currently claimed by a brozzler " "worker"),
|
||||
help=("limit to pages that are currently claimed by a brozzler worker"),
|
||||
)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser, argv)
|
||||
@ -1024,22 +1063,21 @@ def brozzler_purge(argv=None):
|
||||
dest="job",
|
||||
metavar="JOB_ID",
|
||||
help=(
|
||||
"purge crawl state from rethinkdb for a job, including all "
|
||||
"sites and pages"
|
||||
"purge crawl state from rethinkdb for a job, including all sites and pages"
|
||||
),
|
||||
)
|
||||
group.add_argument(
|
||||
"--site",
|
||||
dest="site",
|
||||
metavar="SITE_ID",
|
||||
help=("purge crawl state from rethinkdb for a site, including all " "pages"),
|
||||
help=("purge crawl state from rethinkdb for a site, including all pages"),
|
||||
)
|
||||
group.add_argument(
|
||||
"--finished-before",
|
||||
dest="finished_before",
|
||||
metavar="YYYY-MM-DD",
|
||||
help=(
|
||||
"purge crawl state from rethinkdb for a jobs that ended " "before this date"
|
||||
"purge crawl state from rethinkdb for a jobs that ended before this date"
|
||||
),
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
|
@ -334,7 +334,7 @@ def main(argv=None):
|
||||
prog=os.path.basename(argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description=(
|
||||
"brozzler-dashboard - web application for viewing brozzler " "crawl status"
|
||||
"brozzler-dashboard - web application for viewing brozzler crawl status"
|
||||
),
|
||||
epilog=(
|
||||
"brozzler-dashboard has no command line options, but can be "
|
||||
|
@ -81,8 +81,7 @@ def _build_arg_parser(argv=None):
|
||||
dest="cacert",
|
||||
default="./%s-warcprox-ca.pem" % socket.gethostname(),
|
||||
help=(
|
||||
"warcprox CA certificate file; if file does not exist, it "
|
||||
"will be created"
|
||||
"warcprox CA certificate file; if file does not exist, it will be created"
|
||||
),
|
||||
)
|
||||
arg_parser.add_argument(
|
||||
@ -95,7 +94,7 @@ def _build_arg_parser(argv=None):
|
||||
"--onion-tor-socks-proxy",
|
||||
dest="onion_tor_socks_proxy",
|
||||
default=None,
|
||||
help=("host:port of tor socks proxy, used only to connect to " ".onion sites"),
|
||||
help=("host:port of tor socks proxy, used only to connect to .onion sites"),
|
||||
)
|
||||
|
||||
# brozzler-worker args
|
||||
@ -112,7 +111,7 @@ def _build_arg_parser(argv=None):
|
||||
dest="max_browsers",
|
||||
type=int,
|
||||
default=1,
|
||||
help=("max number of chrome instances simultaneously " "browsing pages"),
|
||||
help=("max number of chrome instances simultaneously browsing pages"),
|
||||
)
|
||||
|
||||
# pywb args
|
||||
|
@ -447,8 +447,6 @@ def main(argv=sys.argv):
|
||||
wayback_cli = BrozzlerWaybackCli(
|
||||
args=argv[1:],
|
||||
default_port=8880,
|
||||
desc=(
|
||||
"brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)"
|
||||
),
|
||||
desc=("brozzler-wayback - pywb wayback (monkey-patched for use with brozzler)"),
|
||||
)
|
||||
wayback_cli.run()
|
||||
|
@ -120,7 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
structlog.get_logger(logger_name=__name__).warning(
|
||||
"returning true (permitted) after problem fetching " "robots.txt",
|
||||
"returning true (permitted) after problem fetching robots.txt",
|
||||
url=url,
|
||||
raised_exception=e,
|
||||
)
|
||||
|
@ -169,7 +169,7 @@ class BrozzlerWorker:
|
||||
svc = self._choose_warcprox()
|
||||
if svc is None:
|
||||
raise brozzler.ProxyError(
|
||||
"no available instances of warcprox in the service " "registry"
|
||||
"no available instances of warcprox in the service registry"
|
||||
)
|
||||
site.proxy = "%s:%s" % (svc["host"], svc["port"])
|
||||
site.save()
|
||||
@ -735,7 +735,7 @@ class BrozzlerWorker:
|
||||
with self._start_stop_lock:
|
||||
if self._thread:
|
||||
self.logger.warning(
|
||||
"ignoring start request because self._thread is " "not None"
|
||||
"ignoring start request because self._thread is not None"
|
||||
)
|
||||
return
|
||||
self._thread = threading.Thread(target=self.run, name="BrozzlerWorker")
|
||||
|
@ -434,7 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
if worker._using_warcprox(site):
|
||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||
logger.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json",
|
||||
url=ydl.url,
|
||||
)
|
||||
worker._warcprox_write_record(
|
||||
|
@ -25,3 +25,9 @@ Issues = "https://github.com/internetarchive/brozzler/issues"
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pytest>=8.3.5",
|
||||
"ruff>=0.9.9"
|
||||
]
|
||||
|
@ -29,7 +29,9 @@ import json
|
||||
import threading
|
||||
import socket
|
||||
|
||||
args = argparse.Namespace()
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
brozzler.cli.add_common_options(arg_parser)
|
||||
args = arg_parser.parse_args([])
|
||||
args.log_level = logging.INFO
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
|
@ -28,7 +28,9 @@ import pytest
|
||||
|
||||
import brozzler
|
||||
|
||||
args = argparse.Namespace()
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
brozzler.cli.add_common_options(arg_parser)
|
||||
args = arg_parser.parse_args([])
|
||||
args.log_level = logging.INFO
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
|
@ -82,7 +82,7 @@ def main(argv=[]):
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
cmd = (
|
||||
"/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site " "%s %s"
|
||||
"/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site %s %s"
|
||||
) % (" ".join(options), args.seed)
|
||||
subprocess.call(["vagrant", "ssh", "--", cmd])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user