black formatting

This commit is contained in:
Misty De Méo 2025-02-19 09:13:10 -08:00
parent 3cdefc0779
commit 32b90f7029
11 changed files with 55 additions and 52 deletions

View File

@ -184,7 +184,9 @@ class ThreadExceptionGate:
def __enter__(self):
assert self.thread == threading.current_thread()
if self.pending_exception:
self.logger.info("raising pending exception", pending_exception=self.pending_exception)
self.logger.info(
"raising pending exception", pending_exception=self.pending_exception
)
tmp = self.pending_exception
self.pending_exception = None
raise tmp

View File

@ -717,9 +717,7 @@ class Browser:
# no links found
return frozenset()
else:
self.logger.error(
"problem extracting outlinks", message=message
)
self.logger.error("problem extracting outlinks", message=message)
return frozenset()
def screenshot(self, full_page=False, timeout=45):

View File

@ -268,7 +268,7 @@ class Chrome:
url_logger.warning(
"problem accessing url (will keep trying until timeout)",
timeout=timeout_sec,
exc_info=True
exc_info=True,
)
self._last_warning = time.time()
finally:

View File

@ -45,6 +45,7 @@ r = rdb.RethinkDB()
logger = structlog.get_logger()
def add_common_options(arg_parser, argv=None):
argv = argv or sys.argv
arg_parser.add_argument(
@ -119,12 +120,12 @@ def configure_logging(args):
structlog.processors.StackInfoRenderer(),
structlog.dev.set_exc_info,
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
structlog.dev.ConsoleRenderer()
structlog.dev.ConsoleRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=False
cache_logger_on_first_use=False,
)
logging.basicConfig(
@ -665,7 +666,9 @@ def brozzler_worker(argv=None):
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logger.info("running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE)
logger.info(
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
)
except Exception as e:
skip_av_seeds = set()
logger.info("running with empty skip_av_seeds")
@ -680,7 +683,7 @@ def brozzler_worker(argv=None):
if ytdlp_proxy_endpoints:
logger.info(
"running with ytdlp proxy endpoints file",
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
)
except Exception as e:
ytdlp_proxy_endpoints = []

View File

@ -238,11 +238,14 @@ class BrozzlerEasyController:
self.logger.info("starting brozzler-worker")
self.brozzler_worker.start()
self.logger.info("starting pywb", address="%s:%s" % self.pywb_httpd.server_address)
self.logger.info(
"starting pywb", address="%s:%s" % self.pywb_httpd.server_address
)
threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info(
"starting brozzler-dashboard", address="%s:%s" % self.dashboard_httpd.server_address
"starting brozzler-dashboard",
address="%s:%s" % self.dashboard_httpd.server_address,
)
threading.Thread(target=self.dashboard_httpd.serve_forever).start()

View File

@ -50,9 +50,7 @@ class RethinkDbFrontier:
self.rr.db_create(self.rr.dbname).run()
tables = self.rr.table_list().run()
if not "sites" in tables:
db_logger.info(
"creating rethinkdb table 'sites' in database"
)
db_logger.info("creating rethinkdb table 'sites' in database")
self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas
).run()
@ -61,9 +59,7 @@ class RethinkDbFrontier:
).run()
self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables:
db_logger.info(
"creating rethinkdb table 'pages' in database"
)
db_logger.info("creating rethinkdb table 'pages' in database")
self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas
).run()
@ -83,9 +79,7 @@ class RethinkDbFrontier:
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run()
if not "jobs" in tables:
db_logger.info(
"creating rethinkdb table 'jobs' in database"
)
db_logger.info("creating rethinkdb table 'jobs' in database")
self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas
).run()

View File

@ -38,6 +38,7 @@ from typing import Optional
logger = structlog.get_logger()
def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
with open(schema_file) as f:

View File

@ -120,8 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None):
raise brozzler.ProxyError(e)
else:
structlog.get_logger().warning(
"returning true (permitted) after problem fetching "
"robots.txt",
"returning true (permitted) after problem fetching " "robots.txt",
url=url,
exception=e,
)

View File

@ -223,13 +223,11 @@ class BrozzlerWorker:
request.type = "http"
request.set_proxy(warcprox_address, "http")
try:
with urllib.request.urlopen(request, timeout=600) as response:
if response.getcode() != 204:
self.logger.warning(
'got unexpected response on warcprox '
"got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)",
code=response.getcode(),
reason=response.reason,
@ -237,7 +235,7 @@ class BrozzlerWorker:
return request, response
except urllib.error.HTTPError as e:
self.logger.warning(
'got unexpected response on warcprox '
"got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)",
code=e.getcode(),
reason=e.info(),
@ -326,9 +324,7 @@ class BrozzlerWorker:
url=page.url,
)
else:
self.logger.exception(
"youtube_dl raised exception", page=page
)
self.logger.exception("youtube_dl raised exception", page=page)
return outlinks
@metrics.brozzler_header_processing_duration_seconds.time()
@ -581,9 +577,7 @@ class BrozzlerWorker:
page=page,
)
else:
site_logger.exception(
"unexpected exception", page=page
)
site_logger.exception("unexpected exception", page=page)
if page:
# Calculate backoff in seconds based on number of failed attempts.
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
@ -687,9 +681,7 @@ class BrozzlerWorker:
self._browser_pool.release(browsers[i])
def run(self):
self.logger.warn(
"brozzler %s - brozzler-worker starting", brozzler.__version__
)
self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__)
last_nothing_to_claim = 0
try:
while not self._shutdown.is_set():
@ -698,7 +690,9 @@ class BrozzlerWorker:
try:
self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable:
self.logger.debug("all browsers are in use", max_browsers=self._max_browsers)
self.logger.debug(
"all browsers are in use", max_browsers=self._max_browsers
)
except brozzler.NothingToClaim:
last_nothing_to_claim = time.time()
self.logger.debug(
@ -709,9 +703,7 @@ class BrozzlerWorker:
self.logger.warn("shutdown requested")
except r.ReqlError as e:
self.logger.exception(
"caught rethinkdb exception, will try to proceed"
)
self.logger.exception("caught rethinkdb exception, will try to proceed")
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")
except:
@ -723,12 +715,11 @@ class BrozzlerWorker:
try:
self._service_registry.unregister(self.status_info["id"])
except:
self.logger.exception(
"failed to unregister from service registry"
)
self.logger.exception("failed to unregister from service registry")
self.logger.info(
"shutting down brozzling threads", thread_count=len(self._browsing_threads)
"shutting down brozzling threads",
thread_count=len(self._browsing_threads),
)
with self._browsing_threads_lock:
for th in self._browsing_threads:

View File

@ -42,6 +42,7 @@ YTDLP_MAX_REDIRECTS = 5
logger = structlog.get_logger()
def should_ytdlp(site, page, page_status, skip_av_seeds):
# called only after we've passed needs_browsing() check
@ -130,7 +131,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info:
self.logger.info(
f"Following redirect", redirect_url=ie_result['url'], extraction_depth=extra_info['extraction_depth']
f"Following redirect",
redirect_url=ie_result["url"],
extraction_depth=extra_info["extraction_depth"],
)
extra_info["extraction_depth"] = 1 + extra_info.get(
"extraction_depth", 0
@ -166,7 +169,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
except Exception as e:
extract_context.warning(
"failed to unroll entries ie_result['entries']?",
exc_info=True
exc_info=True,
)
ie_result["entries_no_dl"] = []
ie_result["entries"] = []
@ -195,7 +198,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e:
mimetype = "video/%s" % info_dict["ext"]
self.logger.warning("guessing mimetype due to error", mimetype=mimetype, exc_info=True)
self.logger.warning(
"guessing mimetype due to error",
mimetype=mimetype,
exc_info=True,
)
# youtube watch page postprocessor is MoveFiles
@ -217,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
format=info_dict["format"],
mimetype=mimetype,
size=size,
warcprox=worker._proxy_for(site)
warcprox=worker._proxy_for(site),
)
with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked
@ -268,7 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
def ydl_postprocess_hook(d):
if d["status"] == "finished":
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing", postprocessor=d["postprocessor"])
worker.logger.info(
"[ydl_postprocess_hook] Finished postprocessing",
postprocessor=d["postprocessor"],
)
is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])
metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1)
@ -397,7 +407,10 @@ def _try_youtube_dl(worker, ydl, site, page):
attempt += 1
if attempt == max_attempts:
logger.warning(
"Failed after %s attempt(s)", max_attempts, attempts=max_attempts, exc_info=True
"Failed after %s attempt(s)",
max_attempts,
attempts=max_attempts,
exc_info=True,
)
raise brozzler.VideoExtractorError(
"yt-dlp hit error extracting info for %s" % ydl.url
@ -421,8 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page):
if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox "
"with yt-dlp json",
"sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
url=ydl.url,
)
worker._warcprox_write_record(

View File

@ -76,7 +76,7 @@ setuptools.setup(
"cryptography>=2.3",
"python-magic>=0.4.15",
"prometheus-client>=0.20.0",
"structlog>=25.1.0"
"structlog>=25.1.0",
],
extras_require={
"yt-dlp": ["yt-dlp>=2024.7.25"],