black formatting

This commit is contained in:
Misty De Méo 2025-02-19 09:13:10 -08:00
parent 3cdefc0779
commit 32b90f7029
11 changed files with 55 additions and 52 deletions

View file

@ -184,7 +184,9 @@ class ThreadExceptionGate:
def __enter__(self): def __enter__(self):
assert self.thread == threading.current_thread() assert self.thread == threading.current_thread()
if self.pending_exception: if self.pending_exception:
self.logger.info("raising pending exception", pending_exception=self.pending_exception) self.logger.info(
"raising pending exception", pending_exception=self.pending_exception
)
tmp = self.pending_exception tmp = self.pending_exception
self.pending_exception = None self.pending_exception = None
raise tmp raise tmp

View file

@ -717,9 +717,7 @@ class Browser:
# no links found # no links found
return frozenset() return frozenset()
else: else:
self.logger.error( self.logger.error("problem extracting outlinks", message=message)
"problem extracting outlinks", message=message
)
return frozenset() return frozenset()
def screenshot(self, full_page=False, timeout=45): def screenshot(self, full_page=False, timeout=45):

View file

@ -268,7 +268,7 @@ class Chrome:
url_logger.warning( url_logger.warning(
"problem accessing url (will keep trying until timeout)", "problem accessing url (will keep trying until timeout)",
timeout=timeout_sec, timeout=timeout_sec,
exc_info=True exc_info=True,
) )
self._last_warning = time.time() self._last_warning = time.time()
finally: finally:

View file

@ -45,6 +45,7 @@ r = rdb.RethinkDB()
logger = structlog.get_logger() logger = structlog.get_logger()
def add_common_options(arg_parser, argv=None): def add_common_options(arg_parser, argv=None):
argv = argv or sys.argv argv = argv or sys.argv
arg_parser.add_argument( arg_parser.add_argument(
@ -119,12 +120,12 @@ def configure_logging(args):
structlog.processors.StackInfoRenderer(), structlog.processors.StackInfoRenderer(),
structlog.dev.set_exc_info, structlog.dev.set_exc_info,
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False), structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
structlog.dev.ConsoleRenderer() structlog.dev.ConsoleRenderer(),
], ],
wrapper_class=structlog.make_filtering_bound_logger(args.log_level), wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
context_class=dict, context_class=dict,
logger_factory=structlog.PrintLoggerFactory(), logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=False cache_logger_on_first_use=False,
) )
logging.basicConfig( logging.basicConfig(
@ -665,7 +666,9 @@ def brozzler_worker(argv=None):
# make set from seed IDs in SKIP_AV_SEEDS_FILE # make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips: with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()} skip_av_seeds = {int(l) for l in skips.readlines()}
logger.info("running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE) logger.info(
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
)
except Exception as e: except Exception as e:
skip_av_seeds = set() skip_av_seeds = set()
logger.info("running with empty skip_av_seeds") logger.info("running with empty skip_av_seeds")
@ -680,7 +683,7 @@ def brozzler_worker(argv=None):
if ytdlp_proxy_endpoints: if ytdlp_proxy_endpoints:
logger.info( logger.info(
"running with ytdlp proxy endpoints file", "running with ytdlp proxy endpoints file",
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
) )
except Exception as e: except Exception as e:
ytdlp_proxy_endpoints = [] ytdlp_proxy_endpoints = []

View file

@ -238,11 +238,14 @@ class BrozzlerEasyController:
self.logger.info("starting brozzler-worker") self.logger.info("starting brozzler-worker")
self.brozzler_worker.start() self.brozzler_worker.start()
self.logger.info("starting pywb", address="%s:%s" % self.pywb_httpd.server_address) self.logger.info(
"starting pywb", address="%s:%s" % self.pywb_httpd.server_address
)
threading.Thread(target=self.pywb_httpd.serve_forever).start() threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info( self.logger.info(
"starting brozzler-dashboard", address="%s:%s" % self.dashboard_httpd.server_address "starting brozzler-dashboard",
address="%s:%s" % self.dashboard_httpd.server_address,
) )
threading.Thread(target=self.dashboard_httpd.serve_forever).start() threading.Thread(target=self.dashboard_httpd.serve_forever).start()

View file

@ -50,9 +50,7 @@ class RethinkDbFrontier:
self.rr.db_create(self.rr.dbname).run() self.rr.db_create(self.rr.dbname).run()
tables = self.rr.table_list().run() tables = self.rr.table_list().run()
if not "sites" in tables: if not "sites" in tables:
db_logger.info( db_logger.info("creating rethinkdb table 'sites' in database")
"creating rethinkdb table 'sites' in database"
)
self.rr.table_create( self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas "sites", shards=self.shards, replicas=self.replicas
).run() ).run()
@ -61,9 +59,7 @@ class RethinkDbFrontier:
).run() ).run()
self.rr.table("sites").index_create("job_id").run() self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables: if not "pages" in tables:
db_logger.info( db_logger.info("creating rethinkdb table 'pages' in database")
"creating rethinkdb table 'pages' in database"
)
self.rr.table_create( self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas "pages", shards=self.shards, replicas=self.replicas
).run() ).run()
@ -83,9 +79,7 @@ class RethinkDbFrontier:
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]], [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run() ).run()
if not "jobs" in tables: if not "jobs" in tables:
db_logger.info( db_logger.info("creating rethinkdb table 'jobs' in database")
"creating rethinkdb table 'jobs' in database"
)
self.rr.table_create( self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas "jobs", shards=self.shards, replicas=self.replicas
).run() ).run()

View file

@ -38,6 +38,7 @@ from typing import Optional
logger = structlog.get_logger() logger = structlog.get_logger()
def load_schema(): def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml") schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
with open(schema_file) as f: with open(schema_file) as f:

View file

@ -120,8 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None):
raise brozzler.ProxyError(e) raise brozzler.ProxyError(e)
else: else:
structlog.get_logger().warning( structlog.get_logger().warning(
"returning true (permitted) after problem fetching " "returning true (permitted) after problem fetching " "robots.txt",
"robots.txt",
url=url, url=url,
exception=e, exception=e,
) )

View file

@ -223,13 +223,11 @@ class BrozzlerWorker:
request.type = "http" request.type = "http"
request.set_proxy(warcprox_address, "http") request.set_proxy(warcprox_address, "http")
try: try:
with urllib.request.urlopen(request, timeout=600) as response: with urllib.request.urlopen(request, timeout=600) as response:
if response.getcode() != 204: if response.getcode() != 204:
self.logger.warning( self.logger.warning(
'got unexpected response on warcprox ' "got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)", "WARCPROX_WRITE_RECORD request (expected 204)",
code=response.getcode(), code=response.getcode(),
reason=response.reason, reason=response.reason,
@ -237,7 +235,7 @@ class BrozzlerWorker:
return request, response return request, response
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
self.logger.warning( self.logger.warning(
'got unexpected response on warcprox ' "got unexpected response on warcprox "
"WARCPROX_WRITE_RECORD request (expected 204)", "WARCPROX_WRITE_RECORD request (expected 204)",
code=e.getcode(), code=e.getcode(),
reason=e.info(), reason=e.info(),
@ -326,9 +324,7 @@ class BrozzlerWorker:
url=page.url, url=page.url,
) )
else: else:
self.logger.exception( self.logger.exception("youtube_dl raised exception", page=page)
"youtube_dl raised exception", page=page
)
return outlinks return outlinks
@metrics.brozzler_header_processing_duration_seconds.time() @metrics.brozzler_header_processing_duration_seconds.time()
@ -581,9 +577,7 @@ class BrozzlerWorker:
page=page, page=page,
) )
else: else:
site_logger.exception( site_logger.exception("unexpected exception", page=page)
"unexpected exception", page=page
)
if page: if page:
# Calculate backoff in seconds based on number of failed attempts. # Calculate backoff in seconds based on number of failed attempts.
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135... # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
@ -687,9 +681,7 @@ class BrozzlerWorker:
self._browser_pool.release(browsers[i]) self._browser_pool.release(browsers[i])
def run(self): def run(self):
self.logger.warn( self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__)
"brozzler %s - brozzler-worker starting", brozzler.__version__
)
last_nothing_to_claim = 0 last_nothing_to_claim = 0
try: try:
while not self._shutdown.is_set(): while not self._shutdown.is_set():
@ -698,7 +690,9 @@ class BrozzlerWorker:
try: try:
self._start_browsing_some_sites() self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable: except brozzler.browser.NoBrowsersAvailable:
self.logger.debug("all browsers are in use", max_browsers=self._max_browsers) self.logger.debug(
"all browsers are in use", max_browsers=self._max_browsers
)
except brozzler.NothingToClaim: except brozzler.NothingToClaim:
last_nothing_to_claim = time.time() last_nothing_to_claim = time.time()
self.logger.debug( self.logger.debug(
@ -709,9 +703,7 @@ class BrozzlerWorker:
self.logger.warn("shutdown requested") self.logger.warn("shutdown requested")
except r.ReqlError as e: except r.ReqlError as e:
self.logger.exception( self.logger.exception("caught rethinkdb exception, will try to proceed")
"caught rethinkdb exception, will try to proceed"
)
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
self.logger.info("shutdown requested") self.logger.info("shutdown requested")
except: except:
@ -723,12 +715,11 @@ class BrozzlerWorker:
try: try:
self._service_registry.unregister(self.status_info["id"]) self._service_registry.unregister(self.status_info["id"])
except: except:
self.logger.exception( self.logger.exception("failed to unregister from service registry")
"failed to unregister from service registry"
)
self.logger.info( self.logger.info(
"shutting down brozzling threads", thread_count=len(self._browsing_threads) "shutting down brozzling threads",
thread_count=len(self._browsing_threads),
) )
with self._browsing_threads_lock: with self._browsing_threads_lock:
for th in self._browsing_threads: for th in self._browsing_threads:

View file

@ -42,6 +42,7 @@ YTDLP_MAX_REDIRECTS = 5
logger = structlog.get_logger() logger = structlog.get_logger()
def should_ytdlp(site, page, page_status, skip_av_seeds): def should_ytdlp(site, page, page_status, skip_av_seeds):
# called only after we've passed needs_browsing() check # called only after we've passed needs_browsing() check
@ -130,7 +131,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"): if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info: if "extraction_depth" in extra_info:
self.logger.info( self.logger.info(
f"Following redirect", redirect_url=ie_result['url'], extraction_depth=extra_info['extraction_depth'] f"Following redirect",
redirect_url=ie_result["url"],
extraction_depth=extra_info["extraction_depth"],
) )
extra_info["extraction_depth"] = 1 + extra_info.get( extra_info["extraction_depth"] = 1 + extra_info.get(
"extraction_depth", 0 "extraction_depth", 0
@ -166,7 +169,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
except Exception as e: except Exception as e:
extract_context.warning( extract_context.warning(
"failed to unroll entries ie_result['entries']?", "failed to unroll entries ie_result['entries']?",
exc_info=True exc_info=True,
) )
ie_result["entries_no_dl"] = [] ie_result["entries_no_dl"] = []
ie_result["entries"] = [] ie_result["entries"] = []
@ -195,7 +198,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
mimetype = magic.from_file(info_dict["filepath"], mime=True) mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e: except ImportError as e:
mimetype = "video/%s" % info_dict["ext"] mimetype = "video/%s" % info_dict["ext"]
self.logger.warning("guessing mimetype due to error", mimetype=mimetype, exc_info=True) self.logger.warning(
"guessing mimetype due to error",
mimetype=mimetype,
exc_info=True,
)
# youtube watch page postprocessor is MoveFiles # youtube watch page postprocessor is MoveFiles
@ -217,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
format=info_dict["format"], format=info_dict["format"],
mimetype=mimetype, mimetype=mimetype,
size=size, size=size,
warcprox=worker._proxy_for(site) warcprox=worker._proxy_for(site),
) )
with open(info_dict["filepath"], "rb") as f: with open(info_dict["filepath"], "rb") as f:
# include content-length header to avoid chunked # include content-length header to avoid chunked
@ -268,7 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
def ydl_postprocess_hook(d): def ydl_postprocess_hook(d):
if d["status"] == "finished": if d["status"] == "finished":
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing", postprocessor=d["postprocessor"]) worker.logger.info(
"[ydl_postprocess_hook] Finished postprocessing",
postprocessor=d["postprocessor"],
)
is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"]) is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])
metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1) metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1)
@ -397,7 +407,10 @@ def _try_youtube_dl(worker, ydl, site, page):
attempt += 1 attempt += 1
if attempt == max_attempts: if attempt == max_attempts:
logger.warning( logger.warning(
"Failed after %s attempt(s)", max_attempts, attempts=max_attempts, exc_info=True "Failed after %s attempt(s)",
max_attempts,
attempts=max_attempts,
exc_info=True,
) )
raise brozzler.VideoExtractorError( raise brozzler.VideoExtractorError(
"yt-dlp hit error extracting info for %s" % ydl.url "yt-dlp hit error extracting info for %s" % ydl.url
@ -421,8 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page):
if worker._using_warcprox(site): if worker._using_warcprox(site):
info_json = json.dumps(ie_result, sort_keys=True, indent=4) info_json = json.dumps(ie_result, sort_keys=True, indent=4)
logger.info( logger.info(
"sending WARCPROX_WRITE_RECORD request to warcprox " "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
"with yt-dlp json",
url=ydl.url, url=ydl.url,
) )
worker._warcprox_write_record( worker._warcprox_write_record(

View file

@ -76,7 +76,7 @@ setuptools.setup(
"cryptography>=2.3", "cryptography>=2.3",
"python-magic>=0.4.15", "python-magic>=0.4.15",
"prometheus-client>=0.20.0", "prometheus-client>=0.20.0",
"structlog>=25.1.0" "structlog>=25.1.0",
], ],
extras_require={ extras_require={
"yt-dlp": ["yt-dlp>=2024.7.25"], "yt-dlp": ["yt-dlp>=2024.7.25"],