mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
black formatting
This commit is contained in:
parent
3cdefc0779
commit
32b90f7029
@ -184,7 +184,9 @@ class ThreadExceptionGate:
|
||||
def __enter__(self):
|
||||
assert self.thread == threading.current_thread()
|
||||
if self.pending_exception:
|
||||
self.logger.info("raising pending exception", pending_exception=self.pending_exception)
|
||||
self.logger.info(
|
||||
"raising pending exception", pending_exception=self.pending_exception
|
||||
)
|
||||
tmp = self.pending_exception
|
||||
self.pending_exception = None
|
||||
raise tmp
|
||||
|
@ -717,9 +717,7 @@ class Browser:
|
||||
# no links found
|
||||
return frozenset()
|
||||
else:
|
||||
self.logger.error(
|
||||
"problem extracting outlinks", message=message
|
||||
)
|
||||
self.logger.error("problem extracting outlinks", message=message)
|
||||
return frozenset()
|
||||
|
||||
def screenshot(self, full_page=False, timeout=45):
|
||||
|
@ -268,7 +268,7 @@ class Chrome:
|
||||
url_logger.warning(
|
||||
"problem accessing url (will keep trying until timeout)",
|
||||
timeout=timeout_sec,
|
||||
exc_info=True
|
||||
exc_info=True,
|
||||
)
|
||||
self._last_warning = time.time()
|
||||
finally:
|
||||
|
@ -45,6 +45,7 @@ r = rdb.RethinkDB()
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def add_common_options(arg_parser, argv=None):
|
||||
argv = argv or sys.argv
|
||||
arg_parser.add_argument(
|
||||
@ -119,12 +120,12 @@ def configure_logging(args):
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.dev.set_exc_info,
|
||||
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False),
|
||||
structlog.dev.ConsoleRenderer()
|
||||
structlog.dev.ConsoleRenderer(),
|
||||
],
|
||||
wrapper_class=structlog.make_filtering_bound_logger(args.log_level),
|
||||
context_class=dict,
|
||||
logger_factory=structlog.PrintLoggerFactory(),
|
||||
cache_logger_on_first_use=False
|
||||
cache_logger_on_first_use=False,
|
||||
)
|
||||
|
||||
logging.basicConfig(
|
||||
@ -665,7 +666,9 @@ def brozzler_worker(argv=None):
|
||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
||||
skip_av_seeds = {int(l) for l in skips.readlines()}
|
||||
logger.info("running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE)
|
||||
logger.info(
|
||||
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
|
||||
)
|
||||
except Exception as e:
|
||||
skip_av_seeds = set()
|
||||
logger.info("running with empty skip_av_seeds")
|
||||
@ -680,7 +683,7 @@ def brozzler_worker(argv=None):
|
||||
if ytdlp_proxy_endpoints:
|
||||
logger.info(
|
||||
"running with ytdlp proxy endpoints file",
|
||||
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE
|
||||
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
|
||||
)
|
||||
except Exception as e:
|
||||
ytdlp_proxy_endpoints = []
|
||||
|
@ -238,11 +238,14 @@ class BrozzlerEasyController:
|
||||
self.logger.info("starting brozzler-worker")
|
||||
self.brozzler_worker.start()
|
||||
|
||||
self.logger.info("starting pywb", address="%s:%s" % self.pywb_httpd.server_address)
|
||||
self.logger.info(
|
||||
"starting pywb", address="%s:%s" % self.pywb_httpd.server_address
|
||||
)
|
||||
threading.Thread(target=self.pywb_httpd.serve_forever).start()
|
||||
|
||||
self.logger.info(
|
||||
"starting brozzler-dashboard", address="%s:%s" % self.dashboard_httpd.server_address
|
||||
"starting brozzler-dashboard",
|
||||
address="%s:%s" % self.dashboard_httpd.server_address,
|
||||
)
|
||||
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
|
||||
|
||||
|
@ -50,9 +50,7 @@ class RethinkDbFrontier:
|
||||
self.rr.db_create(self.rr.dbname).run()
|
||||
tables = self.rr.table_list().run()
|
||||
if not "sites" in tables:
|
||||
db_logger.info(
|
||||
"creating rethinkdb table 'sites' in database"
|
||||
)
|
||||
db_logger.info("creating rethinkdb table 'sites' in database")
|
||||
self.rr.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
@ -61,9 +59,7 @@ class RethinkDbFrontier:
|
||||
).run()
|
||||
self.rr.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
db_logger.info(
|
||||
"creating rethinkdb table 'pages' in database"
|
||||
)
|
||||
db_logger.info("creating rethinkdb table 'pages' in database")
|
||||
self.rr.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
@ -83,9 +79,7 @@ class RethinkDbFrontier:
|
||||
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
||||
).run()
|
||||
if not "jobs" in tables:
|
||||
db_logger.info(
|
||||
"creating rethinkdb table 'jobs' in database"
|
||||
)
|
||||
db_logger.info("creating rethinkdb table 'jobs' in database")
|
||||
self.rr.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas
|
||||
).run()
|
||||
|
@ -38,6 +38,7 @@ from typing import Optional
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), "job_schema.yaml")
|
||||
with open(schema_file) as f:
|
||||
|
@ -120,8 +120,7 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
structlog.get_logger().warning(
|
||||
"returning true (permitted) after problem fetching "
|
||||
"robots.txt",
|
||||
"returning true (permitted) after problem fetching " "robots.txt",
|
||||
url=url,
|
||||
exception=e,
|
||||
)
|
||||
|
@ -223,13 +223,11 @@ class BrozzlerWorker:
|
||||
request.type = "http"
|
||||
request.set_proxy(warcprox_address, "http")
|
||||
|
||||
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=600) as response:
|
||||
if response.getcode() != 204:
|
||||
self.logger.warning(
|
||||
'got unexpected response on warcprox '
|
||||
"got unexpected response on warcprox "
|
||||
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||
code=response.getcode(),
|
||||
reason=response.reason,
|
||||
@ -237,7 +235,7 @@ class BrozzlerWorker:
|
||||
return request, response
|
||||
except urllib.error.HTTPError as e:
|
||||
self.logger.warning(
|
||||
'got unexpected response on warcprox '
|
||||
"got unexpected response on warcprox "
|
||||
"WARCPROX_WRITE_RECORD request (expected 204)",
|
||||
code=e.getcode(),
|
||||
reason=e.info(),
|
||||
@ -326,9 +324,7 @@ class BrozzlerWorker:
|
||||
url=page.url,
|
||||
)
|
||||
else:
|
||||
self.logger.exception(
|
||||
"youtube_dl raised exception", page=page
|
||||
)
|
||||
self.logger.exception("youtube_dl raised exception", page=page)
|
||||
return outlinks
|
||||
|
||||
@metrics.brozzler_header_processing_duration_seconds.time()
|
||||
@ -581,9 +577,7 @@ class BrozzlerWorker:
|
||||
page=page,
|
||||
)
|
||||
else:
|
||||
site_logger.exception(
|
||||
"unexpected exception", page=page
|
||||
)
|
||||
site_logger.exception("unexpected exception", page=page)
|
||||
if page:
|
||||
# Calculate backoff in seconds based on number of failed attempts.
|
||||
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
|
||||
@ -687,9 +681,7 @@ class BrozzlerWorker:
|
||||
self._browser_pool.release(browsers[i])
|
||||
|
||||
def run(self):
|
||||
self.logger.warn(
|
||||
"brozzler %s - brozzler-worker starting", brozzler.__version__
|
||||
)
|
||||
self.logger.warn("brozzler %s - brozzler-worker starting", brozzler.__version__)
|
||||
last_nothing_to_claim = 0
|
||||
try:
|
||||
while not self._shutdown.is_set():
|
||||
@ -698,7 +690,9 @@ class BrozzlerWorker:
|
||||
try:
|
||||
self._start_browsing_some_sites()
|
||||
except brozzler.browser.NoBrowsersAvailable:
|
||||
self.logger.debug("all browsers are in use", max_browsers=self._max_browsers)
|
||||
self.logger.debug(
|
||||
"all browsers are in use", max_browsers=self._max_browsers
|
||||
)
|
||||
except brozzler.NothingToClaim:
|
||||
last_nothing_to_claim = time.time()
|
||||
self.logger.debug(
|
||||
@ -709,9 +703,7 @@ class BrozzlerWorker:
|
||||
|
||||
self.logger.warn("shutdown requested")
|
||||
except r.ReqlError as e:
|
||||
self.logger.exception(
|
||||
"caught rethinkdb exception, will try to proceed"
|
||||
)
|
||||
self.logger.exception("caught rethinkdb exception, will try to proceed")
|
||||
except brozzler.ShutdownRequested:
|
||||
self.logger.info("shutdown requested")
|
||||
except:
|
||||
@ -723,12 +715,11 @@ class BrozzlerWorker:
|
||||
try:
|
||||
self._service_registry.unregister(self.status_info["id"])
|
||||
except:
|
||||
self.logger.exception(
|
||||
"failed to unregister from service registry"
|
||||
)
|
||||
self.logger.exception("failed to unregister from service registry")
|
||||
|
||||
self.logger.info(
|
||||
"shutting down brozzling threads", thread_count=len(self._browsing_threads)
|
||||
"shutting down brozzling threads",
|
||||
thread_count=len(self._browsing_threads),
|
||||
)
|
||||
with self._browsing_threads_lock:
|
||||
for th in self._browsing_threads:
|
||||
|
@ -42,6 +42,7 @@ YTDLP_MAX_REDIRECTS = 5
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def should_ytdlp(site, page, page_status, skip_av_seeds):
|
||||
# called only after we've passed needs_browsing() check
|
||||
|
||||
@ -130,7 +131,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
if result_type in ("url", "url_transparent"):
|
||||
if "extraction_depth" in extra_info:
|
||||
self.logger.info(
|
||||
f"Following redirect", redirect_url=ie_result['url'], extraction_depth=extra_info['extraction_depth']
|
||||
f"Following redirect",
|
||||
redirect_url=ie_result["url"],
|
||||
extraction_depth=extra_info["extraction_depth"],
|
||||
)
|
||||
extra_info["extraction_depth"] = 1 + extra_info.get(
|
||||
"extraction_depth", 0
|
||||
@ -166,7 +169,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
except Exception as e:
|
||||
extract_context.warning(
|
||||
"failed to unroll entries ie_result['entries']?",
|
||||
exc_info=True
|
||||
exc_info=True,
|
||||
)
|
||||
ie_result["entries_no_dl"] = []
|
||||
ie_result["entries"] = []
|
||||
@ -195,7 +198,11 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
||||
except ImportError as e:
|
||||
mimetype = "video/%s" % info_dict["ext"]
|
||||
self.logger.warning("guessing mimetype due to error", mimetype=mimetype, exc_info=True)
|
||||
self.logger.warning(
|
||||
"guessing mimetype due to error",
|
||||
mimetype=mimetype,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# youtube watch page postprocessor is MoveFiles
|
||||
|
||||
@ -217,7 +224,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
format=info_dict["format"],
|
||||
mimetype=mimetype,
|
||||
size=size,
|
||||
warcprox=worker._proxy_for(site)
|
||||
warcprox=worker._proxy_for(site),
|
||||
)
|
||||
with open(info_dict["filepath"], "rb") as f:
|
||||
# include content-length header to avoid chunked
|
||||
@ -268,7 +275,10 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
|
||||
def ydl_postprocess_hook(d):
|
||||
if d["status"] == "finished":
|
||||
worker.logger.info("[ydl_postprocess_hook] Finished postprocessing", postprocessor=d["postprocessor"])
|
||||
worker.logger.info(
|
||||
"[ydl_postprocess_hook] Finished postprocessing",
|
||||
postprocessor=d["postprocessor"],
|
||||
)
|
||||
is_youtube_host = isyoutubehost(d["info_dict"]["webpage_url"])
|
||||
|
||||
metrics.brozzler_ydl_download_successes.labels(is_youtube_host).inc(1)
|
||||
@ -397,7 +407,10 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
attempt += 1
|
||||
if attempt == max_attempts:
|
||||
logger.warning(
|
||||
"Failed after %s attempt(s)", max_attempts, attempts=max_attempts, exc_info=True
|
||||
"Failed after %s attempt(s)",
|
||||
max_attempts,
|
||||
attempts=max_attempts,
|
||||
exc_info=True,
|
||||
)
|
||||
raise brozzler.VideoExtractorError(
|
||||
"yt-dlp hit error extracting info for %s" % ydl.url
|
||||
@ -421,8 +434,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
if worker._using_warcprox(site):
|
||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||
logger.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||
"with yt-dlp json",
|
||||
"sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json",
|
||||
url=ydl.url,
|
||||
)
|
||||
worker._warcprox_write_record(
|
||||
|
Loading…
x
Reference in New Issue
Block a user