mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-18 14:56:02 -04:00
ruff linting fixes (#343)
* ruff linting fixes * move imports back down to where they're re-exported
This commit is contained in:
parent
6f011cc6c8
commit
f64db214d4
@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import threading
|
||||
from importlib.metadata import version as _version
|
||||
|
||||
import structlog
|
||||
import urlcanon
|
||||
|
||||
__version__ = _version("brozzler")
|
||||
|
||||
@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
|
||||
finally:
|
||||
try:
|
||||
self.release()
|
||||
except:
|
||||
except: # noqa: E722
|
||||
pass
|
||||
return rv
|
||||
|
||||
@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
|
||||
`js-templates/`. Defaults to brozzler dir.
|
||||
"""
|
||||
import os
|
||||
import string
|
||||
|
||||
import yaml
|
||||
|
||||
@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||
"""
|
||||
Returns the javascript behavior string populated with template_parameters.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
@ -194,8 +195,6 @@ class ThreadExceptionGate:
|
||||
return "<ThreadExceptionGate(%s)>" % self.thread
|
||||
|
||||
|
||||
import threading
|
||||
|
||||
_thread_exception_gates = {}
|
||||
_thread_exception_gates_lock = threading.Lock()
|
||||
|
||||
@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
|
||||
thread = threading.current_thread()
|
||||
|
||||
with _thread_exception_gates_lock:
|
||||
if not thread in _thread_exception_gates:
|
||||
if thread not in _thread_exception_gates:
|
||||
_thread_exception_gates[thread] = ThreadExceptionGate(thread)
|
||||
|
||||
return _thread_exception_gates[thread]
|
||||
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
|
||||
"""
|
||||
import ctypes
|
||||
import inspect
|
||||
import threading
|
||||
|
||||
import structlog
|
||||
|
||||
@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
|
||||
return _jinja2_env
|
||||
|
||||
|
||||
import urlcanon
|
||||
|
||||
|
||||
def _remove_query(url):
|
||||
url.question_mark = b""
|
||||
url.query = b""
|
||||
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
import datetime
|
||||
|
||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
||||
|
||||
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
|
||||
from brozzler.robots import is_permitted_by_robots # noqa: E402
|
||||
|
||||
__all__ = [
|
||||
"is_permitted_by_robots",
|
||||
@ -422,22 +414,25 @@ __all__ = [
|
||||
"suggest_default_chrome_exe",
|
||||
]
|
||||
|
||||
# TODO try using importlib.util.find_spec to test for dependency presence
|
||||
# rather than try/except on import.
|
||||
# See https://docs.astral.sh/ruff/rules/unused-import/#example
|
||||
try:
|
||||
import doublethink
|
||||
import doublethink # noqa: F401
|
||||
|
||||
# All of these imports use doublethink for real and are unsafe
|
||||
# to do if doublethink is unavailable.
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
from brozzler.frontier import RethinkDbFrontier # noqa: F401
|
||||
from brozzler.model import (
|
||||
InvalidJobConf,
|
||||
Job,
|
||||
Page,
|
||||
Site,
|
||||
new_job,
|
||||
new_job_file,
|
||||
new_site,
|
||||
InvalidJobConf, # noqa: F401
|
||||
Job, # noqa: F401
|
||||
Page, # noqa: F401
|
||||
Site, # noqa: F401
|
||||
new_job, # noqa: F401
|
||||
new_job_file, # noqa: F401
|
||||
new_site, # noqa: F401
|
||||
)
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.worker import BrozzlerWorker # noqa: F401
|
||||
|
||||
__all__.extend(
|
||||
[
|
||||
|
@ -18,7 +18,6 @@ limitations under the License.
|
||||
|
||||
import base64
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import socket
|
||||
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
def _on_message(self, websock, message):
|
||||
try:
|
||||
self._handle_message(websock, message)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.exception(
|
||||
"uncaught exception in _handle_message",
|
||||
message=message,
|
||||
@ -430,7 +429,7 @@ class Browser:
|
||||
self.logger.info("shutting down websocket connection")
|
||||
try:
|
||||
self.websock.close()
|
||||
except BaseException as e:
|
||||
except BaseException:
|
||||
self.logger.exception(
|
||||
"exception closing websocket", websocket=self.websock
|
||||
)
|
||||
@ -458,7 +457,7 @@ class Browser:
|
||||
)
|
||||
|
||||
self.websock_url = None
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.exception("problem stopping")
|
||||
|
||||
def is_running(self):
|
||||
@ -628,7 +627,7 @@ class Browser:
|
||||
jpeg_bytes = self.screenshot(full_page)
|
||||
on_screenshot(jpeg_bytes)
|
||||
return
|
||||
except BrowsingTimeout as e:
|
||||
except BrowsingTimeout:
|
||||
self.logger.exception("attempt %s/3", i + 1)
|
||||
|
||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||
@ -807,12 +806,12 @@ class Browser:
|
||||
if (
|
||||
msg
|
||||
and "result" in msg
|
||||
and not ("exceptionDetails" in msg["result"])
|
||||
and "exceptionDetails" not in msg["result"]
|
||||
and not (
|
||||
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
|
||||
)
|
||||
and "result" in msg["result"]
|
||||
and type(msg["result"]["result"]["value"]) == bool
|
||||
and isinstance(msg["result"]["result"]["value"], bool)
|
||||
and msg["result"]["result"]["value"]
|
||||
):
|
||||
self.logger.info("behavior decided it has finished")
|
||||
|
@ -265,7 +265,7 @@ class Chrome:
|
||||
return url
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
if time.time() - self._last_warning > 30:
|
||||
url_logger.warning(
|
||||
"problem accessing url (will keep trying until timeout)",
|
||||
@ -325,7 +325,7 @@ class Chrome:
|
||||
self.logger.debug(
|
||||
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
|
||||
)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.exception("unexpected exception")
|
||||
|
||||
def stop(self):
|
||||
@ -378,7 +378,7 @@ class Chrome:
|
||||
self.chrome_process.stderr.close()
|
||||
try:
|
||||
self._home_tmpdir.cleanup()
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.exception(
|
||||
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
|
||||
)
|
||||
|
@ -23,12 +23,10 @@ import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import string
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
@ -397,9 +395,9 @@ def brozzle_page(argv=None):
|
||||
enable_youtube_dl=not worker._skip_youtube_dl,
|
||||
)
|
||||
logger.info("outlinks", outlinks=sorted(outlinks))
|
||||
except brozzler.ReachedLimit as e:
|
||||
except brozzler.ReachedLimit:
|
||||
logger.exception("reached limit")
|
||||
except brozzler.PageInterstitialShown as e:
|
||||
except brozzler.PageInterstitialShown:
|
||||
logger.exception("page interstitial shown")
|
||||
finally:
|
||||
browser.stop()
|
||||
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
|
||||
logger.info(
|
||||
"dumping state (caught signal)\n%s", signal=signum, state=state_strs
|
||||
)
|
||||
except BaseException as e:
|
||||
except BaseException:
|
||||
logger.exception("exception dumping state")
|
||||
finally:
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
|
||||
try:
|
||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
||||
skip_av_seeds = {int(l) for l in skips.readlines()}
|
||||
skip_av_seeds = {int(line) for line in skips.readlines()}
|
||||
logger.info(
|
||||
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
skip_av_seeds = set()
|
||||
logger.info("running with empty skip_av_seeds")
|
||||
return skip_av_seeds
|
||||
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
|
||||
try:
|
||||
# make list from file
|
||||
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
|
||||
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
|
||||
ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
|
||||
if ytdlp_proxy_endpoints:
|
||||
logger.info(
|
||||
"running with ytdlp proxy endpoints file",
|
||||
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
ytdlp_proxy_endpoints = []
|
||||
logger.info("running with empty proxy endpoints file")
|
||||
return ytdlp_proxy_endpoints
|
||||
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
|
||||
configure_logging(args)
|
||||
|
||||
rr = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
brozzler.RethinkDbFrontier(rr)
|
||||
if args.job:
|
||||
try:
|
||||
job_id = int(args.job)
|
||||
|
@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import sys
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import structlog
|
||||
import yaml
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
|
||||
@ -33,14 +38,6 @@ except ImportError as e:
|
||||
e,
|
||||
)
|
||||
sys.exit(1)
|
||||
import base64
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import yaml
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
@ -285,6 +282,8 @@ def root(path):
|
||||
|
||||
|
||||
try:
|
||||
import logging
|
||||
|
||||
import gunicorn.app.base
|
||||
import gunicorn.glogging
|
||||
from gunicorn.six import iteritems
|
||||
|
@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import socketserver
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import doublethink
|
||||
import structlog
|
||||
|
||||
import brozzler
|
||||
import brozzler.cli
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
|
||||
try:
|
||||
@ -42,19 +54,6 @@ except ImportError as e:
|
||||
exc_info=True,
|
||||
)
|
||||
sys.exit(1)
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import socketserver
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import doublethink
|
||||
|
||||
import brozzler
|
||||
import brozzler.cli
|
||||
|
||||
|
||||
def _build_arg_parser(argv=None):
|
||||
|
@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import structlog
|
||||
@ -47,11 +43,11 @@ class RethinkDbFrontier:
|
||||
db_logger = self.logger.bind(dbname=self.rr.dbname)
|
||||
|
||||
dbs = self.rr.db_list().run()
|
||||
if not self.rr.dbname in dbs:
|
||||
if self.rr.dbname not in dbs:
|
||||
db_logger.info("creating rethinkdb database")
|
||||
self.rr.db_create(self.rr.dbname).run()
|
||||
tables = self.rr.table_list().run()
|
||||
if not "sites" in tables:
|
||||
if "sites" not in tables:
|
||||
db_logger.info("creating rethinkdb table 'sites' in database")
|
||||
self.rr.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas
|
||||
@ -60,7 +56,7 @@ class RethinkDbFrontier:
|
||||
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
|
||||
).run()
|
||||
self.rr.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
if "pages" not in tables:
|
||||
db_logger.info("creating rethinkdb table 'pages' in database")
|
||||
self.rr.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas
|
||||
@ -80,7 +76,7 @@ class RethinkDbFrontier:
|
||||
"least_hops",
|
||||
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
||||
).run()
|
||||
if not "jobs" in tables:
|
||||
if "jobs" not in tables:
|
||||
db_logger.info("creating rethinkdb table 'jobs' in database")
|
||||
self.rr.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas
|
||||
@ -352,7 +348,6 @@ class RethinkDbFrontier:
|
||||
site.save()
|
||||
|
||||
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
|
||||
"utf-8"
|
||||
@ -461,8 +456,8 @@ class RethinkDbFrontier:
|
||||
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||
# there can be many pages and each one can be very large (many videos,
|
||||
# in and out of scope links, etc)
|
||||
l = list(pages.values())
|
||||
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
|
||||
pages_list = list(pages.values())
|
||||
for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
|
||||
try:
|
||||
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
|
||||
reql = self.rr.table("pages").insert(batch, conflict="replace")
|
||||
@ -471,8 +466,8 @@ class RethinkDbFrontier:
|
||||
'conflict="replace")',
|
||||
batch,
|
||||
)
|
||||
result = reql.run()
|
||||
except Exception as e:
|
||||
reql.run()
|
||||
except Exception:
|
||||
self.logger.exception(
|
||||
"problem inserting/replacing batch of %s pages",
|
||||
len(batch),
|
||||
|
@ -19,12 +19,9 @@ limitations under the License.
|
||||
|
||||
import base64
|
||||
import copy
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
import uuid
|
||||
import zlib
|
||||
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
|
||||
# debugged, I found it here. Maybe there's a better way to see it.
|
||||
value = validator._errors[0].info[0][0].info[0][0].value
|
||||
self.errors["bad value"] = value
|
||||
except:
|
||||
except: # noqa: E722
|
||||
value = None
|
||||
|
||||
|
||||
@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
|
||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
||||
logger.info("inserting batch of %s pages", len(batch))
|
||||
result = frontier.rr.table("pages").insert(batch).run()
|
||||
frontier.rr.table("pages").insert(batch).run()
|
||||
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
||||
logger.info("inserting batch of %s sites", len(batch))
|
||||
result = frontier.rr.table("sites").insert(batch).run()
|
||||
frontier.rr.table("sites").insert(batch).run()
|
||||
logger.info("job fully started", job_id=job.id)
|
||||
|
||||
return job
|
||||
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||
table = "jobs"
|
||||
|
||||
def populate_defaults(self):
|
||||
if not "status" in self:
|
||||
if "status" not in self:
|
||||
self.status = "ACTIVE"
|
||||
if not "starts_and_stops" in self:
|
||||
if "starts_and_stops" not in self:
|
||||
if self.get("started"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
{"start": self.get("started"), "stop": self.get("finished")}
|
||||
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
table = "sites"
|
||||
|
||||
def populate_defaults(self):
|
||||
if not "status" in self:
|
||||
if "status" not in self:
|
||||
self.status = "ACTIVE"
|
||||
if not "claimed" in self:
|
||||
if "claimed" not in self:
|
||||
self.claimed = False
|
||||
if not "last_disclaimed" in self:
|
||||
if "last_disclaimed" not in self:
|
||||
self.last_disclaimed = brozzler.EPOCH_UTC
|
||||
if not "last_claimed" in self:
|
||||
if "last_claimed" not in self:
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
if "scope" not in self:
|
||||
self.scope = {}
|
||||
if not "skip_ytdlp" in self:
|
||||
if "skip_ytdlp" not in self:
|
||||
self.skip_ytdlp = None
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
if not "accepts" in self.scope:
|
||||
if "accepts" not in self.scope:
|
||||
self.scope["accepts"] = []
|
||||
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
||||
del self.scope["surt"]
|
||||
|
||||
# backward compatibility
|
||||
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
|
||||
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
|
||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||
if "max_hops_off_surt" in self.scope:
|
||||
del self.scope["max_hops_off_surt"]
|
||||
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
||||
)
|
||||
|
||||
if not "starts_and_stops" in self:
|
||||
if "starts_and_stops" not in self:
|
||||
if self.get("start_time"): # backward compatibility
|
||||
self.starts_and_stops = [
|
||||
{"start": self.get("start_time"), "stop": None}
|
||||
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||
|
||||
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||
if not "accepts" in self.scope:
|
||||
if "accepts" not in self.scope:
|
||||
self.scope["accepts"] = []
|
||||
simple_rule_ssurts = (
|
||||
rule["ssurt"]
|
||||
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
if not isinstance(url, urlcanon.ParsedUrl):
|
||||
url = urlcanon.semantic(url)
|
||||
|
||||
if not url.scheme in (b"http", b"https"):
|
||||
if url.scheme not in (b"http", b"https"):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
@ -390,31 +387,31 @@ class Page(doublethink.Document):
|
||||
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||
|
||||
def populate_defaults(self):
|
||||
if not "retry_after" in self:
|
||||
if "retry_after" not in self:
|
||||
self.retry_after = None
|
||||
if not "failed_attempts" in self:
|
||||
if "failed_attempts" not in self:
|
||||
self.failed_attempts = 0
|
||||
if not "hops_from_seed" in self:
|
||||
if "hops_from_seed" not in self:
|
||||
self.hops_from_seed = 0
|
||||
if not "hop_path" in self:
|
||||
if "hop_path" not in self:
|
||||
self.hop_path = None
|
||||
if not "via_page_url" in self:
|
||||
if "via_page_url" not in self:
|
||||
self.via_page_url = None
|
||||
if not "brozzle_count" in self:
|
||||
if "brozzle_count" not in self:
|
||||
self.brozzle_count = 0
|
||||
if not "claimed" in self:
|
||||
if "claimed" not in self:
|
||||
self.claimed = False
|
||||
if "hops_off_surt" in self and not "hops_off" in self:
|
||||
if "hops_off_surt" in self and "hops_off" not in self:
|
||||
self.hops_off = self.hops_off_surt
|
||||
if "hops_off_surt" in self:
|
||||
del self["hops_off_surt"]
|
||||
if not "hops_off" in self:
|
||||
if "hops_off" not in self:
|
||||
self.hops_off = 0
|
||||
if not "needs_robots_check" in self:
|
||||
if "needs_robots_check" not in self:
|
||||
self.needs_robots_check = False
|
||||
if not "priority" in self:
|
||||
if "priority" not in self:
|
||||
self.priority = self._calc_priority()
|
||||
if not "id" in self:
|
||||
if "id" not in self:
|
||||
self.id = self.compute_id(self.site_id, self.url)
|
||||
|
||||
def __str__(self):
|
||||
|
@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import structlog
|
||||
import urlcanon
|
||||
|
||||
import brozzler
|
||||
|
||||
logger = structlog.get_logger(logger_name=__name__)
|
||||
|
||||
@ -40,14 +47,7 @@ except ImportError as e:
|
||||
e,
|
||||
)
|
||||
sys.exit(1)
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import urlcanon
|
||||
|
||||
import brozzler
|
||||
|
||||
r = rdb.RethinkDB()
|
||||
|
||||
@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
|
||||
key = urlcanon.semantic(url).surt().decode("ascii")
|
||||
# logging.debug('%s -> %s', url, key)
|
||||
return key
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
return url
|
||||
|
||||
def replace_default_canonicalizer():
|
||||
@ -221,18 +221,9 @@ def support_in_progress_warcs():
|
||||
|
||||
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
def __init__(self, orig_url):
|
||||
import re
|
||||
|
||||
import six
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from six.moves.urllib.parse import (
|
||||
quote,
|
||||
quote_plus,
|
||||
unquote_plus,
|
||||
urlsplit,
|
||||
urlunsplit,
|
||||
)
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
||||
|
||||
@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
|
||||
urlkey = to_native_str(query.key, "utf-8")
|
||||
url = query.url
|
||||
filter_ = query.filters
|
||||
output = query.output
|
||||
|
||||
for rule in self.rules.iter_matching(urlkey):
|
||||
m = rule.regex.search(urlkey)
|
||||
|
@ -71,7 +71,7 @@ _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
|
||||
|
||||
def _robots_cache(site, proxy=None):
|
||||
if not site.id in _robots_caches:
|
||||
if site.id not in _robots_caches:
|
||||
req_sesh = _SessionRaiseOn420()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if proxy:
|
||||
|
@ -21,9 +21,7 @@ limitations under the License.
|
||||
import datetime
|
||||
import io
|
||||
import json
|
||||
import random
|
||||
import socket
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import urllib.request
|
||||
@ -99,9 +97,13 @@ class BrozzlerWorker:
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
|
||||
# TODO try using importlib.util.find_spec to test for dependency
|
||||
# presence rather than try/except on import.
|
||||
# See https://docs.astral.sh/ruff/rules/unused-import/#example
|
||||
|
||||
# We definitely shouldn't ytdlp if the optional extra is missing
|
||||
try:
|
||||
import yt_dlp
|
||||
import yt_dlp # noqa: F401
|
||||
except ImportError:
|
||||
self.logger.info(
|
||||
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
|
||||
@ -200,7 +202,7 @@ class BrozzlerWorker:
|
||||
response = requests.get("http://%s/status" % self._proxy)
|
||||
status = json.loads(response.text)
|
||||
self._proxy_is_warcprox = status["role"] == "warcprox"
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
self._proxy_is_warcprox = False
|
||||
self.logger.info(
|
||||
"%s %s warcprox",
|
||||
@ -348,13 +350,13 @@ class BrozzlerWorker:
|
||||
)
|
||||
metrics.brozzler_ydl_urls_checked.inc(1)
|
||||
outlinks.update(ydl_outlinks)
|
||||
except brozzler.ReachedLimit as e:
|
||||
except brozzler.ReachedLimit:
|
||||
raise
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except brozzler.ProxyError:
|
||||
raise
|
||||
except brozzler.VideoExtractorError as e:
|
||||
except brozzler.VideoExtractorError:
|
||||
self.logger.exception("error extracting video info")
|
||||
except Exception as e:
|
||||
if (
|
||||
@ -391,9 +393,9 @@ class BrozzlerWorker:
|
||||
timeout=self.HEADER_REQUEST_TIMEOUT,
|
||||
) as r:
|
||||
return r.headers
|
||||
except requests.exceptions.Timeout as e:
|
||||
except requests.exceptions.Timeout:
|
||||
url_logger.warning("Timed out trying to get headers", exc_info=True)
|
||||
except requests.exceptions.RequestException as e:
|
||||
except requests.exceptions.RequestException:
|
||||
url_logger.warning("Failed to get headers", exc_info=True)
|
||||
return {}
|
||||
|
||||
@ -469,7 +471,7 @@ class BrozzlerWorker:
|
||||
if "content-range" in response_headers:
|
||||
video["content-range"] = response_headers["content-range"]
|
||||
self.logger.debug("embedded video", video=video)
|
||||
if not "videos" in page:
|
||||
if "videos" not in page:
|
||||
page.videos = []
|
||||
page.videos.append(video)
|
||||
|
||||
@ -598,13 +600,13 @@ class BrozzlerWorker:
|
||||
site_logger.info("no pages left for site")
|
||||
except brozzler.ReachedLimit as e:
|
||||
self._frontier.reached_limit(site, e)
|
||||
except brozzler.ReachedTimeLimit as e:
|
||||
except brozzler.ReachedTimeLimit:
|
||||
self._frontier.finished(site, "FINISHED_TIME_LIMIT")
|
||||
except brozzler.CrawlStopped:
|
||||
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
|
||||
# except brozzler.browser.BrowsingAborted:
|
||||
# self.logger.info("{} shut down".format(browser))
|
||||
except brozzler.ProxyError as e:
|
||||
except brozzler.ProxyError:
|
||||
if self._warcprox_auto:
|
||||
self.logger.exception(
|
||||
"proxy error, will try to choose a "
|
||||
@ -676,7 +678,7 @@ class BrozzlerWorker:
|
||||
try:
|
||||
self.status_info = self._service_registry.heartbeat(status_info)
|
||||
self.logger.debug("status in service registry", status=self.status_info)
|
||||
except r.ReqlError as e:
|
||||
except r.ReqlError:
|
||||
self.logger.exception(
|
||||
"failed to send heartbeat and update service registry",
|
||||
info=status_info,
|
||||
@ -748,11 +750,11 @@ class BrozzlerWorker:
|
||||
time.sleep(0.5)
|
||||
|
||||
self.logger.warn("shutdown requested")
|
||||
except r.ReqlError as e:
|
||||
except r.ReqlError:
|
||||
self.logger.exception("caught rethinkdb exception, will try to proceed")
|
||||
except brozzler.ShutdownRequested:
|
||||
self.logger.info("shutdown requested")
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.critical(
|
||||
"thread exiting due to unexpected exception", exc_info=True
|
||||
)
|
||||
@ -760,7 +762,7 @@ class BrozzlerWorker:
|
||||
if self._service_registry and hasattr(self, "status_info"):
|
||||
try:
|
||||
self._service_registry.unregister(self.status_info["id"])
|
||||
except:
|
||||
except: # noqa: E722
|
||||
self.logger.exception("failed to unregister from service registry")
|
||||
|
||||
self.logger.info(
|
||||
|
@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
if result_type in ("url", "url_transparent"):
|
||||
if "extraction_depth" in extra_info:
|
||||
self.logger.info(
|
||||
f"Following redirect",
|
||||
"Following redirect",
|
||||
redirect_url=ie_result["url"],
|
||||
extraction_depth=extra_info["extraction_depth"],
|
||||
)
|
||||
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
# use it later to extract the watch pages as outlinks.
|
||||
try:
|
||||
ie_result["entries_no_dl"] = list(ie_result["entries"])
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
extract_context.warning(
|
||||
"failed to unroll entries ie_result['entries']?",
|
||||
exc_info=True,
|
||||
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
import magic
|
||||
|
||||
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
||||
except ImportError as e:
|
||||
except ImportError:
|
||||
mimetype = "video/%s" % info_dict["ext"]
|
||||
self.logger.warning(
|
||||
"guessing mimetype due to error",
|
||||
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||
)
|
||||
site.last_claimed = doublethink.utcnow()
|
||||
site.save()
|
||||
except:
|
||||
except: # noqa: E722
|
||||
worker.logger.debug(
|
||||
"problem heartbeating site.last_claimed site",
|
||||
id=site.id,
|
||||
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
|
||||
"""
|
||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||
"""
|
||||
if not "videos" in page:
|
||||
if "videos" not in page:
|
||||
page.videos = []
|
||||
for pushed_video in pushed_videos or []:
|
||||
video = {
|
||||
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
)
|
||||
metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
|
||||
break
|
||||
except brozzler.ShutdownRequested as e:
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (
|
||||
|
@ -110,7 +110,7 @@ def test_httpd(httpd):
|
||||
of the same url return the same payload, proving it can be used to test
|
||||
deduplication.
|
||||
"""
|
||||
payload1 = content2 = None
|
||||
payload1 = None
|
||||
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
|
||||
with urllib.request.urlopen(url) as response:
|
||||
assert response.status == 200
|
||||
@ -175,7 +175,6 @@ def test_420(httpd):
|
||||
|
||||
def test_js_dialogs(httpd):
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
# before commit d2ed6b97a24 these would hang and eventually raise
|
||||
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
||||
|
@ -40,11 +40,11 @@ def cli_commands():
|
||||
commands = set(console_scripts().keys())
|
||||
commands.remove("brozzler-wayback")
|
||||
try:
|
||||
import gunicorn
|
||||
import gunicorn # noqa: F401
|
||||
except ImportError:
|
||||
commands.remove("brozzler-dashboard")
|
||||
try:
|
||||
import pywb
|
||||
import pywb # noqa: F401
|
||||
except ImportError:
|
||||
commands.remove("brozzler-easy")
|
||||
return commands
|
||||
|
@ -23,7 +23,6 @@ import http.server
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib.request
|
||||
@ -47,7 +46,7 @@ def _local_address():
|
||||
try:
|
||||
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
|
||||
return s.getsockname()[0]
|
||||
except:
|
||||
except: # noqa: E722
|
||||
return "127.0.0.1"
|
||||
finally:
|
||||
s.close()
|
||||
@ -148,7 +147,7 @@ def test_httpd(httpd):
|
||||
of the same url return the same payload, proving it can be used to test
|
||||
deduplication.
|
||||
"""
|
||||
payload1 = content2 = None
|
||||
payload1 = None
|
||||
url = make_url(httpd, "/site1/file1.txt")
|
||||
with urllib.request.urlopen(url) as response:
|
||||
assert response.status == 200
|
||||
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):
|
||||
|
||||
|
||||
def test_proxy_conflict():
|
||||
with pytest.raises(AssertionError) as excinfo:
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
with pytest.raises(AssertionError):
|
||||
brozzler.worker.BrozzlerWorker(
|
||||
None, None, warcprox_auto=True, proxy="localhost:12345"
|
||||
)
|
||||
|
||||
@ -523,7 +522,6 @@ def test_login(httpd):
|
||||
|
||||
# take a look at the captures table
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
robots_url = make_url(httpd, "/robots.txt")
|
||||
captures = list(
|
||||
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
|
||||
)
|
||||
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):
|
||||
|
||||
|
||||
def test_stop_crawl(httpd):
|
||||
test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
"""
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
|
||||
# run two instances of warcprox
|
||||
opts = warcprox.Options()
|
||||
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
# the system, if any
|
||||
try:
|
||||
stop_service("warcprox")
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
logger.warning("problem stopping warcprox service: %s", exc_info=True)
|
||||
|
||||
# queue the site for brozzling
|
||||
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
|
||||
|
||||
def test_time_limit(httpd):
|
||||
test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
@ -928,7 +923,6 @@ def test_time_limit(httpd):
|
||||
|
||||
sites = list(frontier.job_sites(job.id))
|
||||
assert len(sites) == 1
|
||||
site = sites[0]
|
||||
|
||||
# time limit should be enforced pretty soon
|
||||
start = time.time()
|
||||
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
# take a look at the captures table
|
||||
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
|
||||
l = [c for c in captures if c["url"] == stitched_url]
|
||||
l = [c for c in captures if c["url"] == stitched_url] # noqa: E741
|
||||
assert len(l) == 1
|
||||
c = l[0]
|
||||
assert c["filename"].startswith("test_ydl_stitching")
|
||||
|
@ -559,7 +559,7 @@ def test_parent_url_scoping():
|
||||
assert parent_page.outlinks["accepted"] == outlinks
|
||||
|
||||
# parent page redirect_url matches accept parent_url_regex
|
||||
parent_page_c = brozzler.Page(
|
||||
brozzler.Page(
|
||||
rr,
|
||||
{
|
||||
"site_id": site.id,
|
||||
@ -606,7 +606,7 @@ def test_parent_url_scoping():
|
||||
assert parent_page.outlinks["accepted"] == []
|
||||
|
||||
# parent page redirect_url matches block parent_url_regex
|
||||
parent_page_c = brozzler.Page(
|
||||
brozzler.Page(
|
||||
rr,
|
||||
{
|
||||
"site_id": site.id,
|
||||
@ -659,10 +659,10 @@ def test_completed_page():
|
||||
]
|
||||
}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
page.refresh()
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
|
||||
# redirect that doesn't change scope surt because destination is covered by
|
||||
# the original surt
|
||||
@ -686,10 +686,10 @@ def test_completed_page():
|
||||
site.refresh()
|
||||
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
page.refresh()
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
|
||||
# redirect that doesn't change scope surt because page is not the seed page
|
||||
site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
|
||||
@ -712,10 +712,10 @@ def test_completed_page():
|
||||
site.refresh()
|
||||
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
page.refresh()
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
assert page.claimed is False
|
||||
|
||||
|
||||
def test_seed_page():
|
||||
@ -931,7 +931,7 @@ def test_max_claimed_sites():
|
||||
claimed_sites = frontier.claim_sites(3)
|
||||
assert len(claimed_sites) == 2
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_site = frontier.claim_sites(3)
|
||||
frontier.claim_sites(3)
|
||||
|
||||
# clean slate for the next one
|
||||
rr.table("jobs").delete().run()
|
||||
@ -1074,7 +1074,7 @@ def test_max_hops_off():
|
||||
site.refresh() # get it back from the db
|
||||
|
||||
# renamed this param
|
||||
assert not "max_hops_off_surt" in site.scope
|
||||
assert "max_hops_off_surt" not in site.scope
|
||||
assert site.scope["max_hops_off"] == 1
|
||||
|
||||
seed_page = frontier.seed_page(site.id)
|
||||
@ -1109,7 +1109,7 @@ def test_max_hops_off():
|
||||
assert len(pages) == 4
|
||||
assert pages[0].url == "http://example.com/"
|
||||
assert pages[0].hops_off == 0
|
||||
assert not "hops_off_surt" in pages[0]
|
||||
assert "hops_off_surt" not in pages[0]
|
||||
assert set(pages[0].outlinks["accepted"]) == {
|
||||
"https://example.com/toot",
|
||||
"http://foo.org/",
|
||||
|
@ -21,7 +21,6 @@ import datetime
|
||||
import http.server
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
@ -29,7 +28,6 @@ import uuid
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
import brozzler
|
||||
@ -291,7 +289,7 @@ def test_proxy_down():
|
||||
)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]["start"]
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
assert "start_time" not in site
|
||||
|
||||
site = brozzler.Site(
|
||||
None,
|
||||
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert site.starts_and_stops[0]["stop"] is None
|
||||
assert not "start_time" in site
|
||||
assert "start_time" not in site
|
||||
|
||||
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
||||
assert job.starts_and_stops[0]["start"]
|
||||
assert job.starts_and_stops[0]["stop"] is None
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
assert "started" not in job
|
||||
assert "finished" not in job
|
||||
|
||||
job = brozzler.Job(
|
||||
None,
|
||||
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
|
||||
)
|
||||
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
||||
assert not "started" in job
|
||||
assert not "finished" in job
|
||||
assert "started" not in job
|
||||
assert "finished" not in job
|
||||
|
||||
|
||||
class Exception1(Exception):
|
||||
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
|
||||
with brozzler.thread_accept_exceptions():
|
||||
time.sleep(2)
|
||||
return # test fails
|
||||
except Exception1 as e:
|
||||
except Exception1:
|
||||
pass
|
||||
except:
|
||||
except: # noqa: E722
|
||||
return # fail test
|
||||
|
||||
try:
|
||||
|
@ -32,7 +32,7 @@ import sys
|
||||
|
||||
try:
|
||||
from shlex import quote
|
||||
except:
|
||||
except: # noqa: E722
|
||||
from pipes import quote
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user