ruff linting fixes (#343)

* ruff linting fixes

* move imports back down to where they're re-exported
This commit is contained in:
Gretchen Leigh Miller 2025-03-07 16:03:35 -08:00 committed by GitHub
parent 6f011cc6c8
commit f64db214d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 155 additions and 190 deletions

View file

@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
import datetime
import logging import logging
import threading
from importlib.metadata import version as _version from importlib.metadata import version as _version
import structlog import structlog
import urlcanon
__version__ = _version("brozzler") __version__ = _version("brozzler")
@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
finally: finally:
try: try:
self.release() self.release()
except: except: # noqa: E722
pass pass
return rv return rv
@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
`js-templates/`. Defaults to brozzler dir. `js-templates/`. Defaults to brozzler dir.
""" """
import os import os
import string
import yaml import yaml
@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
""" """
Returns the javascript behavior string populated with template_parameters. Returns the javascript behavior string populated with template_parameters.
""" """
import json
import re import re
logger = structlog.get_logger(logger_name=__name__) logger = structlog.get_logger(logger_name=__name__)
@ -194,8 +195,6 @@ class ThreadExceptionGate:
return "<ThreadExceptionGate(%s)>" % self.thread return "<ThreadExceptionGate(%s)>" % self.thread
import threading
_thread_exception_gates = {} _thread_exception_gates = {}
_thread_exception_gates_lock = threading.Lock() _thread_exception_gates_lock = threading.Lock()
@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
thread = threading.current_thread() thread = threading.current_thread()
with _thread_exception_gates_lock: with _thread_exception_gates_lock:
if not thread in _thread_exception_gates: if thread not in _thread_exception_gates:
_thread_exception_gates[thread] = ThreadExceptionGate(thread) _thread_exception_gates[thread] = ThreadExceptionGate(thread)
return _thread_exception_gates[thread] return _thread_exception_gates[thread]
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
""" """
import ctypes import ctypes
import inspect import inspect
import threading
import structlog import structlog
@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
return _jinja2_env return _jinja2_env
import urlcanon
def _remove_query(url): def _remove_query(url):
url.question_mark = b"" url.question_mark = b""
url.query = b"" url.query = b""
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
return "chromium-browser" return "chromium-browser"
import datetime
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc) EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
from brozzler.browser import Browser, BrowserPool, BrowsingException from brozzler.robots import is_permitted_by_robots # noqa: E402
from brozzler.robots import is_permitted_by_robots
__all__ = [ __all__ = [
"is_permitted_by_robots", "is_permitted_by_robots",
@ -422,22 +414,25 @@ __all__ = [
"suggest_default_chrome_exe", "suggest_default_chrome_exe",
] ]
# TODO try using importlib.util.find_spec to test for dependency presence
# rather than try/except on import.
# See https://docs.astral.sh/ruff/rules/unused-import/#example
try: try:
import doublethink import doublethink # noqa: F401
# All of these imports use doublethink for real and are unsafe # All of these imports use doublethink for real and are unsafe
# to do if doublethink is unavailable. # to do if doublethink is unavailable.
from brozzler.frontier import RethinkDbFrontier from brozzler.frontier import RethinkDbFrontier # noqa: F401
from brozzler.model import ( from brozzler.model import (
InvalidJobConf, InvalidJobConf, # noqa: F401
Job, Job, # noqa: F401
Page, Page, # noqa: F401
Site, Site, # noqa: F401
new_job, new_job, # noqa: F401
new_job_file, new_job_file, # noqa: F401
new_site, new_site, # noqa: F401
) )
from brozzler.worker import BrozzlerWorker from brozzler.worker import BrozzlerWorker # noqa: F401
__all__.extend( __all__.extend(
[ [

View file

@ -18,7 +18,6 @@ limitations under the License.
import base64 import base64
import datetime import datetime
import itertools
import json import json
import logging import logging
import socket import socket
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
def _on_message(self, websock, message): def _on_message(self, websock, message):
try: try:
self._handle_message(websock, message) self._handle_message(websock, message)
except: except: # noqa: E722
self.logger.exception( self.logger.exception(
"uncaught exception in _handle_message", "uncaught exception in _handle_message",
message=message, message=message,
@ -430,7 +429,7 @@ class Browser:
self.logger.info("shutting down websocket connection") self.logger.info("shutting down websocket connection")
try: try:
self.websock.close() self.websock.close()
except BaseException as e: except BaseException:
self.logger.exception( self.logger.exception(
"exception closing websocket", websocket=self.websock "exception closing websocket", websocket=self.websock
) )
@ -458,7 +457,7 @@ class Browser:
) )
self.websock_url = None self.websock_url = None
except: except: # noqa: E722
self.logger.exception("problem stopping") self.logger.exception("problem stopping")
def is_running(self): def is_running(self):
@ -628,7 +627,7 @@ class Browser:
jpeg_bytes = self.screenshot(full_page) jpeg_bytes = self.screenshot(full_page)
on_screenshot(jpeg_bytes) on_screenshot(jpeg_bytes)
return return
except BrowsingTimeout as e: except BrowsingTimeout:
self.logger.exception("attempt %s/3", i + 1) self.logger.exception("attempt %s/3", i + 1)
def visit_hashtags(self, page_url, hashtags, outlinks): def visit_hashtags(self, page_url, hashtags, outlinks):
@ -807,12 +806,12 @@ class Browser:
if ( if (
msg msg
and "result" in msg and "result" in msg
and not ("exceptionDetails" in msg["result"]) and "exceptionDetails" not in msg["result"]
and not ( and not (
"wasThrown" in msg["result"] and msg["result"]["wasThrown"] "wasThrown" in msg["result"] and msg["result"]["wasThrown"]
) )
and "result" in msg["result"] and "result" in msg["result"]
and type(msg["result"]["result"]["value"]) == bool and isinstance(msg["result"]["result"]["value"], bool)
and msg["result"]["result"]["value"] and msg["result"]["result"]["value"]
): ):
self.logger.info("behavior decided it has finished") self.logger.info("behavior decided it has finished")

View file

@ -265,7 +265,7 @@ class Chrome:
return url return url
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except Exception as e: except Exception:
if time.time() - self._last_warning > 30: if time.time() - self._last_warning > 30:
url_logger.warning( url_logger.warning(
"problem accessing url (will keep trying until timeout)", "problem accessing url (will keep trying until timeout)",
@ -325,7 +325,7 @@ class Chrome:
self.logger.debug( self.logger.debug(
"chrome pid %s STDERR %s", self.chrome_process.pid, buf "chrome pid %s STDERR %s", self.chrome_process.pid, buf
) )
except: except: # noqa: E722
self.logger.exception("unexpected exception") self.logger.exception("unexpected exception")
def stop(self): def stop(self):
@ -378,7 +378,7 @@ class Chrome:
self.chrome_process.stderr.close() self.chrome_process.stderr.close()
try: try:
self._home_tmpdir.cleanup() self._home_tmpdir.cleanup()
except: except: # noqa: E722
self.logger.exception( self.logger.exception(
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir "exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
) )

View file

@ -23,12 +23,10 @@ import datetime
import json import json
import logging import logging
import os import os
import re
import signal import signal
import string import string
import sys import sys
import threading import threading
import time
import traceback import traceback
import warnings import warnings
@ -397,9 +395,9 @@ def brozzle_page(argv=None):
enable_youtube_dl=not worker._skip_youtube_dl, enable_youtube_dl=not worker._skip_youtube_dl,
) )
logger.info("outlinks", outlinks=sorted(outlinks)) logger.info("outlinks", outlinks=sorted(outlinks))
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit:
logger.exception("reached limit") logger.exception("reached limit")
except brozzler.PageInterstitialShown as e: except brozzler.PageInterstitialShown:
logger.exception("page interstitial shown") logger.exception("page interstitial shown")
finally: finally:
browser.stop() browser.stop()
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
logger.info( logger.info(
"dumping state (caught signal)\n%s", signal=signum, state=state_strs "dumping state (caught signal)\n%s", signal=signum, state=state_strs
) )
except BaseException as e: except BaseException:
logger.exception("exception dumping state") logger.exception("exception dumping state")
finally: finally:
signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGQUIT, dump_state)
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
try: try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE # make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips: with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()} skip_av_seeds = {int(line) for line in skips.readlines()}
logger.info( logger.info(
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE "running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
) )
except Exception as e: except Exception:
skip_av_seeds = set() skip_av_seeds = set()
logger.info("running with empty skip_av_seeds") logger.info("running with empty skip_av_seeds")
return skip_av_seeds return skip_av_seeds
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
try: try:
# make list from file # make list from file
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints: with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()] ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
if ytdlp_proxy_endpoints: if ytdlp_proxy_endpoints:
logger.info( logger.info(
"running with ytdlp proxy endpoints file", "running with ytdlp proxy endpoints file",
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE, ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
) )
except Exception as e: except Exception:
ytdlp_proxy_endpoints = [] ytdlp_proxy_endpoints = []
logger.info("running with empty proxy endpoints file") logger.info("running with empty proxy endpoints file")
return ytdlp_proxy_endpoints return ytdlp_proxy_endpoints
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
configure_logging(args) configure_logging(args)
rr = rethinker(args) rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr) brozzler.RethinkDbFrontier(rr)
if args.job: if args.job:
try: try:
job_id = int(args.job) job_id = int(args.job)

View file

@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
import base64
import os
import sys import sys
import doublethink
import rethinkdb as rdb
import structlog import structlog
import yaml
logger = structlog.get_logger(logger_name=__name__) logger = structlog.get_logger(logger_name=__name__)
@ -33,14 +38,6 @@ except ImportError as e:
e, e,
) )
sys.exit(1) sys.exit(1)
import base64
import importlib
import json
import os
import doublethink
import rethinkdb as rdb
import yaml
r = rdb.RethinkDB() r = rdb.RethinkDB()
@ -285,6 +282,8 @@ def root(path):
try: try:
import logging
import gunicorn.app.base import gunicorn.app.base
import gunicorn.glogging import gunicorn.glogging
from gunicorn.six import iteritems from gunicorn.six import iteritems

View file

@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
import argparse
import os
import signal
import socket
import socketserver
import sys import sys
import threading
import time
import traceback
import doublethink
import structlog import structlog
import brozzler
import brozzler.cli
logger = structlog.get_logger(logger_name=__name__) logger = structlog.get_logger(logger_name=__name__)
try: try:
@ -42,19 +54,6 @@ except ImportError as e:
exc_info=True, exc_info=True,
) )
sys.exit(1) sys.exit(1)
import argparse
import os
import signal
import socket
import socketserver
import threading
import time
import traceback
import doublethink
import brozzler
import brozzler.cli
def _build_arg_parser(argv=None): def _build_arg_parser(argv=None):

View file

@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
import datetime
import random
import time
import doublethink import doublethink
import rethinkdb as rdb import rethinkdb as rdb
import structlog import structlog
@ -47,11 +43,11 @@ class RethinkDbFrontier:
db_logger = self.logger.bind(dbname=self.rr.dbname) db_logger = self.logger.bind(dbname=self.rr.dbname)
dbs = self.rr.db_list().run() dbs = self.rr.db_list().run()
if not self.rr.dbname in dbs: if self.rr.dbname not in dbs:
db_logger.info("creating rethinkdb database") db_logger.info("creating rethinkdb database")
self.rr.db_create(self.rr.dbname).run() self.rr.db_create(self.rr.dbname).run()
tables = self.rr.table_list().run() tables = self.rr.table_list().run()
if not "sites" in tables: if "sites" not in tables:
db_logger.info("creating rethinkdb table 'sites' in database") db_logger.info("creating rethinkdb table 'sites' in database")
self.rr.table_create( self.rr.table_create(
"sites", shards=self.shards, replicas=self.replicas "sites", shards=self.shards, replicas=self.replicas
@ -60,7 +56,7 @@ class RethinkDbFrontier:
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]] "sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
).run() ).run()
self.rr.table("sites").index_create("job_id").run() self.rr.table("sites").index_create("job_id").run()
if not "pages" in tables: if "pages" not in tables:
db_logger.info("creating rethinkdb table 'pages' in database") db_logger.info("creating rethinkdb table 'pages' in database")
self.rr.table_create( self.rr.table_create(
"pages", shards=self.shards, replicas=self.replicas "pages", shards=self.shards, replicas=self.replicas
@ -80,7 +76,7 @@ class RethinkDbFrontier:
"least_hops", "least_hops",
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]], [r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
).run() ).run()
if not "jobs" in tables: if "jobs" not in tables:
db_logger.info("creating rethinkdb table 'jobs' in database") db_logger.info("creating rethinkdb table 'jobs' in database")
self.rr.table_create( self.rr.table_create(
"jobs", shards=self.shards, replicas=self.replicas "jobs", shards=self.shards, replicas=self.replicas
@ -352,7 +348,6 @@ class RethinkDbFrontier:
site.save() site.save()
def _build_fresh_page(self, site, parent_page, url, hops_off=0): def _build_fresh_page(self, site, parent_page, url, hops_off=0):
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url) url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode( hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
"utf-8" "utf-8"
@ -461,8 +456,8 @@ class RethinkDbFrontier:
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
# there can be many pages and each one can be very large (many videos, # there can be many pages and each one can be very large (many videos,
# in and out of scope links, etc) # in and out of scope links, etc)
l = list(pages.values()) pages_list = list(pages.values())
for batch in (l[i : i + 50] for i in range(0, len(l), 50)): for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
try: try:
self.logger.debug("inserting/replacing batch of %s pages", len(batch)) self.logger.debug("inserting/replacing batch of %s pages", len(batch))
reql = self.rr.table("pages").insert(batch, conflict="replace") reql = self.rr.table("pages").insert(batch, conflict="replace")
@ -471,8 +466,8 @@ class RethinkDbFrontier:
'conflict="replace")', 'conflict="replace")',
batch, batch,
) )
result = reql.run() reql.run()
except Exception as e: except Exception:
self.logger.exception( self.logger.exception(
"problem inserting/replacing batch of %s pages", "problem inserting/replacing batch of %s pages",
len(batch), len(batch),

View file

@ -19,12 +19,9 @@ limitations under the License.
import base64 import base64
import copy import copy
import datetime
import hashlib import hashlib
import json import json
import os import os
import re
import time
import urllib import urllib
import uuid import uuid
import zlib import zlib
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
# debugged, I found it here. Maybe there's a better way to see it. # debugged, I found it here. Maybe there's a better way to see it.
value = validator._errors[0].info[0][0].info[0][0].value value = validator._errors[0].info[0][0].info[0][0].value
self.errors["bad value"] = value self.errors["bad value"] = value
except: except: # noqa: E722
value = None value = None
@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)): for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
logger.info("inserting batch of %s pages", len(batch)) logger.info("inserting batch of %s pages", len(batch))
result = frontier.rr.table("pages").insert(batch).run() frontier.rr.table("pages").insert(batch).run()
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)): for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
logger.info("inserting batch of %s sites", len(batch)) logger.info("inserting batch of %s sites", len(batch))
result = frontier.rr.table("sites").insert(batch).run() frontier.rr.table("sites").insert(batch).run()
logger.info("job fully started", job_id=job.id) logger.info("job fully started", job_id=job.id)
return job return job
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
table = "jobs" table = "jobs"
def populate_defaults(self): def populate_defaults(self):
if not "status" in self: if "status" not in self:
self.status = "ACTIVE" self.status = "ACTIVE"
if not "starts_and_stops" in self: if "starts_and_stops" not in self:
if self.get("started"): # backward compatibility if self.get("started"): # backward compatibility
self.starts_and_stops = [ self.starts_and_stops = [
{"start": self.get("started"), "stop": self.get("finished")} {"start": self.get("started"), "stop": self.get("finished")}
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
table = "sites" table = "sites"
def populate_defaults(self): def populate_defaults(self):
if not "status" in self: if "status" not in self:
self.status = "ACTIVE" self.status = "ACTIVE"
if not "claimed" in self: if "claimed" not in self:
self.claimed = False self.claimed = False
if not "last_disclaimed" in self: if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self: if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self: if "scope" not in self:
self.scope = {} self.scope = {}
if not "skip_ytdlp" in self: if "skip_ytdlp" not in self:
self.skip_ytdlp = None self.skip_ytdlp = None
# backward compatibility # backward compatibility
if "surt" in self.scope: if "surt" in self.scope:
if not "accepts" in self.scope: if "accepts" not in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]}) self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"] del self.scope["surt"]
# backward compatibility # backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope: if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope: if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"] del self.scope["max_hops_off_surt"]
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii") brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
) )
if not "starts_and_stops" in self: if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility if self.get("start_time"): # backward compatibility
self.starts_and_stops = [ self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None} {"start": self.get("start_time"), "stop": None}
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
def _accept_ssurt_if_not_redundant(self, ssurt): def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope: if "accepts" not in self.scope:
self.scope["accepts"] = [] self.scope["accepts"] = []
simple_rule_ssurts = ( simple_rule_ssurts = (
rule["ssurt"] rule["ssurt"]
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
if not isinstance(url, urlcanon.ParsedUrl): if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url) url = urlcanon.semantic(url)
if not url.scheme in (b"http", b"https"): if url.scheme not in (b"http", b"https"):
# XXX doesn't belong here maybe (where? worker ignores unknown # XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?) # schemes?)
return False return False
@ -390,31 +387,31 @@ class Page(doublethink.Document):
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest() return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self): def populate_defaults(self):
if not "retry_after" in self: if "retry_after" not in self:
self.retry_after = None self.retry_after = None
if not "failed_attempts" in self: if "failed_attempts" not in self:
self.failed_attempts = 0 self.failed_attempts = 0
if not "hops_from_seed" in self: if "hops_from_seed" not in self:
self.hops_from_seed = 0 self.hops_from_seed = 0
if not "hop_path" in self: if "hop_path" not in self:
self.hop_path = None self.hop_path = None
if not "via_page_url" in self: if "via_page_url" not in self:
self.via_page_url = None self.via_page_url = None
if not "brozzle_count" in self: if "brozzle_count" not in self:
self.brozzle_count = 0 self.brozzle_count = 0
if not "claimed" in self: if "claimed" not in self:
self.claimed = False self.claimed = False
if "hops_off_surt" in self and not "hops_off" in self: if "hops_off_surt" in self and "hops_off" not in self:
self.hops_off = self.hops_off_surt self.hops_off = self.hops_off_surt
if "hops_off_surt" in self: if "hops_off_surt" in self:
del self["hops_off_surt"] del self["hops_off_surt"]
if not "hops_off" in self: if "hops_off" not in self:
self.hops_off = 0 self.hops_off = 0
if not "needs_robots_check" in self: if "needs_robots_check" not in self:
self.needs_robots_check = False self.needs_robots_check = False
if not "priority" in self: if "priority" not in self:
self.priority = self._calc_priority() self.priority = self._calc_priority()
if not "id" in self: if "id" not in self:
self.id = self.compute_id(self.site_id, self.url) self.id = self.compute_id(self.site_id, self.url)
def __str__(self): def __str__(self):

View file

@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
""" """
import argparse
import json
import sys import sys
import doublethink
import rethinkdb as rdb
import structlog import structlog
import urlcanon
import brozzler
logger = structlog.get_logger(logger_name=__name__) logger = structlog.get_logger(logger_name=__name__)
@ -40,14 +47,7 @@ except ImportError as e:
e, e,
) )
sys.exit(1) sys.exit(1)
import argparse
import json
import doublethink
import rethinkdb as rdb
import urlcanon
import brozzler
r = rdb.RethinkDB() r = rdb.RethinkDB()
@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
key = urlcanon.semantic(url).surt().decode("ascii") key = urlcanon.semantic(url).surt().decode("ascii")
# logging.debug('%s -> %s', url, key) # logging.debug('%s -> %s', url, key)
return key return key
except Exception as e: except Exception:
return url return url
def replace_default_canonicalizer(): def replace_default_canonicalizer():
@ -221,18 +221,9 @@ def support_in_progress_warcs():
class SomeWbUrl(pywb.rewrite.wburl.WbUrl): class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
def __init__(self, orig_url): def __init__(self, orig_url):
import re
import six import six
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.utils.loaders import to_native_str from six.moves.urllib.parse import quote
from six.moves.urllib.parse import (
quote,
quote_plus,
unquote_plus,
urlsplit,
urlunsplit,
)
pywb.rewrite.wburl.BaseWbUrl.__init__(self) pywb.rewrite.wburl.BaseWbUrl.__init__(self)
@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
urlkey = to_native_str(query.key, "utf-8") urlkey = to_native_str(query.key, "utf-8")
url = query.url url = query.url
filter_ = query.filters filter_ = query.filters
output = query.output
for rule in self.rules.iter_matching(urlkey): for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey) m = rule.regex.search(urlkey)

View file

@ -71,7 +71,7 @@ _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None): def _robots_cache(site, proxy=None):
if not site.id in _robots_caches: if site.id not in _robots_caches:
req_sesh = _SessionRaiseOn420() req_sesh = _SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors req_sesh.verify = False # ignore cert errors
if proxy: if proxy:

View file

@ -21,9 +21,7 @@ limitations under the License.
import datetime import datetime
import io import io
import json import json
import random
import socket import socket
import tempfile
import threading import threading
import time import time
import urllib.request import urllib.request
@ -99,9 +97,13 @@ class BrozzlerWorker:
self._skip_visit_hashtags = skip_visit_hashtags self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl self._skip_youtube_dl = skip_youtube_dl
# TODO try using importlib.util.find_spec to test for dependency
# presence rather than try/except on import.
# See https://docs.astral.sh/ruff/rules/unused-import/#example
# We definitely shouldn't ytdlp if the optional extra is missing # We definitely shouldn't ytdlp if the optional extra is missing
try: try:
import yt_dlp import yt_dlp # noqa: F401
except ImportError: except ImportError:
self.logger.info( self.logger.info(
"optional yt-dlp extra not installed; setting skip_youtube_dl to True" "optional yt-dlp extra not installed; setting skip_youtube_dl to True"
@ -200,7 +202,7 @@ class BrozzlerWorker:
response = requests.get("http://%s/status" % self._proxy) response = requests.get("http://%s/status" % self._proxy)
status = json.loads(response.text) status = json.loads(response.text)
self._proxy_is_warcprox = status["role"] == "warcprox" self._proxy_is_warcprox = status["role"] == "warcprox"
except Exception as e: except Exception:
self._proxy_is_warcprox = False self._proxy_is_warcprox = False
self.logger.info( self.logger.info(
"%s %s warcprox", "%s %s warcprox",
@ -348,13 +350,13 @@ class BrozzlerWorker:
) )
metrics.brozzler_ydl_urls_checked.inc(1) metrics.brozzler_ydl_urls_checked.inc(1)
outlinks.update(ydl_outlinks) outlinks.update(ydl_outlinks)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit:
raise raise
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except brozzler.ProxyError: except brozzler.ProxyError:
raise raise
except brozzler.VideoExtractorError as e: except brozzler.VideoExtractorError:
self.logger.exception("error extracting video info") self.logger.exception("error extracting video info")
except Exception as e: except Exception as e:
if ( if (
@ -391,9 +393,9 @@ class BrozzlerWorker:
timeout=self.HEADER_REQUEST_TIMEOUT, timeout=self.HEADER_REQUEST_TIMEOUT,
) as r: ) as r:
return r.headers return r.headers
except requests.exceptions.Timeout as e: except requests.exceptions.Timeout:
url_logger.warning("Timed out trying to get headers", exc_info=True) url_logger.warning("Timed out trying to get headers", exc_info=True)
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException:
url_logger.warning("Failed to get headers", exc_info=True) url_logger.warning("Failed to get headers", exc_info=True)
return {} return {}
@ -469,7 +471,7 @@ class BrozzlerWorker:
if "content-range" in response_headers: if "content-range" in response_headers:
video["content-range"] = response_headers["content-range"] video["content-range"] = response_headers["content-range"]
self.logger.debug("embedded video", video=video) self.logger.debug("embedded video", video=video)
if not "videos" in page: if "videos" not in page:
page.videos = [] page.videos = []
page.videos.append(video) page.videos.append(video)
@ -598,13 +600,13 @@ class BrozzlerWorker:
site_logger.info("no pages left for site") site_logger.info("no pages left for site")
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
self._frontier.reached_limit(site, e) self._frontier.reached_limit(site, e)
except brozzler.ReachedTimeLimit as e: except brozzler.ReachedTimeLimit:
self._frontier.finished(site, "FINISHED_TIME_LIMIT") self._frontier.finished(site, "FINISHED_TIME_LIMIT")
except brozzler.CrawlStopped: except brozzler.CrawlStopped:
self._frontier.finished(site, "FINISHED_STOP_REQUESTED") self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
# except brozzler.browser.BrowsingAborted: # except brozzler.browser.BrowsingAborted:
# self.logger.info("{} shut down".format(browser)) # self.logger.info("{} shut down".format(browser))
except brozzler.ProxyError as e: except brozzler.ProxyError:
if self._warcprox_auto: if self._warcprox_auto:
self.logger.exception( self.logger.exception(
"proxy error, will try to choose a " "proxy error, will try to choose a "
@ -676,7 +678,7 @@ class BrozzlerWorker:
try: try:
self.status_info = self._service_registry.heartbeat(status_info) self.status_info = self._service_registry.heartbeat(status_info)
self.logger.debug("status in service registry", status=self.status_info) self.logger.debug("status in service registry", status=self.status_info)
except r.ReqlError as e: except r.ReqlError:
self.logger.exception( self.logger.exception(
"failed to send heartbeat and update service registry", "failed to send heartbeat and update service registry",
info=status_info, info=status_info,
@ -748,11 +750,11 @@ class BrozzlerWorker:
time.sleep(0.5) time.sleep(0.5)
self.logger.warn("shutdown requested") self.logger.warn("shutdown requested")
except r.ReqlError as e: except r.ReqlError:
self.logger.exception("caught rethinkdb exception, will try to proceed") self.logger.exception("caught rethinkdb exception, will try to proceed")
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
self.logger.info("shutdown requested") self.logger.info("shutdown requested")
except: except: # noqa: E722
self.logger.critical( self.logger.critical(
"thread exiting due to unexpected exception", exc_info=True "thread exiting due to unexpected exception", exc_info=True
) )
@ -760,7 +762,7 @@ class BrozzlerWorker:
if self._service_registry and hasattr(self, "status_info"): if self._service_registry and hasattr(self, "status_info"):
try: try:
self._service_registry.unregister(self.status_info["id"]) self._service_registry.unregister(self.status_info["id"])
except: except: # noqa: E722
self.logger.exception("failed to unregister from service registry") self.logger.exception("failed to unregister from service registry")
self.logger.info( self.logger.info(

View file

@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
if result_type in ("url", "url_transparent"): if result_type in ("url", "url_transparent"):
if "extraction_depth" in extra_info: if "extraction_depth" in extra_info:
self.logger.info( self.logger.info(
f"Following redirect", "Following redirect",
redirect_url=ie_result["url"], redirect_url=ie_result["url"],
extraction_depth=extra_info["extraction_depth"], extraction_depth=extra_info["extraction_depth"],
) )
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
# use it later to extract the watch pages as outlinks. # use it later to extract the watch pages as outlinks.
try: try:
ie_result["entries_no_dl"] = list(ie_result["entries"]) ie_result["entries_no_dl"] = list(ie_result["entries"])
except Exception as e: except Exception:
extract_context.warning( extract_context.warning(
"failed to unroll entries ie_result['entries']?", "failed to unroll entries ie_result['entries']?",
exc_info=True, exc_info=True,
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
import magic import magic
mimetype = magic.from_file(info_dict["filepath"], mime=True) mimetype = magic.from_file(info_dict["filepath"], mime=True)
except ImportError as e: except ImportError:
mimetype = "video/%s" % info_dict["ext"] mimetype = "video/%s" % info_dict["ext"]
self.logger.warning( self.logger.warning(
"guessing mimetype due to error", "guessing mimetype due to error",
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
) )
site.last_claimed = doublethink.utcnow() site.last_claimed = doublethink.utcnow()
site.save() site.save()
except: except: # noqa: E722
worker.logger.debug( worker.logger.debug(
"problem heartbeating site.last_claimed site", "problem heartbeating site.last_claimed site",
id=site.id, id=site.id,
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
""" """
Saves info about videos captured by yt-dlp in `page.videos`. Saves info about videos captured by yt-dlp in `page.videos`.
""" """
if not "videos" in page: if "videos" not in page:
page.videos = [] page.videos = []
for pushed_video in pushed_videos or []: for pushed_video in pushed_videos or []:
video = { video = {
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
) )
metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1) metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
break break
except brozzler.ShutdownRequested as e: except brozzler.ShutdownRequested:
raise raise
except Exception as e: except Exception as e:
if ( if (

View file

@ -110,7 +110,7 @@ def test_httpd(httpd):
of the same url return the same payload, proving it can be used to test of the same url return the same payload, proving it can be used to test
deduplication. deduplication.
""" """
payload1 = content2 = None payload1 = None
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
assert response.status == 200 assert response.status == 200
@ -175,7 +175,6 @@ def test_420(httpd):
def test_js_dialogs(httpd): def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise # before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail # brozzler.browser.BrowsingTimeout, which would cause this test to fail

View file

@ -40,11 +40,11 @@ def cli_commands():
commands = set(console_scripts().keys()) commands = set(console_scripts().keys())
commands.remove("brozzler-wayback") commands.remove("brozzler-wayback")
try: try:
import gunicorn import gunicorn # noqa: F401
except ImportError: except ImportError:
commands.remove("brozzler-dashboard") commands.remove("brozzler-dashboard")
try: try:
import pywb import pywb # noqa: F401
except ImportError: except ImportError:
commands.remove("brozzler-easy") commands.remove("brozzler-easy")
return commands return commands

View file

@ -23,7 +23,6 @@ import http.server
import os import os
import socket import socket
import subprocess import subprocess
import sys
import threading import threading
import time import time
import urllib.request import urllib.request
@ -47,7 +46,7 @@ def _local_address():
try: try:
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
return s.getsockname()[0] return s.getsockname()[0]
except: except: # noqa: E722
return "127.0.0.1" return "127.0.0.1"
finally: finally:
s.close() s.close()
@ -148,7 +147,7 @@ def test_httpd(httpd):
of the same url return the same payload, proving it can be used to test of the same url return the same payload, proving it can be used to test
deduplication. deduplication.
""" """
payload1 = content2 = None payload1 = None
url = make_url(httpd, "/site1/file1.txt") url = make_url(httpd, "/site1/file1.txt")
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
assert response.status == 200 assert response.status == 200
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):
def test_proxy_conflict(): def test_proxy_conflict():
with pytest.raises(AssertionError) as excinfo: with pytest.raises(AssertionError):
worker = brozzler.worker.BrozzlerWorker( brozzler.worker.BrozzlerWorker(
None, None, warcprox_auto=True, proxy="localhost:12345" None, None, warcprox_auto=True, proxy="localhost:12345"
) )
@ -523,7 +522,6 @@ def test_login(httpd):
# take a look at the captures table # take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = make_url(httpd, "/robots.txt")
captures = list( captures = list(
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run() rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
) )
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):
def test_stop_crawl(httpd): def test_stop_crawl(httpd):
test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker("localhost", db="brozzler") rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
""" """
rr = doublethink.Rethinker("localhost", db="brozzler") rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
svcreg = doublethink.ServiceRegistry(rr)
# run two instances of warcprox # run two instances of warcprox
opts = warcprox.Options() opts = warcprox.Options()
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
# the system, if any # the system, if any
try: try:
stop_service("warcprox") stop_service("warcprox")
except Exception as e: except Exception:
logger.warning("problem stopping warcprox service: %s", exc_info=True) logger.warning("problem stopping warcprox service: %s", exc_info=True)
# queue the site for brozzling # queue the site for brozzling
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):
def test_time_limit(httpd): def test_time_limit(httpd):
test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker("localhost", db="brozzler") rr = doublethink.Rethinker("localhost", db="brozzler")
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
@ -928,7 +923,6 @@ def test_time_limit(httpd):
sites = list(frontier.job_sites(job.id)) sites = list(frontier.job_sites(job.id))
assert len(sites) == 1 assert len(sites) == 1
site = sites[0]
# time limit should be enforced pretty soon # time limit should be enforced pretty soon
start = time.time() start = time.time()
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table # take a look at the captures table
captures = list(rr.table("captures").filter({"test_id": test_id}).run()) captures = list(rr.table("captures").filter({"test_id": test_id}).run())
l = [c for c in captures if c["url"] == stitched_url] l = [c for c in captures if c["url"] == stitched_url] # noqa: E741
assert len(l) == 1 assert len(l) == 1
c = l[0] c = l[0]
assert c["filename"].startswith("test_ydl_stitching") assert c["filename"].startswith("test_ydl_stitching")

View file

@ -559,7 +559,7 @@ def test_parent_url_scoping():
assert parent_page.outlinks["accepted"] == outlinks assert parent_page.outlinks["accepted"] == outlinks
# parent page redirect_url matches accept parent_url_regex # parent page redirect_url matches accept parent_url_regex
parent_page_c = brozzler.Page( brozzler.Page(
rr, rr,
{ {
"site_id": site.id, "site_id": site.id,
@ -606,7 +606,7 @@ def test_parent_url_scoping():
assert parent_page.outlinks["accepted"] == [] assert parent_page.outlinks["accepted"] == []
# parent page redirect_url matches block parent_url_regex # parent page redirect_url matches block parent_url_regex
parent_page_c = brozzler.Page( brozzler.Page(
rr, rr,
{ {
"site_id": site.id, "site_id": site.id,
@ -659,10 +659,10 @@ def test_completed_page():
] ]
} }
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
page.refresh() page.refresh()
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
# redirect that doesn't change scope surt because destination is covered by # redirect that doesn't change scope surt because destination is covered by
# the original surt # the original surt
@ -686,10 +686,10 @@ def test_completed_page():
site.refresh() site.refresh()
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
page.refresh() page.refresh()
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
# redirect that doesn't change scope surt because page is not the seed page # redirect that doesn't change scope surt because page is not the seed page
site = brozzler.Site(rr, {"seed": "http://example.com/a/"}) site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
@ -712,10 +712,10 @@ def test_completed_page():
site.refresh() site.refresh()
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]} assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
page.refresh() page.refresh()
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed is False
def test_seed_page(): def test_seed_page():
@ -931,7 +931,7 @@ def test_max_claimed_sites():
claimed_sites = frontier.claim_sites(3) claimed_sites = frontier.claim_sites(3)
assert len(claimed_sites) == 2 assert len(claimed_sites) == 2
with pytest.raises(brozzler.NothingToClaim): with pytest.raises(brozzler.NothingToClaim):
claimed_site = frontier.claim_sites(3) frontier.claim_sites(3)
# clean slate for the next one # clean slate for the next one
rr.table("jobs").delete().run() rr.table("jobs").delete().run()
@ -1074,7 +1074,7 @@ def test_max_hops_off():
site.refresh() # get it back from the db site.refresh() # get it back from the db
# renamed this param # renamed this param
assert not "max_hops_off_surt" in site.scope assert "max_hops_off_surt" not in site.scope
assert site.scope["max_hops_off"] == 1 assert site.scope["max_hops_off"] == 1
seed_page = frontier.seed_page(site.id) seed_page = frontier.seed_page(site.id)
@ -1109,7 +1109,7 @@ def test_max_hops_off():
assert len(pages) == 4 assert len(pages) == 4
assert pages[0].url == "http://example.com/" assert pages[0].url == "http://example.com/"
assert pages[0].hops_off == 0 assert pages[0].hops_off == 0
assert not "hops_off_surt" in pages[0] assert "hops_off_surt" not in pages[0]
assert set(pages[0].outlinks["accepted"]) == { assert set(pages[0].outlinks["accepted"]) == {
"https://example.com/toot", "https://example.com/toot",
"http://foo.org/", "http://foo.org/",

View file

@ -21,7 +21,6 @@ import datetime
import http.server import http.server
import os import os
import socket import socket
import sys
import tempfile import tempfile
import threading import threading
import time import time
@ -29,7 +28,6 @@ import uuid
from unittest import mock from unittest import mock
import pytest import pytest
import requests
import yaml import yaml
import brozzler import brozzler
@ -291,7 +289,7 @@ def test_proxy_down():
) )
# youtube-dl fetch # youtube-dl fetch
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir: with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
with pytest.raises(brozzler.ProxyError): with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page) brozzler.ydl.do_youtube_dl(worker, site, page)
@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
assert len(site.starts_and_stops) == 1 assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]["start"] assert site.starts_and_stops[0]["start"]
assert site.starts_and_stops[0]["stop"] is None assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site assert "start_time" not in site
site = brozzler.Site( site = brozzler.Site(
None, None,
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
assert len(site.starts_and_stops) == 1 assert len(site.starts_and_stops) == 1
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert site.starts_and_stops[0]["stop"] is None assert site.starts_and_stops[0]["stop"] is None
assert not "start_time" in site assert "start_time" not in site
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]}) job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
assert job.starts_and_stops[0]["start"] assert job.starts_and_stops[0]["start"]
assert job.starts_and_stops[0]["stop"] is None assert job.starts_and_stops[0]["stop"] is None
assert not "started" in job assert "started" not in job
assert not "finished" in job assert "finished" not in job
job = brozzler.Job( job = brozzler.Job(
None, None,
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
) )
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1) assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2) assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
assert not "started" in job assert "started" not in job
assert not "finished" in job assert "finished" not in job
class Exception1(Exception): class Exception1(Exception):
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
with brozzler.thread_accept_exceptions(): with brozzler.thread_accept_exceptions():
time.sleep(2) time.sleep(2)
return # test fails return # test fails
except Exception1 as e: except Exception1:
pass pass
except: except: # noqa: E722
return # fail test return # fail test
try: try:

View file

@ -32,7 +32,7 @@ import sys
try: try:
from shlex import quote from shlex import quote
except: except: # noqa: E722
from pipes import quote from pipes import quote