mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-23 06:50:37 -04:00
ruff linting fixes (#343)
* ruff linting fixes * move imports back down to where they're re-exported
This commit is contained in:
parent
6f011cc6c8
commit
f64db214d4
18 changed files with 155 additions and 190 deletions
|
@ -17,10 +17,13 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
from importlib.metadata import version as _version
|
from importlib.metadata import version as _version
|
||||||
|
|
||||||
import structlog
|
import structlog
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
__version__ = _version("brozzler")
|
__version__ = _version("brozzler")
|
||||||
|
|
||||||
|
@ -91,7 +94,7 @@ def _logging_handler_handle(self, record):
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
self.release()
|
self.release()
|
||||||
except:
|
except: # noqa: E722
|
||||||
pass
|
pass
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
@ -108,7 +111,6 @@ def behaviors(behaviors_dir=None):
|
||||||
`js-templates/`. Defaults to brozzler dir.
|
`js-templates/`. Defaults to brozzler dir.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import string
|
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
@ -125,7 +127,6 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||||
"""
|
"""
|
||||||
Returns the javascript behavior string populated with template_parameters.
|
Returns the javascript behavior string populated with template_parameters.
|
||||||
"""
|
"""
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
logger = structlog.get_logger(logger_name=__name__)
|
logger = structlog.get_logger(logger_name=__name__)
|
||||||
|
@ -194,8 +195,6 @@ class ThreadExceptionGate:
|
||||||
return "<ThreadExceptionGate(%s)>" % self.thread
|
return "<ThreadExceptionGate(%s)>" % self.thread
|
||||||
|
|
||||||
|
|
||||||
import threading
|
|
||||||
|
|
||||||
_thread_exception_gates = {}
|
_thread_exception_gates = {}
|
||||||
_thread_exception_gates_lock = threading.Lock()
|
_thread_exception_gates_lock = threading.Lock()
|
||||||
|
|
||||||
|
@ -225,7 +224,7 @@ def thread_exception_gate(thread=None):
|
||||||
thread = threading.current_thread()
|
thread = threading.current_thread()
|
||||||
|
|
||||||
with _thread_exception_gates_lock:
|
with _thread_exception_gates_lock:
|
||||||
if not thread in _thread_exception_gates:
|
if thread not in _thread_exception_gates:
|
||||||
_thread_exception_gates[thread] = ThreadExceptionGate(thread)
|
_thread_exception_gates[thread] = ThreadExceptionGate(thread)
|
||||||
|
|
||||||
return _thread_exception_gates[thread]
|
return _thread_exception_gates[thread]
|
||||||
|
@ -252,7 +251,6 @@ def thread_raise(thread, exctype):
|
||||||
"""
|
"""
|
||||||
import ctypes
|
import ctypes
|
||||||
import inspect
|
import inspect
|
||||||
import threading
|
|
||||||
|
|
||||||
import structlog
|
import structlog
|
||||||
|
|
||||||
|
@ -322,9 +320,6 @@ def jinja2_environment(behaviors_dir=None):
|
||||||
return _jinja2_env
|
return _jinja2_env
|
||||||
|
|
||||||
|
|
||||||
import urlcanon
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_query(url):
|
def _remove_query(url):
|
||||||
url.question_mark = b""
|
url.question_mark = b""
|
||||||
url.query = b""
|
url.query = b""
|
||||||
|
@ -403,13 +398,10 @@ def suggest_default_chrome_exe():
|
||||||
return "chromium-browser"
|
return "chromium-browser"
|
||||||
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
||||||
|
|
||||||
|
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
|
||||||
from brozzler.browser import Browser, BrowserPool, BrowsingException
|
from brozzler.robots import is_permitted_by_robots # noqa: E402
|
||||||
from brozzler.robots import is_permitted_by_robots
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"is_permitted_by_robots",
|
"is_permitted_by_robots",
|
||||||
|
@ -422,22 +414,25 @@ __all__ = [
|
||||||
"suggest_default_chrome_exe",
|
"suggest_default_chrome_exe",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# TODO try using importlib.util.find_spec to test for dependency presence
|
||||||
|
# rather than try/except on import.
|
||||||
|
# See https://docs.astral.sh/ruff/rules/unused-import/#example
|
||||||
try:
|
try:
|
||||||
import doublethink
|
import doublethink # noqa: F401
|
||||||
|
|
||||||
# All of these imports use doublethink for real and are unsafe
|
# All of these imports use doublethink for real and are unsafe
|
||||||
# to do if doublethink is unavailable.
|
# to do if doublethink is unavailable.
|
||||||
from brozzler.frontier import RethinkDbFrontier
|
from brozzler.frontier import RethinkDbFrontier # noqa: F401
|
||||||
from brozzler.model import (
|
from brozzler.model import (
|
||||||
InvalidJobConf,
|
InvalidJobConf, # noqa: F401
|
||||||
Job,
|
Job, # noqa: F401
|
||||||
Page,
|
Page, # noqa: F401
|
||||||
Site,
|
Site, # noqa: F401
|
||||||
new_job,
|
new_job, # noqa: F401
|
||||||
new_job_file,
|
new_job_file, # noqa: F401
|
||||||
new_site,
|
new_site, # noqa: F401
|
||||||
)
|
)
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker # noqa: F401
|
||||||
|
|
||||||
__all__.extend(
|
__all__.extend(
|
||||||
[
|
[
|
||||||
|
|
|
@ -18,7 +18,6 @@ limitations under the License.
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import datetime
|
import datetime
|
||||||
import itertools
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import socket
|
import socket
|
||||||
|
@ -213,7 +212,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
def _on_message(self, websock, message):
|
def _on_message(self, websock, message):
|
||||||
try:
|
try:
|
||||||
self._handle_message(websock, message)
|
self._handle_message(websock, message)
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"uncaught exception in _handle_message",
|
"uncaught exception in _handle_message",
|
||||||
message=message,
|
message=message,
|
||||||
|
@ -430,7 +429,7 @@ class Browser:
|
||||||
self.logger.info("shutting down websocket connection")
|
self.logger.info("shutting down websocket connection")
|
||||||
try:
|
try:
|
||||||
self.websock.close()
|
self.websock.close()
|
||||||
except BaseException as e:
|
except BaseException:
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"exception closing websocket", websocket=self.websock
|
"exception closing websocket", websocket=self.websock
|
||||||
)
|
)
|
||||||
|
@ -458,7 +457,7 @@ class Browser:
|
||||||
)
|
)
|
||||||
|
|
||||||
self.websock_url = None
|
self.websock_url = None
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.exception("problem stopping")
|
self.logger.exception("problem stopping")
|
||||||
|
|
||||||
def is_running(self):
|
def is_running(self):
|
||||||
|
@ -628,7 +627,7 @@ class Browser:
|
||||||
jpeg_bytes = self.screenshot(full_page)
|
jpeg_bytes = self.screenshot(full_page)
|
||||||
on_screenshot(jpeg_bytes)
|
on_screenshot(jpeg_bytes)
|
||||||
return
|
return
|
||||||
except BrowsingTimeout as e:
|
except BrowsingTimeout:
|
||||||
self.logger.exception("attempt %s/3", i + 1)
|
self.logger.exception("attempt %s/3", i + 1)
|
||||||
|
|
||||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||||
|
@ -807,12 +806,12 @@ class Browser:
|
||||||
if (
|
if (
|
||||||
msg
|
msg
|
||||||
and "result" in msg
|
and "result" in msg
|
||||||
and not ("exceptionDetails" in msg["result"])
|
and "exceptionDetails" not in msg["result"]
|
||||||
and not (
|
and not (
|
||||||
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
|
"wasThrown" in msg["result"] and msg["result"]["wasThrown"]
|
||||||
)
|
)
|
||||||
and "result" in msg["result"]
|
and "result" in msg["result"]
|
||||||
and type(msg["result"]["result"]["value"]) == bool
|
and isinstance(msg["result"]["result"]["value"], bool)
|
||||||
and msg["result"]["result"]["value"]
|
and msg["result"]["result"]["value"]
|
||||||
):
|
):
|
||||||
self.logger.info("behavior decided it has finished")
|
self.logger.info("behavior decided it has finished")
|
||||||
|
|
|
@ -265,7 +265,7 @@ class Chrome:
|
||||||
return url
|
return url
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception:
|
||||||
if time.time() - self._last_warning > 30:
|
if time.time() - self._last_warning > 30:
|
||||||
url_logger.warning(
|
url_logger.warning(
|
||||||
"problem accessing url (will keep trying until timeout)",
|
"problem accessing url (will keep trying until timeout)",
|
||||||
|
@ -325,7 +325,7 @@ class Chrome:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
|
"chrome pid %s STDERR %s", self.chrome_process.pid, buf
|
||||||
)
|
)
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.exception("unexpected exception")
|
self.logger.exception("unexpected exception")
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
@ -378,7 +378,7 @@ class Chrome:
|
||||||
self.chrome_process.stderr.close()
|
self.chrome_process.stderr.close()
|
||||||
try:
|
try:
|
||||||
self._home_tmpdir.cleanup()
|
self._home_tmpdir.cleanup()
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
|
"exception deleting self._home_tmpdir", tmpdir=self._home_tmpdir
|
||||||
)
|
)
|
||||||
|
|
|
@ -23,12 +23,10 @@ import datetime
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import signal
|
import signal
|
||||||
import string
|
import string
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -397,9 +395,9 @@ def brozzle_page(argv=None):
|
||||||
enable_youtube_dl=not worker._skip_youtube_dl,
|
enable_youtube_dl=not worker._skip_youtube_dl,
|
||||||
)
|
)
|
||||||
logger.info("outlinks", outlinks=sorted(outlinks))
|
logger.info("outlinks", outlinks=sorted(outlinks))
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit:
|
||||||
logger.exception("reached limit")
|
logger.exception("reached limit")
|
||||||
except brozzler.PageInterstitialShown as e:
|
except brozzler.PageInterstitialShown:
|
||||||
logger.exception("page interstitial shown")
|
logger.exception("page interstitial shown")
|
||||||
finally:
|
finally:
|
||||||
browser.stop()
|
browser.stop()
|
||||||
|
@ -661,7 +659,7 @@ def brozzler_worker(argv=None):
|
||||||
logger.info(
|
logger.info(
|
||||||
"dumping state (caught signal)\n%s", signal=signum, state=state_strs
|
"dumping state (caught signal)\n%s", signal=signum, state=state_strs
|
||||||
)
|
)
|
||||||
except BaseException as e:
|
except BaseException:
|
||||||
logger.exception("exception dumping state")
|
logger.exception("exception dumping state")
|
||||||
finally:
|
finally:
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
|
@ -672,11 +670,11 @@ def brozzler_worker(argv=None):
|
||||||
try:
|
try:
|
||||||
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
# make set from seed IDs in SKIP_AV_SEEDS_FILE
|
||||||
with open(SKIP_AV_SEEDS_FILE) as skips:
|
with open(SKIP_AV_SEEDS_FILE) as skips:
|
||||||
skip_av_seeds = {int(l) for l in skips.readlines()}
|
skip_av_seeds = {int(line) for line in skips.readlines()}
|
||||||
logger.info(
|
logger.info(
|
||||||
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
|
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
skip_av_seeds = set()
|
skip_av_seeds = set()
|
||||||
logger.info("running with empty skip_av_seeds")
|
logger.info("running with empty skip_av_seeds")
|
||||||
return skip_av_seeds
|
return skip_av_seeds
|
||||||
|
@ -686,13 +684,13 @@ def brozzler_worker(argv=None):
|
||||||
try:
|
try:
|
||||||
# make list from file
|
# make list from file
|
||||||
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
|
with open(YTDLP_PROXY_ENDPOINTS_FILE) as endpoints:
|
||||||
ytdlp_proxy_endpoints = [l for l in endpoints.readlines()]
|
ytdlp_proxy_endpoints = [line for line in endpoints.readlines()]
|
||||||
if ytdlp_proxy_endpoints:
|
if ytdlp_proxy_endpoints:
|
||||||
logger.info(
|
logger.info(
|
||||||
"running with ytdlp proxy endpoints file",
|
"running with ytdlp proxy endpoints file",
|
||||||
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
|
ytdlp_proxy_endpoints=YTDLP_PROXY_ENDPOINTS_FILE,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
ytdlp_proxy_endpoints = []
|
ytdlp_proxy_endpoints = []
|
||||||
logger.info("running with empty proxy endpoints file")
|
logger.info("running with empty proxy endpoints file")
|
||||||
return ytdlp_proxy_endpoints
|
return ytdlp_proxy_endpoints
|
||||||
|
@ -1032,7 +1030,7 @@ def brozzler_purge(argv=None):
|
||||||
configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
rr = rethinker(args)
|
rr = rethinker(args)
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
brozzler.RethinkDbFrontier(rr)
|
||||||
if args.job:
|
if args.job:
|
||||||
try:
|
try:
|
||||||
job_id = int(args.job)
|
job_id = int(args.job)
|
||||||
|
|
|
@ -17,9 +17,14 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import doublethink
|
||||||
|
import rethinkdb as rdb
|
||||||
import structlog
|
import structlog
|
||||||
|
import yaml
|
||||||
|
|
||||||
logger = structlog.get_logger(logger_name=__name__)
|
logger = structlog.get_logger(logger_name=__name__)
|
||||||
|
|
||||||
|
@ -33,14 +38,6 @@ except ImportError as e:
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import base64
|
|
||||||
import importlib
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
import doublethink
|
|
||||||
import rethinkdb as rdb
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
r = rdb.RethinkDB()
|
r = rdb.RethinkDB()
|
||||||
|
|
||||||
|
@ -285,6 +282,8 @@ def root(path):
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
import logging
|
||||||
|
|
||||||
import gunicorn.app.base
|
import gunicorn.app.base
|
||||||
import gunicorn.glogging
|
import gunicorn.glogging
|
||||||
from gunicorn.six import iteritems
|
from gunicorn.six import iteritems
|
||||||
|
|
|
@ -18,10 +18,22 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import socketserver
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
import doublethink
|
||||||
import structlog
|
import structlog
|
||||||
|
|
||||||
|
import brozzler
|
||||||
|
import brozzler.cli
|
||||||
|
|
||||||
logger = structlog.get_logger(logger_name=__name__)
|
logger = structlog.get_logger(logger_name=__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -42,19 +54,6 @@ except ImportError as e:
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import socket
|
|
||||||
import socketserver
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
import doublethink
|
|
||||||
|
|
||||||
import brozzler
|
|
||||||
import brozzler.cli
|
|
||||||
|
|
||||||
|
|
||||||
def _build_arg_parser(argv=None):
|
def _build_arg_parser(argv=None):
|
||||||
|
|
|
@ -16,10 +16,6 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import datetime
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
|
|
||||||
import doublethink
|
import doublethink
|
||||||
import rethinkdb as rdb
|
import rethinkdb as rdb
|
||||||
import structlog
|
import structlog
|
||||||
|
@ -47,11 +43,11 @@ class RethinkDbFrontier:
|
||||||
db_logger = self.logger.bind(dbname=self.rr.dbname)
|
db_logger = self.logger.bind(dbname=self.rr.dbname)
|
||||||
|
|
||||||
dbs = self.rr.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.rr.dbname in dbs:
|
if self.rr.dbname not in dbs:
|
||||||
db_logger.info("creating rethinkdb database")
|
db_logger.info("creating rethinkdb database")
|
||||||
self.rr.db_create(self.rr.dbname).run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
tables = self.rr.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not "sites" in tables:
|
if "sites" not in tables:
|
||||||
db_logger.info("creating rethinkdb table 'sites' in database")
|
db_logger.info("creating rethinkdb table 'sites' in database")
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"sites", shards=self.shards, replicas=self.replicas
|
"sites", shards=self.shards, replicas=self.replicas
|
||||||
|
@ -60,7 +56,7 @@ class RethinkDbFrontier:
|
||||||
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
|
"sites_last_disclaimed", [r.row["status"], r.row["last_disclaimed"]]
|
||||||
).run()
|
).run()
|
||||||
self.rr.table("sites").index_create("job_id").run()
|
self.rr.table("sites").index_create("job_id").run()
|
||||||
if not "pages" in tables:
|
if "pages" not in tables:
|
||||||
db_logger.info("creating rethinkdb table 'pages' in database")
|
db_logger.info("creating rethinkdb table 'pages' in database")
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"pages", shards=self.shards, replicas=self.replicas
|
"pages", shards=self.shards, replicas=self.replicas
|
||||||
|
@ -80,7 +76,7 @@ class RethinkDbFrontier:
|
||||||
"least_hops",
|
"least_hops",
|
||||||
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
[r.row["site_id"], r.row["brozzle_count"], r.row["hops_from_seed"]],
|
||||||
).run()
|
).run()
|
||||||
if not "jobs" in tables:
|
if "jobs" not in tables:
|
||||||
db_logger.info("creating rethinkdb table 'jobs' in database")
|
db_logger.info("creating rethinkdb table 'jobs' in database")
|
||||||
self.rr.table_create(
|
self.rr.table_create(
|
||||||
"jobs", shards=self.shards, replicas=self.replicas
|
"jobs", shards=self.shards, replicas=self.replicas
|
||||||
|
@ -352,7 +348,6 @@ class RethinkDbFrontier:
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
|
hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode(
|
||||||
"utf-8"
|
"utf-8"
|
||||||
|
@ -461,8 +456,8 @@ class RethinkDbFrontier:
|
||||||
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||||
# there can be many pages and each one can be very large (many videos,
|
# there can be many pages and each one can be very large (many videos,
|
||||||
# in and out of scope links, etc)
|
# in and out of scope links, etc)
|
||||||
l = list(pages.values())
|
pages_list = list(pages.values())
|
||||||
for batch in (l[i : i + 50] for i in range(0, len(l), 50)):
|
for batch in (pages_list[i : i + 50] for i in range(0, len(pages_list), 50)):
|
||||||
try:
|
try:
|
||||||
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
|
self.logger.debug("inserting/replacing batch of %s pages", len(batch))
|
||||||
reql = self.rr.table("pages").insert(batch, conflict="replace")
|
reql = self.rr.table("pages").insert(batch, conflict="replace")
|
||||||
|
@ -471,8 +466,8 @@ class RethinkDbFrontier:
|
||||||
'conflict="replace")',
|
'conflict="replace")',
|
||||||
batch,
|
batch,
|
||||||
)
|
)
|
||||||
result = reql.run()
|
reql.run()
|
||||||
except Exception as e:
|
except Exception:
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"problem inserting/replacing batch of %s pages",
|
"problem inserting/replacing batch of %s pages",
|
||||||
len(batch),
|
len(batch),
|
||||||
|
|
|
@ -19,12 +19,9 @@ limitations under the License.
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import copy
|
import copy
|
||||||
import datetime
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import urllib
|
import urllib
|
||||||
import uuid
|
import uuid
|
||||||
import zlib
|
import zlib
|
||||||
|
@ -61,7 +58,7 @@ class InvalidJobConf(Exception):
|
||||||
# debugged, I found it here. Maybe there's a better way to see it.
|
# debugged, I found it here. Maybe there's a better way to see it.
|
||||||
value = validator._errors[0].info[0][0].info[0][0].value
|
value = validator._errors[0].info[0][0].info[0][0].value
|
||||||
self.errors["bad value"] = value
|
self.errors["bad value"] = value
|
||||||
except:
|
except: # noqa: E722
|
||||||
value = None
|
value = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -122,10 +119,10 @@ def new_job(frontier, job_conf):
|
||||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||||
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
for batch in (pages[i : i + 500] for i in range(0, len(pages), 500)):
|
||||||
logger.info("inserting batch of %s pages", len(batch))
|
logger.info("inserting batch of %s pages", len(batch))
|
||||||
result = frontier.rr.table("pages").insert(batch).run()
|
frontier.rr.table("pages").insert(batch).run()
|
||||||
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
for batch in (sites[i : i + 100] for i in range(0, len(sites), 100)):
|
||||||
logger.info("inserting batch of %s sites", len(batch))
|
logger.info("inserting batch of %s sites", len(batch))
|
||||||
result = frontier.rr.table("sites").insert(batch).run()
|
frontier.rr.table("sites").insert(batch).run()
|
||||||
logger.info("job fully started", job_id=job.id)
|
logger.info("job fully started", job_id=job.id)
|
||||||
|
|
||||||
return job
|
return job
|
||||||
|
@ -200,9 +197,9 @@ class Job(doublethink.Document, ElapsedMixIn):
|
||||||
table = "jobs"
|
table = "jobs"
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if "status" not in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if not "starts_and_stops" in self:
|
if "starts_and_stops" not in self:
|
||||||
if self.get("started"): # backward compatibility
|
if self.get("started"): # backward compatibility
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [
|
||||||
{"start": self.get("started"), "stop": self.get("finished")}
|
{"start": self.get("started"), "stop": self.get("finished")}
|
||||||
|
@ -229,28 +226,28 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
table = "sites"
|
table = "sites"
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if "status" not in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if not "claimed" in self:
|
if "claimed" not in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "last_disclaimed" in self:
|
if "last_disclaimed" not in self:
|
||||||
self.last_disclaimed = brozzler.EPOCH_UTC
|
self.last_disclaimed = brozzler.EPOCH_UTC
|
||||||
if not "last_claimed" in self:
|
if "last_claimed" not in self:
|
||||||
self.last_claimed = brozzler.EPOCH_UTC
|
self.last_claimed = brozzler.EPOCH_UTC
|
||||||
if not "scope" in self:
|
if "scope" not in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "skip_ytdlp" in self:
|
if "skip_ytdlp" not in self:
|
||||||
self.skip_ytdlp = None
|
self.skip_ytdlp = None
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if "surt" in self.scope:
|
if "surt" in self.scope:
|
||||||
if not "accepts" in self.scope:
|
if "accepts" not in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
||||||
del self.scope["surt"]
|
del self.scope["surt"]
|
||||||
|
|
||||||
# backward compatibility
|
# backward compatibility
|
||||||
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
|
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
|
||||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||||
if "max_hops_off_surt" in self.scope:
|
if "max_hops_off_surt" in self.scope:
|
||||||
del self.scope["max_hops_off_surt"]
|
del self.scope["max_hops_off_surt"]
|
||||||
|
@ -260,7 +257,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
|
||||||
)
|
)
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if "starts_and_stops" not in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [
|
||||||
{"start": self.get("start_time"), "stop": None}
|
{"start": self.get("start_time"), "stop": None}
|
||||||
|
@ -275,7 +272,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
|
||||||
def _accept_ssurt_if_not_redundant(self, ssurt):
|
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||||
if not "accepts" in self.scope:
|
if "accepts" not in self.scope:
|
||||||
self.scope["accepts"] = []
|
self.scope["accepts"] = []
|
||||||
simple_rule_ssurts = (
|
simple_rule_ssurts = (
|
||||||
rule["ssurt"]
|
rule["ssurt"]
|
||||||
|
@ -334,7 +331,7 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||||
if not isinstance(url, urlcanon.ParsedUrl):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
url = urlcanon.semantic(url)
|
url = urlcanon.semantic(url)
|
||||||
|
|
||||||
if not url.scheme in (b"http", b"https"):
|
if url.scheme not in (b"http", b"https"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
|
@ -390,31 +387,31 @@ class Page(doublethink.Document):
|
||||||
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "retry_after" in self:
|
if "retry_after" not in self:
|
||||||
self.retry_after = None
|
self.retry_after = None
|
||||||
if not "failed_attempts" in self:
|
if "failed_attempts" not in self:
|
||||||
self.failed_attempts = 0
|
self.failed_attempts = 0
|
||||||
if not "hops_from_seed" in self:
|
if "hops_from_seed" not in self:
|
||||||
self.hops_from_seed = 0
|
self.hops_from_seed = 0
|
||||||
if not "hop_path" in self:
|
if "hop_path" not in self:
|
||||||
self.hop_path = None
|
self.hop_path = None
|
||||||
if not "via_page_url" in self:
|
if "via_page_url" not in self:
|
||||||
self.via_page_url = None
|
self.via_page_url = None
|
||||||
if not "brozzle_count" in self:
|
if "brozzle_count" not in self:
|
||||||
self.brozzle_count = 0
|
self.brozzle_count = 0
|
||||||
if not "claimed" in self:
|
if "claimed" not in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if "hops_off_surt" in self and not "hops_off" in self:
|
if "hops_off_surt" in self and "hops_off" not in self:
|
||||||
self.hops_off = self.hops_off_surt
|
self.hops_off = self.hops_off_surt
|
||||||
if "hops_off_surt" in self:
|
if "hops_off_surt" in self:
|
||||||
del self["hops_off_surt"]
|
del self["hops_off_surt"]
|
||||||
if not "hops_off" in self:
|
if "hops_off" not in self:
|
||||||
self.hops_off = 0
|
self.hops_off = 0
|
||||||
if not "needs_robots_check" in self:
|
if "needs_robots_check" not in self:
|
||||||
self.needs_robots_check = False
|
self.needs_robots_check = False
|
||||||
if not "priority" in self:
|
if "priority" not in self:
|
||||||
self.priority = self._calc_priority()
|
self.priority = self._calc_priority()
|
||||||
if not "id" in self:
|
if "id" not in self:
|
||||||
self.id = self.compute_id(self.site_id, self.url)
|
self.id = self.compute_id(self.site_id, self.url)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
|
|
@ -18,9 +18,16 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import doublethink
|
||||||
|
import rethinkdb as rdb
|
||||||
import structlog
|
import structlog
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
|
import brozzler
|
||||||
|
|
||||||
logger = structlog.get_logger(logger_name=__name__)
|
logger = structlog.get_logger(logger_name=__name__)
|
||||||
|
|
||||||
|
@ -40,14 +47,7 @@ except ImportError as e:
|
||||||
e,
|
e,
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
|
|
||||||
import doublethink
|
|
||||||
import rethinkdb as rdb
|
|
||||||
import urlcanon
|
|
||||||
|
|
||||||
import brozzler
|
|
||||||
|
|
||||||
r = rdb.RethinkDB()
|
r = rdb.RethinkDB()
|
||||||
|
|
||||||
|
@ -137,7 +137,7 @@ class TheGoodUrlCanonicalizer(object):
|
||||||
key = urlcanon.semantic(url).surt().decode("ascii")
|
key = urlcanon.semantic(url).surt().decode("ascii")
|
||||||
# logging.debug('%s -> %s', url, key)
|
# logging.debug('%s -> %s', url, key)
|
||||||
return key
|
return key
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def replace_default_canonicalizer():
|
def replace_default_canonicalizer():
|
||||||
|
@ -221,18 +221,9 @@ def support_in_progress_warcs():
|
||||||
|
|
||||||
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||||
def __init__(self, orig_url):
|
def __init__(self, orig_url):
|
||||||
import re
|
|
||||||
|
|
||||||
import six
|
import six
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.utils.loaders import to_native_str
|
from six.moves.urllib.parse import quote
|
||||||
from six.moves.urllib.parse import (
|
|
||||||
quote,
|
|
||||||
quote_plus,
|
|
||||||
unquote_plus,
|
|
||||||
urlsplit,
|
|
||||||
urlunsplit,
|
|
||||||
)
|
|
||||||
|
|
||||||
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
||||||
|
|
||||||
|
@ -320,7 +311,6 @@ def _fuzzy_query_call(self, query):
|
||||||
urlkey = to_native_str(query.key, "utf-8")
|
urlkey = to_native_str(query.key, "utf-8")
|
||||||
url = query.url
|
url = query.url
|
||||||
filter_ = query.filters
|
filter_ = query.filters
|
||||||
output = query.output
|
|
||||||
|
|
||||||
for rule in self.rules.iter_matching(urlkey):
|
for rule in self.rules.iter_matching(urlkey):
|
||||||
m = rule.regex.search(urlkey)
|
m = rule.regex.search(urlkey)
|
||||||
|
|
|
@ -71,7 +71,7 @@ _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
|
|
||||||
|
|
||||||
def _robots_cache(site, proxy=None):
|
def _robots_cache(site, proxy=None):
|
||||||
if not site.id in _robots_caches:
|
if site.id not in _robots_caches:
|
||||||
req_sesh = _SessionRaiseOn420()
|
req_sesh = _SessionRaiseOn420()
|
||||||
req_sesh.verify = False # ignore cert errors
|
req_sesh.verify = False # ignore cert errors
|
||||||
if proxy:
|
if proxy:
|
||||||
|
|
|
@ -21,9 +21,7 @@ limitations under the License.
|
||||||
import datetime
|
import datetime
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import random
|
|
||||||
import socket
|
import socket
|
||||||
import tempfile
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
@ -99,9 +97,13 @@ class BrozzlerWorker:
|
||||||
self._skip_visit_hashtags = skip_visit_hashtags
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
self._skip_youtube_dl = skip_youtube_dl
|
self._skip_youtube_dl = skip_youtube_dl
|
||||||
|
|
||||||
|
# TODO try using importlib.util.find_spec to test for dependency
|
||||||
|
# presence rather than try/except on import.
|
||||||
|
# See https://docs.astral.sh/ruff/rules/unused-import/#example
|
||||||
|
|
||||||
# We definitely shouldn't ytdlp if the optional extra is missing
|
# We definitely shouldn't ytdlp if the optional extra is missing
|
||||||
try:
|
try:
|
||||||
import yt_dlp
|
import yt_dlp # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
|
"optional yt-dlp extra not installed; setting skip_youtube_dl to True"
|
||||||
|
@ -200,7 +202,7 @@ class BrozzlerWorker:
|
||||||
response = requests.get("http://%s/status" % self._proxy)
|
response = requests.get("http://%s/status" % self._proxy)
|
||||||
status = json.loads(response.text)
|
status = json.loads(response.text)
|
||||||
self._proxy_is_warcprox = status["role"] == "warcprox"
|
self._proxy_is_warcprox = status["role"] == "warcprox"
|
||||||
except Exception as e:
|
except Exception:
|
||||||
self._proxy_is_warcprox = False
|
self._proxy_is_warcprox = False
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"%s %s warcprox",
|
"%s %s warcprox",
|
||||||
|
@ -348,13 +350,13 @@ class BrozzlerWorker:
|
||||||
)
|
)
|
||||||
metrics.brozzler_ydl_urls_checked.inc(1)
|
metrics.brozzler_ydl_urls_checked.inc(1)
|
||||||
outlinks.update(ydl_outlinks)
|
outlinks.update(ydl_outlinks)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit:
|
||||||
raise
|
raise
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
raise
|
raise
|
||||||
except brozzler.ProxyError:
|
except brozzler.ProxyError:
|
||||||
raise
|
raise
|
||||||
except brozzler.VideoExtractorError as e:
|
except brozzler.VideoExtractorError:
|
||||||
self.logger.exception("error extracting video info")
|
self.logger.exception("error extracting video info")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (
|
if (
|
||||||
|
@ -391,9 +393,9 @@ class BrozzlerWorker:
|
||||||
timeout=self.HEADER_REQUEST_TIMEOUT,
|
timeout=self.HEADER_REQUEST_TIMEOUT,
|
||||||
) as r:
|
) as r:
|
||||||
return r.headers
|
return r.headers
|
||||||
except requests.exceptions.Timeout as e:
|
except requests.exceptions.Timeout:
|
||||||
url_logger.warning("Timed out trying to get headers", exc_info=True)
|
url_logger.warning("Timed out trying to get headers", exc_info=True)
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException:
|
||||||
url_logger.warning("Failed to get headers", exc_info=True)
|
url_logger.warning("Failed to get headers", exc_info=True)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -469,7 +471,7 @@ class BrozzlerWorker:
|
||||||
if "content-range" in response_headers:
|
if "content-range" in response_headers:
|
||||||
video["content-range"] = response_headers["content-range"]
|
video["content-range"] = response_headers["content-range"]
|
||||||
self.logger.debug("embedded video", video=video)
|
self.logger.debug("embedded video", video=video)
|
||||||
if not "videos" in page:
|
if "videos" not in page:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
|
|
||||||
|
@ -598,13 +600,13 @@ class BrozzlerWorker:
|
||||||
site_logger.info("no pages left for site")
|
site_logger.info("no pages left for site")
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
self._frontier.reached_limit(site, e)
|
self._frontier.reached_limit(site, e)
|
||||||
except brozzler.ReachedTimeLimit as e:
|
except brozzler.ReachedTimeLimit:
|
||||||
self._frontier.finished(site, "FINISHED_TIME_LIMIT")
|
self._frontier.finished(site, "FINISHED_TIME_LIMIT")
|
||||||
except brozzler.CrawlStopped:
|
except brozzler.CrawlStopped:
|
||||||
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
|
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
|
||||||
# except brozzler.browser.BrowsingAborted:
|
# except brozzler.browser.BrowsingAborted:
|
||||||
# self.logger.info("{} shut down".format(browser))
|
# self.logger.info("{} shut down".format(browser))
|
||||||
except brozzler.ProxyError as e:
|
except brozzler.ProxyError:
|
||||||
if self._warcprox_auto:
|
if self._warcprox_auto:
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"proxy error, will try to choose a "
|
"proxy error, will try to choose a "
|
||||||
|
@ -676,7 +678,7 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
self.status_info = self._service_registry.heartbeat(status_info)
|
self.status_info = self._service_registry.heartbeat(status_info)
|
||||||
self.logger.debug("status in service registry", status=self.status_info)
|
self.logger.debug("status in service registry", status=self.status_info)
|
||||||
except r.ReqlError as e:
|
except r.ReqlError:
|
||||||
self.logger.exception(
|
self.logger.exception(
|
||||||
"failed to send heartbeat and update service registry",
|
"failed to send heartbeat and update service registry",
|
||||||
info=status_info,
|
info=status_info,
|
||||||
|
@ -748,11 +750,11 @@ class BrozzlerWorker:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
self.logger.warn("shutdown requested")
|
self.logger.warn("shutdown requested")
|
||||||
except r.ReqlError as e:
|
except r.ReqlError:
|
||||||
self.logger.exception("caught rethinkdb exception, will try to proceed")
|
self.logger.exception("caught rethinkdb exception, will try to proceed")
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
self.logger.info("shutdown requested")
|
self.logger.info("shutdown requested")
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.critical(
|
self.logger.critical(
|
||||||
"thread exiting due to unexpected exception", exc_info=True
|
"thread exiting due to unexpected exception", exc_info=True
|
||||||
)
|
)
|
||||||
|
@ -760,7 +762,7 @@ class BrozzlerWorker:
|
||||||
if self._service_registry and hasattr(self, "status_info"):
|
if self._service_registry and hasattr(self, "status_info"):
|
||||||
try:
|
try:
|
||||||
self._service_registry.unregister(self.status_info["id"])
|
self._service_registry.unregister(self.status_info["id"])
|
||||||
except:
|
except: # noqa: E722
|
||||||
self.logger.exception("failed to unregister from service registry")
|
self.logger.exception("failed to unregister from service registry")
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
|
|
@ -101,7 +101,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||||
if result_type in ("url", "url_transparent"):
|
if result_type in ("url", "url_transparent"):
|
||||||
if "extraction_depth" in extra_info:
|
if "extraction_depth" in extra_info:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Following redirect",
|
"Following redirect",
|
||||||
redirect_url=ie_result["url"],
|
redirect_url=ie_result["url"],
|
||||||
extraction_depth=extra_info["extraction_depth"],
|
extraction_depth=extra_info["extraction_depth"],
|
||||||
)
|
)
|
||||||
|
@ -136,7 +136,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||||
# use it later to extract the watch pages as outlinks.
|
# use it later to extract the watch pages as outlinks.
|
||||||
try:
|
try:
|
||||||
ie_result["entries_no_dl"] = list(ie_result["entries"])
|
ie_result["entries_no_dl"] = list(ie_result["entries"])
|
||||||
except Exception as e:
|
except Exception:
|
||||||
extract_context.warning(
|
extract_context.warning(
|
||||||
"failed to unroll entries ie_result['entries']?",
|
"failed to unroll entries ie_result['entries']?",
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
|
@ -166,7 +166,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||||
import magic
|
import magic
|
||||||
|
|
||||||
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
mimetype = magic.from_file(info_dict["filepath"], mime=True)
|
||||||
except ImportError as e:
|
except ImportError:
|
||||||
mimetype = "video/%s" % info_dict["ext"]
|
mimetype = "video/%s" % info_dict["ext"]
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
"guessing mimetype due to error",
|
"guessing mimetype due to error",
|
||||||
|
@ -236,7 +236,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||||
)
|
)
|
||||||
site.last_claimed = doublethink.utcnow()
|
site.last_claimed = doublethink.utcnow()
|
||||||
site.save()
|
site.save()
|
||||||
except:
|
except: # noqa: E722
|
||||||
worker.logger.debug(
|
worker.logger.debug(
|
||||||
"problem heartbeating site.last_claimed site",
|
"problem heartbeating site.last_claimed site",
|
||||||
id=site.id,
|
id=site.id,
|
||||||
|
@ -316,7 +316,7 @@ def _remember_videos(page, pushed_videos=None):
|
||||||
"""
|
"""
|
||||||
Saves info about videos captured by yt-dlp in `page.videos`.
|
Saves info about videos captured by yt-dlp in `page.videos`.
|
||||||
"""
|
"""
|
||||||
if not "videos" in page:
|
if "videos" not in page:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
for pushed_video in pushed_videos or []:
|
for pushed_video in pushed_videos or []:
|
||||||
video = {
|
video = {
|
||||||
|
@ -351,7 +351,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||||
)
|
)
|
||||||
metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
|
metrics.brozzler_ydl_extract_successes.labels(ydl.is_youtube_host).inc(1)
|
||||||
break
|
break
|
||||||
except brozzler.ShutdownRequested as e:
|
except brozzler.ShutdownRequested:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -110,7 +110,7 @@ def test_httpd(httpd):
|
||||||
of the same url return the same payload, proving it can be used to test
|
of the same url return the same payload, proving it can be used to test
|
||||||
deduplication.
|
deduplication.
|
||||||
"""
|
"""
|
||||||
payload1 = content2 = None
|
payload1 = None
|
||||||
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
|
url = "http://localhost:%s/site1/file1.txt" % httpd.server_port
|
||||||
with urllib.request.urlopen(url) as response:
|
with urllib.request.urlopen(url) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
|
@ -175,7 +175,6 @@ def test_420(httpd):
|
||||||
|
|
||||||
def test_js_dialogs(httpd):
|
def test_js_dialogs(httpd):
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
url = "http://localhost:%s/site4/alert.html" % httpd.server_port
|
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
# before commit d2ed6b97a24 these would hang and eventually raise
|
# before commit d2ed6b97a24 these would hang and eventually raise
|
||||||
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
|
||||||
|
|
|
@ -40,11 +40,11 @@ def cli_commands():
|
||||||
commands = set(console_scripts().keys())
|
commands = set(console_scripts().keys())
|
||||||
commands.remove("brozzler-wayback")
|
commands.remove("brozzler-wayback")
|
||||||
try:
|
try:
|
||||||
import gunicorn
|
import gunicorn # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
commands.remove("brozzler-dashboard")
|
commands.remove("brozzler-dashboard")
|
||||||
try:
|
try:
|
||||||
import pywb
|
import pywb # noqa: F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
commands.remove("brozzler-easy")
|
commands.remove("brozzler-easy")
|
||||||
return commands
|
return commands
|
||||||
|
|
|
@ -23,7 +23,6 @@ import http.server
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
@ -47,7 +46,7 @@ def _local_address():
|
||||||
try:
|
try:
|
||||||
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
|
s.connect(("10.255.255.255", 1)) # ip doesn't need to be reachable
|
||||||
return s.getsockname()[0]
|
return s.getsockname()[0]
|
||||||
except:
|
except: # noqa: E722
|
||||||
return "127.0.0.1"
|
return "127.0.0.1"
|
||||||
finally:
|
finally:
|
||||||
s.close()
|
s.close()
|
||||||
|
@ -148,7 +147,7 @@ def test_httpd(httpd):
|
||||||
of the same url return the same payload, proving it can be used to test
|
of the same url return the same payload, proving it can be used to test
|
||||||
deduplication.
|
deduplication.
|
||||||
"""
|
"""
|
||||||
payload1 = content2 = None
|
payload1 = None
|
||||||
url = make_url(httpd, "/site1/file1.txt")
|
url = make_url(httpd, "/site1/file1.txt")
|
||||||
with urllib.request.urlopen(url) as response:
|
with urllib.request.urlopen(url) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
|
@ -351,8 +350,8 @@ def test_warcprox_auto(httpd):
|
||||||
|
|
||||||
|
|
||||||
def test_proxy_conflict():
|
def test_proxy_conflict():
|
||||||
with pytest.raises(AssertionError) as excinfo:
|
with pytest.raises(AssertionError):
|
||||||
worker = brozzler.worker.BrozzlerWorker(
|
brozzler.worker.BrozzlerWorker(
|
||||||
None, None, warcprox_auto=True, proxy="localhost:12345"
|
None, None, warcprox_auto=True, proxy="localhost:12345"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -523,7 +522,6 @@ def test_login(httpd):
|
||||||
|
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
robots_url = make_url(httpd, "/robots.txt")
|
|
||||||
captures = list(
|
captures = list(
|
||||||
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
|
rr.table("captures").filter({"test_id": test_id}).order_by("timestamp").run()
|
||||||
)
|
)
|
||||||
|
@ -730,7 +728,6 @@ def test_redirect_hashtags(httpd):
|
||||||
|
|
||||||
|
|
||||||
def test_stop_crawl(httpd):
|
def test_stop_crawl(httpd):
|
||||||
test_id = "test_stop_crawl_job-%s" % datetime.datetime.utcnow().isoformat()
|
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
@ -804,7 +801,6 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
"""
|
"""
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
svcreg = doublethink.ServiceRegistry(rr)
|
|
||||||
|
|
||||||
# run two instances of warcprox
|
# run two instances of warcprox
|
||||||
opts = warcprox.Options()
|
opts = warcprox.Options()
|
||||||
|
@ -836,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
# the system, if any
|
# the system, if any
|
||||||
try:
|
try:
|
||||||
stop_service("warcprox")
|
stop_service("warcprox")
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.warning("problem stopping warcprox service: %s", exc_info=True)
|
logger.warning("problem stopping warcprox service: %s", exc_info=True)
|
||||||
|
|
||||||
# queue the site for brozzling
|
# queue the site for brozzling
|
||||||
|
@ -917,7 +913,6 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
|
|
||||||
|
|
||||||
def test_time_limit(httpd):
|
def test_time_limit(httpd):
|
||||||
test_id = "test_time_limit-%s" % datetime.datetime.utcnow().isoformat()
|
|
||||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
@ -928,7 +923,6 @@ def test_time_limit(httpd):
|
||||||
|
|
||||||
sites = list(frontier.job_sites(job.id))
|
sites = list(frontier.job_sites(job.id))
|
||||||
assert len(sites) == 1
|
assert len(sites) == 1
|
||||||
site = sites[0]
|
|
||||||
|
|
||||||
# time limit should be enforced pretty soon
|
# time limit should be enforced pretty soon
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -986,7 +980,7 @@ def test_ydl_stitching(httpd):
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
|
captures = list(rr.table("captures").filter({"test_id": test_id}).run())
|
||||||
l = [c for c in captures if c["url"] == stitched_url]
|
l = [c for c in captures if c["url"] == stitched_url] # noqa: E741
|
||||||
assert len(l) == 1
|
assert len(l) == 1
|
||||||
c = l[0]
|
c = l[0]
|
||||||
assert c["filename"].startswith("test_ydl_stitching")
|
assert c["filename"].startswith("test_ydl_stitching")
|
||||||
|
|
|
@ -559,7 +559,7 @@ def test_parent_url_scoping():
|
||||||
assert parent_page.outlinks["accepted"] == outlinks
|
assert parent_page.outlinks["accepted"] == outlinks
|
||||||
|
|
||||||
# parent page redirect_url matches accept parent_url_regex
|
# parent page redirect_url matches accept parent_url_regex
|
||||||
parent_page_c = brozzler.Page(
|
brozzler.Page(
|
||||||
rr,
|
rr,
|
||||||
{
|
{
|
||||||
"site_id": site.id,
|
"site_id": site.id,
|
||||||
|
@ -606,7 +606,7 @@ def test_parent_url_scoping():
|
||||||
assert parent_page.outlinks["accepted"] == []
|
assert parent_page.outlinks["accepted"] == []
|
||||||
|
|
||||||
# parent page redirect_url matches block parent_url_regex
|
# parent page redirect_url matches block parent_url_regex
|
||||||
parent_page_c = brozzler.Page(
|
brozzler.Page(
|
||||||
rr,
|
rr,
|
||||||
{
|
{
|
||||||
"site_id": site.id,
|
"site_id": site.id,
|
||||||
|
@ -659,10 +659,10 @@ def test_completed_page():
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
|
|
||||||
# redirect that doesn't change scope surt because destination is covered by
|
# redirect that doesn't change scope surt because destination is covered by
|
||||||
# the original surt
|
# the original surt
|
||||||
|
@ -686,10 +686,10 @@ def test_completed_page():
|
||||||
site.refresh()
|
site.refresh()
|
||||||
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
|
|
||||||
# redirect that doesn't change scope surt because page is not the seed page
|
# redirect that doesn't change scope surt because page is not the seed page
|
||||||
site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
|
site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
|
||||||
|
@ -712,10 +712,10 @@ def test_completed_page():
|
||||||
site.refresh()
|
site.refresh()
|
||||||
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
assert site.scope == {"accepts": [{"ssurt": "com,example,//http:/a/"}]}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed is False
|
||||||
|
|
||||||
|
|
||||||
def test_seed_page():
|
def test_seed_page():
|
||||||
|
@ -931,7 +931,7 @@ def test_max_claimed_sites():
|
||||||
claimed_sites = frontier.claim_sites(3)
|
claimed_sites = frontier.claim_sites(3)
|
||||||
assert len(claimed_sites) == 2
|
assert len(claimed_sites) == 2
|
||||||
with pytest.raises(brozzler.NothingToClaim):
|
with pytest.raises(brozzler.NothingToClaim):
|
||||||
claimed_site = frontier.claim_sites(3)
|
frontier.claim_sites(3)
|
||||||
|
|
||||||
# clean slate for the next one
|
# clean slate for the next one
|
||||||
rr.table("jobs").delete().run()
|
rr.table("jobs").delete().run()
|
||||||
|
@ -1074,7 +1074,7 @@ def test_max_hops_off():
|
||||||
site.refresh() # get it back from the db
|
site.refresh() # get it back from the db
|
||||||
|
|
||||||
# renamed this param
|
# renamed this param
|
||||||
assert not "max_hops_off_surt" in site.scope
|
assert "max_hops_off_surt" not in site.scope
|
||||||
assert site.scope["max_hops_off"] == 1
|
assert site.scope["max_hops_off"] == 1
|
||||||
|
|
||||||
seed_page = frontier.seed_page(site.id)
|
seed_page = frontier.seed_page(site.id)
|
||||||
|
@ -1109,7 +1109,7 @@ def test_max_hops_off():
|
||||||
assert len(pages) == 4
|
assert len(pages) == 4
|
||||||
assert pages[0].url == "http://example.com/"
|
assert pages[0].url == "http://example.com/"
|
||||||
assert pages[0].hops_off == 0
|
assert pages[0].hops_off == 0
|
||||||
assert not "hops_off_surt" in pages[0]
|
assert "hops_off_surt" not in pages[0]
|
||||||
assert set(pages[0].outlinks["accepted"]) == {
|
assert set(pages[0].outlinks["accepted"]) == {
|
||||||
"https://example.com/toot",
|
"https://example.com/toot",
|
||||||
"http://foo.org/",
|
"http://foo.org/",
|
||||||
|
|
|
@ -21,7 +21,6 @@ import datetime
|
||||||
import http.server
|
import http.server
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
import sys
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
@ -29,7 +28,6 @@ import uuid
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
import brozzler
|
import brozzler
|
||||||
|
@ -291,7 +289,7 @@ def test_proxy_down():
|
||||||
)
|
)
|
||||||
|
|
||||||
# youtube-dl fetch
|
# youtube-dl fetch
|
||||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-") as tempdir:
|
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||||
|
|
||||||
|
@ -315,7 +313,7 @@ def test_start_stop_backwards_compat():
|
||||||
assert len(site.starts_and_stops) == 1
|
assert len(site.starts_and_stops) == 1
|
||||||
assert site.starts_and_stops[0]["start"]
|
assert site.starts_and_stops[0]["start"]
|
||||||
assert site.starts_and_stops[0]["stop"] is None
|
assert site.starts_and_stops[0]["stop"] is None
|
||||||
assert not "start_time" in site
|
assert "start_time" not in site
|
||||||
|
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
None,
|
None,
|
||||||
|
@ -324,13 +322,13 @@ def test_start_stop_backwards_compat():
|
||||||
assert len(site.starts_and_stops) == 1
|
assert len(site.starts_and_stops) == 1
|
||||||
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
assert site.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||||
assert site.starts_and_stops[0]["stop"] is None
|
assert site.starts_and_stops[0]["stop"] is None
|
||||||
assert not "start_time" in site
|
assert "start_time" not in site
|
||||||
|
|
||||||
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
job = brozzler.Job(None, {"seeds": [{"url": "https://example.com/"}]})
|
||||||
assert job.starts_and_stops[0]["start"]
|
assert job.starts_and_stops[0]["start"]
|
||||||
assert job.starts_and_stops[0]["stop"] is None
|
assert job.starts_and_stops[0]["stop"] is None
|
||||||
assert not "started" in job
|
assert "started" not in job
|
||||||
assert not "finished" in job
|
assert "finished" not in job
|
||||||
|
|
||||||
job = brozzler.Job(
|
job = brozzler.Job(
|
||||||
None,
|
None,
|
||||||
|
@ -342,8 +340,8 @@ def test_start_stop_backwards_compat():
|
||||||
)
|
)
|
||||||
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
assert job.starts_and_stops[0]["start"] == datetime.datetime(2017, 1, 1)
|
||||||
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
assert job.starts_and_stops[0]["stop"] == datetime.datetime(2017, 1, 2)
|
||||||
assert not "started" in job
|
assert "started" not in job
|
||||||
assert not "finished" in job
|
assert "finished" not in job
|
||||||
|
|
||||||
|
|
||||||
class Exception1(Exception):
|
class Exception1(Exception):
|
||||||
|
@ -452,9 +450,9 @@ def test_thread_raise_second_with_block():
|
||||||
with brozzler.thread_accept_exceptions():
|
with brozzler.thread_accept_exceptions():
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
return # test fails
|
return # test fails
|
||||||
except Exception1 as e:
|
except Exception1:
|
||||||
pass
|
pass
|
||||||
except:
|
except: # noqa: E722
|
||||||
return # fail test
|
return # fail test
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -32,7 +32,7 @@ import sys
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from shlex import quote
|
from shlex import quote
|
||||||
except:
|
except: # noqa: E722
|
||||||
from pipes import quote
|
from pipes import quote
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue