push federation retry limiter down to matrixfederationclient

rather than having to instrument everywhere we make a federation call,
make the MatrixFederationHttpClient manage the retry limiter.
This commit is contained in:
Richard van der Hoff 2017-03-23 00:12:21 +00:00
parent ad8a26e361
commit 4bd597d9fc
8 changed files with 283 additions and 290 deletions

View File

@ -15,7 +15,6 @@
from synapse.crypto.keyclient import fetch_server_key from synapse.crypto.keyclient import fetch_server_key
from synapse.api.errors import SynapseError, Codes from synapse.api.errors import SynapseError, Codes
from synapse.util.retryutils import get_retry_limiter
from synapse.util import unwrapFirstError from synapse.util import unwrapFirstError
from synapse.util.async import ObservableDeferred from synapse.util.async import ObservableDeferred
from synapse.util.logcontext import ( from synapse.util.logcontext import (
@ -363,30 +362,24 @@ class Keyring(object):
def get_keys_from_server(self, server_name_and_key_ids): def get_keys_from_server(self, server_name_and_key_ids):
@defer.inlineCallbacks @defer.inlineCallbacks
def get_key(server_name, key_ids): def get_key(server_name, key_ids):
limiter = yield get_retry_limiter( keys = None
server_name, try:
self.clock, keys = yield self.get_server_verify_key_v2_direct(
self.store, server_name, key_ids
) )
with limiter: except Exception as e:
keys = None logger.info(
try: "Unable to get key %r for %r directly: %s %s",
keys = yield self.get_server_verify_key_v2_direct( key_ids, server_name,
server_name, key_ids type(e).__name__, str(e.message),
) )
except Exception as e:
logger.info(
"Unable to get key %r for %r directly: %s %s",
key_ids, server_name,
type(e).__name__, str(e.message),
)
if not keys: if not keys:
keys = yield self.get_server_verify_key_v1_direct( keys = yield self.get_server_verify_key_v1_direct(
server_name, key_ids server_name, key_ids
) )
keys = {server_name: keys} keys = {server_name: keys}
defer.returnValue(keys) defer.returnValue(keys)

View File

@ -29,7 +29,7 @@ from synapse.util.logcontext import preserve_fn, preserve_context_over_deferred
from synapse.events import FrozenEvent, builder from synapse.events import FrozenEvent, builder
import synapse.metrics import synapse.metrics
from synapse.util.retryutils import get_retry_limiter, NotRetryingDestination from synapse.util.retryutils import NotRetryingDestination
import copy import copy
import itertools import itertools
@ -234,31 +234,24 @@ class FederationClient(FederationBase):
continue continue
try: try:
limiter = yield get_retry_limiter( transaction_data = yield self.transport_layer.get_event(
destination, destination, event_id, timeout=timeout,
self._clock,
self.store,
) )
with limiter: logger.debug("transaction_data %r", transaction_data)
transaction_data = yield self.transport_layer.get_event(
destination, event_id, timeout=timeout,
)
logger.debug("transaction_data %r", transaction_data) pdu_list = [
self.event_from_pdu_json(p, outlier=outlier)
for p in transaction_data["pdus"]
]
pdu_list = [ if pdu_list and pdu_list[0]:
self.event_from_pdu_json(p, outlier=outlier) pdu = pdu_list[0]
for p in transaction_data["pdus"]
]
if pdu_list and pdu_list[0]: # Check signatures are correct.
pdu = pdu_list[0] signed_pdu = yield self._check_sigs_and_hashes([pdu])[0]
# Check signatures are correct. break
signed_pdu = yield self._check_sigs_and_hashes([pdu])[0]
break
pdu_attempts[destination] = now pdu_attempts[destination] = now

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import datetime
from twisted.internet import defer from twisted.internet import defer
@ -22,9 +22,7 @@ from .units import Transaction, Edu
from synapse.api.errors import HttpResponseException from synapse.api.errors import HttpResponseException
from synapse.util.async import run_on_reactor from synapse.util.async import run_on_reactor
from synapse.util.logcontext import preserve_context_over_fn from synapse.util.logcontext import preserve_context_over_fn
from synapse.util.retryutils import ( from synapse.util.retryutils import NotRetryingDestination
get_retry_limiter, NotRetryingDestination,
)
from synapse.util.metrics import measure_func from synapse.util.metrics import measure_func
from synapse.types import get_domain_from_id from synapse.types import get_domain_from_id
from synapse.handlers.presence import format_user_presence_state from synapse.handlers.presence import format_user_presence_state
@ -312,13 +310,6 @@ class TransactionQueue(object):
yield run_on_reactor() yield run_on_reactor()
while True: while True:
limiter = yield get_retry_limiter(
destination,
self.clock,
self.store,
backoff_on_404=True, # If we get a 404 the other side has gone
)
device_message_edus, device_stream_id, dev_list_id = ( device_message_edus, device_stream_id, dev_list_id = (
yield self._get_new_device_messages(destination) yield self._get_new_device_messages(destination)
) )
@ -374,7 +365,6 @@ class TransactionQueue(object):
success = yield self._send_new_transaction( success = yield self._send_new_transaction(
destination, pending_pdus, pending_edus, pending_failures, destination, pending_pdus, pending_edus, pending_failures,
limiter=limiter,
) )
if success: if success:
# Remove the acknowledged device messages from the database # Remove the acknowledged device messages from the database
@ -392,12 +382,24 @@ class TransactionQueue(object):
self.last_device_list_stream_id_by_dest[destination] = dev_list_id self.last_device_list_stream_id_by_dest[destination] = dev_list_id
else: else:
break break
except NotRetryingDestination: except NotRetryingDestination as e:
logger.debug( logger.debug(
"TX [%s] not ready for retry yet - " "TX [%s] not ready for retry yet (next retry at %s) - "
"dropping transaction for now", "dropping transaction for now",
destination, destination,
datetime.datetime.fromtimestamp(
(e.retry_last_ts + e.retry_interval) / 1000.0
),
) )
except Exception as e:
logger.warn(
"TX [%s] Failed to send transaction: %s",
destination,
e,
)
for p in pending_pdus:
logger.info("Failed to send event %s to %s", p.event_id,
destination)
finally: finally:
# We want to be *very* sure we delete this after we stop processing # We want to be *very* sure we delete this after we stop processing
self.pending_transactions.pop(destination, None) self.pending_transactions.pop(destination, None)
@ -437,7 +439,7 @@ class TransactionQueue(object):
@measure_func("_send_new_transaction") @measure_func("_send_new_transaction")
@defer.inlineCallbacks @defer.inlineCallbacks
def _send_new_transaction(self, destination, pending_pdus, pending_edus, def _send_new_transaction(self, destination, pending_pdus, pending_edus,
pending_failures, limiter): pending_failures):
# Sort based on the order field # Sort based on the order field
pending_pdus.sort(key=lambda t: t[1]) pending_pdus.sort(key=lambda t: t[1])
@ -447,132 +449,104 @@ class TransactionQueue(object):
success = True success = True
logger.debug("TX [%s] _attempt_new_transaction", destination)
txn_id = str(self._next_txn_id)
logger.debug(
"TX [%s] {%s} Attempting new transaction"
" (pdus: %d, edus: %d, failures: %d)",
destination, txn_id,
len(pdus),
len(edus),
len(failures)
)
logger.debug("TX [%s] Persisting transaction...", destination)
transaction = Transaction.create_new(
origin_server_ts=int(self.clock.time_msec()),
transaction_id=txn_id,
origin=self.server_name,
destination=destination,
pdus=pdus,
edus=edus,
pdu_failures=failures,
)
self._next_txn_id += 1
yield self.transaction_actions.prepare_to_send(transaction)
logger.debug("TX [%s] Persisted transaction", destination)
logger.info(
"TX [%s] {%s} Sending transaction [%s],"
" (PDUs: %d, EDUs: %d, failures: %d)",
destination, txn_id,
transaction.transaction_id,
len(pdus),
len(edus),
len(failures),
)
# Actually send the transaction
# FIXME (erikj): This is a bit of a hack to make the Pdu age
# keys work
def json_data_cb():
data = transaction.get_dict()
now = int(self.clock.time_msec())
if "pdus" in data:
for p in data["pdus"]:
if "age_ts" in p:
unsigned = p.setdefault("unsigned", {})
unsigned["age"] = now - int(p["age_ts"])
del p["age_ts"]
return data
try: try:
logger.debug("TX [%s] _attempt_new_transaction", destination) response = yield self.transport_layer.send_transaction(
transaction, json_data_cb
txn_id = str(self._next_txn_id)
logger.debug(
"TX [%s] {%s} Attempting new transaction"
" (pdus: %d, edus: %d, failures: %d)",
destination, txn_id,
len(pdus),
len(edus),
len(failures)
) )
code = 200
logger.debug("TX [%s] Persisting transaction...", destination) if response:
for e_id, r in response.get("pdus", {}).items():
transaction = Transaction.create_new( if "error" in r:
origin_server_ts=int(self.clock.time_msec()), logger.warn(
transaction_id=txn_id, "Transaction returned error for %s: %s",
origin=self.server_name, e_id, r,
destination=destination,
pdus=pdus,
edus=edus,
pdu_failures=failures,
)
self._next_txn_id += 1
yield self.transaction_actions.prepare_to_send(transaction)
logger.debug("TX [%s] Persisted transaction", destination)
logger.info(
"TX [%s] {%s} Sending transaction [%s],"
" (PDUs: %d, EDUs: %d, failures: %d)",
destination, txn_id,
transaction.transaction_id,
len(pdus),
len(edus),
len(failures),
)
with limiter:
# Actually send the transaction
# FIXME (erikj): This is a bit of a hack to make the Pdu age
# keys work
def json_data_cb():
data = transaction.get_dict()
now = int(self.clock.time_msec())
if "pdus" in data:
for p in data["pdus"]:
if "age_ts" in p:
unsigned = p.setdefault("unsigned", {})
unsigned["age"] = now - int(p["age_ts"])
del p["age_ts"]
return data
try:
response = yield self.transport_layer.send_transaction(
transaction, json_data_cb
)
code = 200
if response:
for e_id, r in response.get("pdus", {}).items():
if "error" in r:
logger.warn(
"Transaction returned error for %s: %s",
e_id, r,
)
except HttpResponseException as e:
code = e.code
response = e.response
if e.code in (401, 404, 429) or 500 <= e.code:
logger.info(
"TX [%s] {%s} got %d response",
destination, txn_id, code
) )
raise e except HttpResponseException as e:
code = e.code
response = e.response
if e.code in (401, 404, 429) or 500 <= e.code:
logger.info( logger.info(
"TX [%s] {%s} got %d response", "TX [%s] {%s} got %d response",
destination, txn_id, code destination, txn_id, code
) )
raise e
logger.debug("TX [%s] Sent transaction", destination) logger.info(
logger.debug("TX [%s] Marking as delivered...", destination) "TX [%s] {%s} got %d response",
destination, txn_id, code
)
yield self.transaction_actions.delivered( logger.debug("TX [%s] Sent transaction", destination)
transaction, code, response logger.debug("TX [%s] Marking as delivered...", destination)
)
logger.debug("TX [%s] Marked as delivered", destination) yield self.transaction_actions.delivered(
transaction, code, response
)
if code != 200: logger.debug("TX [%s] Marked as delivered", destination)
for p in pdus:
logger.info(
"Failed to send event %s to %s", p.event_id, destination
)
success = False
except RuntimeError as e:
# We capture this here as there as nothing actually listens
# for this finishing functions deferred.
logger.warn(
"TX [%s] Problem in _attempt_transaction: %s",
destination,
e,
)
success = False
if code != 200:
for p in pdus: for p in pdus:
logger.info("Failed to send event %s to %s", p.event_id, destination) logger.info(
except Exception as e: "Failed to send event %s to %s", p.event_id, destination
# We capture this here as there as nothing actually listens )
# for this finishing functions deferred.
logger.warn(
"TX [%s] Problem in _attempt_transaction: %s",
destination,
e,
)
success = False success = False
for p in pdus:
logger.info("Failed to send event %s to %s", p.event_id, destination)
defer.returnValue(success) defer.returnValue(success)

View File

@ -163,6 +163,7 @@ class TransportLayerClient(object):
data=json_data, data=json_data,
json_data_callback=json_data_callback, json_data_callback=json_data_callback,
long_retries=True, long_retries=True,
backoff_on_404=True, # If we get a 404 the other side has gone
) )
logger.debug( logger.debug(

View File

@ -22,7 +22,7 @@ from twisted.internet import defer
from synapse.api.errors import SynapseError, CodeMessageException from synapse.api.errors import SynapseError, CodeMessageException
from synapse.types import get_domain_from_id from synapse.types import get_domain_from_id
from synapse.util.logcontext import preserve_fn, preserve_context_over_deferred from synapse.util.logcontext import preserve_fn, preserve_context_over_deferred
from synapse.util.retryutils import get_retry_limiter, NotRetryingDestination from synapse.util.retryutils import NotRetryingDestination
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -121,15 +121,11 @@ class E2eKeysHandler(object):
def do_remote_query(destination): def do_remote_query(destination):
destination_query = remote_queries_not_in_cache[destination] destination_query = remote_queries_not_in_cache[destination]
try: try:
limiter = yield get_retry_limiter( remote_result = yield self.federation.query_client_keys(
destination, self.clock, self.store destination,
{"device_keys": destination_query},
timeout=timeout
) )
with limiter:
remote_result = yield self.federation.query_client_keys(
destination,
{"device_keys": destination_query},
timeout=timeout
)
for user_id, keys in remote_result["device_keys"].items(): for user_id, keys in remote_result["device_keys"].items():
if user_id in destination_query: if user_id in destination_query:
@ -239,18 +235,14 @@ class E2eKeysHandler(object):
def claim_client_keys(destination): def claim_client_keys(destination):
device_keys = remote_queries[destination] device_keys = remote_queries[destination]
try: try:
limiter = yield get_retry_limiter( remote_result = yield self.federation.claim_client_keys(
destination, self.clock, self.store destination,
{"one_time_keys": device_keys},
timeout=timeout
) )
with limiter: for user_id, keys in remote_result["one_time_keys"].items():
remote_result = yield self.federation.claim_client_keys( if user_id in device_keys:
destination, json_result[user_id] = keys
{"one_time_keys": device_keys},
timeout=timeout
)
for user_id, keys in remote_result["one_time_keys"].items():
if user_id in device_keys:
json_result[user_id] = keys
except CodeMessageException as e: except CodeMessageException as e:
failures[destination] = { failures[destination] = {
"status": e.code, "message": e.message "status": e.code, "message": e.message

View File

@ -12,8 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import synapse.util.retryutils
from twisted.internet import defer, reactor, protocol from twisted.internet import defer, reactor, protocol
from twisted.internet.error import DNSLookupError from twisted.internet.error import DNSLookupError
from twisted.web.client import readBody, HTTPConnectionPool, Agent from twisted.web.client import readBody, HTTPConnectionPool, Agent
@ -94,6 +93,7 @@ class MatrixFederationHttpClient(object):
reactor, MatrixFederationEndpointFactory(hs), pool=pool reactor, MatrixFederationEndpointFactory(hs), pool=pool
) )
self.clock = hs.get_clock() self.clock = hs.get_clock()
self._store = hs.get_datastore()
self.version_string = hs.version_string self.version_string = hs.version_string
self._next_id = 1 self._next_id = 1
@ -106,133 +106,143 @@ class MatrixFederationHttpClient(object):
def _request(self, destination, method, path, def _request(self, destination, method, path,
body_callback, headers_dict={}, param_bytes=b"", body_callback, headers_dict={}, param_bytes=b"",
query_bytes=b"", retry_on_dns_fail=True, query_bytes=b"", retry_on_dns_fail=True,
timeout=None, long_retries=False): timeout=None, long_retries=False, backoff_on_404=False):
""" Creates and sends a request to the given server """ Creates and sends a request to the given server
Args: Args:
destination (str): The remote server to send the HTTP request to. destination (str): The remote server to send the HTTP request to.
method (str): HTTP method method (str): HTTP method
path (str): The HTTP path path (str): The HTTP path
backoff_on_404 (bool): Back off if we get a 404
Returns: Returns:
Deferred: resolves with the http response object on success. Deferred: resolves with the http response object on success.
Fails with ``HTTPRequestException``: if we get an HTTP response Fails with ``HTTPRequestException``: if we get an HTTP response
code >= 300. code >= 300.
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
""" """
limiter = yield synapse.util.retryutils.get_retry_limiter(
destination,
self.clock,
self._store,
backoff_on_404=backoff_on_404,
)
destination = destination.encode("ascii") destination = destination.encode("ascii")
path_bytes = path.encode("ascii") path_bytes = path.encode("ascii")
with limiter:
headers_dict[b"User-Agent"] = [self.version_string]
headers_dict[b"Host"] = [destination]
headers_dict[b"User-Agent"] = [self.version_string] url_bytes = self._create_url(
headers_dict[b"Host"] = [destination] destination, path_bytes, param_bytes, query_bytes
)
url_bytes = self._create_url( txn_id = "%s-O-%s" % (method, self._next_id)
destination, path_bytes, param_bytes, query_bytes self._next_id = (self._next_id + 1) % (sys.maxint - 1)
)
txn_id = "%s-O-%s" % (method, self._next_id) outbound_logger.info(
self._next_id = (self._next_id + 1) % (sys.maxint - 1) "{%s} [%s] Sending request: %s %s",
txn_id, destination, method, url_bytes
)
outbound_logger.info( # XXX: Would be much nicer to retry only at the transaction-layer
"{%s} [%s] Sending request: %s %s", # (once we have reliable transactions in place)
txn_id, destination, method, url_bytes if long_retries:
) retries_left = MAX_LONG_RETRIES
else:
retries_left = MAX_SHORT_RETRIES
# XXX: Would be much nicer to retry only at the transaction-layer http_url_bytes = urlparse.urlunparse(
# (once we have reliable transactions in place) ("", "", path_bytes, param_bytes, query_bytes, "")
if long_retries: )
retries_left = MAX_LONG_RETRIES
else:
retries_left = MAX_SHORT_RETRIES
http_url_bytes = urlparse.urlunparse( log_result = None
("", "", path_bytes, param_bytes, query_bytes, "") try:
) while True:
producer = None
if body_callback:
producer = body_callback(method, http_url_bytes, headers_dict)
log_result = None try:
try: def send_request():
while True: request_deferred = preserve_context_over_fn(
producer = None self.agent.request,
if body_callback: method,
producer = body_callback(method, http_url_bytes, headers_dict) url_bytes,
Headers(headers_dict),
producer
)
try: return self.clock.time_bound_deferred(
def send_request(): request_deferred,
request_deferred = preserve_context_over_fn( time_out=timeout / 1000. if timeout else 60,
self.agent.request, )
response = yield preserve_context_over_fn(send_request)
log_result = "%d %s" % (response.code, response.phrase,)
break
except Exception as e:
if not retry_on_dns_fail and isinstance(e, DNSLookupError):
logger.warn(
"DNS Lookup failed to %s with %s",
destination,
e
)
log_result = "DNS Lookup failed to %s with %s" % (
destination, e
)
raise
logger.warn(
"{%s} Sending request failed to %s: %s %s: %s - %s",
txn_id,
destination,
method, method,
url_bytes, url_bytes,
Headers(headers_dict), type(e).__name__,
producer _flatten_response_never_received(e),
) )
return self.clock.time_bound_deferred( log_result = "%s - %s" % (
request_deferred, type(e).__name__, _flatten_response_never_received(e),
time_out=timeout / 1000. if timeout else 60,
) )
response = yield preserve_context_over_fn(send_request) if retries_left and not timeout:
if long_retries:
delay = 4 ** (MAX_LONG_RETRIES + 1 - retries_left)
delay = min(delay, 60)
delay *= random.uniform(0.8, 1.4)
else:
delay = 0.5 * 2 ** (MAX_SHORT_RETRIES - retries_left)
delay = min(delay, 2)
delay *= random.uniform(0.8, 1.4)
log_result = "%d %s" % (response.code, response.phrase,) yield sleep(delay)
break retries_left -= 1
except Exception as e:
if not retry_on_dns_fail and isinstance(e, DNSLookupError):
logger.warn(
"DNS Lookup failed to %s with %s",
destination,
e
)
log_result = "DNS Lookup failed to %s with %s" % (
destination, e
)
raise
logger.warn(
"{%s} Sending request failed to %s: %s %s: %s - %s",
txn_id,
destination,
method,
url_bytes,
type(e).__name__,
_flatten_response_never_received(e),
)
log_result = "%s - %s" % (
type(e).__name__, _flatten_response_never_received(e),
)
if retries_left and not timeout:
if long_retries:
delay = 4 ** (MAX_LONG_RETRIES + 1 - retries_left)
delay = min(delay, 60)
delay *= random.uniform(0.8, 1.4)
else: else:
delay = 0.5 * 2 ** (MAX_SHORT_RETRIES - retries_left) raise
delay = min(delay, 2) finally:
delay *= random.uniform(0.8, 1.4) outbound_logger.info(
"{%s} [%s] Result: %s",
txn_id,
destination,
log_result,
)
yield sleep(delay) if 200 <= response.code < 300:
retries_left -= 1 pass
else: else:
raise # :'(
finally: # Update transactions table?
outbound_logger.info( body = yield preserve_context_over_fn(readBody, response)
"{%s} [%s] Result: %s", raise HttpResponseException(
txn_id, response.code, response.phrase, body
destination, )
log_result,
)
if 200 <= response.code < 300: defer.returnValue(response)
pass
else:
# :'(
# Update transactions table?
body = yield preserve_context_over_fn(readBody, response)
raise HttpResponseException(
response.code, response.phrase, body
)
defer.returnValue(response)
def sign_request(self, destination, method, url_bytes, headers_dict, def sign_request(self, destination, method, url_bytes, headers_dict,
content=None): content=None):
@ -261,7 +271,7 @@ class MatrixFederationHttpClient(object):
@defer.inlineCallbacks @defer.inlineCallbacks
def put_json(self, destination, path, data={}, json_data_callback=None, def put_json(self, destination, path, data={}, json_data_callback=None,
long_retries=False, timeout=None): long_retries=False, timeout=None, backoff_on_404=False):
""" Sends the specifed json data using PUT """ Sends the specifed json data using PUT
Args: Args:
@ -276,11 +286,17 @@ class MatrixFederationHttpClient(object):
retry for a short or long time. retry for a short or long time.
timeout(int): How long to try (in ms) the destination for before timeout(int): How long to try (in ms) the destination for before
giving up. None indicates no timeout. giving up. None indicates no timeout.
backoff_on_404 (bool): True if we should count a 404 response as
a failure of the server (and should therefore back off future
requests)
Returns: Returns:
Deferred: Succeeds when we get a 2xx HTTP response. The result Deferred: Succeeds when we get a 2xx HTTP response. The result
will be the decoded JSON body. On a 4xx or 5xx error response a will be the decoded JSON body. On a 4xx or 5xx error response a
CodeMessageException is raised. CodeMessageException is raised.
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
""" """
if not json_data_callback: if not json_data_callback:
@ -303,6 +319,7 @@ class MatrixFederationHttpClient(object):
headers_dict={"Content-Type": ["application/json"]}, headers_dict={"Content-Type": ["application/json"]},
long_retries=long_retries, long_retries=long_retries,
timeout=timeout, timeout=timeout,
backoff_on_404=backoff_on_404,
) )
if 200 <= response.code < 300: if 200 <= response.code < 300:
@ -332,6 +349,9 @@ class MatrixFederationHttpClient(object):
Deferred: Succeeds when we get a 2xx HTTP response. The result Deferred: Succeeds when we get a 2xx HTTP response. The result
will be the decoded JSON body. On a 4xx or 5xx error response a will be the decoded JSON body. On a 4xx or 5xx error response a
CodeMessageException is raised. CodeMessageException is raised.
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
""" """
def body_callback(method, url_bytes, headers_dict): def body_callback(method, url_bytes, headers_dict):
@ -377,6 +397,9 @@ class MatrixFederationHttpClient(object):
The result of the deferred is a tuple of `(code, response)`, The result of the deferred is a tuple of `(code, response)`,
where `response` is a dict representing the decoded JSON body. where `response` is a dict representing the decoded JSON body.
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
""" """
logger.debug("get_json args: %s", args) logger.debug("get_json args: %s", args)
@ -426,6 +449,9 @@ class MatrixFederationHttpClient(object):
Fails with ``HTTPRequestException`` if we get an HTTP response code Fails with ``HTTPRequestException`` if we get an HTTP response code
>= 300 >= 300
Fails with ``NotRetryingDestination`` if we are not yet ready
to retry this server.
""" """
encoded_args = {} encoded_args = {}

View File

@ -124,7 +124,13 @@ class RetryDestinationLimiter(object):
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
valid_err_code = False valid_err_code = False
if exc_type is not None and issubclass(exc_type, CodeMessageException): if exc_type is None:
valid_err_code = True
elif not issubclass(exc_type, Exception):
# avoid treating exceptions which don't derive from Exception as
# failures; this is mostly so as not to catch defer._DefGen.
valid_err_code = True
elif issubclass(exc_type, CodeMessageException):
# Some error codes are perfectly fine for some APIs, whereas other # Some error codes are perfectly fine for some APIs, whereas other
# APIs may expect to never received e.g. a 404. It's important to # APIs may expect to never received e.g. a 404. It's important to
# handle 404 as some remote servers will return a 404 when the HS # handle 404 as some remote servers will return a 404 when the HS
@ -142,11 +148,13 @@ class RetryDestinationLimiter(object):
else: else:
valid_err_code = False valid_err_code = False
if exc_type is None or valid_err_code: if valid_err_code:
# We connected successfully. # We connected successfully.
if not self.retry_interval: if not self.retry_interval:
return return
logger.debug("Connection to %s was successful; clearing backoff",
self.destination)
retry_last_ts = 0 retry_last_ts = 0
self.retry_interval = 0 self.retry_interval = 0
else: else:
@ -160,6 +168,10 @@ class RetryDestinationLimiter(object):
else: else:
self.retry_interval = self.min_retry_interval self.retry_interval = self.min_retry_interval
logger.debug(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
self.destination, exc_type, exc_val, self.retry_interval
)
retry_last_ts = int(self.clock.time_msec()) retry_last_ts = int(self.clock.time_msec())
@defer.inlineCallbacks @defer.inlineCallbacks

View File

@ -192,6 +192,7 @@ class TypingNotificationsTestCase(unittest.TestCase):
), ),
json_data_callback=ANY, json_data_callback=ANY,
long_retries=True, long_retries=True,
backoff_on_404=True,
), ),
defer.succeed((200, "OK")) defer.succeed((200, "OK"))
) )
@ -263,6 +264,7 @@ class TypingNotificationsTestCase(unittest.TestCase):
), ),
json_data_callback=ANY, json_data_callback=ANY,
long_retries=True, long_retries=True,
backoff_on_404=True,
), ),
defer.succeed((200, "OK")) defer.succeed((200, "OK"))
) )