Send activities in parallel (fixes #4529)

This commit is contained in:
Felix Ableitner 2024-03-21 16:04:27 +01:00
parent 44e9622b71
commit 748b96e513
3 changed files with 151 additions and 93 deletions

View File

@ -16,7 +16,7 @@ use once_cell::sync::Lazy;
use std::{
cmp::Ordering,
collections::BinaryHeap,
sync::{Arc, Mutex},
sync::{Arc, RwLock},
thread::available_parallelism,
time::Duration,
};
@ -38,26 +38,26 @@ pub async fn shared_inbox(
request.method().clone(),
request.uri().clone(),
);
ACTIVITY_QUEUE.lock().unwrap().push(InboxActivity {
ACTIVITY_QUEUE.write().unwrap().push(InboxActivity {
request_parts,
bytes,
published,
});
Ok(HttpResponse::Ok().finish())
}
None => {
// no timestamp included, process immediately
receive_activity::<SharedInboxActivities, UserOrCommunity, LemmyContext>(
request, bytes, &data,
)
.await?;
.await
}
};
Ok(HttpResponse::Ok().finish())
}
}
/// Queue of incoming activities, ordered by oldest published first
static ACTIVITY_QUEUE: Lazy<Arc<Mutex<BinaryHeap<InboxActivity>>>> =
Lazy::new(|| Arc::new(Mutex::new(BinaryHeap::new())));
static ACTIVITY_QUEUE: Lazy<Arc<RwLock<BinaryHeap<InboxActivity>>>> =
Lazy::new(|| Arc::new(RwLock::new(BinaryHeap::new())));
/// Minimum age of an activity before it gets processed. This ensures that an activity which was
/// delayed still gets processed in correct order.
@ -100,11 +100,11 @@ pub fn handle_received_activities(
}
fn peek_queue_timestamp() -> Option<DateTime<Utc>> {
ACTIVITY_QUEUE.lock().unwrap().peek().map(|i| i.published)
ACTIVITY_QUEUE.read().unwrap().peek().map(|i| i.published)
}
fn pop_queue<'a>() -> Option<InboxActivity> {
ACTIVITY_QUEUE.lock().unwrap().pop()
ACTIVITY_QUEUE.write().unwrap().pop()
}
#[derive(Clone, Debug)]
@ -156,7 +156,7 @@ mod tests {
bytes: Default::default(),
published: Local::now().into(),
};
let mut lock = ACTIVITY_QUEUE.lock().unwrap();
let mut lock = ACTIVITY_QUEUE.write().unwrap();
// insert in wrong order
lock.push(activity3.clone());

View File

@ -51,7 +51,7 @@ impl ActivitySendTargets {
}
}
#[derive(PartialEq, Eq, Debug)]
#[derive(PartialEq, Eq, Debug, Clone)]
#[cfg_attr(feature = "full", derive(Queryable, Selectable, Identifiable))]
#[cfg_attr(feature = "full", diesel(check_for_backend(diesel::pg::Pg)))]
#[cfg_attr(feature = "full", diesel(table_name = sent_activity))]

View File

@ -30,9 +30,10 @@ use reqwest::Url;
use std::{
collections::{HashMap, HashSet},
ops::{Add, Deref},
sync::{Arc, RwLock},
time::Duration,
};
use tokio::{sync::mpsc::UnboundedSender, time::sleep};
use tokio::{spawn, sync::mpsc::UnboundedSender, time::sleep};
use tokio_util::sync::CancellationToken;
/// Check whether to save state to db every n sends if there's no failures (during failures state is saved after every attempt)
@ -57,8 +58,11 @@ static FOLLOW_ADDITIONS_RECHECK_DELAY: Lazy<chrono::TimeDelta> = Lazy::new(|| {
/// This is expected to happen pretty rarely and updating it in a timely manner is not too important.
static FOLLOW_REMOVALS_RECHECK_DELAY: Lazy<chrono::TimeDelta> =
Lazy::new(|| chrono::TimeDelta::try_hours(1).expect("TimeDelta out of bounds"));
const MAX_INFLIGHT_REQUESTS: u8 = 5;
pub(crate) struct InstanceWorker {
instance: Instance,
instance: Arc<RwLock<Instance>>,
// load site lazily because if an instance is first seen due to being on allowlist,
// the corresponding row in `site` may not exist yet since that is only added once
// `fetch_instance_actor_for_object` is called.
@ -68,14 +72,26 @@ pub(crate) struct InstanceWorker {
followed_communities: HashMap<CommunityId, HashSet<Url>>,
stop: CancellationToken,
context: Data<LemmyContext>,
stats_sender: UnboundedSender<(String, FederationQueueState)>,
last_full_communities_fetch: DateTime<Utc>,
last_incremental_communities_fetch: DateTime<Utc>,
stats: Arc<RwLock<InstanceStats>>,
}
#[derive(Clone)]
struct InstanceStats {
stats_sender: UnboundedSender<(String, FederationQueueState)>,
state: FederationQueueState,
last_state_insert: DateTime<Utc>,
inflight_requests: u8,
}
impl InstanceWorker {
fn stats(&self) -> InstanceStats {
self.stats.read().unwrap().clone()
}
fn instance(&self) -> Instance {
self.instance.read().unwrap().clone()
}
pub(crate) async fn init_and_loop(
instance: Instance,
context: Data<LemmyContext>,
@ -85,17 +101,20 @@ impl InstanceWorker {
) -> Result<(), anyhow::Error> {
let state = FederationQueueState::load(pool, instance.id).await?;
let mut worker = InstanceWorker {
instance,
instance: Arc::new(RwLock::new(instance)),
site_loaded: false,
site: None,
followed_communities: HashMap::new(),
stop,
context,
stats_sender,
last_full_communities_fetch: Utc.timestamp_nanos(0),
last_incremental_communities_fetch: Utc.timestamp_nanos(0),
state,
last_state_insert: Utc.timestamp_nanos(0),
stats: Arc::new(RwLock::new(InstanceStats {
stats_sender,
state,
last_state_insert: Utc.timestamp_nanos(0),
inflight_requests: 0,
})),
};
worker.loop_until_stopped(pool).await
}
@ -114,25 +133,26 @@ impl InstanceWorker {
if self.stop.is_cancelled() {
break;
}
if (Utc::now() - self.last_state_insert) > save_state_every {
self.save_and_send_state(pool).await?;
if (Utc::now() - self.stats().last_state_insert) > save_state_every {
save_and_send_state(self.stats.clone(), &self.instance(), pool).await?;
}
self.update_communities(pool).await?;
}
// final update of state in db
self.save_and_send_state(pool).await?;
save_and_send_state(self.stats.clone(), &self.instance(), pool).await?;
Ok(())
}
async fn initial_fail_sleep(&mut self) -> Result<()> {
// before starting queue, sleep remaining duration if last request failed
if self.state.fail_count > 0 {
let last_retry = self
let stats = self.stats();
if stats.state.fail_count > 0 {
let last_retry = stats
.state
.last_retry
.context("impossible: if fail count set last retry also set")?;
let elapsed = (Utc::now() - last_retry).to_std()?;
let required = federate_retry_sleep_duration(self.state.fail_count);
let required = federate_retry_sleep_duration(stats.state.fail_count);
if elapsed >= required {
return Ok(());
}
@ -147,14 +167,16 @@ impl InstanceWorker {
/// send out a batch of CHECK_SAVE_STATE_EVERY_IT activities
async fn loop_batch(&mut self, pool: &mut DbPool<'_>) -> Result<()> {
let latest_id = get_latest_activity_id(pool).await?;
let mut id = if let Some(id) = self.state.last_successful_id {
let mut id = if let Some(id) = self.stats().state.last_successful_id {
id
} else {
// this is the initial creation (instance first seen) of the federation queue for this instance
// skip all past activities:
self.state.last_successful_id = Some(latest_id);
{
self.stats.write().unwrap().state.last_successful_id = Some(latest_id);
}
// save here to ensure it's not read as 0 again later if no activities have happened
self.save_and_send_state(pool).await?;
save_and_send_state(self.stats.clone(), &self.instance(), pool).await?;
latest_id
};
if id >= latest_id {
@ -170,30 +192,20 @@ impl InstanceWorker {
&& processed_activities < CHECK_SAVE_STATE_EVERY_IT
&& !self.stop.is_cancelled()
{
while self.stats().inflight_requests >= MAX_INFLIGHT_REQUESTS {
sleep(Duration::from_millis(100)).await;
}
id = ActivityId(id.0 + 1);
processed_activities += 1;
let Some(ele) = get_activity_cached(pool, id)
.await
.context("failed reading activity from db")?
else {
tracing::debug!("{}: {:?} does not exist", self.instance.domain, id);
self.state.last_successful_id = Some(id);
tracing::debug!("{}: {:?} does not exist", self.instance().domain, id);
self.stats.write().unwrap().state.last_successful_id = Some(id);
continue;
};
if let Err(e) = self.send_retry_loop(pool, &ele.0, &ele.1).await {
tracing::warn!(
"sending {} errored internally, skipping activity: {:?}",
ele.0.ap_id,
e
);
}
if self.stop.is_cancelled() {
return Ok(());
}
// send success!
self.state.last_successful_id = Some(id);
self.state.last_successful_published_time = Some(ele.0.published);
self.state.fail_count = 0;
self.send_retry_loop(pool, ele.0.clone(), &ele.1).await?
}
Ok(())
}
@ -203,17 +215,34 @@ impl InstanceWorker {
async fn send_retry_loop(
&mut self,
pool: &mut DbPool<'_>,
activity: &SentActivity,
activity: SentActivity,
object: &SharedInboxActivities,
) -> Result<()> {
let stats = self.stats();
let retry_delay: Duration = federate_retry_sleep_duration(stats.state.fail_count);
tracing::info!(
"{}: retrying {:?} attempt {} with delay {retry_delay:.2?}",
self.instance().domain,
activity.id,
stats.state.fail_count
);
tokio::select! {
() = sleep(retry_delay) => {},
() = self.stop.cancelled() => {
// save state to db and exit
return Ok(());
}
};
let inbox_urls = self
.get_inbox_urls(pool, activity)
.get_inbox_urls(pool, &activity)
.await
.context("failed figuring out inbox urls")?;
if inbox_urls.is_empty() {
tracing::debug!("{}: {:?} no inboxes", self.instance.domain, activity.id);
self.state.last_successful_id = Some(activity.id);
self.state.last_successful_published_time = Some(activity.published);
tracing::debug!("{}: {:?} no inboxes", self.instance().domain, activity.id);
let mut stats = self.stats.write().unwrap();
stats.state.last_successful_id = Some(activity.id);
stats.state.last_successful_published_time = Some(activity.published);
return Ok(());
}
let Some(actor_apub_id) = &activity.actor_apub_id else {
@ -225,43 +254,63 @@ impl InstanceWorker {
let object = WithContext::new(object.clone(), FEDERATION_CONTEXT.deref().clone());
let inbox_urls = inbox_urls.into_iter().collect();
let requests =
SendActivityTask::prepare(&object, actor.as_ref(), inbox_urls, &self.context).await?;
for task in requests {
let context = self.context.reset_request_count();
let stats = self.stats.clone();
let instance = self.instance();
let write_instance = self.instance.clone();
spawn(async move {
{
stats.write().unwrap().inflight_requests += 1;
}
let requests = SendActivityTask::prepare(&object, actor.as_ref(), inbox_urls, &context)
.await
.unwrap();
// usually only one due to shared inbox
tracing::debug!("sending out {}", task);
while let Err(e) = task.sign_and_send(&self.context).await {
self.state.fail_count += 1;
self.state.last_retry = Some(Utc::now());
let retry_delay: Duration = federate_retry_sleep_duration(self.state.fail_count);
tracing::info!(
"{}: retrying {:?} attempt {} with delay {retry_delay:.2?}. ({e})",
self.instance.domain,
activity.id,
self.state.fail_count
);
self.save_and_send_state(pool).await?;
tokio::select! {
() = sleep(retry_delay) => {},
() = self.stop.cancelled() => {
// save state to db and exit
return Ok(());
for task in requests {
tracing::debug!("sending out {}", task);
let res = task.sign_and_send(&context).await;
match res {
Ok(_) => {
// send success!
{
let mut stats_ = stats.write().unwrap();
stats_.inflight_requests -= 1;
stats_.state.last_successful_id = Some(activity.id);
stats_.state.last_successful_published_time = Some(activity.published);
stats_.state.fail_count = 0;
}
// mark instance as alive if it hasn't been updated in a while.
let updated = instance.updated.unwrap_or(instance.published);
if updated.add(Days::new(1)) < Utc::now() {
{
write_instance.write().unwrap().updated = Some(Utc::now());
}
let form = InstanceForm::builder()
.domain(instance.domain.clone())
.updated(Some(naive_now()))
.build();
Instance::update(&mut context.pool(), instance.id, form)
.await
.unwrap();
}
}
Err(e) => {
tracing::info!("{} send failed: {e}", instance.domain);
{
let mut stats_ = stats.write().unwrap();
stats_.inflight_requests -= 1;
stats_.state.fail_count += 1;
stats_.state.last_retry = Some(Utc::now());
}
save_and_send_state(stats.clone(), &instance, &mut context.pool())
.await
.unwrap();
}
}
}
// Activity send successful, mark instance as alive if it hasn't been updated in a while.
let updated = self.instance.updated.unwrap_or(self.instance.published);
if updated.add(Days::new(1)) < Utc::now() {
self.instance.updated = Some(Utc::now());
let form = InstanceForm::builder()
.domain(self.instance.domain.clone())
.updated(Some(naive_now()))
.build();
Instance::update(pool, self.instance.id, form).await?;
}
}
});
Ok(())
}
@ -278,7 +327,7 @@ impl InstanceWorker {
if activity.send_all_instances {
if !self.site_loaded {
self.site = Site::read_from_instance_id(pool, self.instance.id).await?;
self.site = Site::read_from_instance_id(pool, self.instance().id).await?;
self.site_loaded = true;
}
if let Some(site) = &self.site {
@ -296,7 +345,7 @@ impl InstanceWorker {
.send_inboxes
.iter()
.filter_map(std::option::Option::as_ref)
.filter(|&u| (u.domain() == Some(&self.instance.domain)))
.filter(|&u| (u.domain() == Some(&self.instance().domain)))
.map(|u| u.inner().clone()),
);
Ok(inbox_urls)
@ -306,7 +355,7 @@ impl InstanceWorker {
if (Utc::now() - self.last_full_communities_fetch) > *FOLLOW_REMOVALS_RECHECK_DELAY {
// process removals every hour
(self.followed_communities, self.last_full_communities_fetch) = self
.get_communities(pool, self.instance.id, Utc.timestamp_nanos(0))
.get_communities(pool, self.instance().id, Utc.timestamp_nanos(0))
.await?;
self.last_incremental_communities_fetch = self.last_full_communities_fetch;
}
@ -315,7 +364,7 @@ impl InstanceWorker {
let (news, time) = self
.get_communities(
pool,
self.instance.id,
self.instance().id,
self.last_incremental_communities_fetch,
)
.await?;
@ -345,12 +394,21 @@ impl InstanceWorker {
new_last_fetch,
))
}
async fn save_and_send_state(&mut self, pool: &mut DbPool<'_>) -> Result<()> {
self.last_state_insert = Utc::now();
FederationQueueState::upsert(pool, &self.state).await?;
self
.stats_sender
.send((self.instance.domain.clone(), self.state.clone()))?;
Ok(())
}
}
async fn save_and_send_state(
stats: Arc<RwLock<InstanceStats>>,
instance: &Instance,
pool: &mut DbPool<'_>,
) -> Result<()> {
let stats = {
let mut lock = stats.write().unwrap();
lock.last_state_insert = Utc::now();
lock.clone()
};
FederationQueueState::upsert(pool, &stats.state).await?;
stats
.stats_sender
.send((instance.domain.clone(), stats.state))?;
Ok(())
}