Adding hot_rank columns in place of function sorting. (#2952)

* Adding hot_rank columns in place of function sorting.

- Creates hot_rank columns for post, comment, and community.
- Fixes #2932

* Updating all hot ranks on startup.

* Fixing post.url migration.

* Removing update_instance_software from startup.

* Adding post_rank query
This commit is contained in:
Dessalines 2023-06-08 16:15:15 -04:00 committed by Dessalines
parent 107df08f5f
commit 920a93983d
15 changed files with 194 additions and 75 deletions

View File

@ -22,6 +22,7 @@ pub struct CommentAggregates {
pub downvotes: i64,
pub published: chrono::NaiveDateTime,
pub child_count: i32,
pub hot_rank: i32,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
@ -42,6 +43,7 @@ pub struct CommunityAggregates {
pub users_active_week: i64,
pub users_active_month: i64,
pub users_active_half_year: i64,
pub hot_rank: i32,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
@ -73,6 +75,8 @@ pub struct PostAggregates {
pub newest_comment_time: chrono::NaiveDateTime,
pub featured_community: bool,
pub featured_local: bool,
pub hot_rank: i32,
pub hot_rank_active: i32,
}
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]

View File

@ -39,7 +39,8 @@ table! {
upvotes -> Int8,
downvotes -> Int8,
published -> Timestamp,
child_count -> Int4,
child_count -> Int4,
hot_rank -> Int4,
}
}
@ -118,6 +119,7 @@ table! {
users_active_week -> Int8,
users_active_month -> Int8,
users_active_half_year -> Int8,
hot_rank -> Int4,
}
}
@ -409,6 +411,8 @@ table! {
newest_comment_time -> Timestamp,
featured_community -> Bool,
featured_local -> Bool,
hot_rank -> Int4,
hot_rank_active -> Int4,
}
}

View File

@ -504,6 +504,7 @@ mod tests {
downvotes: 0,
published: agg.published,
child_count: 0,
hot_rank: 1728,
},
my_vote: None,
resolver: None,

View File

@ -37,7 +37,7 @@ use lemmy_db_schema::{
post::Post,
},
traits::{ToSafe, ViewToVec},
utils::{functions::hot_rank, fuzzy_search, get_conn, limit_and_offset_unlimited, DbPool},
utils::{fuzzy_search, get_conn, limit_and_offset_unlimited, DbPool},
CommentSortType,
ListingType,
};
@ -355,9 +355,7 @@ impl<'a> CommentQuery<'a> {
};
query = match self.sort.unwrap_or(CommentSortType::Hot) {
CommentSortType::Hot => query
.then_order_by(hot_rank(comment_aggregates::score, comment_aggregates::published).desc())
.then_order_by(comment_aggregates::published.desc()),
CommentSortType::Hot => query.then_order_by(comment_aggregates::hot_rank.desc()),
CommentSortType::New => query.then_order_by(comment::published.desc()),
CommentSortType::Old => query.then_order_by(comment::published.asc()),
CommentSortType::Top => query.order_by(comment_aggregates::score.desc()),
@ -910,6 +908,7 @@ mod tests {
downvotes: 0,
published: agg.published,
child_count: 5,
hot_rank: 1728,
},
}
}

View File

@ -476,6 +476,8 @@ mod tests {
newest_comment_time: inserted_post.published,
featured_community: false,
featured_local: false,
hot_rank: 1728,
hot_rank_active: 1728,
},
resolver: None,
};

View File

@ -40,7 +40,7 @@ use lemmy_db_schema::{
post::{Post, PostRead, PostSaved},
},
traits::{ToSafe, ViewToVec},
utils::{functions::hot_rank, fuzzy_search, get_conn, limit_and_offset, DbPool},
utils::{fuzzy_search, get_conn, limit_and_offset, DbPool},
ListingType,
SortType,
};
@ -378,18 +378,8 @@ impl<'a> PostQuery<'a> {
}
query = match self.sort.unwrap_or(SortType::Hot) {
SortType::Active => query
.then_order_by(
hot_rank(
post_aggregates::score,
post_aggregates::newest_comment_time_necro,
)
.desc(),
)
.then_order_by(post_aggregates::newest_comment_time_necro.desc()),
SortType::Hot => query
.then_order_by(hot_rank(post_aggregates::score, post_aggregates::published).desc())
.then_order_by(post_aggregates::published.desc()),
SortType::Active => query.then_order_by(post_aggregates::hot_rank_active.desc()),
SortType::Hot => query.then_order_by(post_aggregates::hot_rank.desc()),
SortType::New => query.then_order_by(post_aggregates::published.desc()),
SortType::Old => query.then_order_by(post_aggregates::published.asc()),
SortType::NewComments => query.then_order_by(post_aggregates::newest_comment_time.desc()),
@ -929,6 +919,8 @@ mod tests {
newest_comment_time: inserted_post.published,
featured_community: false,
featured_local: false,
hot_rank: 1728,
hot_rank_active: 1728,
},
subscribed: SubscribedType::NotSubscribed,
read: false,

View File

@ -34,7 +34,7 @@ use lemmy_db_schema::{
post::Post,
},
traits::{ToSafe, ViewToVec},
utils::{functions::hot_rank, get_conn, limit_and_offset, DbPool},
utils::{get_conn, limit_and_offset, DbPool},
CommentSortType,
};
use typed_builder::TypedBuilder;
@ -276,12 +276,10 @@ impl<'a> CommentReplyQuery<'a> {
query = query.filter(person::bot_account.eq(false));
};
query = match self.sort.unwrap_or(CommentSortType::Hot) {
CommentSortType::Hot => query
.then_order_by(hot_rank(comment_aggregates::score, comment_aggregates::published).desc())
.then_order_by(comment_aggregates::published.desc()),
CommentSortType::New => query.then_order_by(comment::published.desc()),
CommentSortType::Old => query.then_order_by(comment::published.asc()),
query = match self.sort.unwrap_or(CommentSortType::New) {
CommentSortType::Hot => query.then_order_by(comment_aggregates::hot_rank.desc()),
CommentSortType::New => query.then_order_by(comment_reply::published.desc()),
CommentSortType::Old => query.then_order_by(comment_reply::published.asc()),
CommentSortType::Top => query.order_by(comment_aggregates::score.desc()),
};

View File

@ -164,7 +164,7 @@ impl<'a> CommunityQuery<'a> {
SortType::TopAll => query = query.order_by(community_aggregates::subscribers.desc()),
SortType::TopMonth => query = query.order_by(community_aggregates::users_active_month.desc()),
SortType::Hot => {
query = query.order_by(community_aggregates::users_active_month.desc());
query = query.order_by(community_aggregates::hot_rank.desc());
// Don't show hidden communities in Hot (trending)
query = query.filter(
community::hidden

View File

@ -34,7 +34,7 @@ use lemmy_db_schema::{
post::Post,
},
traits::{ToSafe, ViewToVec},
utils::{functions::hot_rank, get_conn, limit_and_offset, DbPool},
utils::{get_conn, limit_and_offset, DbPool},
CommentSortType,
};
use typed_builder::TypedBuilder;
@ -276,9 +276,7 @@ impl<'a> PersonMentionQuery<'a> {
};
query = match self.sort.unwrap_or(CommentSortType::Hot) {
CommentSortType::Hot => query
.then_order_by(hot_rank(comment_aggregates::score, comment_aggregates::published).desc())
.then_order_by(comment_aggregates::published.desc()),
CommentSortType::Hot => query.then_order_by(comment_aggregates::hot_rank.desc()),
CommentSortType::New => query.then_order_by(comment::published.desc()),
CommentSortType::Old => query.then_order_by(comment::published.asc()),
CommentSortType::Top => query.order_by(comment_aggregates::score.desc()),

View File

@ -1 +1,5 @@
-- Change back the column type
alter table post alter column url type text;
-- Drop the index
drop index idx_post_url;

View File

@ -1 +1,9 @@
-- Make a hard limit of 512 for the post.url column
-- Truncate existing long rows.
update post set url = left(url, 512) where length(url) > 512;
-- Enforce the limit
alter table post alter column url type varchar (512);
-- Add the index
create index idx_post_url on post(url);

View File

@ -0,0 +1,35 @@
-- Remove the new columns
alter table post_aggregates drop column hot_rank;
alter table post_aggregates drop column hot_rank_active;
alter table comment_aggregates drop column hot_rank;
alter table community_aggregates drop column hot_rank;
-- Drop some new indexes
drop index idx_post_aggregates_score;
drop index idx_post_aggregates_published;
drop index idx_post_aggregates_newest_comment_time;
drop index idx_post_aggregates_newest_comment_time_necro;
drop index idx_post_aggregates_featured_community;
drop index idx_post_aggregates_featured_local;
-- Recreate the old indexes
CREATE INDEX idx_post_aggregates_featured_local_newest_comment_time ON public.post_aggregates USING btree (featured_local DESC, newest_comment_time DESC);
CREATE INDEX idx_post_aggregates_featured_community_newest_comment_time ON public.post_aggregates USING btree (featured_community DESC, newest_comment_time DESC);
CREATE INDEX idx_post_aggregates_featured_local_comments ON public.post_aggregates USING btree (featured_local DESC, comments DESC);
CREATE INDEX idx_post_aggregates_featured_community_comments ON public.post_aggregates USING btree (featured_community DESC, comments DESC);
CREATE INDEX idx_post_aggregates_featured_local_hot ON public.post_aggregates USING btree (featured_local DESC, hot_rank((score)::numeric, published) DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_community_hot ON public.post_aggregates USING btree (featured_community DESC, hot_rank((score)::numeric, published) DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_local_score ON public.post_aggregates USING btree (featured_local DESC, score DESC);
CREATE INDEX idx_post_aggregates_featured_community_score ON public.post_aggregates USING btree (featured_community DESC, score DESC);
CREATE INDEX idx_post_aggregates_featured_local_published ON public.post_aggregates USING btree (featured_local DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_community_published ON public.post_aggregates USING btree (featured_community DESC, published DESC);
CREATE INDEX idx_post_aggregates_featured_local_active ON public.post_aggregates USING btree (featured_local DESC, hot_rank((score)::numeric, newest_comment_time_necro) DESC, newest_comment_time_necro DESC);
CREATE INDEX idx_post_aggregates_featured_community_active ON public.post_aggregates USING btree (featured_community DESC, hot_rank((score)::numeric, newest_comment_time_necro) DESC, newest_comment_time_necro DESC);
CREATE INDEX idx_comment_aggregates_hot ON public.comment_aggregates USING btree (hot_rank((score)::numeric, published) DESC, published DESC);
CREATE INDEX idx_community_aggregates_hot ON public.community_aggregates USING btree (hot_rank((subscribers)::numeric, published) DESC, published DESC);

View File

@ -0,0 +1,51 @@
-- This converts the old hot_rank functions, to columns
-- Remove the old compound indexes
DROP INDEX idx_post_aggregates_featured_local_newest_comment_time;
DROP INDEX idx_post_aggregates_featured_community_newest_comment_time;
DROP INDEX idx_post_aggregates_featured_local_comments;
DROP INDEX idx_post_aggregates_featured_community_comments;
DROP INDEX idx_post_aggregates_featured_local_hot;
DROP INDEX idx_post_aggregates_featured_community_hot;
DROP INDEX idx_post_aggregates_featured_local_score;
DROP INDEX idx_post_aggregates_featured_community_score;
DROP INDEX idx_post_aggregates_featured_local_published;
DROP INDEX idx_post_aggregates_featured_community_published;
DROP INDEX idx_post_aggregates_featured_local_active;
DROP INDEX idx_post_aggregates_featured_community_active;
DROP INDEX idx_comment_aggregates_hot;
DROP INDEX idx_community_aggregates_hot;
-- Add the new hot rank columns for post and comment aggregates
-- Note: 1728 is the result of the hot_rank function, with a score of 1, posted now
-- hot_rank = 10000*log10(1 + 3)/Power(2, 1.8)
alter table post_aggregates add column hot_rank integer not null default 1728;
alter table post_aggregates add column hot_rank_active integer not null default 1728;
alter table comment_aggregates add column hot_rank integer not null default 1728;
alter table community_aggregates add column hot_rank integer not null default 1728;
-- Populate them initially
-- Note: After initial population, these are updated in a periodic scheduled job,
-- with only the last week being updated.
update post_aggregates set hot_rank_active = hot_rank(score::numeric, newest_comment_time_necro);
update post_aggregates set hot_rank = hot_rank(score::numeric, published);
update comment_aggregates set hot_rank = hot_rank(score::numeric, published);
update community_aggregates set hot_rank = hot_rank(subscribers::numeric, published);
-- Create single column indexes
create index idx_post_aggregates_score on post_aggregates (score desc);
create index idx_post_aggregates_published on post_aggregates (published desc);
create index idx_post_aggregates_newest_comment_time on post_aggregates (newest_comment_time desc);
create index idx_post_aggregates_newest_comment_time_necro on post_aggregates (newest_comment_time_necro desc);
create index idx_post_aggregates_featured_community on post_aggregates (featured_community desc);
create index idx_post_aggregates_featured_local on post_aggregates (featured_local desc);
create index idx_post_aggregates_hot on post_aggregates (hot_rank desc);
create index idx_post_aggregates_active on post_aggregates (hot_rank_active desc);
create index idx_comment_aggregates_hot on comment_aggregates (hot_rank desc);
create index idx_community_aggregates_hot on community_aggregates (hot_rank desc);

View File

@ -0,0 +1,3 @@
#!/bin/bash
sudo docker exec -i docker-postgres-1 psql -Ulemmy -c "EXPLAIN (ANALYZE, COSTS, VERBOSE, BUFFERS, FORMAT JSON) SELECT post.id, post.name, post.url, post.body, post.creator_id, post.community_id, post.removed, post.locked, post.published, post.updated, post.deleted, post.nsfw, post.embed_title, post.embed_description, post.embed_video_url, post.thumbnail_url, post.ap_id, post.local, post.language_id, post.featured_community, post.featured_local, person.id, person.name, person.display_name, person.avatar, person.banned, person.published, person.updated, person.actor_id, person.bio, person.local, person.banner, person.deleted, person.inbox_url, person.shared_inbox_url, person.matrix_user_id, person.admin, person.bot_account, person.ban_expires, person.instance_id, community.id, community.name, community.title, community.description, community.removed, community.published, community.updated, community.deleted, community.nsfw, community.actor_id, community.local, community.icon, community.banner, community.hidden, community.posting_restricted_to_mods, community.instance_id, community_person_ban.id, community_person_ban.community_id, community_person_ban.person_id, community_person_ban.published, community_person_ban.expires, post_aggregates.id, post_aggregates.post_id, post_aggregates.comments, post_aggregates.score, post_aggregates.upvotes, post_aggregates.downvotes, post_aggregates.published, post_aggregates.newest_comment_time_necro, post_aggregates.newest_comment_time, post_aggregates.featured_community, post_aggregates.featured_local, community_follower.id, community_follower.community_id, community_follower.person_id, community_follower.published, community_follower.pending, post_saved.id, post_saved.post_id, post_saved.person_id, post_saved.published, post_read.id, post_read.post_id, post_read.person_id, post_read.published, person_block.id, person_block.person_id, person_block.target_id, person_block.published, post_like.score, coalesce((post_aggregates.comments - person_post_aggregates.read_comments), post_aggregates.comments) FROM ((((((((((((post INNER JOIN person ON (post.creator_id = person.id)) INNER JOIN community ON (post.community_id = community.id)) LEFT OUTER JOIN community_person_ban ON (((post.community_id = community_person_ban.community_id) AND (community_person_ban.person_id = post.creator_id)) AND ((community_person_ban.expires IS NULL) OR (community_person_ban.expires > CURRENT_TIMESTAMP)))) INNER JOIN post_aggregates ON (post_aggregates.post_id = post.id)) LEFT OUTER JOIN community_follower ON ((post.community_id = community_follower.community_id) AND (community_follower.person_id = '33517'))) LEFT OUTER JOIN post_saved ON ((post.id = post_saved.post_id) AND (post_saved.person_id = '33517'))) LEFT OUTER JOIN post_read ON ((post.id = post_read.post_id) AND (post_read.person_id = '33517'))) LEFT OUTER JOIN person_block ON ((post.creator_id = person_block.target_id) AND (person_block.person_id = '33517'))) LEFT OUTER JOIN community_block ON ((community.id = community_block.community_id) AND (community_block.person_id = '33517'))) LEFT OUTER JOIN post_like ON ((post.id = post_like.post_id) AND (post_like.person_id = '33517'))) LEFT OUTER JOIN person_post_aggregates ON ((post.id = person_post_aggregates.post_id) AND (person_post_aggregates.person_id = '33517'))) LEFT OUTER JOIN local_user_language ON ((post.language_id = local_user_language.language_id) AND (local_user_language.local_user_id = '11402'))) WHERE ((((((((((community_follower.person_id IS NOT NULL) AND (post.nsfw = 'f')) AND (community.nsfw = 'f')) AND (local_user_language.language_id IS NOT NULL)) AND (community_block.person_id IS NULL)) AND (person_block.person_id IS NULL)) AND (post.removed = 'f')) AND (post.deleted = 'f')) AND (community.removed = 'f')) AND (community.deleted = 'f')) ORDER BY post_aggregates.featured_local DESC , post_aggregates.hot_rank DESC LIMIT '40' OFFSET '0';" > query_results.json

View File

@ -1,7 +1,11 @@
use clokwerk::{Scheduler, TimeUnits};
use diesel::{dsl::now, Connection, ExpressionMethods, QueryDsl};
// Import week days and WeekDay
use diesel::{sql_query, PgConnection, RunQueryDsl};
use diesel::{Connection, ExpressionMethods, QueryDsl};
use lemmy_db_schema::{
schema::{comment_aggregates, community_aggregates, post_aggregates},
utils::functions::hot_rank,
};
use lemmy_utils::error::LemmyError;
use std::{thread, time::Duration};
use tracing::info;
@ -11,26 +15,26 @@ pub fn setup(db_url: String) -> Result<(), LemmyError> {
// Setup the connections
let mut scheduler = Scheduler::new();
let mut conn = PgConnection::establish(&db_url).expect("could not establish connection");
let mut conn_1 = PgConnection::establish(&db_url).expect("could not establish connection");
let mut conn_2 = PgConnection::establish(&db_url).expect("could not establish connection");
let mut conn_3 = PgConnection::establish(&db_url).expect("could not establish connection");
let mut conn_4 = PgConnection::establish(&db_url).expect("could not establish connection");
active_counts(&mut conn);
update_banned_when_expired(&mut conn);
active_counts(&mut conn_1);
update_banned_when_expired(&mut conn_1);
update_hot_ranks(&mut conn_1, false);
clear_old_activities(&mut conn_1);
// On startup, reindex the tables non-concurrently
// TODO remove this for now, since it slows down startup a lot on lemmy.ml
reindex_aggregates_tables(&mut conn, true);
scheduler.every(1.hour()).run(move || {
let conn = &mut PgConnection::establish(&db_url)
.unwrap_or_else(|_| panic!("Error connecting to {db_url}"));
active_counts(conn);
update_banned_when_expired(conn);
reindex_aggregates_tables(conn, true);
drop_ccnew_indexes(conn);
active_counts(&mut conn_2);
update_banned_when_expired(&mut conn_2);
});
clear_old_activities(&mut conn);
scheduler.every(1.weeks()).run(move || {
clear_old_activities(&mut conn);
// Clear old activities every week
scheduler.every(TimeUnits::weeks(1)).run(move || {
clear_old_activities(&mut conn_3);
});
scheduler.every(TimeUnits::minutes(5)).run(move || {
update_hot_ranks(&mut conn_4, true);
});
// Manually run the scheduler in an event loop
@ -40,30 +44,56 @@ pub fn setup(db_url: String) -> Result<(), LemmyError> {
}
}
/// Reindex the aggregates tables every one hour
/// This is necessary because hot_rank is actually a mutable function:
/// https://dba.stackexchange.com/questions/284052/how-to-create-an-index-based-on-a-time-based-function-in-postgres?noredirect=1#comment555727_284052
fn reindex_aggregates_tables(conn: &mut PgConnection, concurrently: bool) {
for table_name in &[
"post_aggregates",
"comment_aggregates",
"community_aggregates",
] {
reindex_table(conn, table_name, concurrently);
}
}
/// Update the hot_rank columns for the aggregates tables
fn update_hot_ranks(conn: &mut PgConnection, last_week_only: bool) {
let mut post_update = diesel::update(post_aggregates::table).into_boxed();
let mut comment_update = diesel::update(comment_aggregates::table).into_boxed();
let mut community_update = diesel::update(community_aggregates::table).into_boxed();
fn reindex_table(conn: &mut PgConnection, table_name: &str, concurrently: bool) {
let concurrently_str = if concurrently { "concurrently" } else { "" };
info!("Reindexing table {} {} ...", concurrently_str, table_name);
let query = format!("reindex table {concurrently_str} {table_name}");
sql_query(query).execute(conn).expect("reindex table");
// Only update for the last week of content
if last_week_only {
info!("Updating hot ranks for last week...");
let last_week = now - diesel::dsl::IntervalDsl::weeks(1);
post_update = post_update.filter(post_aggregates::published.gt(last_week));
comment_update = comment_update.filter(comment_aggregates::published.gt(last_week));
community_update = community_update.filter(community_aggregates::published.gt(last_week));
} else {
info!("Updating hot ranks for all history...");
}
post_update
.set((
post_aggregates::hot_rank.eq(hot_rank(post_aggregates::score, post_aggregates::published)),
post_aggregates::hot_rank_active.eq(hot_rank(
post_aggregates::score,
post_aggregates::newest_comment_time_necro,
)),
))
.execute(conn)
.expect("update post_aggregate hot_ranks");
comment_update
.set(comment_aggregates::hot_rank.eq(hot_rank(
comment_aggregates::score,
comment_aggregates::published,
)))
.execute(conn)
.expect("update comment_aggregate hot_ranks");
community_update
.set(community_aggregates::hot_rank.eq(hot_rank(
community_aggregates::subscribers,
community_aggregates::published,
)))
.execute(conn)
.expect("update community_aggregate hot_ranks");
info!("Done.");
}
/// Clear old activities (this table gets very large)
fn clear_old_activities(conn: &mut PgConnection) {
use diesel::dsl::{now, IntervalDsl};
use diesel::dsl::IntervalDsl;
use lemmy_db_schema::schema::activity::dsl::{activity, published};
info!("Clearing old activities...");
diesel::delete(activity.filter(published.lt(now - 6.months())))
@ -110,13 +140,3 @@ fn update_banned_when_expired(conn: &mut PgConnection) {
.execute(conn)
.expect("update banned when expires");
}
/// Drops the phantom CCNEW indexes created by postgres
/// https://github.com/LemmyNet/lemmy/issues/2431
fn drop_ccnew_indexes(conn: &mut PgConnection) {
info!("Dropping phantom ccnew indexes...");
let drop_stmt = "select drop_ccnew_indexes()";
sql_query(drop_stmt)
.execute(conn)
.expect("drop ccnew indexes");
}