mirror of
https://github.com/LemmyNet/lemmy.git
synced 2024-10-01 01:36:12 -04:00
Adding a scaled sort, to boost smaller communities. (#3907)
* Adding a scaled sort, to boost smaller communities. - Previously referred to as *best* . - Fixes #3622 * Fixing scheduled task update. * Converting hot_rank integers to floats. * Altering hot_rank psql function to default to zero after a week. * Setting scaled_rank to zero, where hot_rank is zero. * Adding image_upload table.
This commit is contained in:
parent
4121fc4d56
commit
9785b20843
@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
|
||||
PostLike::like(&mut context.pool(), &like_form).await?;
|
||||
|
||||
// Calculate initial hot_rank for post
|
||||
PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?;
|
||||
PostAggregates::update_ranks(&mut context.pool(), post.id).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,10 +1,14 @@
|
||||
use crate::{
|
||||
aggregates::structs::PostAggregates,
|
||||
newtypes::PostId,
|
||||
schema::post_aggregates,
|
||||
utils::{functions::hot_rank, get_conn, DbPool},
|
||||
schema::{community_aggregates, post, post_aggregates},
|
||||
utils::{
|
||||
functions::{hot_rank, scaled_rank},
|
||||
get_conn,
|
||||
DbPool,
|
||||
},
|
||||
};
|
||||
use diesel::{result::Error, ExpressionMethods, QueryDsl};
|
||||
use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
|
||||
use diesel_async::RunQueryDsl;
|
||||
|
||||
impl PostAggregates {
|
||||
@ -16,9 +20,19 @@ impl PostAggregates {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
|
||||
pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
|
||||
let conn = &mut get_conn(pool).await?;
|
||||
|
||||
// Diesel can't update based on a join, which is necessary for the scaled_rank
|
||||
// https://github.com/diesel-rs/diesel/issues/1478
|
||||
// Just select the users_active_month manually for now, since its a single post anyway
|
||||
let users_active_month = community_aggregates::table
|
||||
.select(community_aggregates::users_active_month)
|
||||
.inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
|
||||
.filter(post::id.eq(post_id))
|
||||
.first::<i64>(conn)
|
||||
.await?;
|
||||
|
||||
diesel::update(post_aggregates::table)
|
||||
.filter(post_aggregates::post_id.eq(post_id))
|
||||
.set((
|
||||
@ -27,6 +41,11 @@ impl PostAggregates {
|
||||
post_aggregates::score,
|
||||
post_aggregates::newest_comment_time_necro,
|
||||
)),
|
||||
post_aggregates::scaled_rank.eq(scaled_rank(
|
||||
post_aggregates::score,
|
||||
post_aggregates::published,
|
||||
users_active_month,
|
||||
)),
|
||||
))
|
||||
.get_result::<Self>(conn)
|
||||
.await
|
||||
|
@ -27,11 +27,11 @@ pub struct CommentAggregates {
|
||||
pub published: DateTime<Utc>,
|
||||
/// The total number of children in this comment branch.
|
||||
pub child_count: i32,
|
||||
pub hot_rank: i32,
|
||||
pub hot_rank: f64,
|
||||
pub controversy_rank: f64,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
|
||||
#[derive(PartialEq, Debug, Serialize, Deserialize, Clone)]
|
||||
#[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))]
|
||||
#[cfg_attr(feature = "full", diesel(table_name = community_aggregates))]
|
||||
#[cfg_attr(
|
||||
@ -55,7 +55,7 @@ pub struct CommunityAggregates {
|
||||
pub users_active_month: i64,
|
||||
/// The number of users with any activity in the last year.
|
||||
pub users_active_half_year: i64,
|
||||
pub hot_rank: i32,
|
||||
pub hot_rank: f64,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
|
||||
@ -95,11 +95,13 @@ pub struct PostAggregates {
|
||||
pub featured_community: bool,
|
||||
/// If the post is featured on the site / to local.
|
||||
pub featured_local: bool,
|
||||
pub hot_rank: i32,
|
||||
pub hot_rank_active: i32,
|
||||
pub hot_rank: f64,
|
||||
pub hot_rank_active: f64,
|
||||
pub community_id: CommunityId,
|
||||
pub creator_id: PersonId,
|
||||
pub controversy_rank: f64,
|
||||
/// A rank that amplifies smaller communities
|
||||
pub scaled_rank: f64,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
|
||||
|
@ -54,6 +54,7 @@ use ts_rs::TS;
|
||||
)]
|
||||
#[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
|
||||
#[cfg_attr(feature = "full", ts(export))]
|
||||
// TODO add the controversial and scaled rankings to the doc below
|
||||
/// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
|
||||
pub enum SortType {
|
||||
#[default]
|
||||
@ -75,6 +76,7 @@ pub enum SortType {
|
||||
TopSixMonths,
|
||||
TopNineMonths,
|
||||
Controversial,
|
||||
Scaled,
|
||||
}
|
||||
|
||||
#[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]
|
||||
|
@ -100,7 +100,7 @@ diesel::table! {
|
||||
downvotes -> Int8,
|
||||
published -> Timestamptz,
|
||||
child_count -> Int4,
|
||||
hot_rank -> Int4,
|
||||
hot_rank -> Float8,
|
||||
controversy_rank -> Float8,
|
||||
}
|
||||
}
|
||||
@ -198,7 +198,7 @@ diesel::table! {
|
||||
users_active_week -> Int8,
|
||||
users_active_month -> Int8,
|
||||
users_active_half_year -> Int8,
|
||||
hot_rank -> Int4,
|
||||
hot_rank -> Float8,
|
||||
}
|
||||
}
|
||||
|
||||
@ -299,6 +299,16 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
image_upload (id) {
|
||||
id -> Int4,
|
||||
local_user_id -> Int4,
|
||||
pictrs_alias -> Text,
|
||||
pictrs_delete_token -> Text,
|
||||
published -> Timestamptz,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
instance (id) {
|
||||
id -> Int4,
|
||||
@ -683,11 +693,12 @@ diesel::table! {
|
||||
newest_comment_time -> Timestamptz,
|
||||
featured_community -> Bool,
|
||||
featured_local -> Bool,
|
||||
hot_rank -> Int4,
|
||||
hot_rank_active -> Int4,
|
||||
hot_rank -> Float8,
|
||||
hot_rank_active -> Float8,
|
||||
community_id -> Int4,
|
||||
creator_id -> Int4,
|
||||
controversy_rank -> Float8,
|
||||
scaled_rank -> Float8,
|
||||
}
|
||||
}
|
||||
|
||||
@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id));
|
||||
diesel::joinable!(email_verification -> local_user (local_user_id));
|
||||
diesel::joinable!(federation_allowlist -> instance (instance_id));
|
||||
diesel::joinable!(federation_blocklist -> instance (instance_id));
|
||||
diesel::joinable!(image_upload -> local_user (local_user_id));
|
||||
diesel::joinable!(local_site -> site (site_id));
|
||||
diesel::joinable!(local_site_rate_limit -> local_site (local_site_id));
|
||||
diesel::joinable!(local_user -> person (person_id));
|
||||
@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!(
|
||||
email_verification,
|
||||
federation_allowlist,
|
||||
federation_blocklist,
|
||||
image_upload,
|
||||
instance,
|
||||
language,
|
||||
local_site,
|
||||
|
@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime<Utc> {
|
||||
|
||||
pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
|
||||
match sort {
|
||||
SortType::Active | SortType::Hot => CommentSortType::Hot,
|
||||
SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
|
||||
SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
|
||||
SortType::Old => CommentSortType::Old,
|
||||
SortType::Controversial => CommentSortType::Controversial,
|
||||
@ -384,7 +384,11 @@ pub mod functions {
|
||||
use diesel::sql_types::{BigInt, Text, Timestamptz};
|
||||
|
||||
sql_function! {
|
||||
fn hot_rank(score: BigInt, time: Timestamptz) -> Integer;
|
||||
fn hot_rank(score: BigInt, time: Timestamptz) -> Double;
|
||||
}
|
||||
|
||||
sql_function! {
|
||||
fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
|
||||
}
|
||||
|
||||
sql_function! {
|
||||
|
@ -432,7 +432,7 @@ mod tests {
|
||||
downvotes: 0,
|
||||
published: agg.published,
|
||||
child_count: 0,
|
||||
hot_rank: 1728,
|
||||
hot_rank: 0.1728,
|
||||
controversy_rank: 0.0,
|
||||
},
|
||||
my_vote: None,
|
||||
|
@ -886,7 +886,7 @@ mod tests {
|
||||
downvotes: 0,
|
||||
published: agg.published,
|
||||
child_count: 5,
|
||||
hot_rank: 1728,
|
||||
hot_rank: 0.1728,
|
||||
controversy_rank: 0.0,
|
||||
},
|
||||
}
|
||||
|
@ -380,6 +380,9 @@ fn queries<'a>() -> Queries<
|
||||
SortType::Hot => query
|
||||
.then_order_by(post_aggregates::hot_rank.desc())
|
||||
.then_order_by(post_aggregates::published.desc()),
|
||||
SortType::Scaled => query
|
||||
.then_order_by(post_aggregates::scaled_rank.desc())
|
||||
.then_order_by(post_aggregates::published.desc()),
|
||||
SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
|
||||
SortType::New => query.then_order_by(post_aggregates::published.desc()),
|
||||
SortType::Old => query.then_order_by(post_aggregates::published.asc()),
|
||||
@ -1154,9 +1157,10 @@ mod tests {
|
||||
newest_comment_time: inserted_post.published,
|
||||
featured_community: false,
|
||||
featured_local: false,
|
||||
hot_rank: 1728,
|
||||
hot_rank_active: 1728,
|
||||
hot_rank: 0.1728,
|
||||
hot_rank_active: 0.1728,
|
||||
controversy_rank: 0.0,
|
||||
scaled_rank: 0.3621,
|
||||
community_id: inserted_post.community_id,
|
||||
creator_id: inserted_post.creator_id,
|
||||
},
|
||||
|
@ -105,7 +105,7 @@ fn queries<'a>() -> Queries<
|
||||
}
|
||||
|
||||
match options.sort.unwrap_or(Hot) {
|
||||
Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()),
|
||||
Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
|
||||
NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
|
||||
query = query.order_by(community_aggregates::users_active_day.desc())
|
||||
}
|
||||
|
87
migrations/2023-08-23-182533_scaled_rank/down.sql
Normal file
87
migrations/2023-08-23-182533_scaled_rank/down.sql
Normal file
@ -0,0 +1,87 @@
|
||||
DROP FUNCTION scaled_rank;
|
||||
|
||||
ALTER TABLE community_aggregates
|
||||
ALTER COLUMN hot_rank TYPE integer,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 1728;
|
||||
|
||||
ALTER TABLE comment_aggregates
|
||||
ALTER COLUMN hot_rank TYPE integer,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 1728;
|
||||
|
||||
ALTER TABLE post_aggregates
|
||||
ALTER COLUMN hot_rank TYPE integer,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 1728,
|
||||
ALTER COLUMN hot_rank_active TYPE integer,
|
||||
ALTER COLUMN hot_rank_active SET DEFAULT 1728;
|
||||
|
||||
-- Change back to integer version
|
||||
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
|
||||
|
||||
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
|
||||
RETURNS integer
|
||||
AS $$
|
||||
DECLARE
|
||||
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
|
||||
BEGIN
|
||||
IF (hours_diff > 0) THEN
|
||||
RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer;
|
||||
ELSE
|
||||
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
|
||||
-- always be on top even with only 1 vote by setting it to the future
|
||||
RETURN 0;
|
||||
END IF;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql
|
||||
IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
ALTER TABLE post_aggregates
|
||||
DROP COLUMN scaled_rank;
|
||||
|
||||
-- The following code is necessary because postgres can't remove
|
||||
-- a single enum value.
|
||||
ALTER TABLE local_user
|
||||
ALTER default_sort_type DROP DEFAULT;
|
||||
|
||||
UPDATE
|
||||
local_user
|
||||
SET
|
||||
default_sort_type = 'Hot'
|
||||
WHERE
|
||||
default_sort_type = 'Scaled';
|
||||
|
||||
-- rename the old enum
|
||||
ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
|
||||
|
||||
-- create the new enum
|
||||
CREATE TYPE sort_type_enum AS ENUM (
|
||||
'Active',
|
||||
'Hot',
|
||||
'New',
|
||||
'Old',
|
||||
'TopDay',
|
||||
'TopWeek',
|
||||
'TopMonth',
|
||||
'TopYear',
|
||||
'TopAll',
|
||||
'MostComments',
|
||||
'NewComments',
|
||||
'TopHour',
|
||||
'TopSixHour',
|
||||
'TopTwelveHour',
|
||||
'TopThreeMonths',
|
||||
'TopSixMonths',
|
||||
'TopNineMonths'
|
||||
);
|
||||
|
||||
-- alter all your enum columns
|
||||
ALTER TABLE local_user
|
||||
ALTER COLUMN default_sort_type TYPE sort_type_enum
|
||||
USING default_sort_type::text::sort_type_enum;
|
||||
|
||||
ALTER TABLE local_user
|
||||
ALTER default_sort_type SET DEFAULT 'Active';
|
||||
|
||||
-- drop the old enum
|
||||
DROP TYPE sort_type_enum__;
|
||||
|
74
migrations/2023-08-23-182533_scaled_rank/up.sql
Normal file
74
migrations/2023-08-23-182533_scaled_rank/up.sql
Normal file
@ -0,0 +1,74 @@
|
||||
-- Change hot ranks and functions from an int to a float
|
||||
ALTER TABLE community_aggregates
|
||||
ALTER COLUMN hot_rank TYPE float,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
|
||||
|
||||
ALTER TABLE comment_aggregates
|
||||
ALTER COLUMN hot_rank TYPE float,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 0.1728;
|
||||
|
||||
ALTER TABLE post_aggregates
|
||||
ALTER COLUMN hot_rank TYPE float,
|
||||
ALTER COLUMN hot_rank SET DEFAULT 0.1728,
|
||||
ALTER COLUMN hot_rank_active TYPE float,
|
||||
ALTER COLUMN hot_rank_active SET DEFAULT 0.1728;
|
||||
|
||||
DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
|
||||
|
||||
CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
|
||||
RETURNS float
|
||||
AS $$
|
||||
DECLARE
|
||||
hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
|
||||
BEGIN
|
||||
-- 24 * 7 = 168, so after a week, it will default to 0.
|
||||
IF (hours_diff > 0 AND hours_diff < 168) THEN
|
||||
RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8);
|
||||
ELSE
|
||||
-- if the post is from the future, set hot score to 0. otherwise you can game the post to
|
||||
-- always be on top even with only 1 vote by setting it to the future
|
||||
RETURN 0.0;
|
||||
END IF;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql
|
||||
IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
-- The new scaled rank function
|
||||
CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
|
||||
RETURNS float
|
||||
AS $$
|
||||
BEGIN
|
||||
-- Add 2 to avoid divide by zero errors
|
||||
-- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
|
||||
-- There may need to be a scale factor multiplied to users_active_month, to make
|
||||
-- the log curve less pronounced. This can be tuned in the future.
|
||||
RETURN (hot_rank (score, published) / log(2 + users_active_month));
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql
|
||||
IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
ALTER TABLE post_aggregates
|
||||
ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621;
|
||||
|
||||
UPDATE
|
||||
post_aggregates
|
||||
SET
|
||||
scaled_rank = 0
|
||||
WHERE
|
||||
hot_rank = 0
|
||||
OR hot_rank_active = 0;
|
||||
|
||||
CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
|
||||
|
||||
CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
|
||||
|
||||
-- We forgot to add the controversial sort type
|
||||
ALTER TYPE sort_type_enum
|
||||
ADD VALUE 'Controversial';
|
||||
|
||||
-- Add the Scaled enum
|
||||
ALTER TYPE sort_type_enum
|
||||
ADD VALUE 'Scaled';
|
||||
|
@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
|
||||
fn update_hot_ranks(conn: &mut PgConnection) {
|
||||
info!("Updating hot ranks for all history...");
|
||||
|
||||
process_hot_ranks_in_batches(
|
||||
conn,
|
||||
"post_aggregates",
|
||||
"a.hot_rank != 0 OR a.hot_rank_active != 0",
|
||||
"SET hot_rank = hot_rank(a.score, a.published),
|
||||
hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
|
||||
);
|
||||
process_post_aggregates_ranks_in_batches(conn);
|
||||
|
||||
process_hot_ranks_in_batches(
|
||||
process_ranks_in_batches(
|
||||
conn,
|
||||
"comment_aggregates",
|
||||
"a.hot_rank != 0",
|
||||
"SET hot_rank = hot_rank(a.score, a.published)",
|
||||
);
|
||||
|
||||
process_hot_ranks_in_batches(
|
||||
process_ranks_in_batches(
|
||||
conn,
|
||||
"community_aggregates",
|
||||
"a.hot_rank != 0",
|
||||
@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
|
||||
/// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
|
||||
/// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
|
||||
/// run)
|
||||
fn process_hot_ranks_in_batches(
|
||||
fn process_ranks_in_batches(
|
||||
conn: &mut PgConnection,
|
||||
table_name: &str,
|
||||
where_clause: &str,
|
||||
@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches(
|
||||
);
|
||||
}
|
||||
|
||||
/// Post aggregates is a special case, since it needs to join to the community_aggregates
|
||||
/// table, to get the active monthly user counts.
|
||||
fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
|
||||
let process_start_time: DateTime<Utc> = Utc
|
||||
.timestamp_opt(0, 0)
|
||||
.single()
|
||||
.expect("0 timestamp creation");
|
||||
|
||||
let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
|
||||
let mut processed_rows_count = 0;
|
||||
let mut previous_batch_result = Some(process_start_time);
|
||||
while let Some(previous_batch_last_published) = previous_batch_result {
|
||||
let result = sql_query(
|
||||
r#"WITH batch AS (SELECT pa.id
|
||||
FROM post_aggregates pa
|
||||
WHERE pa.published > $1
|
||||
AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0)
|
||||
ORDER BY pa.published
|
||||
LIMIT $2
|
||||
FOR UPDATE SKIP LOCKED)
|
||||
UPDATE post_aggregates pa
|
||||
SET hot_rank = hot_rank(pa.score, pa.published),
|
||||
hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
|
||||
scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
|
||||
FROM batch, community_aggregates ca
|
||||
WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
|
||||
"#,
|
||||
)
|
||||
.bind::<Timestamptz, _>(previous_batch_last_published)
|
||||
.bind::<Integer, _>(update_batch_size)
|
||||
.get_results::<HotRanksUpdateResult>(conn);
|
||||
|
||||
match result {
|
||||
Ok(updated_rows) => {
|
||||
processed_rows_count += updated_rows.len();
|
||||
previous_batch_result = updated_rows.last().map(|row| row.published);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
|
||||
"post_aggregates", processed_rows_count
|
||||
);
|
||||
}
|
||||
|
||||
fn delete_expired_captcha_answers(conn: &mut PgConnection) {
|
||||
diesel::delete(
|
||||
captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),
|
||||
|
Loading…
Reference in New Issue
Block a user