mirror of
https://github.com/LemmyNet/lemmy.git
synced 2024-10-01 01:36:12 -04:00
Sitemap (#3808)
* generate sitemap.xml file * set up endpoint for sitemap * Update sitemap generation - remove sitemap generation from scheduled tasks - add posts query for sitemap - create sitemap module in API crate * remove priority and change freq from sitemap * add configuration option for number of posts for sitemap * fix default config * rate limit sitemap endpoint * update sitemap query * update sitemap generation - remove config value for query limit - adjust sitemap generation to query changes - tidy up error handling * refactor sitemap generation loop * remove `limit` argument * refactor `generate_urlset` and add unit test * change query to only fetch local posts of past 24h * fix outdated comment and log * cargo fmt
This commit is contained in:
parent
ab828b81e4
commit
28324ad2c8
28
Cargo.lock
generated
28
Cargo.lock
generated
@ -1603,6 +1603,15 @@ version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
|
||||
|
||||
[[package]]
|
||||
name = "elementtree"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3efd4742acf458718a6456e0adf0b4d734d6b783e452bbf1ac36bf31f4085cb3"
|
||||
dependencies = [
|
||||
"string_cache",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "email-encoding"
|
||||
version = "0.2.0"
|
||||
@ -2581,6 +2590,7 @@ dependencies = [
|
||||
"bcrypt",
|
||||
"captcha",
|
||||
"chrono",
|
||||
"elementtree",
|
||||
"lemmy_api_common",
|
||||
"lemmy_db_schema",
|
||||
"lemmy_db_views",
|
||||
@ -2589,8 +2599,10 @@ dependencies = [
|
||||
"lemmy_utils",
|
||||
"serde",
|
||||
"serial_test",
|
||||
"sitemap-rs",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"uuid",
|
||||
"wav",
|
||||
]
|
||||
@ -4745,6 +4757,16 @@ version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "sitemap-rs"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95b58125f0ab4317b5ba3cdc1f60696e47958760e356874c759334fa56ae1596"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"xml-builder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "skeptic"
|
||||
version = "0.13.7"
|
||||
@ -6132,6 +6154,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xml-builder"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "efc4f1a86af7800dfc4056c7833648ea4515ae21502060b5c98114d828f5333b"
|
||||
|
||||
[[package]]
|
||||
name = "xml5ever"
|
||||
version = "0.17.0"
|
||||
|
@ -31,8 +31,11 @@ captcha = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
url = { workspace = true }
|
||||
wav = "1.0.0"
|
||||
sitemap-rs = "0.2.0"
|
||||
|
||||
[dev-dependencies]
|
||||
serial_test = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
elementtree = "1.2.3"
|
||||
|
@ -18,6 +18,7 @@ pub mod post_report;
|
||||
pub mod private_message;
|
||||
pub mod private_message_report;
|
||||
pub mod site;
|
||||
pub mod sitemap;
|
||||
|
||||
#[async_trait::async_trait(?Send)]
|
||||
pub trait Perform {
|
||||
|
142
crates/api/src/sitemap.rs
Normal file
142
crates/api/src/sitemap.rs
Normal file
@ -0,0 +1,142 @@
|
||||
use actix_web::{
|
||||
http::header::{self, CacheDirective},
|
||||
web::Data,
|
||||
HttpResponse,
|
||||
};
|
||||
use chrono::{DateTime, FixedOffset};
|
||||
use lemmy_api_common::context::LemmyContext;
|
||||
use lemmy_db_schema::{newtypes::DbUrl, source::post::Post};
|
||||
use lemmy_utils::error::LemmyResult;
|
||||
use sitemap_rs::{url::Url, url_set::UrlSet};
|
||||
use tracing::info;
|
||||
|
||||
async fn generate_urlset(posts: Vec<(DbUrl, chrono::NaiveDateTime)>) -> LemmyResult<UrlSet> {
|
||||
let urls = posts
|
||||
.into_iter()
|
||||
.map_while(|post| {
|
||||
Url::builder(post.0.to_string())
|
||||
.last_modified(DateTime::from_utc(
|
||||
post.1,
|
||||
FixedOffset::east_opt(0).expect("Error setting timezone offset"), // TODO what is the proper timezone offset here?
|
||||
))
|
||||
.build()
|
||||
.ok()
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(UrlSet::new(urls)?)
|
||||
}
|
||||
|
||||
pub async fn get_sitemap(context: Data<LemmyContext>) -> LemmyResult<HttpResponse> {
|
||||
info!("Generating sitemap with posts from last {} hours...", 24);
|
||||
let posts = Post::list_for_sitemap(&mut context.pool()).await?;
|
||||
info!("Loaded latest {} posts", posts.len());
|
||||
|
||||
let mut buf = Vec::<u8>::new();
|
||||
generate_urlset(posts).await?.write(&mut buf)?;
|
||||
|
||||
Ok(
|
||||
HttpResponse::Ok()
|
||||
.content_type("application/xml")
|
||||
.insert_header(header::CacheControl(vec![CacheDirective::MaxAge(86_400)])) // 24 h
|
||||
.body(buf),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
#![allow(clippy::unwrap_used)]
|
||||
|
||||
use crate::sitemap::generate_urlset;
|
||||
use chrono::{NaiveDate, NaiveDateTime};
|
||||
use elementtree::Element;
|
||||
use lemmy_db_schema::newtypes::DbUrl;
|
||||
use url::Url;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_generate_urlset() {
|
||||
let posts: Vec<(DbUrl, NaiveDateTime)> = vec![
|
||||
(
|
||||
Url::parse("https://example.com").unwrap().into(),
|
||||
NaiveDate::from_ymd_opt(2022, 12, 1)
|
||||
.unwrap()
|
||||
.and_hms_opt(9, 10, 11)
|
||||
.unwrap(),
|
||||
),
|
||||
(
|
||||
Url::parse("https://lemmy.ml").unwrap().into(),
|
||||
NaiveDate::from_ymd_opt(2023, 1, 1)
|
||||
.unwrap()
|
||||
.and_hms_opt(1, 2, 3)
|
||||
.unwrap(),
|
||||
),
|
||||
];
|
||||
|
||||
let mut buf = Vec::<u8>::new();
|
||||
generate_urlset(posts)
|
||||
.await
|
||||
.unwrap()
|
||||
.write(&mut buf)
|
||||
.unwrap();
|
||||
let root = Element::from_reader(buf.as_slice()).unwrap();
|
||||
|
||||
assert_eq!(root.tag().name(), "urlset");
|
||||
assert_eq!(root.child_count(), 2);
|
||||
|
||||
assert!(root.children().all(|url| url.tag().name() == "url"));
|
||||
assert!(root.children().all(|url| url.child_count() == 2));
|
||||
assert!(root.children().all(|url| url
|
||||
.children()
|
||||
.next()
|
||||
.is_some_and(|element| element.tag().name() == "loc")));
|
||||
assert!(root.children().all(|url| url
|
||||
.children()
|
||||
.nth(1)
|
||||
.is_some_and(|element| element.tag().name() == "lastmod")));
|
||||
|
||||
assert_eq!(
|
||||
root
|
||||
.children()
|
||||
.next()
|
||||
.unwrap()
|
||||
.children()
|
||||
.find(|element| element.tag().name() == "loc")
|
||||
.unwrap()
|
||||
.text(),
|
||||
"https://example.com/"
|
||||
);
|
||||
assert_eq!(
|
||||
root
|
||||
.children()
|
||||
.next()
|
||||
.unwrap()
|
||||
.children()
|
||||
.find(|element| element.tag().name() == "lastmod")
|
||||
.unwrap()
|
||||
.text(),
|
||||
"2022-12-01T09:10:11+00:00"
|
||||
);
|
||||
assert_eq!(
|
||||
root
|
||||
.children()
|
||||
.nth(1)
|
||||
.unwrap()
|
||||
.children()
|
||||
.find(|element| element.tag().name() == "loc")
|
||||
.unwrap()
|
||||
.text(),
|
||||
"https://lemmy.ml/"
|
||||
);
|
||||
assert_eq!(
|
||||
root
|
||||
.children()
|
||||
.nth(1)
|
||||
.unwrap()
|
||||
.children()
|
||||
.find(|element| element.tag().name() == "lastmod")
|
||||
.unwrap()
|
||||
.text(),
|
||||
"2023-01-01T01:02:03+00:00"
|
||||
);
|
||||
}
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
use super::instance::coalesce;
|
||||
use crate::{
|
||||
newtypes::{CommunityId, DbUrl, PersonId, PostId},
|
||||
schema::post::dsl::{
|
||||
@ -7,6 +8,7 @@ use crate::{
|
||||
creator_id,
|
||||
deleted,
|
||||
featured_community,
|
||||
local,
|
||||
name,
|
||||
post,
|
||||
published,
|
||||
@ -30,6 +32,7 @@ use crate::{
|
||||
utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX},
|
||||
};
|
||||
use ::url::Url;
|
||||
use chrono::{Duration, Utc};
|
||||
use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods};
|
||||
use diesel_async::RunQueryDsl;
|
||||
|
||||
@ -96,6 +99,21 @@ impl Post {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn list_for_sitemap(
|
||||
pool: &mut DbPool<'_>,
|
||||
) -> Result<Vec<(DbUrl, chrono::NaiveDateTime)>, Error> {
|
||||
let conn = &mut get_conn(pool).await?;
|
||||
post
|
||||
.select((ap_id, coalesce(updated, published)))
|
||||
.filter(local)
|
||||
.filter(deleted.eq(false))
|
||||
.filter(removed.eq(false))
|
||||
.filter(published.ge(Utc::now().naive_utc() - Duration::days(1)))
|
||||
.order(published.desc())
|
||||
.load::<(DbUrl, chrono::NaiveDateTime)>(conn)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn permadelete_for_creator(
|
||||
pool: &mut DbPool<'_>,
|
||||
for_creator_id: PersonId,
|
||||
|
@ -16,6 +16,7 @@ use lemmy_api::{
|
||||
local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read},
|
||||
post::{feature::feature_post, like::like_post, lock::lock_post},
|
||||
post_report::create::create_post_report,
|
||||
sitemap::get_sitemap,
|
||||
Perform,
|
||||
};
|
||||
use lemmy_api_common::{
|
||||
@ -340,6 +341,11 @@ pub fn config(cfg: &mut web::ServiceConfig, rate_limit: &RateLimitCell) {
|
||||
.route("/delete", web::post().to(delete_custom_emoji)),
|
||||
),
|
||||
);
|
||||
cfg.service(
|
||||
web::scope("/sitemap.xml")
|
||||
.wrap(rate_limit.message())
|
||||
.route("", web::get().to(get_sitemap)),
|
||||
);
|
||||
}
|
||||
|
||||
async fn perform<'a, Data>(
|
||||
|
Loading…
Reference in New Issue
Block a user