From 66e06b39529d397693fd414d6e2bec8519e3415e Mon Sep 17 00:00:00 2001 From: Dessalines Date: Tue, 23 Apr 2024 23:15:20 -0400 Subject: [PATCH] Removing scheme from block urls. Fixes #4656 (#4659) * Removing scheme from block urls. Fixes #4656 * Fix comment. * Fixing domain checking. * Removing pointless URL building in url blocklist regex. * Remove trailing / --- crates/api_common/src/utils.rs | 21 +-------- crates/utils/src/utils/validation.rs | 65 ++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 38 deletions(-) diff --git a/crates/api_common/src/utils.rs b/crates/api_common/src/utils.rs index 3cd7902e1..4ab587928 100644 --- a/crates/api_common/src/utils.rs +++ b/crates/api_common/src/utils.rs @@ -536,25 +536,8 @@ pub async fn get_url_blocklist(context: &LemmyContext) -> LemmyResult .try_get_with::<_, LemmyError>((), async { let urls = LocalSiteUrlBlocklist::get_all(&mut context.pool()).await?; - let regexes = urls.iter().map(|url| { - let url = &url.url; - let parsed = Url::parse(url).expect("Coundln't parse URL."); - if url.ends_with('/') { - format!( - "({}://)?{}{}?", - parsed.scheme(), - escape(parsed.domain().expect("No domain.")), - escape(parsed.path()) - ) - } else { - format!( - "({}://)?{}{}", - parsed.scheme(), - escape(parsed.domain().expect("No domain.")), - escape(parsed.path()) - ) - } - }); + // The urls are already validated on saving, so just escape them. + let regexes = urls.iter().map(|url| escape(&url.url)); let set = RegexSet::new(regexes)?; Ok(set) diff --git a/crates/utils/src/utils/validation.rs b/crates/utils/src/utils/validation.rs index 93d581327..a6539f1b1 100644 --- a/crates/utils/src/utils/validation.rs +++ b/crates/utils/src/utils/validation.rs @@ -309,21 +309,44 @@ pub fn is_url_blocked(url: &Option, blocklist: &RegexSet) -> LemmyResult<() Ok(()) } +/// Check that urls are valid, and also remove the scheme, and uniques pub fn check_urls_are_valid(urls: &Vec) -> LemmyResult> { let mut parsed_urls = vec![]; for url in urls { - let url = Url::parse(url).or_else(|e| { - if e == ParseError::RelativeUrlWithoutBase { - Url::parse(&format!("https://{url}")) - } else { - Err(e) - } - })?; - - parsed_urls.push(url.to_string()); + parsed_urls.push(build_url_str_without_scheme(url)?); } - Ok(parsed_urls) + let unique_urls = parsed_urls.into_iter().unique().collect(); + Ok(unique_urls) +} + +pub fn build_url_str_without_scheme(url_str: &str) -> LemmyResult { + // Parse and check for errors + let mut url = Url::parse(url_str).or_else(|e| { + if e == ParseError::RelativeUrlWithoutBase { + Url::parse(&format!("http://{url_str}")) + } else { + Err(e) + } + })?; + + // Set the scheme to http, then remove the http:// part + url + .set_scheme("http") + .map_err(|_| LemmyErrorType::InvalidUrl)?; + + let mut out = url + .to_string() + .get(7..) + .ok_or(LemmyErrorType::InvalidUrl)? + .to_string(); + + // Remove trailing / if necessary + if out.ends_with('/') { + out.pop(); + } + + Ok(out) } #[cfg(test)] @@ -600,17 +623,21 @@ mod tests { #[test] fn test_url_parsed() { + // Make sure the scheme is removed, and uniques also assert_eq!( - vec![String::from("https://example.com/")], - check_urls_are_valid(&vec![String::from("example.com")]).unwrap() + &check_urls_are_valid(&vec![ + "example.com".to_string(), + "http://example.com".to_string(), + "https://example.com".to_string(), + "https://example.com/test?q=test2&q2=test3#test4".to_string(), + ]) + .unwrap(), + &vec![ + "example.com".to_string(), + "example.com/test?q=test2&q2=test3#test4".to_string() + ], ); - assert!(check_urls_are_valid(&vec![ - String::from("example.com"), - String::from("https://example.blog") - ]) - .is_ok()); - - assert!(check_urls_are_valid(&vec![String::from("https://example .com"),]).is_err()); + assert!(check_urls_are_valid(&vec!["https://example .com".to_string()]).is_err()); } }