fix: Run extract_opengraph_data only on first 64kB of data and if Content-Type html (#4957)

* fix: Run extract_opengraph_data only on first 64kB of data and if data is not binary.

* use mime type for determination

* chore: simplify collect function
This commit is contained in:
phiresky 2024-08-07 16:35:08 +02:00 committed by GitHub
parent 88fbcea246
commit 606545ccaf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -8,6 +8,7 @@ use crate::{
use activitypub_federation::config::Data;
use chrono::{DateTime, Utc};
use encoding_rs::{Encoding, UTF_8};
use futures::StreamExt;
use lemmy_db_schema::{
newtypes::DbUrl,
source::{
@ -23,7 +24,12 @@ use lemmy_utils::{
VERSION,
};
use mime::Mime;
use reqwest::{header::CONTENT_TYPE, Client, ClientBuilder};
use reqwest::{
header::{CONTENT_TYPE, RANGE},
Client,
ClientBuilder,
Response,
};
use reqwest_middleware::ClientWithMiddleware;
use serde::{Deserialize, Serialize};
use tracing::info;
@ -44,7 +50,17 @@ pub fn client_builder(settings: &Settings) -> ClientBuilder {
#[tracing::instrument(skip_all)]
pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResult<LinkMetadata> {
info!("Fetching site metadata for url: {}", url);
let response = context.client().get(url.as_str()).send().await?;
// We only fetch the first 64kB of data in order to not waste bandwidth especially for large
// binary files
let bytes_to_fetch = 64 * 1024;
let response = context
.client()
.get(url.as_str())
// we only need the first chunk of data. Note that we do not check for Accept-Range so the
// server may ignore this and still respond with the full response
.header(RANGE, format!("bytes=0-{}", bytes_to_fetch - 1)) /* -1 because inclusive */
.send()
.await?;
let content_type: Option<Mime> = response
.headers()
@ -52,19 +68,57 @@ pub async fn fetch_link_metadata(url: &Url, context: &LemmyContext) -> LemmyResu
.and_then(|h| h.to_str().ok())
.and_then(|h| h.parse().ok());
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
let opengraph_data = {
// if the content type is not text/html, we don't need to parse it
let is_html = content_type
.as_ref()
.map(|c| {
(c.type_() == mime::TEXT && c.subtype() == mime::HTML)
||
// application/xhtml+xml is a subset of HTML
(c.type_() == mime::APPLICATION && c.subtype() == "xhtml")
})
.unwrap_or(false);
if !is_html {
Default::default()
} else {
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
// So we want to do deep inspection of the actually returned bytes but need to be careful not
// spend too much time parsing binary data as HTML
let opengraph_data = extract_opengraph_data(&html_bytes, url)
.map_err(|e| info!("{e}"))
.unwrap_or_default();
// only take first bytes regardless of how many bytes the server returns
let html_bytes = collect_bytes_until_limit(response, bytes_to_fetch).await?;
extract_opengraph_data(&html_bytes, url)
.map_err(|e| info!("{e}"))
.unwrap_or_default()
}
};
Ok(LinkMetadata {
opengraph_data,
content_type: content_type.map(|c| c.to_string()),
})
}
async fn collect_bytes_until_limit(
response: Response,
requested_bytes: usize,
) -> Result<Vec<u8>, LemmyError> {
let mut stream = response.bytes_stream();
let mut bytes = Vec::with_capacity(requested_bytes);
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(LemmyError::from)?;
// we may go over the requested size here but the important part is we don't keep aggregating
// more chunks than needed
bytes.extend_from_slice(&chunk);
if bytes.len() >= requested_bytes {
bytes.truncate(requested_bytes);
break;
}
}
Ok(bytes)
}
/// Generates and saves a post thumbnail and metadata.
///
/// Takes a callback to generate a send activity task, so that post can be federated with metadata.