Unescape HTML entities in oEmbed titles. (#14781)

It doesn't seem valid that HTML entities should appear in
the title field of oEmbed responses, but a popular WordPress
plug-in seems to do it.

There should not be harm in unescaping these.
This commit is contained in:
Jeyachandran Rathnam 2023-01-09 09:22:02 -05:00 committed by GitHub
parent 7e582a25f8
commit babeeb4e7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 6 deletions

1
changelog.d/14781.misc Normal file
View File

@ -0,0 +1 @@
Unescape HTML entities in URL preview titles making use of oEmbed responses.

View File

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import html
import logging import logging
import urllib.parse import urllib.parse
from typing import TYPE_CHECKING, List, Optional from typing import TYPE_CHECKING, List, Optional
@ -161,7 +162,9 @@ class OEmbedProvider:
title = oembed.get("title") title = oembed.get("title")
if title and isinstance(title, str): if title and isinstance(title, str):
open_graph_response["og:title"] = title # A common WordPress plug-in seems to incorrectly escape entities
# in the oEmbed response.
open_graph_response["og:title"] = html.unescape(title)
author_name = oembed.get("author_name") author_name = oembed.get("author_name")
if not isinstance(author_name, str): if not isinstance(author_name, str):
@ -180,9 +183,9 @@ class OEmbedProvider:
# Process each type separately. # Process each type separately.
oembed_type = oembed.get("type") oembed_type = oembed.get("type")
if oembed_type == "rich": if oembed_type == "rich":
html = oembed.get("html") html_str = oembed.get("html")
if isinstance(html, str): if isinstance(html_str, str):
calc_description_and_urls(open_graph_response, html) calc_description_and_urls(open_graph_response, html_str)
elif oembed_type == "photo": elif oembed_type == "photo":
# If this is a photo, use the full image, not the thumbnail. # If this is a photo, use the full image, not the thumbnail.
@ -192,8 +195,8 @@ class OEmbedProvider:
elif oembed_type == "video": elif oembed_type == "video":
open_graph_response["og:type"] = "video.other" open_graph_response["og:type"] = "video.other"
html = oembed.get("html") html_str = oembed.get("html")
if html and isinstance(html, str): if html_str and isinstance(html_str, str):
calc_description_and_urls(open_graph_response, oembed["html"]) calc_description_and_urls(open_graph_response, oembed["html"])
for size in ("width", "height"): for size in ("width", "height"):
val = oembed.get(size) val = oembed.get(size)

View File

@ -150,3 +150,13 @@ class OEmbedTests(HomeserverTestCase):
result = self.parse_response({"type": "link"}) result = self.parse_response({"type": "link"})
self.assertIn("og:type", result.open_graph_result) self.assertIn("og:type", result.open_graph_result)
self.assertEqual(result.open_graph_result["og:type"], "website") self.assertEqual(result.open_graph_result["og:type"], "website")
def test_title_html_entities(self) -> None:
"""Test HTML entities in title"""
result = self.parse_response(
{"title": "Why JSON isn’t a Good Configuration Language"}
)
self.assertEqual(
result.open_graph_result["og:title"],
"Why JSON isnt a Good Configuration Language",
)