mirror of
https://git.anonymousland.org/anonymousland/synapse-product.git
synced 2024-12-11 14:34:19 -05:00
prevent choking on invalid utf-8, and handle image thumbnailing smarter
This commit is contained in:
parent
bb9a2ca87c
commit
72550c3803
@ -72,7 +72,15 @@ class PreviewUrlResource(BaseMediaResource):
|
|||||||
# define our OG response for this media
|
# define our OG response for this media
|
||||||
elif self._is_html(media_info['media_type']):
|
elif self._is_html(media_info['media_type']):
|
||||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||||
tree = html.parse(media_info['filename'])
|
|
||||||
|
# XXX: can't work out how to make lxml ignore UTF8 decoding errors
|
||||||
|
# so slurp as a string at this point.
|
||||||
|
file = open(media_info['filename'])
|
||||||
|
body = file.read()
|
||||||
|
file.close()
|
||||||
|
# FIXME: we shouldn't be forcing utf-8 if the page isn't actually utf-8...
|
||||||
|
tree = html.fromstring(body.decode('utf-8','ignore'))
|
||||||
|
# tree = html.parse(media_info['filename'])
|
||||||
|
|
||||||
# suck it up into lxml and define our OG response.
|
# suck it up into lxml and define our OG response.
|
||||||
# if we see any URLs in the OG response, then spider them
|
# if we see any URLs in the OG response, then spider them
|
||||||
@ -108,14 +116,19 @@ class PreviewUrlResource(BaseMediaResource):
|
|||||||
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
||||||
og['og:title'] = title[0].text if title else None
|
og['og:title'] = title[0].text if title else None
|
||||||
|
|
||||||
images = tree.xpath("//img")
|
images = [ i for i in tree.xpath("//img") if 'src' in i.attrib ]
|
||||||
big_images = [ i for i in images if (
|
big_images = [ i for i in images if (
|
||||||
'width' in i and 'height' in i and
|
'width' in i.attrib and 'height' in i.attrib and
|
||||||
i.attrib['width'] > 64 and i.attrib['height'] > 64
|
i.attrib['width'] > 64 and i.attrib['height'] > 64
|
||||||
)] or images
|
)]
|
||||||
og['og:image'] = images[0].attrib['src'] if images else None
|
big_images = big_images.sort(key=lambda i: (-1 * int(i.attrib['width']) * int(i.attrib['height'])))
|
||||||
|
images = big_images if big_images else images
|
||||||
|
|
||||||
|
if images:
|
||||||
|
og['og:image'] = images[0].attrib['src']
|
||||||
|
|
||||||
text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
|
text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
|
||||||
|
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text()")
|
||||||
text = ''
|
text = ''
|
||||||
for text_node in text_nodes:
|
for text_node in text_nodes:
|
||||||
if len(text) < 1024:
|
if len(text) < 1024:
|
||||||
|
Loading…
Reference in New Issue
Block a user