mirror of
https://git.anonymousland.org/anonymousland/synapse-product.git
synced 2024-10-01 08:25:44 -04:00
synthesise basig OG metadata from pages lacking it
This commit is contained in:
parent
0d3d7de6fc
commit
bb9a2ca87c
@ -23,6 +23,7 @@ from synapse.http.client import SpiderHttpClient
|
|||||||
from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes
|
from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@ -70,6 +71,7 @@ class PreviewUrlResource(BaseMediaResource):
|
|||||||
|
|
||||||
# define our OG response for this media
|
# define our OG response for this media
|
||||||
elif self._is_html(media_info['media_type']):
|
elif self._is_html(media_info['media_type']):
|
||||||
|
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||||
tree = html.parse(media_info['filename'])
|
tree = html.parse(media_info['filename'])
|
||||||
|
|
||||||
# suck it up into lxml and define our OG response.
|
# suck it up into lxml and define our OG response.
|
||||||
@ -83,16 +85,57 @@ class PreviewUrlResource(BaseMediaResource):
|
|||||||
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
|
# "og:description" : "Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP"
|
||||||
# "og:site_name" : "Twitter"
|
# "og:site_name" : "Twitter"
|
||||||
|
|
||||||
|
# or:
|
||||||
|
|
||||||
|
# "og:type" : "video",
|
||||||
|
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
|
||||||
|
# "og:site_name" : "YouTube",
|
||||||
|
# "og:video:type" : "application/x-shockwave-flash",
|
||||||
|
# "og:description" : " ",
|
||||||
|
# "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
|
||||||
|
# "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
|
||||||
|
# "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
|
||||||
|
# "og:video:width" : "1280"
|
||||||
|
# "og:video:height" : "720",
|
||||||
|
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
|
||||||
|
|
||||||
og = {}
|
og = {}
|
||||||
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
|
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
|
||||||
og[tag.attrib['property']] = tag.attrib['content']
|
og[tag.attrib['property']] = tag.attrib['content']
|
||||||
|
|
||||||
|
if not og:
|
||||||
|
# do some basic spidering of the HTML
|
||||||
|
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
|
||||||
|
og['og:title'] = title[0].text if title else None
|
||||||
|
|
||||||
|
images = tree.xpath("//img")
|
||||||
|
big_images = [ i for i in images if (
|
||||||
|
'width' in i and 'height' in i and
|
||||||
|
i.attrib['width'] > 64 and i.attrib['height'] > 64
|
||||||
|
)] or images
|
||||||
|
og['og:image'] = images[0].attrib['src'] if images else None
|
||||||
|
|
||||||
|
text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | //p/text() | //div/text() | //span/text() | //a/text()")
|
||||||
|
text = ''
|
||||||
|
for text_node in text_nodes:
|
||||||
|
if len(text) < 1024:
|
||||||
|
text += text_node + ' '
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
text = re.sub(r'[\t ]+', ' ', text)
|
||||||
|
text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
|
||||||
|
text = text.strip()[:1024]
|
||||||
|
og['og:description'] = text if text else None
|
||||||
|
|
||||||
|
# TODO: turn any OG media URLs into mxc URLs to capture and thumbnail them too
|
||||||
# TODO: store our OG details in a cache (and expire them when stale)
|
# TODO: store our OG details in a cache (and expire them when stale)
|
||||||
# TODO: delete the content to stop diskfilling, as we only ever cared about its OG
|
# TODO: delete the content to stop diskfilling, as we only ever cared about its OG
|
||||||
else:
|
else:
|
||||||
logger.warn("Failed to find any OG data in %s", url)
|
logger.warn("Failed to find any OG data in %s", url)
|
||||||
og = {}
|
og = {}
|
||||||
|
|
||||||
|
logger.warn(og)
|
||||||
|
|
||||||
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
|
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
|
||||||
except:
|
except:
|
||||||
# XXX: if we don't explicitly respond here, the request never returns.
|
# XXX: if we don't explicitly respond here, the request never returns.
|
||||||
@ -111,6 +154,10 @@ class PreviewUrlResource(BaseMediaResource):
|
|||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def _download_url(self, url, user):
|
def _download_url(self, url, user):
|
||||||
|
# TODO: we should probably honour robots.txt... except in practice
|
||||||
|
# we're most likely being explicitly triggered by a human rather than a
|
||||||
|
# bot, so are we really a robot?
|
||||||
|
|
||||||
# XXX: horrible duplication with base_resource's _download_remote_file()
|
# XXX: horrible duplication with base_resource's _download_remote_file()
|
||||||
file_id = random_string(24)
|
file_id = random_string(24)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user