maubot-rss/rss/bot.py

464 lines
18 KiB
Python
Raw Normal View History

2018-11-27 02:33:41 +02:00
# rss - A maubot plugin to subscribe to RSS/Atom feeds.
2022-03-26 14:32:18 +02:00
# Copyright (C) 2022 Tulir Asokan
2018-11-27 02:33:41 +02:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
2022-03-26 14:32:18 +02:00
from __future__ import annotations
from typing import Any, Iterable
2018-11-27 02:33:41 +02:00
from datetime import datetime
from string import Template
2022-03-26 14:32:18 +02:00
from time import mktime, time
2018-11-27 02:33:41 +02:00
import asyncio
2022-03-26 14:32:18 +02:00
import hashlib
2018-11-27 02:33:41 +02:00
import aiohttp
2022-03-26 14:32:18 +02:00
import attr
2018-11-27 02:33:41 +02:00
import feedparser
2022-03-26 14:32:18 +02:00
from maubot import MessageEvent, Plugin
2019-11-21 23:50:56 +02:00
from maubot.handlers import command, event
2022-03-26 14:32:18 +02:00
from mautrix.types import (
EventID,
EventType,
MessageType,
PowerLevelStateEventContent,
RoomID,
StateEvent,
)
from mautrix.util.async_db import UpgradeTable
from mautrix.util.config import BaseProxyConfig, ConfigUpdateHelper
2018-11-27 02:33:41 +02:00
2022-03-26 14:32:18 +02:00
from .db import DBManager, Entry, Feed, Subscription
from .migrations import upgrade_table
2018-11-27 02:33:41 +02:00
rss_change_level = EventType.find("xyz.maubot.rss", t_class=EventType.Class.STATE)
2018-11-27 02:33:41 +02:00
class Config(BaseProxyConfig):
def do_update(self, helper: ConfigUpdateHelper) -> None:
helper.copy("update_interval")
helper.copy("max_backoff")
2018-11-27 02:33:41 +02:00
helper.copy("spam_sleep")
helper.copy("command_prefix")
helper.copy("notification_template")
helper.copy("admins")
2020-07-01 17:20:14 +03:00
class BoolArgument(command.Argument):
def __init__(self, name: str, label: str = None, *, required: bool = False) -> None:
super().__init__(name, label, required=required, pass_raw=False)
2022-03-26 14:32:18 +02:00
def match(self, val: str, **kwargs) -> tuple[str, Any]:
2020-07-01 17:20:14 +03:00
part = val.split(" ")[0].lower()
if part in ("f", "false", "n", "no", "0"):
res = False
elif part in ("t", "true", "y", "yes", "1"):
res = True
else:
raise ValueError("invalid boolean")
2022-03-26 14:32:18 +02:00
return val[len(part) :], res
2020-07-01 17:20:14 +03:00
2018-11-27 02:33:41 +02:00
class RSSBot(Plugin):
2022-03-26 14:32:18 +02:00
dbm: DBManager
2018-11-27 02:33:41 +02:00
poll_task: asyncio.Future
http: aiohttp.ClientSession
2022-03-26 14:32:18 +02:00
power_level_cache: dict[RoomID, tuple[int, PowerLevelStateEventContent]]
2018-11-27 02:33:41 +02:00
@classmethod
2022-03-26 14:32:18 +02:00
def get_config_class(cls) -> type[BaseProxyConfig]:
2018-11-27 02:33:41 +02:00
return Config
2022-03-26 14:32:18 +02:00
@classmethod
def get_db_upgrade_table(cls) -> UpgradeTable:
return upgrade_table
2018-11-27 02:33:41 +02:00
async def start(self) -> None:
await super().start()
2018-11-27 02:33:41 +02:00
self.config.load_and_update()
2022-03-26 14:32:18 +02:00
self.dbm = DBManager(self.database)
2018-11-27 02:33:41 +02:00
self.http = self.client.api.session
self.power_level_cache = {}
2018-11-27 02:33:41 +02:00
self.poll_task = asyncio.ensure_future(self.poll_feeds(), loop=self.loop)
async def stop(self) -> None:
await super().stop()
2018-11-27 02:33:41 +02:00
self.poll_task.cancel()
async def poll_feeds(self) -> None:
try:
await self._poll_feeds()
except asyncio.CancelledError:
self.log.debug("Polling stopped")
except Exception:
self.log.exception("Fatal error while polling feeds")
2018-11-27 02:33:41 +02:00
async def _send(self, feed: Feed, entry: Entry, sub: Subscription) -> EventID:
2022-03-26 14:32:18 +02:00
message = sub.notification_template.safe_substitute(
{
"feed_url": feed.url,
"feed_title": feed.title,
"feed_subtitle": feed.subtitle,
"feed_link": feed.link,
**attr.asdict(entry),
}
)
msgtype = MessageType.NOTICE if sub.send_notice else MessageType.TEXT
try:
2022-03-26 14:32:18 +02:00
return await self.client.send_markdown(
sub.room_id, message, msgtype=msgtype, allow_html=True
)
except Exception as e:
self.log.warning(f"Failed to send {entry.id} of {feed.id} to {sub.room_id}: {e}")
2022-03-26 14:32:18 +02:00
async def _broadcast(
self, feed: Feed, entry: Entry, subscriptions: list[Subscription]
) -> None:
2021-07-21 18:19:13 +03:00
self.log.debug(f"Broadcasting {entry.id} of {feed.id}")
2018-11-27 02:33:41 +02:00
spam_sleep = self.config["spam_sleep"]
2020-07-01 17:20:14 +03:00
tasks = [self._send(feed, entry, sub) for sub in subscriptions]
2018-11-27 02:33:41 +02:00
if spam_sleep >= 0:
for task in tasks:
await task
await asyncio.sleep(spam_sleep, loop=self.loop)
else:
await asyncio.gather(*tasks)
async def _poll_once(self) -> None:
2022-03-26 14:32:18 +02:00
subs = await self.dbm.get_feeds()
2018-11-27 02:33:41 +02:00
if not subs:
return
2021-04-10 14:20:55 +03:00
now = int(time())
tasks = [self.try_parse_feed(feed=feed) for feed in subs if feed.next_retry < now]
feed: Feed
entries: Iterable[Entry]
2022-02-22 23:11:11 +02:00
self.log.info(f"Polling {len(tasks)} feeds")
for res in asyncio.as_completed(tasks):
2020-08-03 03:03:19 +03:00
feed, entries = await res
2022-03-26 14:32:18 +02:00
self.log.trace(
f"Fetching {feed.id} (backoff: {feed.error_count} / {feed.next_retry}) "
f"success: {bool(entries)}"
)
2020-08-03 03:03:19 +03:00
if not entries:
error_count = feed.error_count + 1
next_retry_delay = self.config["update_interval"] * 60 * error_count
next_retry_delay = min(next_retry_delay, self.config["max_backoff"] * 60)
2021-04-10 14:20:55 +03:00
next_retry = int(time() + next_retry_delay)
2021-07-21 18:19:13 +03:00
self.log.debug(f"Setting backoff of {feed.id} to {error_count} / {next_retry}")
2022-03-26 14:32:18 +02:00
await self.dbm.set_backoff(feed, error_count, next_retry)
2020-08-03 03:03:19 +03:00
continue
elif feed.error_count > 0:
2021-07-21 18:19:13 +03:00
self.log.debug(f"Resetting backoff of {feed.id}")
2022-03-26 14:32:18 +02:00
await self.dbm.set_backoff(feed, error_count=0, next_retry=0)
2020-08-03 03:03:19 +03:00
try:
new_entries = {entry.id: entry for entry in entries}
except Exception:
self.log.exception(f"Weird error in items of {feed.url}")
2020-08-03 03:03:19 +03:00
continue
2022-03-26 14:32:18 +02:00
for old_entry in await self.dbm.get_entries(feed.id):
2018-11-27 02:33:41 +02:00
new_entries.pop(old_entry.id, None)
self.log.trace(f"Feed {feed.id} had {len(new_entries)} new entries")
2022-03-26 14:32:18 +02:00
new_entry_list: list[Entry] = list(new_entries.values())
new_entry_list.sort(key=lambda entry: (entry.date, entry.id))
await self.dbm.add_entries(new_entry_list)
for entry in new_entry_list:
2018-11-27 02:33:41 +02:00
await self._broadcast(feed, entry, feed.subscriptions)
2022-02-22 23:11:11 +02:00
self.log.info(f"Finished polling {len(tasks)} feeds")
2018-11-27 02:33:41 +02:00
async def _poll_feeds(self) -> None:
self.log.debug("Polling started")
while True:
try:
await self._poll_once()
except asyncio.CancelledError:
self.log.debug("Polling stopped")
except Exception:
self.log.exception("Error while polling feeds")
2018-11-27 02:33:41 +02:00
await asyncio.sleep(self.config["update_interval"] * 60, loop=self.loop)
2022-03-26 14:32:18 +02:00
async def try_parse_feed(self, feed: Feed | None = None) -> tuple[Feed, list[Entry]]:
2020-08-03 03:19:32 +03:00
try:
2022-03-26 14:32:18 +02:00
self.log.trace(
f"Trying to fetch {feed.id} / {feed.url} "
f"(backoff: {feed.error_count} / {feed.next_retry})"
)
2020-08-03 03:19:32 +03:00
return await self.parse_feed(feed=feed)
except Exception as e:
self.log.warning(f"Failed to parse feed {feed.id} / {feed.url}: {e}")
2020-08-03 03:19:32 +03:00
return feed, []
2022-03-26 14:32:18 +02:00
async def parse_feed(
self, *, feed: Feed | None = None, url: str | None = None
) -> tuple[Feed, list[Entry]]:
2020-08-03 03:03:19 +03:00
if feed is None:
if url is None:
raise ValueError("Either feed or url must be set")
2022-03-26 14:32:18 +02:00
feed = Feed(id=-1, url=url, title="", subtitle="", link="")
2020-08-03 03:03:19 +03:00
elif url is not None:
raise ValueError("Only one of feed or url must be set")
resp = await self.http.get(feed.url)
2020-08-07 19:55:28 +03:00
ct = resp.headers["Content-Type"].split(";")[0].strip()
if ct == "application/json" or ct == "application/feed+json":
2020-08-03 03:03:19 +03:00
return await self._parse_json(feed, resp)
else:
return await self._parse_rss(feed, resp)
@classmethod
2022-03-26 14:32:18 +02:00
async def _parse_json(
cls, feed: Feed, resp: aiohttp.ClientResponse
) -> tuple[Feed, list[Entry]]:
2020-08-03 03:03:19 +03:00
content = await resp.json()
2022-03-26 14:32:18 +02:00
if content["version"] not in (
"https://jsonfeed.org/version/1",
"https://jsonfeed.org/version/1.1",
):
raise ValueError("Unsupported JSON feed version")
2020-08-03 03:03:19 +03:00
if not isinstance(content["items"], list):
raise ValueError("Feed is not a valid JSON feed (items is not a list)")
2022-03-26 14:32:18 +02:00
feed.title = content["title"]
feed.subtitle = content.get("subtitle", "")
feed.link = content.get("home_page_url", "")
return feed, [cls._parse_json_entry(feed.id, entry) for entry in content["items"]]
2020-08-03 03:03:19 +03:00
@classmethod
2022-03-26 14:32:18 +02:00
def _parse_json_entry(cls, feed_id: int, entry: dict[str, Any]) -> Entry:
try:
2020-08-03 03:03:19 +03:00
date = datetime.fromisoformat(entry["date_published"])
except (ValueError, KeyError):
date = datetime.now()
title = entry.get("title", "")
2022-03-26 14:32:18 +02:00
summary = (
entry.get("summary") or entry.get("content_html") or entry.get("content_text") or ""
).strip()
2020-08-03 03:03:19 +03:00
id = str(entry["id"])
link = entry.get("url") or id
return Entry(feed_id=feed_id, id=id, date=date, title=title, summary=summary, link=link)
@classmethod
2022-03-26 14:32:18 +02:00
async def _parse_rss(
cls, feed: Feed, resp: aiohttp.ClientResponse
) -> tuple[Feed, list[Entry]]:
2019-06-01 23:42:00 +03:00
try:
content = await resp.text()
except UnicodeDecodeError:
try:
2020-08-03 03:03:19 +03:00
content = await resp.text(encoding="utf-8", errors="ignore")
except UnicodeDecodeError:
2019-06-01 23:42:00 +03:00
content = str(await resp.read())[2:-1]
2020-08-03 03:03:19 +03:00
headers = {"Content-Location": feed.url, **resp.headers, "Content-Encoding": "identity"}
parsed_data = feedparser.parse(content, response_headers=headers)
if parsed_data.bozo:
if not isinstance(parsed_data.bozo_exception, feedparser.ThingsNobodyCaresAboutButMe):
raise parsed_data.bozo_exception
feed_data = parsed_data.get("feed", {})
2022-03-26 14:32:18 +02:00
feed.title = feed_data.get("title", feed.url)
feed.subtitle = feed_data.get("description", "")
feed.link = feed_data.get("link", "")
return feed, [cls._parse_rss_entry(feed.id, entry) for entry in parsed_data.entries]
@classmethod
2020-08-03 03:03:19 +03:00
def _parse_rss_entry(cls, feed_id: int, entry: Any) -> Entry:
return Entry(
2018-11-27 02:33:41 +02:00
feed_id=feed_id,
2022-03-26 14:32:18 +02:00
id=(
getattr(entry, "id", None)
or hashlib.sha1(
" ".join(
[
getattr(entry, "title", ""),
getattr(entry, "description", ""),
getattr(entry, "link", ""),
]
).encode("utf-8")
).hexdigest()
),
2020-08-03 03:03:19 +03:00
date=cls._parse_rss_date(entry),
title=getattr(entry, "title", ""),
summary=getattr(entry, "description", "").strip(),
link=getattr(entry, "link", ""),
2020-08-03 03:03:19 +03:00
)
@staticmethod
def _parse_rss_date(entry: Any) -> datetime:
try:
return datetime.fromtimestamp(mktime(entry["published_parsed"]))
except (KeyError, TypeError, ValueError):
pass
try:
return datetime.fromtimestamp(mktime(entry["date_parsed"]))
except (KeyError, TypeError, ValueError):
pass
return datetime.now()
2018-11-27 02:33:41 +02:00
async def get_power_levels(self, room_id: RoomID) -> PowerLevelStateEventContent:
try:
expiry, levels = self.power_level_cache[room_id]
if expiry < int(time()):
return levels
except KeyError:
pass
levels = await self.client.get_state_event(room_id, EventType.ROOM_POWER_LEVELS)
self.power_level_cache[room_id] = (int(time()) + 5 * 60, levels)
return levels
async def can_manage(self, evt: MessageEvent) -> bool:
if evt.sender in self.config["admins"]:
return True
levels = await self.get_power_levels(evt.room_id)
user_level = levels.get_user_level(evt.sender)
state_level = levels.get_event_level(rss_change_level)
if not isinstance(state_level, int):
state_level = 50
if user_level < state_level:
2022-03-26 14:32:18 +02:00
await evt.reply(
"You don't have the permission to manage the subscriptions of this room."
)
return False
return True
2022-03-26 14:32:18 +02:00
@command.new(
name=lambda self: self.config["command_prefix"],
help="Manage this RSS bot",
require_subcommand=True,
)
async def rss(self) -> None:
pass
2022-03-26 14:32:18 +02:00
@rss.subcommand("subscribe", aliases=("s", "sub"), help="Subscribe this room to a feed.")
@command.argument("url", "feed URL", pass_raw=True)
async def subscribe(self, evt: MessageEvent, url: str) -> None:
if not await self.can_manage(evt):
return
2022-03-26 14:32:18 +02:00
feed = await self.dbm.get_feed_by_url(url)
if not feed:
2020-08-03 03:03:19 +03:00
try:
info, entries = await self.parse_feed(url=url)
except Exception as e:
await evt.reply(f"Failed to load feed: {e}")
return
2022-03-26 14:32:18 +02:00
feed = await self.dbm.create_feed(info)
await self.dbm.add_entries(entries, override_feed_id=feed.id)
elif feed.error_count > 0:
2022-03-26 14:32:18 +02:00
await self.dbm.set_backoff(feed, error_count=feed.error_count, next_retry=0)
feed_info = f"feed ID {feed.id}: [{feed.title}]({feed.url})"
2022-03-26 14:32:18 +02:00
sub, _ = await self.dbm.get_subscription(feed.id, evt.room_id)
if sub is not None:
2022-03-26 14:32:18 +02:00
subscriber = (
"You"
if sub.user_id == evt.sender
else f"[{sub.user_id}](https://matrix.to/#/{sub.user_id})"
)
await evt.reply(f"{subscriber} had already subscribed this room to {feed_info}")
else:
await self.dbm.subscribe(
feed.id, evt.room_id, evt.sender, self.config["notification_template"]
)
await evt.reply(f"Subscribed to {feed_info}")
2022-03-26 14:32:18 +02:00
@rss.subcommand(
"unsubscribe", aliases=("u", "unsub"), help="Unsubscribe this room from a feed."
)
@command.argument("feed_id", "feed ID", parser=int)
async def unsubscribe(self, evt: MessageEvent, feed_id: int) -> None:
if not await self.can_manage(evt):
return
2022-03-26 14:32:18 +02:00
sub, feed = await self.dbm.get_subscription(feed_id, evt.room_id)
if not sub:
await evt.reply("This room is not subscribed to that feed")
return
2022-03-26 14:32:18 +02:00
await self.dbm.unsubscribe(feed.id, evt.room_id)
await evt.reply(f"Unsubscribed from feed ID {feed.id}: [{feed.title}]({feed.url})")
2022-03-26 14:32:18 +02:00
@rss.subcommand(
"template",
aliases=("t", "tpl"),
help="Change the notification template for a subscription in this room",
)
@command.argument("feed_id", "feed ID", parser=int)
@command.argument("template", "new template", pass_raw=True)
async def command_template(self, evt: MessageEvent, feed_id: int, template: str) -> None:
if not await self.can_manage(evt):
return
2022-03-26 14:32:18 +02:00
sub, feed = await self.dbm.get_subscription(feed_id, evt.room_id)
if not sub:
await evt.reply("This room is not subscribed to that feed")
return
2022-03-26 14:32:18 +02:00
await self.dbm.update_template(feed.id, evt.room_id, template)
sub = Subscription(
feed_id=feed.id,
room_id=sub.room_id,
user_id=sub.user_id,
notification_template=Template(template),
send_notice=sub.send_notice,
)
sample_entry = Entry(
feed_id=feed.id,
id="SAMPLE",
date=datetime.now(),
title="Sample entry",
summary="This is a sample entry to demonstrate your new template",
link="http://example.com",
)
await evt.reply(f"Template for feed ID {feed.id} updated. Sample notification:")
2020-08-03 03:03:19 +03:00
await self._send(feed, sample_entry, sub)
2022-03-26 14:32:18 +02:00
@rss.subcommand(
"notice", aliases=("n",), help="Set whether or not the bot should send updates as m.notice"
)
2020-07-01 17:20:14 +03:00
@command.argument("feed_id", "feed ID", parser=int)
@BoolArgument("setting", "true/false")
async def command_notice(self, evt: MessageEvent, feed_id: int, setting: bool) -> None:
if not await self.can_manage(evt):
return
2022-03-26 14:32:18 +02:00
sub, feed = await self.dbm.get_subscription(feed_id, evt.room_id)
2020-07-01 17:20:14 +03:00
if not sub:
await evt.reply("This room is not subscribed to that feed")
return
2022-03-26 14:32:18 +02:00
await self.dbm.set_send_notice(feed.id, evt.room_id, setting)
2020-07-01 17:20:14 +03:00
send_type = "m.notice" if setting else "m.text"
await evt.reply(f"Updates for feed ID {feed.id} will now be sent as `{send_type}`")
@staticmethod
def _format_subscription(feed: Feed, subscriber: str) -> str:
2022-03-26 14:32:18 +02:00
msg = (
f"* {feed.id} - [{feed.title}]({feed.url}) "
f"(subscribed by [{subscriber}](https://matrix.to/#/{subscriber}))"
)
if feed.error_count > 1:
msg += f" \n ⚠️ The last {feed.error_count} attempts to fetch the feed have failed!"
return msg
2022-03-26 14:32:18 +02:00
@rss.subcommand(
"subscriptions",
aliases=("ls", "list", "subs"),
help="List the subscriptions in the current room.",
)
async def command_subscriptions(self, evt: MessageEvent) -> None:
2022-03-26 14:32:18 +02:00
subscriptions = await self.dbm.get_feeds_by_room(evt.room_id)
if len(subscriptions) == 0:
await evt.reply("There are no RSS subscriptions in this room")
return
2022-03-26 14:32:18 +02:00
await evt.reply(
"**Subscriptions in this room:**\n\n"
+ "\n".join(
self._format_subscription(feed, subscriber) for feed, subscriber in subscriptions
)
)
2019-11-21 23:50:56 +02:00
@event.on(EventType.ROOM_TOMBSTONE)
async def tombstone(self, evt: StateEvent) -> None:
if not evt.content.replacement_room:
return
2022-03-26 14:32:18 +02:00
await self.dbm.update_room_id(evt.room_id, evt.content.replacement_room)