From 03bb1280050e6fcb09405c8848cb58ec7807f8d9 Mon Sep 17 00:00:00 2001 From: Andrew Kvalheim Date: Thu, 14 Jul 2022 09:41:33 -0700 Subject: [PATCH 01/11] Key entries by link if missing ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves the problem of incorrectly duplicated entries in feeds that update content but don’t explicitly provide entry IDs. Example feed: - https://www.to-rss.xyz/wikipedia/current_events/ Example entry: Current events: 2022-07-13 https://en.wikipedia.org/wiki/Portal:Current_events/2022_July_13 [VARIABLE CONTENT] Wed, 13 Jul 2022 00:00:00 -0000 This behavior is suggested by the common practice of using an entry’s link as its ID value, and is consistent with typical feed aggregators such as Feedbin and Inoreader. --- rss/bot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rss/bot.py b/rss/bot.py index 0352f4c..42ca897 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -279,12 +279,12 @@ class RSSBot(Plugin): feed_id=feed_id, id=( getattr(entry, "id", None) + or getattr(entry, "link", None) or hashlib.sha1( " ".join( [ getattr(entry, "title", ""), getattr(entry, "description", ""), - getattr(entry, "link", ""), ] ).encode("utf-8") ).hexdigest() From 1a52d18f5993e7c5e7c78729a396dbe686b0a560 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Tue, 21 Feb 2023 12:43:32 +0200 Subject: [PATCH 02/11] Show current template if ran without arguments --- rss/bot.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/rss/bot.py b/rss/bot.py index 0352f4c..6c6e2fc 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -21,6 +21,7 @@ from string import Template from time import mktime, time import asyncio import hashlib +import html import aiohttp import attr @@ -392,7 +393,7 @@ class RSSBot(Plugin): help="Change the notification template for a subscription in this room", ) @command.argument("feed_id", "feed ID", parser=int) - @command.argument("template", "new template", pass_raw=True) + @command.argument("template", "new template", pass_raw=True, required=False) async def command_template(self, evt: MessageEvent, feed_id: int, template: str) -> None: if not await self.can_manage(evt): return @@ -400,6 +401,13 @@ class RSSBot(Plugin): if not sub: await evt.reply("This room is not subscribed to that feed") return + if not template: + await evt.reply( + '

Current template in this room:

'
+                f"{html.escape(sub.notification_template.template)}"
+                "
", allow_html=True, markdown=False, + ) + return await self.dbm.update_template(feed.id, evt.room_id, template) sub = Subscription( feed_id=feed.id, From ef4915e43442f500f232d7f5067d6f9dbc73b013 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Tue, 21 Feb 2023 12:47:13 +0200 Subject: [PATCH 03/11] Add usage to readme --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.md b/README.md index ee06f0c..aab772a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,30 @@ # rss A [maubot](https://github.com/maubot/maubot) that posts RSS feed updates to Matrix. + +## Usage +Basic commands: + +* `!rss subscribe ` - Subscribe the current room to a feed. +* `!rss unsubscribe ` - Unsubscribe the current room from a feed. +* `!rss subscriptions` - List subscriptions (and feed IDs) in the current room. +* `!rss notice [true/false]` - Set whether the bot should send new + posts as `m.notice` (if false, they're sent as `m.text`). +* `!rss template [new template]` - Change the post template for a + feed in the current room. If the new template is omitted, the bot replies + with the current template. + +### Templates +The default template is `New post in $feed_title: [$title]($link)`. + +Templates are interpreted as markdown with some simple variable substitution. +The following variables are available: + +* `$feed_url` - The URL that was used to subscribe to the feed. +* `$feed_link` - The home page of the feed. +* `$feed_title` - The title of the feed. +* `$feed_subtitle` - The subtitle of the feed. +* `$id` - The unique ID of the entry. +* `$date` - The date of the entry. +* `$title` - The title of the entry. +* `$summary` - The summary/description of the entry. +* `$link` - The link of the entry. From eeb71a008f39a2bcbe8b8bdba36ead433fd8eba0 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Tue, 21 Feb 2023 13:22:19 +0200 Subject: [PATCH 04/11] Fix formatting --- rss/bot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rss/bot.py b/rss/bot.py index 6c6e2fc..4806f1f 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -405,7 +405,9 @@ class RSSBot(Plugin): await evt.reply( '

Current template in this room:

'
                 f"{html.escape(sub.notification_template.template)}"
-                "
", allow_html=True, markdown=False, + "", + allow_html=True, + markdown=False, ) return await self.dbm.update_template(feed.id, evt.room_id, template) From a8f134012515202e4be7fab0b0609f08b8b068f4 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Thu, 30 Jan 2025 14:07:57 +0200 Subject: [PATCH 05/11] Update feedparser input --- rss/bot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rss/bot.py b/rss/bot.py index 945903b..423118c 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -22,6 +22,7 @@ from time import mktime, time import asyncio import hashlib import html +import io import aiohttp import attr @@ -264,7 +265,7 @@ class RSSBot(Plugin): except UnicodeDecodeError: content = str(await resp.read())[2:-1] headers = {"Content-Location": feed.url, **resp.headers, "Content-Encoding": "identity"} - parsed_data = feedparser.parse(content, response_headers=headers) + parsed_data = feedparser.parse(io.StringIO(content), response_headers=headers) if parsed_data.bozo: if not isinstance(parsed_data.bozo_exception, feedparser.ThingsNobodyCaresAboutButMe): raise parsed_data.bozo_exception From f62b0335dd0f12c95d873fe2729cca43d22f4b84 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Thu, 30 Jan 2025 14:08:04 +0200 Subject: [PATCH 06/11] Update linters --- .github/workflows/python-lint.yml | 2 +- .pre-commit-config.yaml | 6 +++--- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index fc28bdb..18be560 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -9,7 +9,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 with: - python-version: "3.10" + python-version: "3.13" - uses: isort/isort-action@master with: sortPaths: "./rss" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f1b3e5..caefdcb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v5.0.0 hooks: - id: trailing-whitespace exclude_types: [markdown] @@ -8,13 +8,13 @@ repos: - id: check-yaml - id: check-added-large-files - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 25.1.0 hooks: - id: black language_version: python3 files: ^rss/.*\.pyi?$ - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 6.0.0 hooks: - id: isort files: ^rss/.*\.pyi?$ diff --git a/pyproject.toml b/pyproject.toml index 3e608c9..f143797 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,4 +8,4 @@ line_length = 99 [tool.black] line-length = 99 -target-version = ["py38"] +target-version = ["py310"] From 68e5a84096cfe26e0b95f870726912f94d540c8e Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Thu, 30 Jan 2025 14:10:05 +0200 Subject: [PATCH 07/11] Bump version to v0.4.0 --- maubot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maubot.yaml b/maubot.yaml index 4cefc3d..c1ec735 100644 --- a/maubot.yaml +++ b/maubot.yaml @@ -1,6 +1,6 @@ maubot: 0.3.0 id: xyz.maubot.rss -version: 0.3.2 +version: 0.4.0 license: AGPL-3.0-or-later modules: - rss From 72d08096b7af74b1565d7958a1aac3f8e246e59d Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Thu, 30 Jan 2025 15:47:09 +0200 Subject: [PATCH 08/11] Pass raw data to feedparser. Fixes #59 --- rss/bot.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/rss/bot.py b/rss/bot.py index 423118c..74c1681 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -257,15 +257,9 @@ class RSSBot(Plugin): async def _parse_rss( cls, feed: Feed, resp: aiohttp.ClientResponse ) -> tuple[Feed, list[Entry]]: - try: - content = await resp.text() - except UnicodeDecodeError: - try: - content = await resp.text(encoding="utf-8", errors="ignore") - except UnicodeDecodeError: - content = str(await resp.read())[2:-1] + content = await resp.read() headers = {"Content-Location": feed.url, **resp.headers, "Content-Encoding": "identity"} - parsed_data = feedparser.parse(io.StringIO(content), response_headers=headers) + parsed_data = feedparser.parse(io.BytesIO(content), response_headers=headers) if parsed_data.bozo: if not isinstance(parsed_data.bozo_exception, feedparser.ThingsNobodyCaresAboutButMe): raise parsed_data.bozo_exception From 81ec8ed86494fae5f1dd49628c316158067ffeab Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Thu, 30 Jan 2025 15:53:50 +0200 Subject: [PATCH 09/11] Bump version to 0.4.1 --- maubot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maubot.yaml b/maubot.yaml index c1ec735..b8c0836 100644 --- a/maubot.yaml +++ b/maubot.yaml @@ -1,6 +1,6 @@ maubot: 0.3.0 id: xyz.maubot.rss -version: 0.4.0 +version: 0.4.1 license: AGPL-3.0-or-later modules: - rss From 93984bef86f90bafc14b14c3ea6b3d16cfda1d2b Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Wed, 7 May 2025 15:19:05 +0300 Subject: [PATCH 10/11] Remove sqlite returning special case --- rss/db.py | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/rss/db.py b/rss/db.py index e6faa88..69c930c 100644 --- a/rss/db.py +++ b/rss/db.py @@ -25,12 +25,6 @@ import attr from mautrix.types import RoomID, UserID from mautrix.util.async_db import Database, Scheme -# TODO make this import unconditional after updating mautrix-python -try: - from mautrix.util.async_db import SQLiteCursor -except ImportError: - SQLiteCursor = None - @dataclass class Subscription: @@ -188,24 +182,9 @@ class DBManager: "INSERT INTO feed (url, title, subtitle, link, next_retry) " "VALUES ($1, $2, $3, $4, $5) RETURNING (id)" ) - # SQLite only gained RETURNING support in v3.35 (2021-03-12) - # TODO remove this special case in a couple of years - if self.db.scheme == Scheme.SQLITE: - cur = await self.db.execute( - q.replace(" RETURNING (id)", ""), - info.url, - info.title, - info.subtitle, - info.link, - info.next_retry, - ) - if SQLiteCursor is not None: - assert isinstance(cur, SQLiteCursor) - info.id = cur.lastrowid - else: - info.id = await self.db.fetchval( - q, info.url, info.title, info.subtitle, info.link, info.next_retry - ) + info.id = await self.db.fetchval( + q, info.url, info.title, info.subtitle, info.link, info.next_retry + ) return info async def set_backoff(self, info: Feed, error_count: int, next_retry: int) -> None: From 7df6c62f6b20b16fbf01a37818d60c84868987ae Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Wed, 27 Aug 2025 14:44:12 +0300 Subject: [PATCH 11/11] Add parallelism limit for polling feeds --- base-config.yaml | 2 ++ rss/bot.py | 24 +++++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/base-config.yaml b/base-config.yaml index 52c92d4..6df33fe 100644 --- a/base-config.yaml +++ b/base-config.yaml @@ -2,6 +2,8 @@ update_interval: 60 # Maximum backoff in minutes when failing to fetch feeds (defaults to 5 days) max_backoff: 7200 +# How many feeds to poll in parallel? Set to 0 to disable limit. +poll_parallelism_limit: 10 # The time to sleep between send requests when broadcasting a new feed entry. # Set to 0 to disable sleep or -1 to run all requests asynchronously at once. spam_sleep: 2 diff --git a/rss/bot.py b/rss/bot.py index 74c1681..91c745f 100644 --- a/rss/bot.py +++ b/rss/bot.py @@ -18,7 +18,7 @@ from __future__ import annotations from typing import Any, Iterable from datetime import datetime from string import Template -from time import mktime, time +from time import mktime, monotonic, time import asyncio import hashlib import html @@ -55,6 +55,7 @@ class Config(BaseProxyConfig): helper.copy("command_prefix") helper.copy("notification_template") helper.copy("admins") + helper.copy("poll_parallelism_limit") class BoolArgument(command.Argument): @@ -75,6 +76,7 @@ class BoolArgument(command.Argument): class RSSBot(Plugin): dbm: DBManager poll_task: asyncio.Future + poll_sema: asyncio.Semaphore | None http: aiohttp.ClientSession power_level_cache: dict[RoomID, tuple[int, PowerLevelStateEventContent]] @@ -88,12 +90,19 @@ class RSSBot(Plugin): async def start(self) -> None: await super().start() - self.config.load_and_update() + self.on_external_config_update() self.dbm = DBManager(self.database) self.http = self.client.api.session self.power_level_cache = {} self.poll_task = asyncio.create_task(self.poll_feeds()) + def on_external_config_update(self) -> None: + self.config.load_and_update() + poll_parallelism_limit = self.config["poll_parallelism_limit"] + self.poll_sema = ( + asyncio.Semaphore(poll_parallelism_limit) if poll_parallelism_limit > 0 else None + ) + async def stop(self) -> None: await super().stop() self.poll_task.cancel() @@ -142,6 +151,7 @@ class RSSBot(Plugin): if not subs: return now = int(time()) + start = monotonic() tasks = [self.try_parse_feed(feed=feed) for feed in subs if feed.next_retry < now] feed: Feed entries: Iterable[Entry] @@ -176,7 +186,8 @@ class RSSBot(Plugin): await self.dbm.add_entries(new_entry_list) for entry in new_entry_list: await self._broadcast(feed, entry, feed.subscriptions) - self.log.info(f"Finished polling {len(tasks)} feeds") + duration = monotonic() - now + self.log.info(f"Finished polling {len(tasks)} feeds in {duration:.2f} seconds") async def _poll_feeds(self) -> None: self.log.debug("Polling started") @@ -190,6 +201,13 @@ class RSSBot(Plugin): await asyncio.sleep(self.config["update_interval"] * 60) async def try_parse_feed(self, feed: Feed | None = None) -> tuple[Feed, list[Entry]]: + if self.poll_sema is not None: + async with self.poll_sema: + return await self._try_parse_feed(feed) + else: + return await self._try_parse_feed(feed) + + async def _try_parse_feed(self, feed: Feed | None = None) -> tuple[Feed, list[Entry]]: try: self.log.trace( f"Trying to fetch {feed.id} / {feed.url} "