forked-synapse/synapse/config/oembed.py

# Copyright 2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
from typing import Any, Dict, Iterable, List, Optional, Pattern
from urllib import parse as urlparse

import attr
import pkg_resources

from synapse.types import JsonDict, StrSequence

from ._base import Config, ConfigError
from ._util import validate_config


@attr.s(slots=True, frozen=True, auto_attribs=True)
class OEmbedEndpointConfig:
    # The API endpoint to fetch.
    api_endpoint: str
    # The patterns to match.
    url_patterns: List[Pattern]
    # The supported formats.
    formats: Optional[List[str]]


class OembedConfig(Config):
    """oEmbed Configuration"""

    section = "oembed"

    def read_config(self, config: JsonDict, **kwargs: Any) -> None:
        oembed_config: Dict[str, Any] = config.get("oembed") or {}

        # A list of patterns which will be used.
        self.oembed_patterns: List[OEmbedEndpointConfig] = list(
            self._parse_and_validate_providers(oembed_config)
        )

    def _parse_and_validate_providers(
        self, oembed_config: dict
    ) -> Iterable[OEmbedEndpointConfig]:
        """Extract and parse the oEmbed providers from the given JSON file.

        Returns a generator which yields the OidcProviderConfig objects
        """
        # Whether to use the packaged providers.json file.
        if not oembed_config.get("disable_default_providers") or False:
            with pkg_resources.resource_stream("synapse", "res/providers.json") as s:
                providers = json.load(s)

            yield from self._parse_and_validate_provider(
                providers, config_path=("oembed",)
            )

        # The JSON files which includes additional provider information.
        for i, file in enumerate(oembed_config.get("additional_providers") or []):
            # TODO Error checking.
            with open(file) as f:
                providers = json.load(f)

            yield from self._parse_and_validate_provider(
                providers,
                config_path=(
                    "oembed",
                    "additional_providers",
                    f"<item {i}>",
                ),
            )

    def _parse_and_validate_provider(
        self, providers: List[JsonDict], config_path: StrSequence
    ) -> Iterable[OEmbedEndpointConfig]:
        # Ensure it is the proper form.
        validate_config(
            _OEMBED_PROVIDER_SCHEMA,
            providers,
            config_path=config_path,
        )

        # Parse it and yield each result.
        for provider in providers:
            # Each provider might have multiple API endpoints, each which
            # might have multiple patterns to match.
            for endpoint in provider["endpoints"]:
                api_endpoint = endpoint["url"]

                # The API endpoint must be an HTTP(S) URL.
                results = urlparse.urlparse(api_endpoint)
                if results.scheme not in {"http", "https"}:
                    raise ConfigError(
                        f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}",
                        config_path,
                    )

                patterns = [
                    self._glob_to_pattern(glob, config_path)
                    for glob in endpoint["schemes"]
                ]
                yield OEmbedEndpointConfig(
                    api_endpoint, patterns, endpoint.get("formats")
                )

    def _glob_to_pattern(self, glob: str, config_path: StrSequence) -> Pattern:
        """
        Convert the glob into a sane regular expression to match against. The
        rules followed will be slightly different for the domain portion vs.
        the rest.

        1. The scheme must be one of HTTP / HTTPS (and have no globs).
        2. The domain can have globs, but we limit it to characters that can
           reasonably be a domain part.
           TODO: This does not attempt to handle Unicode domain names.
           TODO: The domain should not allow wildcard TLDs.
        3. Other parts allow a glob to be any one, or more, characters.
        """
        results = urlparse.urlparse(glob)

        # The scheme must be HTTP(S) (and cannot contain wildcards).
        if results.scheme not in {"http", "https"}:
            raise ConfigError(
                f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}",
                config_path,
            )

        pattern = urlparse.urlunparse(
            [
                results.scheme,
                re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
            ]
            + [re.escape(part).replace("\\*", ".+") for part in results[2:]]
        )
        return re.compile(pattern)


_OEMBED_PROVIDER_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "provider_name": {"type": "string"},
            "provider_url": {"type": "string"},
            "endpoints": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "schemes": {
                            "type": "array",
                            "items": {"type": "string"},
                        },
                        "url": {"type": "string"},
                        "formats": {"type": "array", "items": {"type": "string"}},
                        "discovery": {"type": "boolean"},
                    },
                    "required": ["schemes", "url"],
                },
            },
        },
        "required": ["provider_name", "provider_url", "endpoints"],
    },
}
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`# Copyright 2021 The Matrix.org Foundation C.I.C.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import json`
			`import re`
Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00			`from typing import Any, Dict, Iterable, List, Optional, Pattern`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`from urllib import parse as urlparse`

			`import attr`
			`import pkg_resources`

Re-type config paths in `ConfigError`s to be `StrSequence`s (#15615) Part of #14809. Signed-off-by: Sean Quah <seanq@matrix.org> 2023-05-18 06:11:30 -04:00			`from synapse.types import JsonDict, StrSequence`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00
			`from ._base import Config, ConfigError`
			`from ._util import validate_config`


			`@attr.s(slots=True, frozen=True, auto_attribs=True)`
			`class OEmbedEndpointConfig:`
			`# The API endpoint to fetch.`
			`api_endpoint: str`
			`# The patterns to match.`
			`url_patterns: List[Pattern]`
Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00			`# The supported formats.`
			`formats: Optional[List[str]]`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00

			`class OembedConfig(Config):`
			`"""oEmbed Configuration"""`

			`section = "oembed"`

Add missing type hints to config classes. (#12402) 2022-04-11 12:07:23 -04:00			`def read_config(self, config: JsonDict, **kwargs: Any) -> None:`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`oembed_config: Dict[str, Any] = config.get("oembed") or {}`

			`# A list of patterns which will be used.`
			`self.oembed_patterns: List[OEmbedEndpointConfig] = list(`
			`self._parse_and_validate_providers(oembed_config)`
			`)`

			`def _parse_and_validate_providers(`
			`self, oembed_config: dict`
			`) -> Iterable[OEmbedEndpointConfig]:`
			`"""Extract and parse the oEmbed providers from the given JSON file.`

			`Returns a generator which yields the OidcProviderConfig objects`
			`"""`
			`# Whether to use the packaged providers.json file.`
			`if not oembed_config.get("disable_default_providers") or False:`
Properly close providers.json file stream. (#12794) 2022-05-19 10:23:59 -04:00			`with pkg_resources.resource_stream("synapse", "res/providers.json") as s:`
			`providers = json.load(s)`

Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`yield from self._parse_and_validate_provider(`
			`providers, config_path=("oembed",)`
			`)`

			`# The JSON files which includes additional provider information.`
			`for i, file in enumerate(oembed_config.get("additional_providers") or []):`
			`# TODO Error checking.`
			`with open(file) as f:`
			`providers = json.load(f)`

			`yield from self._parse_and_validate_provider(`
			`providers,`
			`config_path=(`
			`"oembed",`
			`"additional_providers",`
			`f"<item {i}>",`
			`),`
			`)`

			`def _parse_and_validate_provider(`
Re-type config paths in `ConfigError`s to be `StrSequence`s (#15615) Part of #14809. Signed-off-by: Sean Quah <seanq@matrix.org> 2023-05-18 06:11:30 -04:00			`self, providers: List[JsonDict], config_path: StrSequence`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`) -> Iterable[OEmbedEndpointConfig]:`
			`# Ensure it is the proper form.`
			`validate_config(`
			`_OEMBED_PROVIDER_SCHEMA,`
			`providers,`
			`config_path=config_path,`
			`)`

			`# Parse it and yield each result.`
			`for provider in providers:`
			`# Each provider might have multiple API endpoints, each which`
			`# might have multiple patterns to match.`
			`for endpoint in provider["endpoints"]:`
			`api_endpoint = endpoint["url"]`
Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00
			`# The API endpoint must be an HTTP(S) URL.`
			`results = urlparse.urlparse(api_endpoint)`
			`if results.scheme not in {"http", "https"}:`
			`raise ConfigError(`
			`f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}",`
			`config_path,`
			`)`

Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`patterns = [`
			`self._glob_to_pattern(glob, config_path)`
			`for glob in endpoint["schemes"]`
			`]`
Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00			`yield OEmbedEndpointConfig(`
			`api_endpoint, patterns, endpoint.get("formats")`
			`)`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00
Re-type config paths in `ConfigError`s to be `StrSequence`s (#15615) Part of #14809. Signed-off-by: Sean Quah <seanq@matrix.org> 2023-05-18 06:11:30 -04:00			`def _glob_to_pattern(self, glob: str, config_path: StrSequence) -> Pattern:`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`"""`
			`Convert the glob into a sane regular expression to match against. The`
			`rules followed will be slightly different for the domain portion vs.`
			`the rest.`

			`1. The scheme must be one of HTTP / HTTPS (and have no globs).`
			`2. The domain can have globs, but we limit it to characters that can`
			`reasonably be a domain part.`
			`TODO: This does not attempt to handle Unicode domain names.`
			`TODO: The domain should not allow wildcard TLDs.`
			`3. Other parts allow a glob to be any one, or more, characters.`
			`"""`
			`results = urlparse.urlparse(glob)`

Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00			`# The scheme must be HTTP(S) (and cannot contain wildcards).`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00			`if results.scheme not in {"http", "https"}:`
Request JSON for oEmbed requests (and ignore XML only providers). (#10759) This adds the format to the request arguments / URL to ensure that JSON data is returned (which is all that Synapse supports). This also adds additional error checking / filtering to the configuration file to ignore XML-only providers. 2021-09-08 07:17:52 -04:00			`raise ConfigError(`
			`f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}",`
			`config_path,`
			`)`
Allow configuration of the oEmbed URLs. (#10714) This adds configuration options (under an `oembed` section) to configure which URLs are matched to use oEmbed for URL previews. 2021-08-31 18:37:07 -04:00
			`pattern = urlparse.urlunparse(`
			`[`
			`results.scheme,`
			`re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),`
			`]`
			`+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]`
			`)`
			`return re.compile(pattern)`


			`_OEMBED_PROVIDER_SCHEMA = {`
			`"type": "array",`
			`"items": {`
			`"type": "object",`
			`"properties": {`
			`"provider_name": {"type": "string"},`
			`"provider_url": {"type": "string"},`
			`"endpoints": {`
			`"type": "array",`
			`"items": {`
			`"type": "object",`
			`"properties": {`
			`"schemes": {`
			`"type": "array",`
			`"items": {"type": "string"},`
			`},`
			`"url": {"type": "string"},`
			`"formats": {"type": "array", "items": {"type": "string"}},`
			`"discovery": {"type": "boolean"},`
			`},`
			`"required": ["schemes", "url"],`
			`},`
			`},`
			`},`
			`"required": ["provider_name", "provider_url", "endpoints"],`
			`},`
			`}`