python and typst script

2025-12-13 15:39:15 -05:00 · 2023-07-09 20:53:03 +00:00 · 2023-07-09 20:53:03 +00:00 · be05046783
commit be05046783
parent da5f497ec1
19 changed files with 2223 additions and 0 deletions
--- a/layout/python/slugify/init.py
+++ b/layout/python/slugify/init.py
@ -0,0 +1,7 @@
+from .special import *
+from .slugify import *
+
+
+__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]'
+__description__ = 'A Python slugify application that also handles Unicode'
+__version__ = '4.0.1'
--- a/layout/python/slugify/main.py
+++ b/layout/python/slugify/main.py
@ -0,0 +1,93 @@
+from __future__ import print_function, absolute_import
+import argparse
+import sys
+
+from .slugify import slugify, DEFAULT_SEPARATOR
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Sluggify string")
+
+    input_group = parser.add_argument_group(description="Input")
+    input_group.add_argument("input_string", nargs='*',
+                             help='Text to slugify')
+    input_group.add_argument("--stdin", action='store_true',
+                             help="Take the text from STDIN")
+
+    parser.add_argument("--no-entities", action='store_false', dest='entities', default=True,
+                        help="Do not convert HTML entities to unicode")
+    parser.add_argument("--no-decimal", action='store_false', dest='decimal', default=True,
+                        help="Do not convert HTML decimal to unicode")
+    parser.add_argument("--no-hexadecimal", action='store_false', dest='hexadecimal', default=True,
+                        help="Do not convert HTML hexadecimal to unicode")
+    parser.add_argument("--max-length", type=int, default=0,
+                        help="Output string length, 0 for no limit")
+    parser.add_argument("--word-boundary", action='store_true', default=False,
+                        help="Truncate to complete word even if length ends up shorter than --max_length")
+    parser.add_argument("--save-order", action='store_true', default=False,
+                        help="When set and --max_length > 0 return whole words in the initial order")
+    parser.add_argument("--separator", type=str, default=DEFAULT_SEPARATOR,
+                        help="Separator between words. By default " + DEFAULT_SEPARATOR)
+    parser.add_argument("--stopwords", nargs='+',
+                        help="Words to discount")
+    parser.add_argument("--regex-pattern",
+                        help="Python regex pattern for allowed characters")
+    parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True,
+                        help="Activate case sensitivity")
+    parser.add_argument("--replacements", nargs='+',
+                        help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
+
+    args = parser.parse_args(argv[1:])
+
+    if args.input_string and args.stdin:
+        parser.error("Input strings and --stdin cannot work together")
+
+    if args.replacements:
+        def split_check(repl):
+            SEP = '->'
+            if SEP not in repl:
+                parser.error("Replacements must be of the form: ORIGINAL{SEP}REPLACED".format(SEP=SEP))
+            return repl.split(SEP, 1)
+        args.replacements = [split_check(repl) for repl in args.replacements]
+
+    if args.input_string:
+        args.input_string = " ".join(args.input_string)
+    elif args.stdin:
+        args.input_string = sys.stdin.read()
+
+    if not args.input_string:
+        args.input_string = ''
+
+    return args
+
+
+def slugify_params(args):
+    return dict(
+        text=args.input_string,
+        entities=args.entities,
+        decimal=args.decimal,
+        hexadecimal=args.hexadecimal,
+        max_length=args.max_length,
+        word_boundary=args.word_boundary,
+        save_order=args.save_order,
+        separator=args.separator,
+        stopwords=args.stopwords,
+        lowercase=args.lowercase,
+        replacements=args.replacements
+    )
+
+
+def main(argv=None): # pragma: no cover
+    """ Run this program """
+    if argv is None:
+        argv = sys.argv
+    args = parse_args(argv)
+    params = slugify_params(args)
+    try:
+        print(slugify(**params))
+    except KeyboardInterrupt:
+        sys.exit(-1)
+
+
+if __name__ == '__main__': # pragma: no cover
+    main()
--- a/layout/python/slugify/slugify.py
+++ b/layout/python/slugify/slugify.py
@ -0,0 +1,180 @@
+import re
+import unicodedata
+import types
+import sys
+
+try:
+    from htmlentitydefs import name2codepoint
+    _unicode = unicode
+    _unicode_type = types.UnicodeType
+except ImportError:
+    from html.entities import name2codepoint
+    _unicode = str
+    _unicode_type = str
+    unichr = chr
+
+try:
+    import text_unidecode as unidecode
+except ImportError:
+    import unidecode
+
+__all__ = ['slugify', 'smart_truncate']
+
+
+CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
+DECIMAL_PATTERN = re.compile(r'&#(\d+);')
+HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
+QUOTE_PATTERN = re.compile(r'[\']+')
+ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+')
+ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+')
+DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
+NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
+DEFAULT_SEPARATOR = '-'
+
+
+def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):
+    """
+    Truncate a string.
+    :param string (str): string for modification
+    :param max_length (int): output string length
+    :param word_boundary (bool):
+    :param save_order (bool): if True then word order of output string is like input string
+    :param separator (str): separator between words
+    :return:
+    """
+
+    string = string.strip(separator)
+
+    if not max_length:
+        return string
+
+    if len(string) < max_length:
+        return string
+
+    if not word_boundary:
+        return string[:max_length].strip(separator)
+
+    if separator not in string:
+        return string[:max_length]
+
+    truncated = ''
+    for word in string.split(separator):
+        if word:
+            next_len = len(truncated) + len(word)
+            if next_len < max_length:
+                truncated += '{}{}'.format(word, separator)
+            elif next_len == max_length:
+                truncated += '{}'.format(word)
+                break
+            else:
+                if save_order:
+                    break
+    if not truncated: # pragma: no cover
+        truncated = string[:max_length]
+    return truncated.strip(separator)
+
+
+def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
+            separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
+            replacements=()):
+    """
+    Make a slug from the given text.
+    :param text (str): initial text
+    :param entities (bool): converts html entities to unicode
+    :param decimal (bool): converts html decimal to unicode
+    :param hexadecimal (bool): converts html hexadecimal to unicode
+    :param max_length (int): output string length
+    :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
+    :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
+    :param separator (str): separator between words
+    :param stopwords (iterable): words to discount
+    :param regex_pattern (str): regex pattern for allowed characters
+    :param lowercase (bool): activate case sensitivity by setting it to False
+    :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+    :return (str):
+    """
+
+    # user-specific replacements
+    if replacements:
+        for old, new in replacements:
+            text = text.replace(old, new)
+
+    # ensure text is unicode
+    if not isinstance(text, _unicode_type):
+        text = _unicode(text, 'utf-8', 'ignore')
+
+    # replace quotes with dashes - pre-process
+    text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
+
+    # decode unicode
+    text = unidecode.unidecode(text)
+
+    # ensure text is still in unicode
+    if not isinstance(text, _unicode_type):
+        text = _unicode(text, 'utf-8', 'ignore')
+
+    # character entity reference
+    if entities:
+        text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
+
+    # decimal character reference
+    if decimal:
+        try:
+            text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text)
+        except Exception:
+            pass
+
+    # hexadecimal character reference
+    if hexadecimal:
+        try:
+            text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text)
+        except Exception:
+            pass
+
+    # translate
+    text = unicodedata.normalize('NFKD', text)
+    if sys.version_info < (3,):
+        text = text.encode('ascii', 'ignore')
+
+    # make the text lowercase (optional)
+    if lowercase:
+        text = text.lower()
+
+    # remove generated quotes -- post-process
+    text = QUOTE_PATTERN.sub('', text)
+
+    # cleanup numbers
+    text = NUMBERS_PATTERN.sub('', text)
+
+    # replace all other unwanted characters
+    if lowercase:
+        pattern = regex_pattern or ALLOWED_CHARS_PATTERN
+    else:
+        pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE
+    text = re.sub(pattern, DEFAULT_SEPARATOR, text)
+
+    # remove redundant
+    text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
+
+    # remove stopwords
+    if stopwords:
+        if lowercase:
+            stopwords_lower = [s.lower() for s in stopwords]
+            words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
+        else:
+            words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
+        text = DEFAULT_SEPARATOR.join(words)
+
+    # finalize user-specific replacements
+    if replacements:
+        for old, new in replacements:
+            text = text.replace(old, new)
+
+    # smart truncate if requested
+    if max_length > 0:
+        text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
+
+    if separator != DEFAULT_SEPARATOR:
+        text = text.replace(DEFAULT_SEPARATOR, separator)
+
+    return text
--- a/layout/python/slugify/special.py
+++ b/layout/python/slugify/special.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+
+def add_uppercase_char(char_list):
+    """ Given a replacement char list, this adds uppercase chars to the list """
+
+    for item in char_list:
+        char, xlate = item
+        upper_dict = char.upper(), xlate.capitalize()
+        if upper_dict not in char_list and char != upper_dict[0]:
+            char_list.insert(0, upper_dict)
+        return char_list
+
+
+# Language specific pre translations
+# Source awesome-slugify
+
+_CYRILLIC = [      # package defaults:
+    (u'ё', u'e'),    # io / yo
+    (u'я', u'ya'),   # ia
+    (u'х', u'h'),    # kh
+    (u'у', u'y'),    # u
+    (u'щ', u'sch'),  # shch
+    (u'ю', u'u'),    # iu / yu
+]
+CYRILLIC = add_uppercase_char(_CYRILLIC)
+
+_GERMAN = [        # package defaults:
+    (u'ä', u'ae'),   # a
+    (u'ö', u'oe'),   # o
+    (u'ü', u'ue'),   # u
+]
+GERMAN = add_uppercase_char(_GERMAN)
+
+_GREEK = [         # package defaults:
+    (u'χ', u'ch'),   # kh
+    (u'Ξ', u'X'),    # Ks
+    (u'ϒ', u'Y'),    # U
+    (u'υ', u'y'),    # u
+    (u'ύ', u'y'),
+    (u'ϋ', u'y'),
+    (u'ΰ', u'y'),
+]
+GREEK = add_uppercase_char(_GREEK)
+
+# Pre translations
+PRE_TRANSLATIONS = CYRILLIC + GERMAN + GREEK