python and typst script

This commit is contained in:
anarsec 2023-07-09 20:53:03 +00:00
parent da5f497ec1
commit be05046783
No known key found for this signature in database
19 changed files with 2223 additions and 0 deletions

View file

@ -0,0 +1,7 @@
from .special import *
from .slugify import *
__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]'
__description__ = 'A Python slugify application that also handles Unicode'
__version__ = '4.0.1'

View file

@ -0,0 +1,93 @@
from __future__ import print_function, absolute_import
import argparse
import sys
from .slugify import slugify, DEFAULT_SEPARATOR
def parse_args(argv):
parser = argparse.ArgumentParser(description="Sluggify string")
input_group = parser.add_argument_group(description="Input")
input_group.add_argument("input_string", nargs='*',
help='Text to slugify')
input_group.add_argument("--stdin", action='store_true',
help="Take the text from STDIN")
parser.add_argument("--no-entities", action='store_false', dest='entities', default=True,
help="Do not convert HTML entities to unicode")
parser.add_argument("--no-decimal", action='store_false', dest='decimal', default=True,
help="Do not convert HTML decimal to unicode")
parser.add_argument("--no-hexadecimal", action='store_false', dest='hexadecimal', default=True,
help="Do not convert HTML hexadecimal to unicode")
parser.add_argument("--max-length", type=int, default=0,
help="Output string length, 0 for no limit")
parser.add_argument("--word-boundary", action='store_true', default=False,
help="Truncate to complete word even if length ends up shorter than --max_length")
parser.add_argument("--save-order", action='store_true', default=False,
help="When set and --max_length > 0 return whole words in the initial order")
parser.add_argument("--separator", type=str, default=DEFAULT_SEPARATOR,
help="Separator between words. By default " + DEFAULT_SEPARATOR)
parser.add_argument("--stopwords", nargs='+',
help="Words to discount")
parser.add_argument("--regex-pattern",
help="Python regex pattern for allowed characters")
parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True,
help="Activate case sensitivity")
parser.add_argument("--replacements", nargs='+',
help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
args = parser.parse_args(argv[1:])
if args.input_string and args.stdin:
parser.error("Input strings and --stdin cannot work together")
if args.replacements:
def split_check(repl):
SEP = '->'
if SEP not in repl:
parser.error("Replacements must be of the form: ORIGINAL{SEP}REPLACED".format(SEP=SEP))
return repl.split(SEP, 1)
args.replacements = [split_check(repl) for repl in args.replacements]
if args.input_string:
args.input_string = " ".join(args.input_string)
elif args.stdin:
args.input_string = sys.stdin.read()
if not args.input_string:
args.input_string = ''
return args
def slugify_params(args):
return dict(
text=args.input_string,
entities=args.entities,
decimal=args.decimal,
hexadecimal=args.hexadecimal,
max_length=args.max_length,
word_boundary=args.word_boundary,
save_order=args.save_order,
separator=args.separator,
stopwords=args.stopwords,
lowercase=args.lowercase,
replacements=args.replacements
)
def main(argv=None): # pragma: no cover
""" Run this program """
if argv is None:
argv = sys.argv
args = parse_args(argv)
params = slugify_params(args)
try:
print(slugify(**params))
except KeyboardInterrupt:
sys.exit(-1)
if __name__ == '__main__': # pragma: no cover
main()

View file

@ -0,0 +1,180 @@
import re
import unicodedata
import types
import sys
try:
from htmlentitydefs import name2codepoint
_unicode = unicode
_unicode_type = types.UnicodeType
except ImportError:
from html.entities import name2codepoint
_unicode = str
_unicode_type = str
unichr = chr
try:
import text_unidecode as unidecode
except ImportError:
import unidecode
__all__ = ['slugify', 'smart_truncate']
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+')
ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):
"""
Truncate a string.
:param string (str): string for modification
:param max_length (int): output string length
:param word_boundary (bool):
:param save_order (bool): if True then word order of output string is like input string
:param separator (str): separator between words
:return:
"""
string = string.strip(separator)
if not max_length:
return string
if len(string) < max_length:
return string
if not word_boundary:
return string[:max_length].strip(separator)
if separator not in string:
return string[:max_length]
truncated = ''
for word in string.split(separator):
if word:
next_len = len(truncated) + len(word)
if next_len < max_length:
truncated += '{}{}'.format(word, separator)
elif next_len == max_length:
truncated += '{}'.format(word)
break
else:
if save_order:
break
if not truncated: # pragma: no cover
truncated = string[:max_length]
return truncated.strip(separator)
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
replacements=()):
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for allowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:return (str):
"""
# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# ensure text is unicode
if not isinstance(text, _unicode_type):
text = _unicode(text, 'utf-8', 'ignore')
# replace quotes with dashes - pre-process
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# decode unicode
text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, _unicode_type):
text = _unicode(text, 'utf-8', 'ignore')
# character entity reference
if entities:
text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
# decimal character reference
if decimal:
try:
text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text)
except Exception:
pass
# hexadecimal character reference
if hexadecimal:
try:
text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text)
except Exception:
pass
# translate
text = unicodedata.normalize('NFKD', text)
if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')
# make the text lowercase (optional)
if lowercase:
text = text.lower()
# remove generated quotes -- post-process
text = QUOTE_PATTERN.sub('', text)
# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
if lowercase:
pattern = regex_pattern or ALLOWED_CHARS_PATTERN
else:
pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
# remove stopwords
if stopwords:
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
text = DEFAULT_SEPARATOR.join(words)
# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
return text

View file

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
def add_uppercase_char(char_list):
""" Given a replacement char list, this adds uppercase chars to the list """
for item in char_list:
char, xlate = item
upper_dict = char.upper(), xlate.capitalize()
if upper_dict not in char_list and char != upper_dict[0]:
char_list.insert(0, upper_dict)
return char_list
# Language specific pre translations
# Source awesome-slugify
_CYRILLIC = [ # package defaults:
(u'ё', u'e'), # io / yo
(u'я', u'ya'), # ia
(u'х', u'h'), # kh
(u'у', u'y'), # u
(u'щ', u'sch'), # shch
(u'ю', u'u'), # iu / yu
]
CYRILLIC = add_uppercase_char(_CYRILLIC)
_GERMAN = [ # package defaults:
(u'ä', u'ae'), # a
(u'ö', u'oe'), # o
(u'ü', u'ue'), # u
]
GERMAN = add_uppercase_char(_GERMAN)
_GREEK = [ # package defaults:
(u'χ', u'ch'), # kh
(u'Ξ', u'X'), # Ks
(u'ϒ', u'Y'), # U
(u'υ', u'y'), # u
(u'ύ', u'y'),
(u'ϋ', u'y'),
(u'ΰ', u'y'),
]
GREEK = add_uppercase_char(_GREEK)
# Pre translations
PRE_TRANSLATIONS = CYRILLIC + GERMAN + GREEK