python and typst script

2025-07-23 06:40:37 -04:00 · 2023-07-09 20:53:03 +00:00 · 2023-07-09 20:53:03 +00:00 · be05046783
commit be05046783
parent da5f497ec1
19 changed files with 2223 additions and 0 deletions
--- a/layout/python/anarsec_article_to_pdf.py
+++ b/layout/python/anarsec_article_to_pdf.py
@ -0,0 +1,221 @@
+import argparse
+import contextlib
+import os
+import pathlib
+import re
+import shutil
+import slugify
+import subprocess
+import tempfile
+
+import pdfimposer
+import PyPDF2
+import toml
+
+class Converter:
+    """Converts an Anarsec article to PDF booklets."""
+    
+    def __init__(self, pandoc_binary: pathlib.Path, typst_binary: pathlib.Path, anarsec_root: pathlib.Path, post_id: str, *, force: bool = False, verbose: bool = False):
+        """Initialize the converter."""
+        
+        # Set attributes
+        self.pandoc_binary = pandoc_binary
+        self.typst_binary = typst_binary
+        self.anarsec_root = anarsec_root
+        self.post_id = post_id
+        self.force = force
+        self.verbose = verbose
+        
+        # Set post directory
+        self.post_directory = self.anarsec_root / "content" / "posts" / self.post_id
+        
+        # Check validity of some attributes
+        if not self.pandoc_binary.exists() or not self.pandoc_binary.is_file():
+            raise RuntimeError(f"Pandoc binary '{self.pandoc_binary}' doesn't exist or isn't a file.")
+        if not self.typst_binary.exists() or not self.typst_binary.is_file():
+            raise RuntimeError(f"Typst binary '{self.typst_binary}' doesn't exist or isn't a file.")
+        if not self.anarsec_root.exists() or not self.anarsec_root.is_dir():
+            raise RuntimeError(f"Anarsec root '{self.anarsec_root}' doesn't exist or isn't a directory.")
+        if not self.post_directory.exists() or not self.post_directory.is_dir():
+            raise RuntimeError(f"Post directory '{self.post_directory}' doesn't exist or isn't a directory.")
+        
+    def convert(self):
+        """Convert the input file to the output file. This method should only be run once."""
+        
+        # Set glossary file
+        glossary_file = self.anarsec_root / "content" / "glossary" / "_index.md"
+        if not glossary_file.exists() or not glossary_file.is_file():
+            raise RuntimeError(f"Glossary file '{glossary_file}' doesn't exist or isn't a file.")
+            
+        # Set recommendations file
+        recommendations_file = self.anarsec_root / "content" / "recommendations" / "_index.md"
+        if not recommendations_file.exists() or not recommendations_file.is_file():
+            raise RuntimeError(f"Recommendations file '{recommendations_file}' doesn't exist or isn't a file.")
+            
+        # Set input path
+        input_path = self.post_directory / "index.md"
+        if not input_path.exists() or not input_path.is_file():
+            raise RuntimeError(f"Post Markdown file '{input_path}' doesn't exist or isn't a file.")
+            
+        # Load the glossary
+        glossary = dict()
+        for match in re.findall(r'### (.*?)\n+(.*?)\n*(?=###|\Z)', glossary_file.open().read(), re.DOTALL | re.MULTILINE):
+            glossary[slugify.slugify(match[0])] = (match[0], match[1])
+        
+        # For each paper size
+        for paper_size in ["a4", "letter"]:
+            # Set the output path
+            output_path = self.post_directory / f"{self.post_id}-{paper_size}.pdf"
+            if not self.force and output_path.exists():
+                raise RuntimeError(f"Output file '{output_path}' already exists.")
+            
+            # Work in a temporary directory
+            with tempfile.TemporaryDirectory() as workingDirectory:
+                # Copy the required resources to the working directory
+                shutil.copy(pathlib.Path(__file__).parent.parent / "anarsec_article.typ", workingDirectory)
+                for filename in input_path.parent.iterdir():
+                    if filename.suffix.lower() == ".webp":
+                        subprocess.check_call(["convert", filename, pathlib.Path(workingDirectory) / f"{filename.name}.png"])
+                    elif filename.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".svg", ".gif"]:
+                        shutil.copy(filename, workingDirectory)
+                
+                # Separate the input file into a TOML front matter and Markdown content
+                with input_path.open("r") as input_file:
+                    match = re.fullmatch(r'\+{3}\n(.*)\+{3}(.*)', input_file.read(), re.DOTALL | re.MULTILINE)
+                    if match is None:
+                        raise RuntimeError(f"Couldn't separate input file '{self.input_path}' into a TOML front matter and Markdown content. Is it a valid Anarsec article?")
+                    toml_front_matter = toml.loads(match.group(1))
+                    markdown_content = match.group(2)
+                
+                # Grab the description
+                description = re.search(r'^(.*?)\<\!\-\- more \-\-\>', markdown_content, re.DOTALL | re.MULTILINE).group(1).strip("\n ")
+                
+                # Parse the description
+                description_md_path = pathlib.Path(workingDirectory) / "description.md"
+                description_txt_path = pathlib.Path(workingDirectory) / "description.txt"
+                description_md_path.open("w").write(description)
+                subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "plain", "--columns", "999999", "-o", description_txt_path, description_md_path])
+                description = description_txt_path.open().read()
+                    
+                # Copy the front image
+                front_image = pathlib.Path(workingDirectory) / ("front_image" + pathlib.Path(toml_front_matter['extra']['blogimage']).suffix)
+                shutil.copy(self.anarsec_root / "static" / toml_front_matter['extra']['blogimage'].removeprefix("/"), front_image)
+
+                # Copy the back image
+                back_image = pathlib.Path(workingDirectory) / "back_image.png"
+                shutil.copy(self.anarsec_root / "static" / "images" / "gay.png", back_image)
+                
+                # Add recommendations to the Markdown content
+                recommendations = re.search(r'\+{3}.*?\+{3}(.*)', recommendations_file.open().read(), re.MULTILINE | re.DOTALL).group(1)
+                markdown_content += f"\n\n# Recommendations\n\n{recommendations}\n\n"
+                
+                # Replace all .webp images to .png images in the Markdown content
+                markdown_content = re.sub(r'\((.*?\.webp)\)', lambda match: f'({match.group(1)}.png)', markdown_content)
+                
+                # List glossary entries that appear in the Markdown content
+                glossary_entries = set()
+                for match in re.findall(r'\[.*?\]\(/glossary\/?#(.*?)\)', markdown_content):
+                    glossary_entries.add(slugify.slugify(match))
+                
+                # Add to glossary entries the glossary entries that appear in glossary entries, recursively
+                added_entry = True
+                while added_entry:
+                    added_entry = False
+                    for entry in list(glossary_entries):
+                        for match in re.findall(r'\[.*?\]\((?:/glossary|)\/?#(.*?)\)', glossary[entry][1]):
+                            new_entry = slugify.slugify(match)
+                            if new_entry not in glossary_entries:
+                                glossary_entries.add(new_entry)
+                                added_entry = True
+                
+                # Add glossary entries to the Markdown content
+                if glossary_entries:
+                    markdown_content += "\n\n# Glossary\n\n"
+                    for entry, entry_content in glossary.items():
+                        if entry in glossary_entries:
+                            markdown_content += f"## {entry_content[0]}\n\n{entry_content[1]}\n\n"
+                
+                # Write the Markdown content to a file
+                input_markdown_path = pathlib.Path(workingDirectory) / f"{self.post_id}-markdown.md"
+                input_markdown_path.open("w").write(markdown_content)
+                
+                # Convert the Markdown content to typst
+                typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}.typ"
+                subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "typst", "--columns", "999999", "-o", typst_path, input_markdown_path])
+                
+                # Build the full typst file
+                full_typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}-full.typ"
+                full_typst = f"""
+#import "anarsec_article.typ": anarsec_article, blockquote
+#set page({'"a5"' if paper_size == "a4" else 'width: 5.5in, height: 8.5in'})
+#show: content => anarsec_article(
+  title: [
+    {toml_front_matter["title"]}
+  ],
+  frontimage: "{front_image.name}",
+  backimage: "{back_image.name}",
+  lastediteddate: "{toml_front_matter["extra"]["dateedit"]}",
+  description: "{description}",
+  content
+)
+{typst_path.open().read()}
+"""
+                full_typst_path.open("w").write(full_typst)
+                    
+                # Convert the full typst file to PDF
+                pdf_path = pathlib.Path(workingDirectory) / f"{self.post_id}.pdf"
+                subprocess.check_call(
+                    [str(self.typst_binary), "--root", workingDirectory, "compile", full_typst_path, pdf_path],
+                    stderr = subprocess.STDOUT
+                )
+                
+                # Insert blank pages before the back cover if needed
+                pdf_reader = PyPDF2.PdfFileReader(pdf_path.open("rb"))
+                if len(pdf_reader.pages) % 4 != 0:
+                    pdf_writer = PyPDF2.PdfFileWriter()
+                    for page in pdf_reader.pages[:-1]:
+                        pdf_writer.addPage(page)
+                    for i in range(4 - len(pdf_reader.pages) % 4):
+                        pdf_writer.addBlankPage()
+                    pdf_writer.addPage(pdf_reader.pages[-1])
+                    pdf_with_blank_pages_path = pathlib.Path(workingDirectory) / f"{self.post_id}-with-blank-pages.pdf"
+                    pdf_writer.write(pdf_with_blank_pages_path.open("wb"))
+                    shutil.copy(pdf_with_blank_pages_path, pdf_path)
+                
+                # Bookletize
+                with open(os.devnull, "w") as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        pdfimposer.bookletize_on_file(
+                            pdf_path,
+                            output_path,
+                            layout = "2x1",
+                            format = "A4" if paper_size == "a4" else "Letter"
+                        )
+                    
+            # Print a message
+            if self.verbose:
+                print(f"PDF file '{output_path}' created successfully!")
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser(description = "Converts an Anarsec article to PDF booklets.")
+    parser.add_argument("--pandoc-binary", type = pathlib.Path, required = True, help = "Path to the Pandoc binary. Minimum required version is 3.1.5.")
+    parser.add_argument("--typst-binary", type = pathlib.Path, required = True, help = "Path to the typst binary. Minimum required version is 0.6.0.")
+    parser.add_argument("--anarsec-root", type = pathlib.Path, required = True, help = "Root of the Anarsec repository.")
+    parser.add_argument("--post-id", type = str, required = True, help = "ID of the Anarsec post to convert, i.e. the name of the post folder in '/content/posts'.")
+    parser.add_argument("-f", "--force", dest = "force", default = False, action = "store_true", help = "Replace the output files if they already exist.")
+    parser.add_argument("-v", "--verbose", dest = "verbose", default = False, action = "store_true", help = "Print messages when the output files are created.")
+    arguments = parser.parse_args()
+    
+    # Create the converter
+    converter = Converter(
+        arguments.pandoc_binary,
+        arguments.typst_binary,
+        arguments.anarsec_root,
+        arguments.post_id,
+        force = arguments.force,
+        verbose = arguments.verbose
+    )
+    
+    # Convert
+    converter.convert()