anarsec.guide/layout/python/anarsec_article_to_pdf.py

import argparse
import contextlib
import os
import pathlib
import re
import shutil
import slugify
import subprocess
import tempfile

import pdfimposer
import PyPDF2
import toml

class Converter:
    """Converts an Anarsec article to PDF booklets."""

    def __init__(self, pandoc_binary: pathlib.Path, typst_binary: pathlib.Path, anarsec_root: pathlib.Path, post_id: str, *, force: bool = False, verbose: bool = False):
        """Initialize the converter."""

        # Set attributes
        self.pandoc_binary = pandoc_binary
        self.typst_binary = typst_binary
        self.anarsec_root = anarsec_root
        self.post_id = post_id.split('.',1)[0]
        self.post_lang = post_id.split('.',1)[1]
        self.force = force
        self.verbose = verbose

        # Set post directory
        self.post_directory = self.anarsec_root / "content" / "posts" / self.post_id

        # Check validity of some attributes
        if not self.pandoc_binary.exists() or not self.pandoc_binary.is_file():
            raise RuntimeError(f"Pandoc binary '{self.pandoc_binary}' doesn't exist or isn't a file.")
        if not self.typst_binary.exists() or not self.typst_binary.is_file():
            raise RuntimeError(f"Typst binary '{self.typst_binary}' doesn't exist or isn't a file.")
        if not self.anarsec_root.exists() or not self.anarsec_root.is_dir():
            raise RuntimeError(f"Anarsec root '{self.anarsec_root}' doesn't exist or isn't a directory.")
        if not self.post_directory.exists() or not self.post_directory.is_dir():
            raise RuntimeError(f"Post directory '{self.post_directory}' doesn't exist or isn't a directory.")

    def convert(self):
        """Convert the input file to the output file. This method should only be run once."""

        # Set glossary file
        if self.post_lang == 'en':
            glossary_file = self.anarsec_root / "content" / "glossary" / "_index.md"
        else:
            glossary_file = self.anarsec_root / "content" / "glossary" / f"_index.{self.post_lang}.md"
        if not glossary_file.exists() or not glossary_file.is_file():
            raise RuntimeError(f"Glossary file '{glossary_file}' doesn't exist or isn't a file.")

        # Set recommendations file
        if self.post_lang == 'en':
            recommendations_file = self.anarsec_root / "content" / "recommendations" / "_index.md"
        else:
            recommendations_file = self.anarsec_root / "content" / "recommendations" / f"_index.{self.post_lang}.md"
        if not recommendations_file.exists() or not recommendations_file.is_file():
            raise RuntimeError(f"Recommendations file '{recommendations_file}' doesn't exist or isn't a file.")

        # Set series file
        if self.post_lang == 'en':
            series_file = self.anarsec_root / "content" / "series" / "_index.md"
        else:
             series_file = self.anarsec_root / "content" / "series" / f"_index.{self.post_lang}.md"
        if not series_file.exists() or not series_file.is_file():
            raise RuntimeError(f"Series file '{series_file}' doesn't exist or isn't a file.")

        # Set input path
        if self.post_lang == 'en':
            input_path = self.post_directory / "index.md"
        else:
            input_path = self.post_directory / f"index.{self.post_lang}.md"
        if not input_path.exists() or not input_path.is_file():
            raise RuntimeError(f"Post Markdown file '{input_path}' doesn't exist or isn't a file.")

        # Load the glossary
        glossary = dict()
        for match in re.findall(r'### (.*?)\n+(.*?)\n*(?=###|\Z)', glossary_file.open().read(), re.DOTALL | re.MULTILINE):
            glossary[slugify.slugify(match[0])] = (match[0], match[1])

        # Load the series markdown
        series_markdown = re.search(r'\+{3}.*?\+{3}(.*)', series_file.open().read(), re.MULTILINE | re.DOTALL).group(1)

        # For each paper size
        for paper_size in ["a4", "letter"]:
            # Set the output path
            if self.post_lang == 'en':
                output_path = self.anarsec_root / "static" / "posts" / self.post_id / f"{self.post_id}-{paper_size}-{self.post_lang}.pdf"
            else:
                output_path = self.anarsec_root / "static" / self.post_lang / "posts" / self.post_id / f"{self.post_id}-{paper_size}-{self.post_lang}.pdf"
            if not self.force and output_path.exists():
                raise RuntimeError(f"Output file '{output_path}' already exists.")

            # Work in a temporary directory
            with tempfile.TemporaryDirectory() as workingDirectory:
                # Copy the required resources to the working directory
                shutil.copy(pathlib.Path(__file__).parent.parent / "anarsec_article.typ", workingDirectory)
                for filename in input_path.parent.iterdir():
                    if filename.suffix.lower() == ".webp":
                        subprocess.check_call(["convert", filename, pathlib.Path(workingDirectory) / f"{filename.name}.png"])
                    elif filename.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".svg", ".gif"]:
                        shutil.copy(filename, workingDirectory)

                # Separate the input file into a TOML front matter and Markdown content
                with input_path.open("r") as input_file:
                    match = re.fullmatch(r'\+{3}\n(.*)\+{3}(.*)', input_file.read(), re.DOTALL | re.MULTILINE)
                    if match is None:
                        raise RuntimeError(f"Couldn't separate input file '{self.input_path}' into a TOML front matter and Markdown content. Is it a valid Anarsec article?")
                    toml_front_matter = toml.loads(match.group(1))
                    markdown_content = match.group(2)

                # Grab the description
                description = re.search(r'^(.*?)\<\!\-\- more \-\-\>', markdown_content, re.DOTALL | re.MULTILINE).group(1).strip("\n ")

                # Parse the description
                description_md_path = pathlib.Path(workingDirectory) / "description.md"
                description_txt_path = pathlib.Path(workingDirectory) / "description.txt"
                description_md_path.open("w").write(description)
                subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "plain", "--columns", "999999", "-o", description_txt_path, description_md_path])
                description = description_txt_path.open().read()

                # Copy the front image
                blogimageid = "blogimagepng" if "blogimagepng" in toml_front_matter["extra"] else "blogimage"
                front_image = pathlib.Path(workingDirectory) / ("front_image" + pathlib.Path(toml_front_matter['extra'][blogimageid]).suffix)
                shutil.copy(self.anarsec_root / "static" / toml_front_matter['extra'][blogimageid].removeprefix("/"), front_image)

                # Copy the back image
                back_image = pathlib.Path(workingDirectory) / "back_image.png"
                shutil.copy(self.anarsec_root / "static" / "images" / "gay.png", back_image)

                # Copy the header font
                header_font = pathlib.Path(workingDirectory) / "Jost-Medium.ttf"
                shutil.copy(self.anarsec_root / "static" / "fonts" / "Jost-Medium.ttf", header_font)
                header_font_italic = pathlib.Path(workingDirectory) / "Jost-MediumItalic.ttf"
                shutil.copy(self.anarsec_root / "static" / "fonts" / "Jost-MediumItalic.ttf", header_font_italic)

                # Add recommendations to the Markdown content
                recommendations = re.search(r'\+{3}.*?\+{3}(.*)', recommendations_file.open().read(), re.MULTILINE | re.DOTALL).group(1)
                if self.post_lang == 'en':
                    markdown_content += f"\n\n# Appendix: Recommendations\n\n{recommendations}\n\n"
                if self.post_lang == 'fr':
                    markdown_content += f"\n\n# Annexe: Recommendations\n\n{recommendations}\n\n"

                # Make all images paths relative in the Markdown content
                for extension in ["jpg", "png", "webp", "jpeg", "gif"]:
                    markdown_content = re.sub(f'\\(\\/posts/{input_path.parent.name}/(.*?\.{extension})\\)', lambda match: f'({match.group(1)})', markdown_content)

                # Replace all .webp images to .png images in the Markdown content
                markdown_content = re.sub(r'\((.*?\.webp)\)', lambda match: f'({match.group(1)}.png)', markdown_content)

                # List glossary entries that appear in the Markdown content
                glossary_entries = set()
                for match in re.findall(r'\[.*?\]\(/glossary\/?#(.*?)\)', markdown_content):
                    glossary_entries.add(slugify.slugify(match))

                # Add glossary entries to the Markdown content
                if glossary_entries:
                    if self.post_lang == 'en':
                        markdown_content += "\n\n# Appendix: Glossary\n\n"
                    if self.post_lang == 'fr':
                        markdown_content += "\n\n# Annexe: Glossaire\n\n"
                    for entry, entry_content in glossary.items():
                        if entry in glossary_entries:
                            markdown_content += f"## {entry_content[0]}\n\n{entry_content[1]}\n\n"

                # Write the Markdown content to a file
                input_markdown_path = pathlib.Path(workingDirectory) / f"{self.post_id}-markdown.md"
                input_markdown_path.open("w").write(markdown_content)

                # Convert the Markdown content to typst
                typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}.typ"
                subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "typst", "--columns", "999999", "-o", typst_path, input_markdown_path])

                # Write the series markdown to a file
                series_markdown_path = pathlib.Path(workingDirectory) / "series-markdown.md"
                series_markdown_path.open("w").write(series_markdown)

                # Convert the series markdown to typst
                series_typst_path = pathlib.Path(workingDirectory) / f"series.typ"
                subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "typst", "--columns", "999999", "-o", series_typst_path, series_markdown_path])

                # mutlilingual categories
                category = toml_front_matter["taxonomies"]["categories"][0]
                if self.post_lang == 'fr':
                    if category == 'Defensive':
                        category = 'Défensif'
                    if category == 'Offensive':
                        category = 'Offensif'

                # Build the full typst file
                full_typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}-full.typ"
                full_typst = f"""
#import "anarsec_article.typ": anarsec_article, blockquote
#set page({'"a5"' if paper_size == "a4" else 'width: 5.5in, height: 8.5in'})
#show: content => anarsec_article(
  title: [
    {toml_front_matter["title"]}
  ],
  frontimage: "{front_image.name}",
  backimage: "{back_image.name}",
  lastediteddate: "{toml_front_matter["extra"]["dateedit"]}",
  description: "{description}",
  subtitle: "{toml_front_matter.get("description")}",
  category: "{category}",
  backcoverinsidecontent: [{series_typst_path.open().read()}],
  lang: "{self.post_lang}",
  content
)
{typst_path.open().read()}
"""
                full_typst_path.open("w").write(full_typst)

                # Convert the full typst file to PDF
                pdf_path = pathlib.Path(workingDirectory) / f"{self.post_id}.pdf"

                os.environ["TYPST_FONT_PATHS"] = str(workingDirectory)

                subprocess.check_call(
                    [str(self.typst_binary), "compile", full_typst_path, pdf_path, "--root", workingDirectory],
                    stderr = subprocess.STDOUT
                )

                # Insert blank pages before the back cover and back cover inside if needed
                pdf_reader = PyPDF2.PdfFileReader(pdf_path.open("rb"))
                if len(pdf_reader.pages) % 4 != 0:
                    pdf_writer = PyPDF2.PdfFileWriter()
                    for page in pdf_reader.pages[:-2]:
                        pdf_writer.addPage(page)
                    for i in range(4 - len(pdf_reader.pages) % 4):
                        pdf_writer.addBlankPage()
                    pdf_writer.addPage(pdf_reader.pages[-2])
                    pdf_writer.addPage(pdf_reader.pages[-1])
                    pdf_with_blank_pages_path = pathlib.Path(workingDirectory) / f"{self.post_id}-with-blank-pages.pdf"
                    pdf_writer.write(pdf_with_blank_pages_path.open("wb"))
                    shutil.copy(pdf_with_blank_pages_path, pdf_path)

                # Bookletize
                with open(os.devnull, "w") as devnull:
                    with contextlib.redirect_stdout(devnull):
                        pdfimposer.bookletize_on_file(
                            pdf_path,
                            output_path,
                            layout = "2x1",
                            format = "A4" if paper_size == "a4" else "Letter"
                        )

            # Print a message
            if self.verbose:
                print(f"PDF file '{output_path}' created successfully!")

if __name__ == "__main__":
    # Parse arguments
    parser = argparse.ArgumentParser(description = "Converts an Anarsec article to PDF booklets.")
    parser.add_argument("--pandoc-binary", type = pathlib.Path, required = True, help = "Path to the Pandoc binary. Minimum required version is 3.1.5.")
    parser.add_argument("--typst-binary", type = pathlib.Path, required = True, help = "Path to the typst binary. Minimum required version is 0.6.0.")
    parser.add_argument("--anarsec-root", type = pathlib.Path, required = True, help = "Root of the Anarsec repository.")
    parser.add_argument("--post-id", type = str, required = True, help = "ID of the Anarsec post to convert with language added after a period, i.e. 'nophones.en' and 'nophones.fr', where 'nophones' is the name of the post folder in '/content/posts'.")
    parser.add_argument("-f", "--force", dest = "force", default = False, action = "store_true", help = "Replace the output files if they already exist.")
    parser.add_argument("-v", "--verbose", dest = "verbose", default = False, action = "store_true", help = "Print messages when the output files are created.")
    arguments = parser.parse_args()

    # Create the converter
    converter = Converter(
        arguments.pandoc_binary,
        arguments.typst_binary,
        arguments.anarsec_root,
        arguments.post_id,
        force = arguments.force,
        verbose = arguments.verbose
    )

    # Convert
    converter.convert()