anarsec.guide/layout/python/anarsec_article_to_pdf.py
2024-11-22 13:04:27 -05:00

276 lines
15 KiB
Python

import argparse
import contextlib
import os
import pathlib
import re
import shutil
import slugify
import subprocess
import tempfile
import pdfimposer
import PyPDF2
import toml
class Converter:
"""Converts an Anarsec article to PDF booklets."""
def __init__(self, pandoc_binary: pathlib.Path, typst_binary: pathlib.Path, anarsec_root: pathlib.Path, post_id: str, *, force: bool = False, verbose: bool = False):
"""Initialize the converter."""
# Set attributes
self.pandoc_binary = pandoc_binary
self.typst_binary = typst_binary
self.anarsec_root = anarsec_root
self.post_id = post_id.split('.',1)[0]
self.post_lang = post_id.split('.',1)[1]
self.force = force
self.verbose = verbose
# Set post directory
self.post_directory = self.anarsec_root / "content" / "posts" / self.post_id
# Check validity of some attributes
if not self.pandoc_binary.exists() or not self.pandoc_binary.is_file():
raise RuntimeError(f"Pandoc binary '{self.pandoc_binary}' doesn't exist or isn't a file.")
if not self.typst_binary.exists() or not self.typst_binary.is_file():
raise RuntimeError(f"Typst binary '{self.typst_binary}' doesn't exist or isn't a file.")
if not self.anarsec_root.exists() or not self.anarsec_root.is_dir():
raise RuntimeError(f"Anarsec root '{self.anarsec_root}' doesn't exist or isn't a directory.")
if not self.post_directory.exists() or not self.post_directory.is_dir():
raise RuntimeError(f"Post directory '{self.post_directory}' doesn't exist or isn't a directory.")
def convert(self):
"""Convert the input file to the output file. This method should only be run once."""
# Set glossary file
if self.post_lang == 'en':
glossary_file = self.anarsec_root / "content" / "glossary" / "_index.md"
else:
glossary_file = self.anarsec_root / "content" / "glossary" / f"_index.{self.post_lang}.md"
if not glossary_file.exists() or not glossary_file.is_file():
raise RuntimeError(f"Glossary file '{glossary_file}' doesn't exist or isn't a file.")
# Set recommendations file
if self.post_lang == 'en':
recommendations_file = self.anarsec_root / "content" / "recommendations" / "_index.md"
else:
recommendations_file = self.anarsec_root / "content" / "recommendations" / f"_index.{self.post_lang}.md"
if not recommendations_file.exists() or not recommendations_file.is_file():
raise RuntimeError(f"Recommendations file '{recommendations_file}' doesn't exist or isn't a file.")
# Set series file
if self.post_lang == 'en':
series_file = self.anarsec_root / "content" / "series" / "_index.md"
else:
series_file = self.anarsec_root / "content" / "series" / f"_index.{self.post_lang}.md"
if not series_file.exists() or not series_file.is_file():
raise RuntimeError(f"Series file '{series_file}' doesn't exist or isn't a file.")
# Set input path
if self.post_lang == 'en':
input_path = self.post_directory / "index.md"
else:
input_path = self.post_directory / f"index.{self.post_lang}.md"
if not input_path.exists() or not input_path.is_file():
raise RuntimeError(f"Post Markdown file '{input_path}' doesn't exist or isn't a file.")
# Load the glossary
glossary = dict()
for match in re.findall(r'### (.*?)\n+(.*?)\n*(?=###|\Z)', glossary_file.open().read(), re.DOTALL | re.MULTILINE):
glossary[slugify.slugify(match[0])] = (match[0], match[1])
# Load the series markdown
series_markdown = re.search(r'\+{3}.*?\+{3}(.*)', series_file.open().read(), re.MULTILINE | re.DOTALL).group(1)
# For each paper size
for paper_size in ["a4", "letter"]:
# Set the output path
if self.post_lang == 'en':
output_path = self.anarsec_root / "static" / "posts" / self.post_id / f"{self.post_id}-{paper_size}-{self.post_lang}.pdf"
else:
output_path = self.anarsec_root / "static" / self.post_lang / "posts" / self.post_id / f"{self.post_id}-{paper_size}-{self.post_lang}.pdf"
if not self.force and output_path.exists():
raise RuntimeError(f"Output file '{output_path}' already exists.")
# Work in a temporary directory
with tempfile.TemporaryDirectory() as workingDirectory:
# Copy the required resources to the working directory
shutil.copy(pathlib.Path(__file__).parent.parent / "anarsec_article.typ", workingDirectory)
for filename in input_path.parent.iterdir():
if filename.suffix.lower() == ".webp":
subprocess.check_call(["convert", filename, pathlib.Path(workingDirectory) / f"{filename.name}.png"])
elif filename.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".svg", ".gif"]:
shutil.copy(filename, workingDirectory)
# Separate the input file into a TOML front matter and Markdown content
with input_path.open("r") as input_file:
match = re.fullmatch(r'\+{3}\n(.*)\+{3}(.*)', input_file.read(), re.DOTALL | re.MULTILINE)
if match is None:
raise RuntimeError(f"Couldn't separate input file '{self.input_path}' into a TOML front matter and Markdown content. Is it a valid Anarsec article?")
toml_front_matter = toml.loads(match.group(1))
markdown_content = match.group(2)
# Grab the description
description = re.search(r'^(.*?)\<\!\-\- more \-\-\>', markdown_content, re.DOTALL | re.MULTILINE).group(1).strip("\n ")
# Parse the description
description_md_path = pathlib.Path(workingDirectory) / "description.md"
description_txt_path = pathlib.Path(workingDirectory) / "description.txt"
description_md_path.open("w").write(description)
subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "plain", "--columns", "999999", "-o", description_txt_path, description_md_path])
description = description_txt_path.open().read()
# Copy the front image
blogimageid = "blogimagepng" if "blogimagepng" in toml_front_matter["extra"] else "blogimage"
front_image = pathlib.Path(workingDirectory) / ("front_image" + pathlib.Path(toml_front_matter['extra'][blogimageid]).suffix)
shutil.copy(self.anarsec_root / "static" / toml_front_matter['extra'][blogimageid].removeprefix("/"), front_image)
# Copy the back image
back_image = pathlib.Path(workingDirectory) / "back_image.png"
shutil.copy(self.anarsec_root / "static" / "images" / "gay.png", back_image)
# Copy the header font
header_font = pathlib.Path(workingDirectory) / "Jost-Medium.ttf"
shutil.copy(self.anarsec_root / "static" / "fonts" / "Jost-Medium.ttf", header_font)
header_font_italic = pathlib.Path(workingDirectory) / "Jost-MediumItalic.ttf"
shutil.copy(self.anarsec_root / "static" / "fonts" / "Jost-MediumItalic.ttf", header_font_italic)
# Add recommendations to the Markdown content
recommendations = re.search(r'\+{3}.*?\+{3}(.*)', recommendations_file.open().read(), re.MULTILINE | re.DOTALL).group(1)
if self.post_lang == 'en':
markdown_content += f"\n\n# Appendix: Recommendations\n\n{recommendations}\n\n"
if self.post_lang == 'fr':
markdown_content += f"\n\n# Annexe: Recommendations\n\n{recommendations}\n\n"
# Make all images paths relative in the Markdown content
for extension in ["jpg", "png", "webp", "jpeg", "gif"]:
markdown_content = re.sub(f'\\(\\/posts/{input_path.parent.name}/(.*?\.{extension})\\)', lambda match: f'({match.group(1)})', markdown_content)
# Replace all .webp images to .png images in the Markdown content
markdown_content = re.sub(r'\((.*?\.webp)\)', lambda match: f'({match.group(1)}.png)', markdown_content)
# List glossary entries that appear in the Markdown content
glossary_entries = set()
for match in re.findall(r'\[.*?\]\(/glossary\/?#(.*?)\)', markdown_content):
glossary_entries.add(slugify.slugify(match))
# Add glossary entries to the Markdown content
if glossary_entries:
if self.post_lang == 'en':
markdown_content += "\n\n# Appendix: Glossary\n\n"
if self.post_lang == 'fr':
markdown_content += "\n\n# Annexe: Glossaire\n\n"
for entry, entry_content in glossary.items():
if entry in glossary_entries:
markdown_content += f"## {entry_content[0]}\n\n{entry_content[1]}\n\n"
# Write the Markdown content to a file
input_markdown_path = pathlib.Path(workingDirectory) / f"{self.post_id}-markdown.md"
input_markdown_path.open("w").write(markdown_content)
# Convert the Markdown content to typst
typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}.typ"
subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "typst", "--columns", "999999", "-o", typst_path, input_markdown_path])
# Write the series markdown to a file
series_markdown_path = pathlib.Path(workingDirectory) / "series-markdown.md"
series_markdown_path.open("w").write(series_markdown)
# Convert the series markdown to typst
series_typst_path = pathlib.Path(workingDirectory) / f"series.typ"
subprocess.check_call([str(self.pandoc_binary), "-f", "markdown", "-t", "typst", "--columns", "999999", "-o", series_typst_path, series_markdown_path])
# mutlilingual categories
category = toml_front_matter["taxonomies"]["categories"][0]
if self.post_lang == 'fr':
if category == 'Defensive':
category = 'Défensif'
if category == 'Offensive':
category = 'Offensif'
# Build the full typst file
full_typst_path = pathlib.Path(workingDirectory) / f"{self.post_id}-full.typ"
full_typst = f"""
#import "anarsec_article.typ": anarsec_article, blockquote
#set page({'"a5"' if paper_size == "a4" else 'width: 5.5in, height: 8.5in'})
#show: content => anarsec_article(
title: [
{toml_front_matter["title"]}
],
frontimage: "{front_image.name}",
backimage: "{back_image.name}",
lastediteddate: "{toml_front_matter["extra"]["dateedit"]}",
description: "{description}",
subtitle: "{toml_front_matter.get("description")}",
category: "{category}",
backcoverinsidecontent: [{series_typst_path.open().read()}],
lang: "{self.post_lang}",
content
)
{typst_path.open().read()}
"""
full_typst_path.open("w").write(full_typst)
# Convert the full typst file to PDF
pdf_path = pathlib.Path(workingDirectory) / f"{self.post_id}.pdf"
os.environ["TYPST_FONT_PATHS"] = str(workingDirectory)
subprocess.check_call(
[str(self.typst_binary), "compile", full_typst_path, pdf_path, "--root", workingDirectory],
stderr = subprocess.STDOUT
)
# Insert blank pages before the back cover and back cover inside if needed
pdf_reader = PyPDF2.PdfFileReader(pdf_path.open("rb"))
if len(pdf_reader.pages) % 4 != 0:
pdf_writer = PyPDF2.PdfFileWriter()
for page in pdf_reader.pages[:-2]:
pdf_writer.addPage(page)
for i in range(4 - len(pdf_reader.pages) % 4):
pdf_writer.addBlankPage()
pdf_writer.addPage(pdf_reader.pages[-2])
pdf_writer.addPage(pdf_reader.pages[-1])
pdf_with_blank_pages_path = pathlib.Path(workingDirectory) / f"{self.post_id}-with-blank-pages.pdf"
pdf_writer.write(pdf_with_blank_pages_path.open("wb"))
shutil.copy(pdf_with_blank_pages_path, pdf_path)
# Bookletize
with open(os.devnull, "w") as devnull:
with contextlib.redirect_stdout(devnull):
pdfimposer.bookletize_on_file(
pdf_path,
output_path,
layout = "2x1",
format = "A4" if paper_size == "a4" else "Letter"
)
# Print a message
if self.verbose:
print(f"PDF file '{output_path}' created successfully!")
if __name__ == "__main__":
# Parse arguments
parser = argparse.ArgumentParser(description = "Converts an Anarsec article to PDF booklets.")
parser.add_argument("--pandoc-binary", type = pathlib.Path, required = True, help = "Path to the Pandoc binary. Minimum required version is 3.1.5.")
parser.add_argument("--typst-binary", type = pathlib.Path, required = True, help = "Path to the typst binary. Minimum required version is 0.6.0.")
parser.add_argument("--anarsec-root", type = pathlib.Path, required = True, help = "Root of the Anarsec repository.")
parser.add_argument("--post-id", type = str, required = True, help = "ID of the Anarsec post to convert with language added after a period, i.e. 'nophones.en' and 'nophones.fr', where 'nophones' is the name of the post folder in '/content/posts'.")
parser.add_argument("-f", "--force", dest = "force", default = False, action = "store_true", help = "Replace the output files if they already exist.")
parser.add_argument("-v", "--verbose", dest = "verbose", default = False, action = "store_true", help = "Print messages when the output files are created.")
arguments = parser.parse_args()
# Create the converter
converter = Converter(
arguments.pandoc_binary,
arguments.typst_binary,
arguments.anarsec_root,
arguments.post_id,
force = arguments.force,
verbose = arguments.verbose
)
# Convert
converter.convert()