mirror of
https://github.com/c0mmando/discourse-to-markdown-archiver.git
synced 2025-04-14 20:53:10 -04:00
add archive.py
This commit is contained in:
parent
0900f6cf22
commit
c931799f4e
334
archive.py
Normal file
334
archive.py
Normal file
@ -0,0 +1,334 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Archive Discourse posts and render topics to Markdown from multiple sites.
|
||||
|
||||
This script downloads posts from one or more Discourse servers via their APIs,
|
||||
archives new posts as JSON files (skipping those already saved or archived),
|
||||
renders topics to Markdown files for each batch of posts concurrently, and updates
|
||||
a metadata file after each post is indexed.
|
||||
|
||||
Usage:
|
||||
./archive_and_render.py --urls https://forum.hackliberty.org,https://forum.qubes-os.org --target-dir ./archive
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import functools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import datetime
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Set up logging. If the 'rich' module is available, it will be used.
|
||||
loglevel = 'DEBUG' if os.environ.get('DEBUG') else 'INFO'
|
||||
try:
|
||||
from rich.logging import RichHandler
|
||||
logging.basicConfig(level=loglevel, datefmt="[%X]", handlers=[RichHandler()])
|
||||
except ImportError:
|
||||
logging.basicConfig(level=loglevel)
|
||||
log = logging.getLogger('archive_and_render')
|
||||
|
||||
# Argument parser (cached for re-use)
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Archive topics from one or more Discourse installations and render to markdown')
|
||||
parser.add_argument(
|
||||
'--urls',
|
||||
help='Comma-separated URLs of Discourse servers (for example: "https://forum.hackliberty.org,https://forum.qubes-os.org")',
|
||||
default=os.environ.get('DISCOURSE_URLS', 'https://forum.hackliberty.org'))
|
||||
parser.add_argument(
|
||||
'--debug', action='store_true', default=os.environ.get('DEBUG', False))
|
||||
parser.add_argument(
|
||||
'-t', '--target-dir', help='Target base directory for the archives',
|
||||
default=Path(os.environ.get('TARGET_DIR', './archive')))
|
||||
|
||||
@functools.cache
|
||||
def args():
|
||||
return parser.parse_args()
|
||||
|
||||
def parse_sites(urls_string: str) -> list:
|
||||
"""Return a list of cleaned-up site URLs."""
|
||||
return [url.strip().rstrip('/') for url in urls_string.split(',') if url.strip()]
|
||||
|
||||
def http_get(site_url: str, path: str) -> str:
|
||||
"""Simple HTTP GET with exponential backoff."""
|
||||
full_url = f"{site_url}{path}"
|
||||
log.debug("HTTP GET %s", full_url)
|
||||
backoff = 3
|
||||
while True:
|
||||
try:
|
||||
with urllib.request.urlopen(full_url) as response:
|
||||
return response.read().decode()
|
||||
except Exception as e:
|
||||
log.debug("Error fetching %s: %s -- Retrying in %d seconds", full_url, e, backoff)
|
||||
time.sleep(backoff)
|
||||
backoff *= 2
|
||||
if backoff >= 256:
|
||||
log.exception("Rate limit or unrecoverable error for %s", full_url)
|
||||
sys.exit(1)
|
||||
|
||||
def http_get_json(site_url: str, path: str) -> dict:
|
||||
"""Fetch URL contents from a specific site and decode JSON."""
|
||||
try:
|
||||
return json.loads(http_get(site_url, path))
|
||||
except json.JSONDecodeError:
|
||||
log.warning("Unable to decode JSON response from %r", path)
|
||||
raise
|
||||
|
||||
# ----- Data Models -----
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PostTopic:
|
||||
id: int
|
||||
slug: str
|
||||
title: str
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Post:
|
||||
id: int
|
||||
slug: str
|
||||
raw: dict
|
||||
|
||||
def get_created_at(self) -> datetime.datetime:
|
||||
return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
|
||||
|
||||
def save(self, dir: Path):
|
||||
"""Save the raw JSON post to disk if not already archived."""
|
||||
idstr = str(self.id).zfill(10)
|
||||
filename = f"{idstr}-{self.raw.get('username', 'anonymous')}-{self.raw.get('topic_slug', 'unknown')}.json"
|
||||
folder_name = self.get_created_at().strftime('%Y-%m-%B')
|
||||
full_path = dir / folder_name / filename
|
||||
|
||||
if full_path.exists():
|
||||
log.debug("Post %s already saved, skipping", self.id)
|
||||
return
|
||||
|
||||
full_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log.info("Saving post %s to %s", self.id, full_path)
|
||||
full_path.write_text(json.dumps(self.raw, indent=2), encoding='utf-8')
|
||||
|
||||
def get_topic(self) -> PostTopic:
|
||||
return PostTopic(
|
||||
id=self.raw.get('topic_id', self.id),
|
||||
slug=self.raw.get('topic_slug', self.slug),
|
||||
title=self.raw.get('topic_title', self.raw.get('title', 'No Title')),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, j: dict) -> 'Post':
|
||||
return cls(
|
||||
id=j['id'],
|
||||
slug=j.get('topic_slug', 'unknown'),
|
||||
raw=j,
|
||||
)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Topic:
|
||||
id: int
|
||||
slug: str
|
||||
raw: dict
|
||||
markdown: str
|
||||
|
||||
def get_created_at(self) -> datetime.datetime:
|
||||
return datetime.datetime.fromisoformat(self.raw['created_at'].replace("Z", "+00:00"))
|
||||
|
||||
def save_rendered(self, dir: Path):
|
||||
"""
|
||||
Save the rendered Markdown topic to disk.
|
||||
Filename built from creation date, slug, and id.
|
||||
"""
|
||||
date_str = str(self.get_created_at().date())
|
||||
filename = f"{date_str}-{self.slug}-id{self.id}.md"
|
||||
folder_name = self.get_created_at().strftime('%Y-%m-%B')
|
||||
full_path = dir / folder_name / filename
|
||||
full_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log.info("Saving rendered topic %s to %s", self.id, full_path)
|
||||
rendered_markdown = f"# {self.raw.get('title', 'No Title')}\n\n{self.markdown}"
|
||||
full_path.write_text(rendered_markdown, encoding='utf-8')
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, t: dict, markdown: str) -> 'Topic':
|
||||
slug = t.get('slug') or t.get('topic_slug') or "unknown"
|
||||
return cls(
|
||||
id=t.get('id', 0),
|
||||
slug=slug,
|
||||
raw=t,
|
||||
markdown=markdown,
|
||||
)
|
||||
|
||||
# ----- Helper Functions -----
|
||||
|
||||
def update_metadata(metadata_file: Path, metadata: dict):
|
||||
"""Writes the metadata as a JSON file to disk."""
|
||||
log.debug("Updating metadata: %s", metadata)
|
||||
metadata_file.write_text(json.dumps(metadata, indent=2), encoding='utf-8')
|
||||
|
||||
def render_topic(site_url: str, topic: PostTopic, topics_dir: Path):
|
||||
"""
|
||||
Render a single topic to Markdown.
|
||||
Fetches the topic JSON and its raw Markdown (including additional pages if available).
|
||||
"""
|
||||
try:
|
||||
log.info("Fetching topic %s JSON from %s", topic.id, site_url)
|
||||
topic_data = http_get_json(site_url, f"/t/{topic.id}.json")
|
||||
except Exception as e:
|
||||
log.warning("Failed to fetch topic JSON for topic %s: %s", topic.id, e)
|
||||
return
|
||||
|
||||
log.info("Fetching raw markdown for topic %s from %s", topic.id, site_url)
|
||||
body = http_get(site_url, f"/raw/{topic.id}")
|
||||
if not body:
|
||||
log.warning("Could not retrieve markdown body for topic %s", topic.id)
|
||||
return
|
||||
|
||||
# Assemble additional pages if available.
|
||||
page_num = 2
|
||||
while True:
|
||||
more_body = http_get(site_url, f"/raw/{topic.id}?page={page_num}")
|
||||
if not more_body:
|
||||
break
|
||||
body += f"\n{more_body}"
|
||||
page_num += 1
|
||||
|
||||
try:
|
||||
topic_obj = Topic.from_json(topic_data, body)
|
||||
except Exception as e:
|
||||
log.error("Failed to create Topic object for topic %s: %s", topic.id, e)
|
||||
return
|
||||
|
||||
topic_obj.save_rendered(topics_dir)
|
||||
log.info("Saved rendered topic %s (%s)", topic_obj.id, topic_obj.slug)
|
||||
|
||||
def render_topics_concurrently(site_url: str, topics: dict, topics_dir: Path, max_workers: int = 8):
|
||||
"""
|
||||
Render multiple topics concurrently.
|
||||
topics: a dictionary of topic_id -> PostTopic.
|
||||
"""
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [executor.submit(render_topic, site_url, topic, topics_dir) for topic in topics.values()]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as exc:
|
||||
log.error("A topic generated an exception: %s", exc)
|
||||
|
||||
def process_site(site_url: str, base_target_dir: Path):
|
||||
"""
|
||||
Archive posts and render topics for a single site.
|
||||
Each site gets its own subdirectory (named for its hostname) inside the base target directory,
|
||||
and its own metadata file.
|
||||
"""
|
||||
parsed = urlparse(site_url)
|
||||
site_name = parsed.hostname or site_url.replace("https://", "").replace("http://", "").split('/')[0]
|
||||
log.info("Processing site: %s", site_url)
|
||||
site_target_dir = base_target_dir / site_name
|
||||
posts_dir = site_target_dir / 'posts'
|
||||
topics_dir = site_target_dir / 'rendered-topics'
|
||||
posts_dir.mkdir(parents=True, exist_ok=True)
|
||||
topics_dir.mkdir(parents=True, exist_ok=True)
|
||||
metadata_file = site_target_dir / '.metadata.json'
|
||||
|
||||
# Load stored metadata if it exists.
|
||||
metadata = {}
|
||||
last_sync_date = None
|
||||
archived_post_ids = set()
|
||||
if metadata_file.exists():
|
||||
try:
|
||||
metadata = json.loads(metadata_file.read_text())
|
||||
if "last_sync_date" in metadata:
|
||||
last_sync_date = datetime.datetime.fromisoformat(metadata.get('last_sync_date'))
|
||||
if "archived_post_ids" in metadata:
|
||||
archived_post_ids = set(int(x) for x in metadata.get('archived_post_ids', []))
|
||||
except Exception as e:
|
||||
log.error("Failed to read/parse metadata file for %s: %s", site_url, e)
|
||||
|
||||
if last_sync_date:
|
||||
# Step back one day to catch updates.
|
||||
last_sync_date -= datetime.timedelta(days=1)
|
||||
log.info("Resyncing posts from %s for %s", last_sync_date.isoformat(), site_url)
|
||||
|
||||
posts_json = http_get_json(site_url, '/posts.json')
|
||||
posts = posts_json.get('latest_posts', [])
|
||||
last_id = None
|
||||
max_created_at = last_sync_date
|
||||
should_stop = False
|
||||
|
||||
while posts:
|
||||
log.info("Processing %d posts for %s", len(posts), site_url)
|
||||
topics_to_render = {} # unique topics in this batch
|
||||
for json_post in posts:
|
||||
try:
|
||||
post = Post.from_json(json_post)
|
||||
except Exception as e:
|
||||
log.warning("Failed to deserialize post %s: %s", json_post, e)
|
||||
continue
|
||||
|
||||
if post.id in archived_post_ids:
|
||||
log.debug("Post %s already archived, skipping", post.id)
|
||||
continue
|
||||
|
||||
post_created = post.get_created_at()
|
||||
if last_sync_date is not None and post_created < last_sync_date:
|
||||
log.info("Post %s is older than last_sync_date; stopping batch for %s.", post.id, site_url)
|
||||
should_stop = True
|
||||
break
|
||||
|
||||
post.save(posts_dir)
|
||||
archived_post_ids.add(post.id)
|
||||
last_id = post.id
|
||||
|
||||
topic = post.get_topic()
|
||||
topics_to_render[topic.id] = topic
|
||||
|
||||
if max_created_at is None or post_created > max_created_at:
|
||||
max_created_at = post_created
|
||||
|
||||
metadata['last_sync_date'] = max_created_at.isoformat() if max_created_at else None
|
||||
metadata['archived_post_ids'] = sorted(archived_post_ids)
|
||||
update_metadata(metadata_file, metadata)
|
||||
|
||||
# Render topics concurrently for the current batch.
|
||||
if topics_to_render:
|
||||
log.info("Rendering %d topics concurrently for %s.", len(topics_to_render), site_url)
|
||||
render_topics_concurrently(site_url, topics_to_render, topics_dir, max_workers=8)
|
||||
|
||||
if should_stop:
|
||||
log.info("Stopping pagination loop based on sync date for %s.", site_url)
|
||||
break
|
||||
|
||||
if last_id is None or last_id <= 1:
|
||||
log.info("No valid last_id found for %s. Ending pagination loop.", site_url)
|
||||
break
|
||||
|
||||
time.sleep(5)
|
||||
posts = http_get_json(site_url, f'/posts.json?before={last_id - 1}').get('latest_posts', [])
|
||||
# Fallback if posts come empty (step back gradually)
|
||||
while not posts and last_id and last_id >= 0:
|
||||
last_id -= 49
|
||||
posts = http_get_json(site_url, f'/posts.json?before={last_id}').get('latest_posts', [])
|
||||
time.sleep(1)
|
||||
|
||||
def main() -> None:
|
||||
# Parse command-line parameters.
|
||||
parameters = args()
|
||||
base_target_dir = parameters.target_dir
|
||||
if not isinstance(base_target_dir, Path):
|
||||
base_target_dir = Path(base_target_dir)
|
||||
base_target_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sites = parse_sites(parameters.urls)
|
||||
if not sites:
|
||||
log.error("No valid sites provided. Exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Process each site.
|
||||
for site_url in sites:
|
||||
process_site(site_url, base_target_dir)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user