import os import json import argparse import configparser from lunr import lunr from datetime import datetime from urllib.parse import urlparse from write_html import generate_html from watchful import return_redd_objects def rebuild_threads(threads: list[dict], comments:list[dict]) -> list[dict]: print('Rebuilding threads...') threads_dict = {} for t in threads: t['comments'] = [] threads_dict[t['id']] = t t['subreddit'] = t['subreddit'].lower() for c in comments: if 'permalink' not in c.keys(): continue parent_thread_id = c['permalink'].split('/')[4] if parent_thread_id not in threads_dict.keys(): continue threads_dict[parent_thread_id]['comments'].append(c) return [threads_dict[t] for t in threads_dict.keys()] def rebuild_subreddits(threads: list[dict]) -> dict: print("Rebuilding subreddits...") subreddits = {} for t in threads: if t['subreddit'] not in subreddits.keys(): subreddits[t['subreddit']] = [] subreddits[t['subreddit']].append(t) return subreddits def get_thread_meta(thread: dict) -> dict: return { 'id': thread['id'], 'path': thread['permalink'].lower().replace(f'r/{thread["subreddit"]}', '').strip('/') + '.html', 'title': thread['title'], 'score': thread['score'], 'replies': str(len(thread['comments'])), 'body_short': thread['selftext'][:200], 'date': datetime.utcfromtimestamp(int(thread['created_utc'])).strftime('%Y-%m-%d'), 'author': thread['author'], 'subreddit': thread['subreddit'] } def get_comment_meta(comment: dict) -> dict: return { 'id': comment['id'], 'path': comment['path'], 'title': comment['title'], 'score': comment['score'], 'body_short': comment['selftext'][:200], 'date': datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d'), 'author': comment['author'] } def get_lunr_index(subreddits: list[dict]): print('Generating search index...') to_index = [] chunk_size = 1000 idxs = [] metadata = {} for s in subreddits.keys(): for t in subreddits[s]: meta = get_thread_meta(t) metadata[t['id']] = meta i = (t, {'boost': t['score']}) to_index.append(i) chunks = [to_index[i * chunk_size:(i + 1) * chunk_size] for i in range((len(to_index) + chunk_size - 1) // chunk_size )] for chunk in chunks: print(f'\rParsing index chunk: {chunks.index(chunk) + 1}/{len(chunks)}', end='') idxs.append(lunr( ref='id', fields=[ dict(field_name='title', boost=15), dict(field_name='selftext', boost=10), 'score', 'author' ], documents=chunk, )) print('') return idxs, metadata def main(): parser = argparse.ArgumentParser() parser.add_argument('config', type=str, help='Path to configuration file.') args = parser.parse_args() if not args.config: print("No config file found") exit() config = configparser.ConfigParser() config.read(args.config) raw_posts = [] raw_comments = [] for s in config.sections(): posts_path = config[s]['posts'] comments_path = config[s]['comments'] print(f"Loading from {posts_path}") raw_posts += return_redd_objects(posts_path) print(f"Loading from {comments_path}") raw_comments += return_redd_objects(comments_path) threads = rebuild_threads(raw_posts, raw_comments) subreddits = rebuild_subreddits(threads) idxs, metadata = get_lunr_index(subreddits) os.makedirs('r/static/js/search/', exist_ok=True) idx_path_list = [] for idx in idxs: idx_name = f'static/js/search/idx-00{idxs.index(idx) + 1}.json' idx_path_list.append(idx_name) print(f'\rWriting: {idx_name}',end='') with open(f'r/{idx_name}', 'w') as f: json.dump(idx.serialize(),f) with open('r/static/js/search/search-idx-list.json','w') as f: json.dump(idx_path_list, f) with open('r/static/js/search/metadata.json', 'w') as f: json.dump(metadata, f) print('') generate_html(subreddits) if __name__ == "__main__": main()