diff --git a/redarch.py b/redarch.py index 8db9644..f56ab9e 100644 --- a/redarch.py +++ b/redarch.py @@ -4,10 +4,49 @@ import os import json import argparse import configparser +from lunr import lunr from urllib.parse import urlparse from write_html import generate_html from watchful import return_redd_objects +def get_lunr_posts_index(subreddits: list[dict]): + print('Generating search index') + to_index = [] + chunk_size = 1000 + idxs = [] + metadata = {} + for s in subreddits.keys(): + for t in subreddits[s]: + meta = {} + t['path'] = t['permalink'].lower().replace(f'r/{s}', '').strip('/') + '.html' + meta['path'] = t['path'] + meta['title'] = t['title'] + meta['score'] = t['score'] + meta['replies'] = str(len(t['comments'])) + meta['body_short'] = t['selftext'][:200] + meta['date'] = t['created_utc'] + metadata[t['id']] = meta + + to_index.append(t) + + chunks = [to_index[i * chunk_size:(i + 1) * chunk_size] for i in range((len(to_index) + chunk_size - 1) // chunk_size )] + for chunk in chunks: + for t in chunk: + print(t, '\n\n\n') + idxs.append(lunr( + ref='id', + fields=[ + 'id', + dict(field_name='title', boost=15), + dict(field_name='selftext', boost=10), + 'score', + 'author' + ], + documents=chunk, + )) + print(f'\rCreating index chunk: {chunks.index(chunk) + 1}/{len(chunks)}', end='') + print('') + return idxs, metadata def main(): parser = argparse.ArgumentParser() parser.add_argument('config', type=str, help='Path to configuration file.') @@ -50,7 +89,7 @@ def main(): comments[parent_url].append(c) - complete_threads = [] + complete_reddit_threads = [] for p in raw_posts: p['comments'] = [] @@ -60,17 +99,31 @@ def main(): if postp in comments.keys(): p['comments'] = comments[postp] - complete_threads.append(p) + complete_reddit_threads.append(p) - subreddits[s.lower()] = complete_threads + subreddits[s.lower()] = complete_reddit_threads print("Total threads: ",len(raw_posts)) print("Total comments: ", len(raw_comments)) print("Comments missing permalinks: ", len(missing_perm)) print("Comment chains found: ", len(comments)) - print("Threads rebuilt: ", len(complete_threads)) + print("Threads rebuilt: ", len(complete_reddit_threads)) + idxs, metadata = get_lunr_posts_index(subreddits) + idx_path_list = [] + for idx in idxs: + idx_name = f'static/js/search/idx-00{idxs.index(idx) + 1}.json' + + idx_path_list.append(idx_name) + print(f'\rWriting: {idx_name}',end='') + with open(f'r/{idx_name}', 'w') as f: + json.dump(idx.serialize(),f) + with open('r/static/js/search/search-idx-list.json','w') as f: + json.dump(idx_path_list, f) + with open('r/static/js/search/metadata.json', 'w') as f: + json.dump(metadata, f) + print('') generate_html([s.lower() for s in config.sections()], subreddits) if __name__ == "__main__":