mirror of
https://github.com/sys-nyx/red-arch.git
synced 2025-05-08 09:45:27 -04:00
refactored redarch.py a bit
This commit is contained in:
parent
34ec2d920b
commit
38b3cfbbb4
1 changed files with 84 additions and 61 deletions
145
redarch.py
145
redarch.py
|
@ -9,42 +9,96 @@ from urllib.parse import urlparse
|
||||||
from write_html import generate_html
|
from write_html import generate_html
|
||||||
from watchful import return_redd_objects
|
from watchful import return_redd_objects
|
||||||
|
|
||||||
def get_lunr_posts_index(subreddits: list[dict]):
|
def rebuild_threads(threads: list[dict], comments:list[dict]) -> list[dict]:
|
||||||
print('Generating search index')
|
print('Rebuilding threads...')
|
||||||
|
threads_dict = {}
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
t['comments'] = []
|
||||||
|
threads_dict[t['id']] = t
|
||||||
|
t['subreddit'] = t['subreddit'].lower()
|
||||||
|
|
||||||
|
for c in comments:
|
||||||
|
if 'permalink' not in c.keys():
|
||||||
|
continue
|
||||||
|
|
||||||
|
parent_thread_id = c['permalink'].split('/')[4]
|
||||||
|
|
||||||
|
if parent_thread_id not in threads_dict.keys():
|
||||||
|
continue
|
||||||
|
|
||||||
|
threads_dict[parent_thread_id]['comments'].append(c)
|
||||||
|
|
||||||
|
return [threads_dict[t] for t in threads_dict.keys()]
|
||||||
|
|
||||||
|
|
||||||
|
def rebuild_subreddits(threads: list[dict]) -> dict:
|
||||||
|
print("Rebuilding subreddits...")
|
||||||
|
subreddits = {}
|
||||||
|
|
||||||
|
for t in threads:
|
||||||
|
if t['subreddit'] not in subreddits.keys():
|
||||||
|
subreddits[t['subreddit']] = []
|
||||||
|
subreddits[t['subreddit']].append(t)
|
||||||
|
|
||||||
|
return subreddits
|
||||||
|
|
||||||
|
|
||||||
|
def get_thread_meta(thread: dict) -> dict:
|
||||||
|
return {
|
||||||
|
'id': thread['id'],
|
||||||
|
'path': thread['permalink'].lower().replace(f'r/{thread["subreddit"]}', '').strip('/') + '.html',
|
||||||
|
'title': thread['title'],
|
||||||
|
'score': thread['score'],
|
||||||
|
'replies': str(len(thread['comments'])),
|
||||||
|
'body_short': thread['selftext'][:200],
|
||||||
|
'date': thread['created_utc'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_comment_meta(comment: dict) -> dict:
|
||||||
|
return {
|
||||||
|
'id': comment['id'],
|
||||||
|
'path': comment['path'],
|
||||||
|
'title': comment['title'],
|
||||||
|
'score': comment['score'],
|
||||||
|
'body_short': comment['selftext'][:200],
|
||||||
|
'date': comment['created_utc'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_lunr_index(subreddits: list[dict]):
|
||||||
|
print('Generating search index...')
|
||||||
to_index = []
|
to_index = []
|
||||||
chunk_size = 1000
|
chunk_size = 1000
|
||||||
idxs = []
|
idxs = []
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for s in subreddits.keys():
|
for s in subreddits.keys():
|
||||||
for t in subreddits[s]:
|
for t in subreddits[s]:
|
||||||
meta = {}
|
meta = get_thread_meta(t)
|
||||||
t['path'] = t['permalink'].lower().replace(f'r/{s}', '').strip('/') + '.html'
|
|
||||||
meta['path'] = t['path']
|
|
||||||
meta['title'] = t['title']
|
|
||||||
meta['score'] = t['score']
|
|
||||||
meta['replies'] = str(len(t['comments']))
|
|
||||||
meta['body_short'] = t['selftext'][:200]
|
|
||||||
meta['date'] = t['created_utc']
|
|
||||||
metadata[t['id']] = meta
|
metadata[t['id']] = meta
|
||||||
|
i = (t, {'boost': t['score']})
|
||||||
to_index.append(t)
|
to_index.append(i)
|
||||||
|
|
||||||
chunks = [to_index[i * chunk_size:(i + 1) * chunk_size] for i in range((len(to_index) + chunk_size - 1) // chunk_size )]
|
chunks = [to_index[i * chunk_size:(i + 1) * chunk_size] for i in range((len(to_index) + chunk_size - 1) // chunk_size )]
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
|
print(f'\rParsing index chunk: {chunks.index(chunk) + 1}/{len(chunks)}', end='')
|
||||||
|
|
||||||
idxs.append(lunr(
|
idxs.append(lunr(
|
||||||
ref='id',
|
ref='id',
|
||||||
fields=[
|
fields=[
|
||||||
'id',
|
|
||||||
dict(field_name='title', boost=15),
|
dict(field_name='title', boost=15),
|
||||||
dict(field_name='selftext', boost=10),
|
dict(field_name='selftext', boost=10),
|
||||||
'score',
|
'score',
|
||||||
'author'
|
'author'
|
||||||
],
|
],
|
||||||
documents=chunk,
|
documents=chunk,
|
||||||
))
|
))
|
||||||
print(f'\rCreating index chunk: {chunks.index(chunk) + 1}/{len(chunks)}', end='')
|
|
||||||
print('')
|
print('')
|
||||||
return idxs, metadata
|
return idxs, metadata
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('config', type=str, help='Path to configuration file.')
|
parser.add_argument('config', type=str, help='Path to configuration file.')
|
||||||
|
@ -56,58 +110,24 @@ def main():
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read(args.config)
|
config.read(args.config)
|
||||||
|
|
||||||
subreddits = {}
|
raw_posts = []
|
||||||
|
raw_comments = []
|
||||||
|
|
||||||
for s in config.sections():
|
for s in config.sections():
|
||||||
|
|
||||||
posts_path = config[s]['posts']
|
posts_path = config[s]['posts']
|
||||||
comments_path = config[s]['comments']
|
comments_path = config[s]['comments']
|
||||||
|
|
||||||
links = []
|
|
||||||
|
|
||||||
print(f"loading from {posts_path}")
|
print(f"Loading from {posts_path}")
|
||||||
raw_posts = return_redd_objects(posts_path)
|
raw_posts += return_redd_objects(posts_path)
|
||||||
print('done')
|
|
||||||
print(f"loading from {comments_path}")
|
print(f"Loading from {comments_path}")
|
||||||
raw_comments = return_redd_objects(comments_path)
|
raw_comments += return_redd_objects(comments_path)
|
||||||
print('done')
|
|
||||||
|
|
||||||
missing_perm = []
|
threads = rebuild_threads(raw_posts, raw_comments)
|
||||||
comments = {}
|
subreddits = rebuild_subreddits(threads)
|
||||||
for c in raw_comments:
|
|
||||||
if 'permalink' not in c.keys():
|
|
||||||
missing_perm.append(c)
|
|
||||||
continue
|
|
||||||
|
|
||||||
parent_url = '/'.join(urlparse(c['permalink']).path.split('/')[:6])
|
|
||||||
if parent_url.endswith('/'):
|
|
||||||
parent_url = parent_url[:-1]
|
|
||||||
if parent_url not in comments.keys():
|
|
||||||
comments[parent_url] = []
|
|
||||||
comments[parent_url].append(c)
|
|
||||||
|
|
||||||
|
idxs, metadata = get_lunr_index(subreddits)
|
||||||
complete_reddit_threads = []
|
|
||||||
|
|
||||||
for p in raw_posts:
|
|
||||||
p['comments'] = []
|
|
||||||
postp = urlparse(p['permalink']).path
|
|
||||||
if postp.endswith('/'):
|
|
||||||
postp = postp[:-1]
|
|
||||||
if postp in comments.keys():
|
|
||||||
p['comments'] = comments[postp]
|
|
||||||
|
|
||||||
complete_reddit_threads.append(p)
|
|
||||||
|
|
||||||
subreddits[s.lower()] = complete_reddit_threads
|
|
||||||
|
|
||||||
print("Total threads: ",len(raw_posts))
|
|
||||||
print("Total comments: ", len(raw_comments))
|
|
||||||
print("Comments missing permalinks: ", len(missing_perm))
|
|
||||||
print("Comment chains found: ", len(comments))
|
|
||||||
print("Threads rebuilt: ", len(complete_reddit_threads))
|
|
||||||
|
|
||||||
idxs, metadata = get_lunr_posts_index(subreddits)
|
|
||||||
|
|
||||||
os.makedirs('r/static/js/search/', exist_ok=True)
|
os.makedirs('r/static/js/search/', exist_ok=True)
|
||||||
|
|
||||||
|
@ -119,12 +139,15 @@ def main():
|
||||||
print(f'\rWriting: {idx_name}',end='')
|
print(f'\rWriting: {idx_name}',end='')
|
||||||
with open(f'r/{idx_name}', 'w') as f:
|
with open(f'r/{idx_name}', 'w') as f:
|
||||||
json.dump(idx.serialize(),f)
|
json.dump(idx.serialize(),f)
|
||||||
|
|
||||||
with open('r/static/js/search/search-idx-list.json','w') as f:
|
with open('r/static/js/search/search-idx-list.json','w') as f:
|
||||||
json.dump(idx_path_list, f)
|
json.dump(idx_path_list, f)
|
||||||
|
|
||||||
with open('r/static/js/search/metadata.json', 'w') as f:
|
with open('r/static/js/search/metadata.json', 'w') as f:
|
||||||
json.dump(metadata, f)
|
json.dump(metadata, f)
|
||||||
print('')
|
print('')
|
||||||
generate_html([s.lower() for s in config.sections()], subreddits)
|
|
||||||
|
generate_html(subreddits)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue