From d15cd075ba4c0d0c31973e0a8a82a3179aea3375 Mon Sep 17 00:00:00 2001 From: sys-nyx Date: Thu, 26 Dec 2024 16:15:22 -0800 Subject: [PATCH] modified generate_html func in write_html.py to accept a dictionary containing subreddit content --- write_html.py | 103 ++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 63 deletions(-) diff --git a/write_html.py b/write_html.py index 05a6f58..e478375 100755 --- a/write_html.py +++ b/write_html.py @@ -95,53 +95,55 @@ with open('templates/partial_url.html', 'r', encoding='utf-8') as file: process = psutil.Process(os.getpid()) -def generate_html(min_score=0, min_comments=0, hide_deleted_comments=False): +def generate_html(subs: list[str], sub_dict, min_score=0, min_comments=0, hide_deleted_comments=False): delta = timedelta(days=1) - subs = get_subs() user_index = {} processed_subs = [] stat_links = 0 stat_filtered_links = 0 for sub in subs: + print("Building current sub: ", sub) + + threads = sub_dict[sub] + print("Total threads to convert: ", len(threads)) + built = 0 # write link pages # print('generate_html() processing %s %s kb' % (sub, int(int(process.memory_info().rss) / 1024))) stat_sub_links = 0 stat_sub_filtered_links = 0 stat_sub_comments = 0 d = start_date - while d <= end_date: - raw_links = load_links(d, sub, True) - stat_links += len(raw_links) - stat_sub_links += len(raw_links) - for l in raw_links: - if validate_link(l, min_score, min_comments): - write_link_page(subs, l, sub, hide_deleted_comments) - stat_filtered_links += 1 - stat_sub_filtered_links += 1 - if 'comments' in l: - stat_sub_comments += len(l['comments']) - d += delta + stat_links += len(threads) + stat_sub_links += len(threads) + for t in threads: + if validate_link(t, min_score, min_comments): + write_link_page(subs, t, sub, hide_deleted_comments) + built += 1 + stat_filtered_links += 1 + stat_sub_filtered_links += 1 + if 'comments' in t: + stat_sub_comments += len(t['comments']) + if built % 100 == 0: + print(f"{built}/ {len(threads)}") if stat_sub_filtered_links > 0: processed_subs.append({'name': sub, 'num_links': stat_sub_filtered_links}) print('%s: %s links filtered to %s' % (sub, stat_sub_links, stat_sub_filtered_links)) # write subreddit pages valid_sub_links = [] - d = start_date - while d <= end_date: - raw_links = load_links(d, sub) - for l in raw_links: - if validate_link(l, min_score, min_comments): - valid_sub_links.append(l) - # collect links for user pages - # TODO: this is the least performant bit. load and generate user pages user by user instead. - l['subreddit'] = sub - if l['author'] not in user_index.keys(): - user_index[l['author']] = [] - user_index[l['author']].append(l) - d += delta + + for t in threads: + if validate_link(t, min_score, min_comments): + valid_sub_links.append(t) + + # collect links for user pages + # TODO: this is the least performant bit. load and generate user pages user by user instead. + t['subreddit'] = sub + if t['author'] not in user_index.keys(): + user_index[t['author']] = [] + user_index[t['author']].append(t) write_subreddit_pages(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments) write_subreddit_search_page(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments) @@ -202,7 +204,7 @@ def write_subreddit_pages(subreddit, subs, link_index, stat_sub_filtered_links, } link_html = template_link_url for key, value in index_link_data_map.items(): - link_html = link_html.replace(key, value) + link_html = link_html.replace(key, str(value)) links_html += link_html + '\n' index_page_data_map = { @@ -272,7 +274,7 @@ def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False) css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth)) if c['author'] == link['author'] and c['author'] not in removed_content_identifiers: css_classes += ' op' - if c['stickied'].lower() == 'true' or c['stickied'] is True: + if c['stickied'] is True: css_classes += ' stickied' # author link @@ -287,12 +289,12 @@ def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False) '###SCORE###': str(c['score']) if len(str(c['score'])) > 0 else missing_comment_score_label, '###BODY###': snudown.markdown(c['body'].replace('>','>')), '###CSS_CLASSES###': css_classes, - '###CLASS_SCORE###': 'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary', + '###CLASS_SCORE###': 'badge-danger' if c['score'] > 0 and int(c['score']) < 1 else 'badge-secondary', '###HTML_AUTHOR_URL###': author_link_html, } comment_html = template_comment for key, value in comment_data_map.items(): - comment_html = comment_html.replace(key, value) + comment_html = comment_html.replace(key, str(value)) comments_html += comment_html + '\n' # render subreddits list @@ -310,8 +312,8 @@ def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False) url = static_include_path + 'user/' + link['author'] + '.html' author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', link['author']) - html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title']) - if link['is_self'] is True or link['is_self'].lower() == 'true': + html_title = template_url.replace('#INNER_HTML#', link['title']) + if link['is_self'] is True: html_title = link['title'] # render link page @@ -321,7 +323,7 @@ def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False) '###TITLE###': link['title'], '###ID###': link['id'], '###DATE###': created.strftime('%Y-%m-%d'), - '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int(link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a', + # '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int(link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a', '###SCORE###': str(link['score']), '###NUM_COMMENTS###': str(link['num_comments']), '###URL_PROJECT###': url_project, @@ -379,7 +381,7 @@ def write_subreddit_search_page(subreddit, subs, link_index, stat_sub_filtered_l } link_html = template_search_link for key, value in index_link_data_map.items(): - link_html = link_html.replace(key, value) + link_html = link_html.replace(key, str(value)) links_html += link_html + '\n' index_page_data_map = { @@ -458,7 +460,7 @@ def write_user_page(subs, user_index): } link_html = template_user_page_link for key, value in link_data_map.items(): - link_html = link_html.replace(key, value) + link_html = link_html.replace(key, str(value)) links_html += link_html + '\n' page_data_map = { @@ -536,7 +538,7 @@ def sort_comments(comments, hide_deleted_comments=False): id_map[c['id']] = c parent_map[c['id']] = c['parent_id'] # add stickied comments - if c['stickied'].lower() == 'true': + if c['stickied'] == True: sorted_comments.append(c) # store top level comments elif c['parent_id'] == c['link_id']: @@ -561,7 +563,7 @@ def sort_comments(comments, hide_deleted_comments=False): # add orphaned comments for c in comments: - if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys(): + if c['parent_id'] != link_id and str(c['parent_id']).replace('t1_', '') not in id_map.keys(): if hide_deleted_comments and c['body'] in removed_content_identifiers: continue sorted_linear_comments.append(c) @@ -607,31 +609,6 @@ def validate_link(link, min_score=0, min_comments=0): return True -def load_links(date, subreddit, with_comments=False): - links = [] - if not date or not subreddit: - return links - - date_path = date.strftime("%Y/%m/%d") - daily_path = 'data/' + subreddit + '/' + date_path - daily_links_path = daily_path + '/' + source_data_links - if os.path.isfile(daily_links_path): - links = [] - with open(daily_links_path, 'r', encoding='utf-8') as links_file: - reader = csv.DictReader(links_file) - for link_row in reader: - if with_comments and 'id' in link_row.keys(): - comments = [] - comments_file_path = daily_path + '/' + link_row['id'] + '.csv' - if os.path.isfile(comments_file_path): - with open(comments_file_path, 'r', encoding='utf-8') as comments_file: - reader = csv.DictReader(comments_file) - for comment_row in reader: - comments.append(comment_row) - link_row['comments'] = comments - links.append(link_row) - return links - def get_subs(): subs = [] if not os.path.isdir('data'):