fix duplicate comments bug, resolves #10

This commit is contained in:
libertysoft3 2019-09-12 00:47:01 -07:00
parent 83fb77b41c
commit f2729e0231
2 changed files with 9 additions and 6 deletions

View file

@ -1,6 +1,6 @@
## reddit html archiver ## reddit html archiver
pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages. uses the reddit markdown renderer.
### install ### install
@ -28,19 +28,21 @@ before running `fetch_links.py` or `write_html.py` to resolve encoding errors su
data is fetched by subreddit and date range and is stored as csv files in `data`. data is fetched by subreddit and date range and is stored as csv files in `data`.
./fetch_links.py politics 2017-1-1 2017-2-1 ./fetch_links.py politics 2017-1-1 2017-2-1
# or add some link/post request filters # or add some link/post filtering to download less data
./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1 ./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
# show available filters
./fetch_links.py -h ./fetch_links.py -h
you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors. decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
### write web pages ### write web pages
write html files for all subreddits to `r`. write html files for all subreddits to `r`.
./write_html.py ./write_html.py
# or add some output filtering # or add some output filtering for less fluff or a smaller archive size
./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments ./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
# show available filters
./write_html.py -h ./write_html.py -h
your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained. your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained.

View file

@ -417,7 +417,8 @@ def write_user_page(subs, user_index):
author_url = l['author'] + '.html' author_url = l['author'] + '.html'
author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author']) author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
link_comments_url = '../' + l['permalink'].lower().strip('/').strip('r/') link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
link_comments_url = '../' + link_comments_url
idpath = '/'.join(list(l['id'])) idpath = '/'.join(list(l['id']))
link_comments_url = link_comments_url.replace(l['id'], idpath) link_comments_url = link_comments_url.replace(l['id'], idpath)
link_comments_url += '.html' link_comments_url += '.html'
@ -542,7 +543,7 @@ def sort_comments(comments, hide_deleted_comments=False):
# add orphaned comments # add orphaned comments
for c in comments: for c in comments:
if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys(): if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
if hide_deleted_comments and c['body'] in removed_content_identifiers: if hide_deleted_comments and c['body'] in removed_content_identifiers:
continue continue
sorted_linear_comments.append(c) sorted_linear_comments.append(c)