mirror of
https://github.com/sys-nyx/red-arch.git
synced 2025-05-06 08:45:31 -04:00
fix duplicate comments bug, resolves #10
This commit is contained in:
parent
83fb77b41c
commit
f2729e0231
2 changed files with 9 additions and 6 deletions
10
README.md
10
README.md
|
@ -1,6 +1,6 @@
|
||||||
## reddit html archiver
|
## reddit html archiver
|
||||||
|
|
||||||
pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages
|
pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages. uses the reddit markdown renderer.
|
||||||
|
|
||||||
### install
|
### install
|
||||||
|
|
||||||
|
@ -28,19 +28,21 @@ before running `fetch_links.py` or `write_html.py` to resolve encoding errors su
|
||||||
data is fetched by subreddit and date range and is stored as csv files in `data`.
|
data is fetched by subreddit and date range and is stored as csv files in `data`.
|
||||||
|
|
||||||
./fetch_links.py politics 2017-1-1 2017-2-1
|
./fetch_links.py politics 2017-1-1 2017-2-1
|
||||||
# or add some link/post request filters
|
# or add some link/post filtering to download less data
|
||||||
./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
|
./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
|
||||||
|
# show available filters
|
||||||
./fetch_links.py -h
|
./fetch_links.py -h
|
||||||
|
|
||||||
you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
|
decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
|
||||||
|
|
||||||
### write web pages
|
### write web pages
|
||||||
|
|
||||||
write html files for all subreddits to `r`.
|
write html files for all subreddits to `r`.
|
||||||
|
|
||||||
./write_html.py
|
./write_html.py
|
||||||
# or add some output filtering
|
# or add some output filtering for less fluff or a smaller archive size
|
||||||
./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
|
./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
|
||||||
|
# show available filters
|
||||||
./write_html.py -h
|
./write_html.py -h
|
||||||
|
|
||||||
your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained.
|
your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained.
|
||||||
|
|
|
@ -417,7 +417,8 @@ def write_user_page(subs, user_index):
|
||||||
author_url = l['author'] + '.html'
|
author_url = l['author'] + '.html'
|
||||||
author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
|
author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
|
||||||
|
|
||||||
link_comments_url = '../' + l['permalink'].lower().strip('/').strip('r/')
|
link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
|
||||||
|
link_comments_url = '../' + link_comments_url
|
||||||
idpath = '/'.join(list(l['id']))
|
idpath = '/'.join(list(l['id']))
|
||||||
link_comments_url = link_comments_url.replace(l['id'], idpath)
|
link_comments_url = link_comments_url.replace(l['id'], idpath)
|
||||||
link_comments_url += '.html'
|
link_comments_url += '.html'
|
||||||
|
@ -542,7 +543,7 @@ def sort_comments(comments, hide_deleted_comments=False):
|
||||||
|
|
||||||
# add orphaned comments
|
# add orphaned comments
|
||||||
for c in comments:
|
for c in comments:
|
||||||
if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys():
|
if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
|
||||||
if hide_deleted_comments and c['body'] in removed_content_identifiers:
|
if hide_deleted_comments and c['body'] in removed_content_identifiers:
|
||||||
continue
|
continue
|
||||||
sorted_linear_comments.append(c)
|
sorted_linear_comments.append(c)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue