mirror of
https://github.com/sys-nyx/red-arch.git
synced 2025-05-06 08:45:31 -04:00
fix duplicate comments bug, resolves #10
This commit is contained in:
parent
83fb77b41c
commit
f2729e0231
2 changed files with 9 additions and 6 deletions
10
README.md
10
README.md
|
@ -1,6 +1,6 @@
|
|||
## reddit html archiver
|
||||
|
||||
pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages
|
||||
pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages. uses the reddit markdown renderer.
|
||||
|
||||
### install
|
||||
|
||||
|
@ -28,19 +28,21 @@ before running `fetch_links.py` or `write_html.py` to resolve encoding errors su
|
|||
data is fetched by subreddit and date range and is stored as csv files in `data`.
|
||||
|
||||
./fetch_links.py politics 2017-1-1 2017-2-1
|
||||
# or add some link/post request filters
|
||||
# or add some link/post filtering to download less data
|
||||
./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
|
||||
# show available filters
|
||||
./fetch_links.py -h
|
||||
|
||||
you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
|
||||
decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
|
||||
|
||||
### write web pages
|
||||
|
||||
write html files for all subreddits to `r`.
|
||||
|
||||
./write_html.py
|
||||
# or add some output filtering
|
||||
# or add some output filtering for less fluff or a smaller archive size
|
||||
./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
|
||||
# show available filters
|
||||
./write_html.py -h
|
||||
|
||||
your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained.
|
||||
|
|
|
@ -417,7 +417,8 @@ def write_user_page(subs, user_index):
|
|||
author_url = l['author'] + '.html'
|
||||
author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
|
||||
|
||||
link_comments_url = '../' + l['permalink'].lower().strip('/').strip('r/')
|
||||
link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
|
||||
link_comments_url = '../' + link_comments_url
|
||||
idpath = '/'.join(list(l['id']))
|
||||
link_comments_url = link_comments_url.replace(l['id'], idpath)
|
||||
link_comments_url += '.html'
|
||||
|
@ -542,7 +543,7 @@ def sort_comments(comments, hide_deleted_comments=False):
|
|||
|
||||
# add orphaned comments
|
||||
for c in comments:
|
||||
if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys():
|
||||
if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
|
||||
if hide_deleted_comments and c['body'] in removed_content_identifiers:
|
||||
continue
|
||||
sorted_linear_comments.append(c)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue