fix duplicate comments bug, resolves #10

2025-09-19 04:04:35 -04:00 · 2019-09-12 00:47:01 -07:00 · 2019-09-12 00:47:01 -07:00 · f2729e0231
commit f2729e0231
parent 83fb77b41c
2 changed files with 9 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 ## reddit html archiver

-pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages
+pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages. uses the reddit markdown renderer.

 ### install

@ -28,19 +28,21 @@ before running `fetch_links.py` or `write_html.py` to resolve encoding errors su
 data is fetched by subreddit and date range and is stored as csv files in `data`.

    ./fetch_links.py politics 2017-1-1 2017-2-1
-    # or add some link/post request filters
+    # or add some link/post filtering to download less data
    ./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
+    # show available filters
    ./fetch_links.py -h

-you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
+decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.

 ### write web pages

 write html files for all subreddits to `r`.

    ./write_html.py
-    # or add some output filtering
+    # or add some output filtering for less fluff or a smaller archive size
    ./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
+    # show available filters
    ./write_html.py -h

 your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained.
--- a/write_html.py
+++ b/write_html.py
@ -417,7 +417,8 @@ def write_user_page(subs, user_index):
            author_url = l['author'] + '.html'
            author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])

-            link_comments_url = '../' + l['permalink'].lower().strip('/').strip('r/')
+            link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
+            link_comments_url = '../' + link_comments_url
            idpath = '/'.join(list(l['id']))
            link_comments_url = link_comments_url.replace(l['id'], idpath)
            link_comments_url += '.html'
@ -542,7 +543,7 @@ def sort_comments(comments, hide_deleted_comments=False):

    # add orphaned comments
    for c in comments:
-        if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys():
+        if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
            if hide_deleted_comments and c['body'] in removed_content_identifiers:
                continue
            sorted_linear_comments.append(c)