diff --git a/README.md b/README.md index a33a98f..458a7d7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## reddit html archiver -pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages +pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages. uses the reddit markdown renderer. ### install @@ -28,19 +28,21 @@ before running `fetch_links.py` or `write_html.py` to resolve encoding errors su data is fetched by subreddit and date range and is stored as csv files in `data`. ./fetch_links.py politics 2017-1-1 2017-2-1 - # or add some link/post request filters + # or add some link/post filtering to download less data ./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1 + # show available filters ./fetch_links.py -h -you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors. +decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors. ### write web pages write html files for all subreddits to `r`. ./write_html.py - # or add some output filtering + # or add some output filtering for less fluff or a smaller archive size ./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments + # show available filters ./write_html.py -h your html archive has been written to `r`. once you are satisfied with your archive feel free to copy/move the contents of `r` to elsewhere and to delete the git repos you have created. everything in `r` is fully self contained. diff --git a/write_html.py b/write_html.py index fdfbbb2..0eb1c99 100755 --- a/write_html.py +++ b/write_html.py @@ -417,7 +417,8 @@ def write_user_page(subs, user_index): author_url = l['author'] + '.html' author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author']) - link_comments_url = '../' + l['permalink'].lower().strip('/').strip('r/') + link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/') + link_comments_url = '../' + link_comments_url idpath = '/'.join(list(l['id'])) link_comments_url = link_comments_url.replace(l['id'], idpath) link_comments_url += '.html' @@ -542,7 +543,7 @@ def sort_comments(comments, hide_deleted_comments=False): # add orphaned comments for c in comments: - if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys(): + if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys(): if hide_deleted_comments and c['body'] in removed_content_identifiers: continue sorted_linear_comments.append(c)