initial

2025-07-25 07:35:26 -04:00 · 2018-10-28 05:08:54 -07:00 · 2018-10-28 05:08:54 -07:00 · d9d000055b
commit d9d000055b
32 changed files with 1530 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/data
+/r
+!/r/static
--- a/README.md
+++ b/README.md
@ -0,0 +1,72 @@
+## reddit html archiver
+
+pulls reddit data from the [pushshift](https://github.com/pushshift/api) api and renders offline compatible html pages
+
+### install
+
+requires python 3
+
+    sudo apt-get install pip
+    pip install psaw
+    git clone https://github.com/chid/snudown
+    cd snudown
+    sudo python setup.py install
+    cd ..
+    git clone [this repo]
+    cd reddit-html-archiver
+    chmod u+x *.py
+
+### fetch reddit data from pushshift
+
+data is fetched by subreddit and date range.
+
+    ./fetch_links.py politics 2017-1-1 2017-2-1
+    # or add some link/post request parameters
+    ./fetch_links.py --self_only --score "> 2000" politics 2015-1-1 2016-1-1
+    ./fetch_links.py -h
+
+you may need decrease your date range or adjust `pushshift_rate_limit_per_minute` in `fetch_links.py` if you are getting connection errors.
+
+### write web pages
+
+write html files for all subreddits.
+
+    ./write_html.py
+    # or add some output filtering
+    ./write_html.py --min-score 100 --min-comments 100 --hide-deleted-comments
+    ./write_html.py -h
+    
+
+if you add more data later, delete everything in `r` aside from `r/static` and re-run the script to refresh your archive's pages.
+
+### hosting the archived pages
+
+copy the contents of the `r` directory to a web root or appropriately served git repo. or serve it directly.
+
+### potential improvements
+
+* fetch_links
+  * num_comments filtering
+  * thumbnails or thumbnail urls
+  * media posts
+  * update scores from the reddit api with [praw](https://github.com/praw-dev/praw)
+* real templating
+* filter output per sub, individual min score and comments filters
+* js markdown url previews
+* js powered search page, show no links by default
+* user pages
+  * add pagination, posts sorted by score, comments, date, sub
+  * too many files in one directory
+
+### see also
+
+* [pushshift](https://github.com/pushshift/api) [subreddit](https://www.reddit.com/r/pushshift/)
+* [psaw](https://github.com/dmarx/psaw)
+* [snudown](https://github.com/reddit/snudown)
+* [redditsearch.io](https://redditsearch.io/)
+* [reddit post archiver](https://github.com/sJohnsonStoever/redditPostArchiver)
+
+### screenshots
+
+![](screenshots/sub.jpg)
+![](screenshots/post.jpg)
--- a/fetch_links.py
+++ b/fetch_links.py
@ -0,0 +1,217 @@
+#! /usr/bin/env python
+import time
+from time import mktime
+from datetime import datetime, timedelta
+import argparse
+from pprint import pprint
+import json
+import csv
+import os
+from psaw import PushshiftAPI
+
+pushshift_rate_limit_per_minute = 20
+max_comments_per_query = 150
+write_every = 10
+
+link_fields = ['author', 'created_utc', 'domain', 'id', 'is_self', 
+    'num_comments', 'over_18', 'permalink', 'retrieved_on', 'score', 
+    'selftext', 'stickied', 'subreddit_id', 'title', 'url']
+comment_fields = ['author', 'body', 'created_utc', 'id', 'link_id', 
+    'parent_id', 'score', 'stickied', 'subreddit_id']
+
+def fetch_links(subreddit=None, date_start=None, date_stop=None, limit=None, score=None, self_only=False):
+    if subreddit is None or date_start is None or date_stop is None:
+        print('ERROR: missing required arguments')
+        exit()
+
+    api = PushshiftAPI(rate_limit_per_minute=pushshift_rate_limit_per_minute, detect_local_tz=False)
+
+    # get links
+    links = []
+    print('fetching submissions %s to %s...' % (time.strftime('%Y-%m-%d', date_start), time.strftime('%Y-%m-%d', date_stop)))
+    params = {
+        'after': int(mktime(date_start)) - 86400, # make date inclusive, adjust for UTC
+        'before': int(mktime(date_stop)) + 86400,
+        'subreddit': subreddit,
+        'filter': link_fields,
+        'sort': 'asc',
+        'sort_type': 'created_utc',
+    }
+    if limit:
+        params['limit'] = int(limit)
+    if score:
+        params['score'] = score
+    if self_only:
+        params['is_self'] = True
+    link_results = list(api.search_submissions(**params))
+    print('processing %s links' % len(link_results))
+    for s in link_results:
+        # print('%s %s' % (datetime.utcfromtimestamp(int(s.d_['created_utc'])), s.d_['title']))
+        # pprint(s)
+
+        # get comment ids
+        comments = []
+        if s.d_['num_comments'] > 0 and not comment_data_exists(subreddit, s.d_['created_utc'], s.d_['id']):
+            comment_ids = list(api._get_submission_comment_ids(s.d_['id']))
+            # print('%s comment_ids: %s' % (data['id'], comment_ids))
+
+            # get comments
+            if (len(comment_ids) > 0):
+                mychunks = []
+                if len(comment_ids) > max_comments_per_query:
+                    mychunks = chunks(comment_ids, max_comments_per_query)
+                else:
+                    mychunks = [comment_ids]
+                for chunk in mychunks:
+                    comment_params = {
+                        'filter': comment_fields,
+                        'ids': ','.join(chunk),
+                        'limit': max_comments_per_query,
+                    }
+                    comments_results = list(api.search_comments(**comment_params))
+                    print('%s fetch link %s comments %s/%s' % (datetime.utcfromtimestamp(int(s.d_['created_utc'])), s.d_['id'], len(comments_results), len(comment_ids)))
+                    for c in comments_results:
+                        comments.append(c.d_)
+
+        s.d_['comments'] = comments
+        links.append(s.d_)
+
+        # write results
+        if len(links) >= write_every:
+            success = write_links(subreddit, links)
+            if success:
+                links = []
+
+    # write remining results
+    if len(links):
+        write_links(subreddit, links)
+
+# csvs are not guaranteed to be sorted by date but you can resume broken runs
+# and change sort criteria later to add more posts without getting duplicates.
+# delete csvs and re-run to update existing posts
+def write_links(subreddit, links):
+    if links and len(links) > 0:
+        writing_day = None
+        file = None
+        writer = None
+        existing_link_ids = []
+        wrote_links = 0
+        wrote_comments = 0
+
+        for r in links:
+            # print('%s link %s' % (r['id'], r['title']))
+
+            # grab link comments
+            existing_comment_ids = []
+            comments = r['comments']
+            # print('%s comments %s' % (r['id'], comments))
+
+            created_ts = int(r['created_utc'])
+            created = datetime.utcfromtimestamp(created_ts).strftime('%Y-%m-%d')
+            created_path = datetime.utcfromtimestamp(created_ts).strftime('%Y/%m/%d')
+            if created != writing_day:
+                if file:
+                    file.close()
+                writing_day = created
+                path = 'data/' + subreddit + '/' + created_path
+                os.makedirs(path, exist_ok=True)
+
+                # create and parse existing links
+                filename = 'links.csv'
+                filepath = path + '/' + filename
+                if not os.path.isfile(filepath):
+                    file = open(filepath, 'a')
+                    writer = csv.DictWriter(file, fieldnames=link_fields)
+                    writer.writeheader()
+                    # print('created %s' % filepath)
+                else:
+                    with open(filepath, 'r') as file:
+                        reader = csv.DictReader(file)
+                        for row in reader:
+                            existing_link_ids.append(row['id'])
+
+                    file = open(filepath, 'a')
+                    writer = csv.DictWriter(file, fieldnames=link_fields)
+
+            # create and parse existing comments
+            # writing empty comments csvs resuming and comment_data_exists()
+            filename = r['id'] + '.csv'
+            filepath = path + '/' + filename
+            if not os.path.isfile(filepath):
+                comments_file = open(filepath, 'a')
+                comments_writer = csv.DictWriter(comments_file, fieldnames=comment_fields)
+                comments_writer.writeheader()
+                # print('created %s' % filepath)
+            else:
+                with open(filepath, 'r') as comments_file:
+                    reader = csv.DictReader(comments_file)
+                    for row in reader:
+                        existing_comment_ids.append(row['id'])
+
+                comments_file = open(filepath, 'a')
+                comments_writer = csv.DictWriter(comments_file, fieldnames=comment_fields)
+
+            # write link row
+            if r['id'] not in existing_link_ids:
+                for field in list(r):
+                    if field not in link_fields:
+                        del r[field]
+
+                writer.writerow(r)
+                wrote_links += 1
+
+            # write comments
+            for c in comments:
+                if c['id'] not in existing_comment_ids:
+                    for field in list(c):
+                        if field not in comment_fields:
+                            del c[field]
+                    comments_writer.writerow(c)
+                    wrote_comments += 1
+            comments_file.close()
+
+
+        print('got %s links, wrote %s and %s comments' % (len(links), wrote_links, wrote_comments))
+    return True
+
+def link_data_exists(subreddit, date):
+    created_path = time.strftime('%Y/%m/%d', date)
+    path = 'data/' + subreddit + '/' + created_path + '/links.csv'
+    if not os.path.isfile(path):
+        return False
+    return True
+
+def comment_data_exists(subreddit, link_created_utc, link_id):
+    created_ts = int(link_created_utc)
+    created_path = datetime.utcfromtimestamp(created_ts).strftime('%Y/%m/%d')
+    path = 'data/' + subreddit + '/' + created_path + '/' + link_id + '.csv'
+    if os.path.isfile(path):
+        return True
+    return False
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+def mkdate(datestr):
+  try:
+    return time.strptime(datestr, '%Y-%m-%d')
+  except ValueError:
+    raise argparse.ArgumentTypeError(datestr + ' is not a proper date string')
+
+if __name__ == '__main__':
+    parser=argparse.ArgumentParser()
+    parser.add_argument('subreddit', help='subreddit to archive')
+    parser.add_argument('date_start', type=mkdate, help='start archiving at date, e.g. 2005-1-1')
+    parser.add_argument('date_stop', type=mkdate, help='stop archiving at date, inclusive, cannot be date_start')
+    parser.add_argument('--limit', default=None, help='pushshift api limit param, default None')
+    parser.add_argument('--score', default=None, help='pushshift api score param, e.g. "> 10", default None')
+    parser.add_argument('--self_only', action="store_true", help='only fetch selftext submissions, default False')
+    args=parser.parse_args()
+
+    self_only = False
+    if args.self_only:
+        self_only = True
+
+    fetch_links(args.subreddit, args.date_start, args.date_stop, args.limit, args.score, self_only)
--- a/r/static/css/archive.css
+++ b/r/static/css/archive.css
@ -0,0 +1,103 @@
+footer {
+    text-align: center;
+}
+
+/* markdown */
+.md {
+    word-wrap: break-word;
+    overflow-wrap: break-word;
+}
+.md p, .md ol, .md ul, .md blockquote {
+    margin: 3px 0;
+}
+.md blockquote {
+    border-left: 2px solid rgba(255, 255, 255, 0.4); /* text-muted */
+    padding-left: 0.5rem;
+}
+.md blockquote, .md del {
+    color: rgba(255, 255, 255, 0.4); /* text-muted */
+}
+.md code, .md pre {
+    border: 1px solid #4E5D6C; /* alert-secondary */
+    background: #4E5D6C; /* disabled form input color */
+}
+.md h1 {
+    font-size: 1.5rem;
+}
+.md h2 {
+    font-size: 1.4rem;
+}
+.md h3 {
+    font-size: 1.3rem;
+}
+.md h4 {
+    font-size: 1.2rem;
+}
+.md h5 {
+    font-size: 1.1rem;
+}
+.md h6 {
+    font-size: 1rem;
+    font-weight: bold;
+}
+
+/* subreddit links */
+.links .title {
+    line-height: 1.25;
+}
+.links .title a, .submission .title a {
+    color: inherit;
+}
+.search .title {
+    color: inherit;
+    display: block;
+}
+
+/* link/post page */
+.op .author, .submission .author {
+    color: #5bc0de; /* match bootstrap link color / badge-primary */
+}
+.submission .card-body {
+    padding: 1rem;
+}
+
+/* comments */
+.comment {
+    margin-bottom: 5px; /*1rem;*/
+}
+.comment .byline, .comment .byline a {
+    line-height: 1;
+}
+.collapsed .md {
+    display: none;
+}
+.hidden {
+    display: none;
+}
+.to-top {
+    display: block;
+}
+.comments .ml-1 {
+    margin-left: 1rem !important;
+}
+.comments .ml-2 {
+    margin-left: 2rem !important;
+}
+.comments .ml-3 {
+    margin-left: 3rem !important;
+}
+.comments .ml-4 {
+    margin-left: 4rem !important;
+}
+.comments .ml-5 {
+    margin-left: 5rem !important;
+}
+.comments .ml-6 {
+    margin-left: 6rem !important;
+}
+.comments .ml-7 {
+    margin-left: 7rem !important;
+}
+.comments .ml-8 {
+    margin-left: 8rem !important;
+}
--- a/r/static/css/bootstrap-superhero.min.css
+++ b/r/static/css/bootstrap-superhero.min.css
--- a/r/static/css/lato.css
+++ b/r/static/css/lato.css
@ -0,0 +1,48 @@
+/* latin-ext */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 300;
+  src: local('Lato Light'), local('Lato-Light'), url(../fonts/S6u9w4BMUTPHh7USSwaPGR_p.woff2) format('woff2');
+  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
+}
+/* latin */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 300;
+  src: local('Lato Light'), local('Lato-Light'), url(../fonts/S6u9w4BMUTPHh7USSwiPGQ.woff2) format('woff2');
+  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
+}
+/* latin-ext */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 400;
+  src: local('Lato Regular'), local('Lato-Regular'), url(../fonts/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2');
+  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
+}
+/* latin */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 400;
+  src: local('Lato Regular'), local('Lato-Regular'), url(../fonts/S6uyw4BMUTPHjx4wXg.woff2) format('woff2');
+  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
+}
+/* latin-ext */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 700;
+  src: local('Lato Bold'), local('Lato-Bold'), url(../fonts/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2');
+  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
+}
+/* latin */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 700;
+  src: local('Lato Bold'), local('Lato-Bold'), url(../fonts/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2');
+  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
+}
--- a/r/static/fonts/S6u9w4BMUTPHh6UVSwaPGR_p.woff2
+++ b/r/static/fonts/S6u9w4BMUTPHh6UVSwaPGR_p.woff2
--- a/r/static/fonts/S6u9w4BMUTPHh6UVSwiPGQ.woff2
+++ b/r/static/fonts/S6u9w4BMUTPHh6UVSwiPGQ.woff2
--- a/r/static/fonts/S6u9w4BMUTPHh7USSwiPGQ.woff2
+++ b/r/static/fonts/S6u9w4BMUTPHh7USSwiPGQ.woff2
--- a/r/static/fonts/S6uyw4BMUTPHjx4wXg.woff2
+++ b/r/static/fonts/S6uyw4BMUTPHjx4wXg.woff2
--- a/r/static/fonts/S6uyw4BMUTPHjxAwXjeu.woff2
+++ b/r/static/fonts/S6uyw4BMUTPHjxAwXjeu.woff2
--- a/r/static/js/archive-comments.js
+++ b/r/static/js/archive-comments.js
@ -0,0 +1,37 @@
+$(document).ready(function() {
+  $('a.score').click(function(){
+    var $comment = $(this).closest('.comment');
+    var depth = parseInt($comment.data('depth'));
+    if ($comment.hasClass('collapsed')) {
+      $comment.removeClass('collapsed');
+      $check_comment = $comment.next('.comment');
+      depth_sibling = false;
+      while (!depth_sibling) {
+        if ($check_comment.length == 0) {
+          depth_sibling = true;
+        }
+        if ($check_comment.data('depth') == '' || parseInt($check_comment.data('depth')) <= depth) {
+          depth_sibling = true;
+        } else {
+          $check_comment.removeClass('hidden');
+        }
+        $check_comment = $check_comment.next('.comment');
+      }
+    } else {
+      $comment.addClass('collapsed');
+      $check_comment = $comment.next('.comment');
+      depth_sibling = false;
+      while (!depth_sibling) {
+        if ($check_comment.length == 0) {
+          depth_sibling = true;
+        }
+        if ($check_comment.data('depth') == '' || parseInt($check_comment.data('depth')) <= depth) {
+          depth_sibling = true;
+        } else {
+          $check_comment.addClass('hidden');
+        }
+        $check_comment = $check_comment.next('.comment');
+      }
+    }
+  });
+});
--- a/r/static/js/bootstrap.min.js
+++ b/r/static/js/bootstrap.min.js
--- a/r/static/js/jquery-3.3.1.slim.min.js
+++ b/r/static/js/jquery-3.3.1.slim.min.js
--- a/screenshots/post.jpg
+++ b/screenshots/post.jpg
--- a/screenshots/sub.jpg
+++ b/screenshots/sub.jpg
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,42 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/lato.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/bootstrap-superhero.min.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/archive.css">
+    <title>###TITLE###</title>
+  </head>
+  <body>
+    <header>
+      <nav class="navbar navbar-expand-sm navbar-dark bg-primary">
+        <span class="navbar-brand">###TITLE###</span>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav">
+            <li class="nav-item dropdown">
+              <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">subreddits</a>
+              <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+                <a class="dropdown-item" href="###URL_SUBS###">All</a>
+                ###HTML_SUBS_MENU###
+              </div>
+            </li>
+          </ul>
+        </div>
+      </nav>
+    </header>
+    <main role="main" class="container-fluid">
+      <div class="links mt-3">
+        ###HTML_LINKS###
+      </div>
+    </main>
+    <footer class="container-fluid">
+      <p class="small mb-0">archive has ###ARCH_NUM_POSTS### posts. <a href="###URL_PROJECT###">source code</a>.</p>
+    </footer>
+    <script src="###INCLUDE_PATH###static/js/jquery-3.3.1.slim.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/bootstrap.min.js"></script>
+  </body>
+</html>
--- a/templates/link.html
+++ b/templates/link.html
@ -0,0 +1,62 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/lato.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/bootstrap-superhero.min.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/archive.css">
+    <title>r/###SUB###: ###TITLE###</title>
+  </head>
+  <body>
+    <header>
+      <nav class="navbar navbar-expand-sm navbar-dark bg-primary">
+        <a class="navbar-brand" href="###URL_SUB###">r/###SUB###</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav">
+            <li class="nav-item">
+              <a class="nav-link" href="###URL_SUB###">score</a>
+            </li>
+            <li class="nav-item">
+              <a class="nav-link" href="###URL_SUB_CMNT###">comments</a>
+            </li>
+            <li class="nav-item">
+              <a class="nav-link" href="###URL_SUB_DATE###">date</a>
+            </li>
+            <li class="nav-item ###URL_SEARCH_CSS###">
+              <a class="nav-link" href="###URL_SEARCH###">search</a>
+            </li>
+            <li class="nav-item dropdown">
+              <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">subreddits</a>
+              <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+                <a class="dropdown-item" href="###URL_SUBS###">All</a>
+                ###HTML_SUBS_MENU###
+              </div>
+            </li>
+          </ul>
+        </div>
+      </nav>
+    </header>
+    <main role="main" class="container-fluid">
+      <div class="submission pt-3" data-id="###ID###">
+        <h3 class="title">###HTML_TITLE###</h3>
+        <p><span class="badge badge-primary">###SCORE###</span>&nbsp;&nbsp;###DATE### by ###HTML_AUTHOR_URL###</p>
+        ###HTML_SELFTEXT###
+      </div>
+      <div class="comments">
+        <h5>###NUM_COMMENTS### comments</h5>
+        ###HTML_COMMENTS###
+      </div>
+    </main>
+    <footer class="container-fluid">
+      <a class="to-top mt-1 mb-1 btn btn-lg btn-primary" href="#top">top of page</a>
+      <p class="small mb-0">data archived ###ARCHIVE_DATE###. <a href="###URL_PROJECT###">source code</a>.</p>
+    </footer>
+    <script src="###INCLUDE_PATH###static/js/jquery-3.3.1.slim.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/bootstrap.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/archive-comments.js"></script>
+  </body>
+</html>
--- a/templates/partial_comment.html
+++ b/templates/partial_comment.html
@ -0,0 +1,4 @@
+<div class="comment mb-3 ###CSS_CLASSES###" data-depth="###DEPTH###" data-id="###ID###">
+    <p class="byline text-muted mb-0"><a href="javascript:;" class="score"><span class="badge ###CLASS_SCORE###">###SCORE###</span></a>&nbsp;&nbsp;###HTML_AUTHOR_URL### ###DATE###</p>
+    <div class="md">###BODY###</div>
+</div>
--- a/templates/partial_index_subreddit.html
+++ b/templates/partial_index_subreddit.html
@ -0,0 +1 @@
+<h5><a class="subreddit" href="#URL_SUB#">#SUB#</a>  <span class="badge badge-secondary">#NUM_LINKS#</span></h5> 
--- a/templates/partial_link.html
+++ b/templates/partial_link.html
@ -0,0 +1,4 @@
+<div class="link mt-3">
+    <h5 class="title mb-0"><a href="###URL###">###TITLE###</a></h5>
+    <a href="###URL_COMMENTS###"><span class="badge badge-secondary">###SCORE###</span></a> <small class="text-muted"><a href="###URL_COMMENTS###">###NUM_COMMENTS### comments</a> ###DATE### ###HTML_AUTHOR_URL### ###LINK_DOMAIN###</small>
+</div>
--- a/templates/partial_link_selftext.html
+++ b/templates/partial_link_selftext.html
@ -0,0 +1 @@
+<div class="card bg-dark mb-3"><div class="card-body md">###SELFTEXT###</div></div>
--- a/templates/partial_menu_item.html
+++ b/templates/partial_menu_item.html
@ -0,0 +1 @@
+<a class="dropdown-item" href="###URL_SUB###">###SUB###</a>
--- a/templates/partial_search_link.html
+++ b/templates/partial_search_link.html
@ -0,0 +1 @@
+<a class="title mb-1" href="###URL###">###TITLE###</a>
--- a/templates/partial_subreddit_pager_link.html
+++ b/templates/partial_subreddit_pager_link.html
@ -0,0 +1 @@
+<li class="page-item #CSS_CLASS#"><a class="page-link" href="#URL#">#TEXT#</a></li>
--- a/templates/partial_url.html
+++ b/templates/partial_url.html
@ -0,0 +1 @@
+<a href="#HREF#">#INNER_HTML#</a>
--- a/templates/partial_user.html
+++ b/templates/partial_user.html
@ -0,0 +1 @@
+<a class="author" href="###URL_AUTHOR###">###AUTHOR###</a>
--- a/templates/partial_user_link.html
+++ b/templates/partial_user_link.html
@ -0,0 +1,4 @@
+<div class="link mt-3">
+    <h5 class="title mb-0"><a href="###URL###">###TITLE###</a></h5>
+    <a href="###URL_COMMENTS###"><span class="badge badge-secondary">###SCORE###</span></a> <small class="text-muted"><a href="###URL_COMMENTS###">###NUM_COMMENTS### comments</a> ###DATE### ###HTML_AUTHOR_URL### in <a href="###SUB_URL###">r/###SUB###</a></small>
+</div>
--- a/templates/search.html
+++ b/templates/search.html
@ -0,0 +1,54 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/lato.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/bootstrap-superhero.min.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/archive.css">
+    <title>r/###SUB### ###TITLE###</title>
+  </head>
+  <body>
+    <header>
+      <nav class="navbar navbar-expand-sm navbar-dark bg-primary">
+        <a class="navbar-brand" href="###URL_IDX_SCORE###">r/###SUB###</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav">
+            <li class="nav-item ###URL_IDX_SCORE_CSS###">
+              <a class="nav-link" href="###URL_IDX_SCORE###">score</a>
+            </li>
+            <li class="nav-item ###URL_IDX_CMNT_CSS###">
+              <a class="nav-link" href="###URL_IDX_CMNT###">comments</a>
+            </li>
+            <li class="nav-item ###URL_IDX_DATE_CSS###">
+              <a class="nav-link" href="###URL_IDX_DATE###">date</a>
+            </li>
+            <li class="nav-item ###URL_SEARCH_CSS###">
+              <a class="nav-link" href="###URL_SEARCH###">search</a>
+            </li>
+            <li class="nav-item dropdown">
+              <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">subreddits</a>
+              <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+                <a class="dropdown-item" href="###URL_SUBS###">All</a>
+                ###HTML_SUBS_MENU###
+              </div>
+            </li>
+          </ul>
+        </div>
+      </nav>
+    </header>
+    <main role="main" class="container-fluid">
+      <div class="links search mt-3">
+        ###HTML_LINKS###
+      </div>
+    </main>
+    <footer class="container-fluid">
+      <p class="small mb-0">r/###SUB### archive has ###ARCH_NUM_POSTS### posts and ###ARCH_NUM_COMMENTS### comments. <a href="###URL_PROJECT###">source code</a>.</p>
+    </footer>
+    <script src="###INCLUDE_PATH###static/js/jquery-3.3.1.slim.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/bootstrap.min.js"></script>
+  </body>
+</html>
--- a/templates/subreddit.html
+++ b/templates/subreddit.html
@ -0,0 +1,60 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/lato.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/bootstrap-superhero.min.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/archive.css">
+    <title>r/###SUB### ###TITLE###</title>
+  </head>
+  <body>
+    <header>
+      <nav class="navbar navbar-expand-sm navbar-dark bg-primary">
+        <a class="navbar-brand" href="###URL_IDX_SCORE###">r/###SUB###</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav">
+            <li class="nav-item ###URL_IDX_SCORE_CSS###">
+              <a class="nav-link" href="###URL_IDX_SCORE###">score</a>
+            </li>
+            <li class="nav-item ###URL_IDX_CMNT_CSS###">
+              <a class="nav-link" href="###URL_IDX_CMNT###">comments</a>
+            </li>
+            <li class="nav-item ###URL_IDX_DATE_CSS###">
+              <a class="nav-link" href="###URL_IDX_DATE###">date</a>
+            </li>
+            <li class="nav-item ###URL_SEARCH_CSS###">
+              <a class="nav-link" href="###URL_SEARCH###">search</a>
+            </li>
+            <li class="nav-item dropdown">
+              <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">subreddits</a>
+              <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+                <a class="dropdown-item" href="###URL_SUBS###">All</a>
+                ###HTML_SUBS_MENU###
+              </div>
+            </li>
+          </ul>
+        </div>
+      </nav>
+    </header>
+    <main role="main" class="container-fluid">
+      <ul class="pagination pagination-sm mt-3">
+        ###HTML_PAGER###
+      </ul>
+      <div class="links">
+        ###HTML_LINKS###
+      </div>
+      <ul class="pagination pagination-sm mt-3">
+        ###HTML_PAGER###
+      </ul>
+    </main>
+    <footer class="container-fluid">
+      <p class="small mb-0">r/###SUB### archive has ###ARCH_NUM_POSTS### posts and ###ARCH_NUM_COMMENTS### comments. <a href="###URL_PROJECT###">source code</a>.</p>
+    </footer>
+    <script src="###INCLUDE_PATH###static/js/jquery-3.3.1.slim.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/bootstrap.min.js"></script>
+  </body>
+</html>
--- a/templates/user.html
+++ b/templates/user.html
@ -0,0 +1,42 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/lato.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/bootstrap-superhero.min.css">
+    <link rel="stylesheet" href="###INCLUDE_PATH###static/css/archive.css">
+    <title>###TITLE###</title>
+  </head>
+  <body>
+    <header>
+      <nav class="navbar navbar-expand-sm navbar-dark bg-primary">
+        <a class="navbar-brand" href="###URL_USER###">###TITLE###</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarNav">
+          <ul class="navbar-nav">
+            <li class="nav-item dropdown">
+              <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">subreddits</a>
+              <div class="dropdown-menu" aria-labelledby="navbarDropdown">
+                <a class="dropdown-item" href="###URL_SUBS###">All</a>
+                ###HTML_SUBS_MENU###
+              </div>
+            </li>
+          </ul>
+        </div>
+      </nav>
+    </header>
+    <main role="main" class="container-fluid">
+      <div class="links">
+        ###HTML_LINKS###
+      </div>
+    </main>
+    <footer class="container-fluid">
+      <p class="small mb-0">archive has ###ARCH_NUM_POSTS### user posts. <a href="###URL_PROJECT###">source code</a>.</p>
+    </footer>
+    <script src="###INCLUDE_PATH###static/js/jquery-3.3.1.slim.min.js"></script>
+    <script src="###INCLUDE_PATH###static/js/bootstrap.min.js"></script>
+  </body>
+</html>
--- a/write_html.py
+++ b/write_html.py
@ -0,0 +1,749 @@
+#! /usr/bin/env python
+from datetime import datetime, date, timedelta
+import argparse
+import csv
+import os
+import re
+import snudown
+
+url_project = 'https://github.com/libertysoft3/reddit-html-archiver'
+links_per_page = 30
+pager_skip = 10
+pager_skip_long = 100
+start_date = date(2005, 1, 1)
+end_date = datetime.today().date() + timedelta(days=1)
+source_data_links = 'links.csv'
+max_comment_depth = 8 # mostly for mobile, which might be silly
+removed_content_identifiers = ['[deleted]','deleted','[removed]','removed']
+default_sort = 'score'
+sort_indexes = {
+    'score': {
+        'default': 1,
+        'slug': 'score'
+    },
+    'num_comments': {
+        'default': 0,
+        'slug': 'comments',
+    },
+    'created_utc': {
+        'default': 1000198000,
+        'slug': 'date',
+    }
+}
+
+template_index = ''
+with open('templates/index.html', 'r') as file:
+    template_index = file.read()
+
+template_subreddit = ''
+with open('templates/subreddit.html', 'r') as file:
+    template_subreddit = file.read()
+
+template_link = ''
+with open('templates/link.html', 'r') as file:
+    template_link = file.read()
+
+template_comment = ''
+with open('templates/partial_comment.html', 'r') as file:
+    template_comment = file.read()
+
+template_search = ''
+with open('templates/search.html', 'r') as file:
+    template_search = file.read()
+
+template_user = ''
+with open('templates/user.html', 'r') as file:
+    template_user = file.read()
+
+template_sub_link = ''
+with open('templates/partial_menu_item.html', 'r') as file:
+    template_sub_link = file.read()
+
+template_user_url = ''
+with open('templates/partial_user.html', 'r') as file:
+    template_user_url = file.read()
+
+template_link_url = ''
+with open('templates/partial_link.html', 'r') as file:
+    template_link_url = file.read()
+
+template_search_link = ''
+with open('templates/partial_search_link.html', 'r') as file:
+    template_search_link = file.read()
+
+template_index_sub = ''
+with open('templates/partial_index_subreddit.html', 'r') as file:
+    template_index_sub = file.read()
+
+template_index_pager_link = ''
+with open('templates/partial_subreddit_pager_link.html', 'r') as file:
+    template_index_pager_link = file.read()
+
+template_selftext = ''
+with open('templates/partial_link_selftext.html', 'r') as file:
+    template_selftext = file.read()
+
+template_user_page_link = ''
+with open('templates/partial_user_link.html', 'r') as file:
+    template_user_page_link = file.read()
+
+teplate_url = ''
+with open('templates/partial_url.html', 'r') as file:
+    template_url = file.read()
+
+def generate_html(min_score=0, min_comments=0, hide_deleted_comments=False):
+    delta = timedelta(days=1)
+    subs = get_subs()
+    stat_links = 0
+    stat_filtered_links = 0
+    user_index = {}
+    processed_subs = []
+
+    for sub in subs:
+        d = start_date
+        sub_links = []
+        stat_sub_links = 0
+        stat_sub_filtered_links = 0
+        stat_sub_comments = 0
+        while d <= end_date:
+            raw_links = load_links(d, sub)
+            # print ('processing %s %s %s links' % (sub, d.strftime("%Y-%m-%d"), len(sub_links)))
+            stat_links += len(raw_links)
+            stat_sub_links += len(raw_links)
+            for l in raw_links:
+                if validate_link(l, min_score, min_comments):
+                    stat_filtered_links += 1
+                    stat_sub_filtered_links += 1
+                    stat_sub_comments += len('comments')
+                    sub_links.append(l)
+                    if l['author'] not in user_index.keys():
+                        user_index[l['author']] = []
+                    l['subreddit'] = sub
+                    user_index[l['author']].append(l)
+                    # TODO: return comments written
+                    write_link_page(subs, l, sub, hide_deleted_comments)
+            d += delta
+        if stat_sub_filtered_links > 0:
+            processed_subs.append({'name': sub, 'num_links': stat_sub_filtered_links})
+        write_subreddit_pages(sub, subs, sub_links, stat_sub_filtered_links, stat_sub_comments)
+        write_subreddit_search_page(sub, subs, sub_links, stat_sub_filtered_links, stat_sub_comments)
+        print('%s: %s links filtered to %s' % (sub, stat_sub_links, stat_sub_filtered_links))
+    write_index(processed_subs)
+    write_user_page(processed_subs, user_index)
+    print('all done. %s links filtered to %s' % (stat_links, stat_filtered_links))
+
+def write_subreddit_pages(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
+    if len(link_index) == 0:
+        return True
+
+    for sort in sort_indexes.keys():
+        links = sorted(link_index, key=lambda k: (int(k[sort]) if k[sort] != '' else sort_indexes[sort]['default']), reverse=True)
+        pages = list(chunks(links, links_per_page))
+        page_num = 0
+
+        sort_based_prefix = '../'
+        if sort == default_sort:
+            sort_based_prefix = ''
+
+        # render subreddits list
+        subs_menu_html = ''
+        for sub in subs:
+            sub_url = sort_based_prefix + '../' + sub + '/index.html'
+            subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
+
+        for page in pages:
+            page_num += 1
+            # print('%s page' % (page))
+
+            links_html = ''
+            for l in page:
+                author_link_html = template_user_url
+                author_url = sort_based_prefix + '../user/' + l['author'] + '.html'
+                author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
+
+                link_url = l['url']
+                link_comments_url = sort_based_prefix + l['permalink'].strip('/')
+                link_comments_url = link_comments_url.replace('r/' + subreddit + '/', '')
+                idpath = '/'.join(list(l['id']))
+                link_comments_url = link_comments_url.replace(l['id'], idpath)
+                link_comments_url += '.html'
+                if l['is_self'] is True or l['is_self'] == 'True':
+                    link_url = link_comments_url
+
+                index_link_data_map = {
+                    '###TITLE###':              l['title'],
+                    '###URL###':                link_url,
+                    '###URL_COMMENTS###':       link_comments_url,
+                    '###SCORE###':              l['score'],
+                    '###NUM_COMMENTS###':       l['num_comments'] if int(l['num_comments']) > 0 else 0,
+                    '###DATE###':               datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
+                    '###LINK_DOMAIN###':        '(self.' + l['subreddit'] + ')' if l['is_self'] is True or l['is_self'] == 'True' else '',
+                    '###HTML_AUTHOR_URL###':    author_link_html,
+                }
+                link_html = template_link_url
+                for key, value in index_link_data_map.items():
+                    link_html = link_html.replace(key, value)
+                links_html += link_html + '\n'
+
+            index_page_data_map = {
+                '###INCLUDE_PATH###':           sort_based_prefix + '../',
+                '###TITLE###':                  'by ' + sort_indexes[sort]['slug'] + ' page ' + str(page_num) + ' of ' + str(len(pages)),
+                '###SUB###':                    subreddit,
+                '###ARCH_NUM_POSTS###':         str(stat_sub_filtered_links),
+                '###ARCH_NUM_COMMENTS###':      str(stat_sub_comments),
+                '###URL_SUBS###':               sort_based_prefix + '../index.html',
+                '###URL_PROJECT###':            url_project,
+                '###URL_IDX_SCORE###':          sort_based_prefix + 'index.html',
+                '###URL_IDX_CMNT###':           sort_based_prefix + 'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
+                '###URL_IDX_DATE###':           sort_based_prefix + 'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
+                '###URL_SEARCH###':             sort_based_prefix + 'search.html',
+                '###URL_IDX_SCORE_CSS###':      'active' if sort == 'score' else '',
+                '###URL_IDX_CMNT_CSS###':       'active' if sort == 'num_comments' else '',
+                '###URL_IDX_DATE_CSS###':       'active' if sort == 'created_utc' else '',
+                '###URL_SEARCH_CSS###':         '',
+                '###HTML_LINKS###':             links_html,
+                '###HTML_SUBS_MENU###':         subs_menu_html,
+                '###HTML_PAGER###':             get_pager_html(page_num, len(pages)),
+            }
+            page_html = template_subreddit
+            for key, value in index_page_data_map.items():
+                page_html = page_html.replace(key, value)
+
+            
+            # write file
+            suffix = '-' + str(page_num) + '.html'
+            if page_num == 1:
+                suffix = '.html'
+            filename = 'index' + suffix
+            if sort == default_sort:
+                filepath = 'r/' + subreddit + '/' + filename
+            else:
+                filepath = 'r/' + subreddit + '/index-' + sort_indexes[sort]['slug'] + '/' + filename
+            if not os.path.isfile(filepath):
+                os.makedirs(os.path.dirname(filepath), exist_ok=True)
+                with open(filepath, 'w') as file:
+                    file.write(page_html)
+                    # print('wrote %s %s, %s links' % (sort, filepath, len(page)))
+
+    return True
+
+def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False):
+    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
+    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
+    idpath = '/'.join(list(link['id']))
+    filepath = link['permalink'].strip('/') + '.html'
+    filepath = filepath.replace(link['id'], idpath)
+    if os.path.isfile(filepath):
+        return True
+
+    created = datetime.utcfromtimestamp(int(link['created_utc']))
+    sorted_comments = []
+    if len(link['comments']) > 0:
+        sorted_comments = sort_comments(link['comments'], hide_deleted_comments)
+
+    # traverse up to root dir, depends on id length
+    static_include_path = ''
+    for i in range(len(link['id']) + 2):
+        static_include_path += '../'
+
+    # render comments
+    comments_html = ''
+    for c in sorted_comments:
+        css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth))
+        if c['author'] == link['author'] and c['author'] not in removed_content_identifiers:
+            css_classes += ' op'
+        if c['stickied'].lower() == 'true' or c['stickied'] is True:
+            css_classes += ' stickied'
+
+        # author link
+        url = static_include_path + 'user/' + c['author'] + '.html'
+        author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', c['author'])
+
+        comment_data_map = {
+            '###ID###':                 c['id'],
+            '###PARENT_ID###':          c['parent_id'],
+            '###DEPTH###':              str(c['depth']),
+            '###DATE###':               created.strftime('%Y-%m-%d'),
+            '###SCORE###':              c['score'],
+            '###BODY###':               snudown.markdown(c['body'].replace('&gt;','>')),
+            '###CSS_CLASSES###':        css_classes,
+            '###CLASS_SCORE###':        'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary',
+            '###HTML_AUTHOR_URL###':    author_link_html,
+        }
+        comment_html = template_comment
+        for key, value in comment_data_map.items():
+            comment_html = comment_html.replace(key, value)
+        comments_html += comment_html + '\n'
+
+    # render subreddits list
+    subs_menu_html = ''
+    for sub in subreddits:
+        sub_url = static_include_path + sub + '/index.html'
+        subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
+
+    # render selftext
+    selftext_html = ''
+    if len(link['selftext']) > 0:
+        selftext_html = template_selftext.replace('###SELFTEXT###', snudown.markdown(link['selftext'].replace('&gt;','>')))
+
+    # author link
+    url = static_include_path + 'user/' + link['author'] + '.html'
+    author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', link['author'])
+
+    html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title'])
+    if link['is_self'] is True or link['is_self'].lower() == 'true':
+        html_title = link['title']
+
+    # render link page
+    link_data_map = {
+        '###INCLUDE_PATH###':       static_include_path,
+        '###SUB###':                subreddit,
+        '###TITLE###':              link['title'],
+        '###ID###':                 link['id'],
+        '###DATE###':               created.strftime('%Y-%m-%d'),
+        '###ARCHIVE_DATE###':       datetime.utcfromtimestamp(int(link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a',
+        '###SCORE###':              link['score'],
+        '###NUM_COMMENTS###':       link['num_comments'],
+        '###URL_PROJECT###':        url_project,
+        '###URL_SUBS###':           static_include_path + 'index.html',
+        '###URL_SUB###':            static_include_path + subreddit + '/index.html',
+        '###URL_SUB_CMNT###':       static_include_path + subreddit + '/index-' + sort_indexes['num_comments']['slug'] + '/index.html',
+        '###URL_SUB_DATE###':       static_include_path + subreddit + '/index-' + sort_indexes['created_utc']['slug'] + '/index.html',
+        '###URL_SEARCH###':         static_include_path + subreddit + '/search.html',
+        '###HTML_SUBS_MENU###':     subs_menu_html,
+        '###HTML_SELFTEXT###':      selftext_html,
+        '###HTML_COMMENTS###':      comments_html,
+        '###HTML_AUTHOR_URL###':    author_link_html,
+        '###HTML_TITLE###':         html_title,
+    }
+    html = template_link
+    for key, value in link_data_map.items():
+        html = html.replace(key, value)
+
+    # write html
+    # reddit:  https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
+    # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
+    idpath = '/'.join(list(link['id']))
+    filepath = link['permalink'].strip('/') + '.html'
+    filepath = filepath.replace(link['id'], idpath)
+    if not os.path.isfile(filepath):
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        with open(filepath, 'w') as file:
+            file.write(html)
+        # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath))
+
+    return True
+
+def write_subreddit_search_page(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
+    if len(link_index) == 0:
+        return True
+
+    # name sort?
+    links = sorted(link_index, key=lambda k: re.sub(r'\W+', '', k['title']).lower())
+
+    # render subreddits list
+    subs_menu_html = ''
+    for sub in subs:
+        sub_url = '../' + sub + '/index.html'
+        subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
+
+    links_html = ''
+    for l in links:
+        link_comments_url = l['permalink'].strip('/').replace('r/' + subreddit + '/', '')
+        idpath = '/'.join(list(l['id']))
+        link_comments_url = link_comments_url.replace(l['id'], idpath)
+        link_comments_url += '.html'
+        index_link_data_map = {
+            '###TITLE###':              l['title'],
+            '###URL###':                link_comments_url,
+        }
+        link_html = template_search_link
+        for key, value in index_link_data_map.items():
+            link_html = link_html.replace(key, value)
+        links_html += link_html + '\n'
+
+    index_page_data_map = {
+        '###INCLUDE_PATH###':           '../',
+        '###TITLE###':                  'search',
+        '###SUB###':                    subreddit,
+        '###ARCH_NUM_POSTS###':         str(stat_sub_filtered_links),
+        '###ARCH_NUM_COMMENTS###':      str(stat_sub_comments),
+        '###URL_SUBS###':               '../index.html',
+        '###URL_PROJECT###':            url_project,
+        '###URL_IDX_SCORE###':          'index.html',
+        '###URL_IDX_CMNT###':           'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
+        '###URL_IDX_DATE###':           'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
+        '###URL_SEARCH###':             'search.html',
+        '###URL_IDX_SCORE_CSS###':      '',
+        '###URL_IDX_CMNT_CSS###':       '',
+        '###URL_IDX_DATE_CSS###':       '',
+        '###URL_SEARCH_CSS###':         'active',
+        '###HTML_LINKS###':             links_html,
+        '###HTML_SUBS_MENU###':         subs_menu_html,
+    }
+    page_html = template_search
+    for key, value in index_page_data_map.items():
+        page_html = page_html.replace(key, value)
+
+    # write file
+    filename = 'search.html'
+    filepath = 'r/' + subreddit + '/' + filename
+    if not os.path.isfile(filepath):
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        with open(filepath, 'w') as file:
+            file.write(page_html)
+            # print('wrote %s, %s links' % (filepath, len(links)))
+    return True
+
+def write_user_page(subs, user_index):
+    if len(user_index.keys()) == 0:
+        return False
+
+    # subreddits list
+    subs_menu_html = ''
+    for sub in subs:
+        sub_url = '../' + sub['name'] + '/index.html'
+        subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
+
+    for user in user_index.keys():
+        links = user_index[user]
+        links.sort(key=lambda k: (int(k['score']) if k['score'] != '' else sort_indexes['score']['default']), reverse=True)
+
+        links_html = ''
+        for l in links:
+
+            author_link_html = template_user_url
+            author_url = l['author'] + '.html'
+            author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
+
+            link_comments_url = '../' + l['permalink'].strip('/').strip('r/')
+            idpath = '/'.join(list(l['id']))
+            link_comments_url = link_comments_url.replace(l['id'], idpath)
+            link_comments_url += '.html'
+            link_url = l['url']
+            if l['is_self'] is True or l['is_self'] == 'True':
+                link_url = link_comments_url
+
+            link_data_map = {
+                '###TITLE###':              l['title'],
+                '###URL###':                link_url,
+                '###URL_COMMENTS###':       link_comments_url,
+                '###SCORE###':              l['score'],
+                '###NUM_COMMENTS###':       l['num_comments'] if int(l['num_comments']) > 0 else 0,
+                '###DATE###':               datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
+                '###SUB###':                l['subreddit'],
+                '###SUB_URL###':            '../' + l['subreddit'] + '/index.html',
+                '###HTML_AUTHOR_URL###':    author_link_html,
+            }
+            link_html = template_user_page_link
+            for key, value in link_data_map.items():
+                link_html = link_html.replace(key, value)
+            links_html += link_html + '\n'
+
+        page_data_map = {
+            '###INCLUDE_PATH###':           '../',
+            '###TITLE###':                  'user/' + user,
+            '###ARCH_NUM_POSTS###':         str(len(links)),
+            '###URL_USER###':               user + '.html',
+            '###URL_SUBS###':               '../index.html',
+            '###URL_PROJECT###':            url_project,
+            '###HTML_LINKS###':             links_html,
+            '###HTML_SUBS_MENU###':         subs_menu_html,
+        }
+        page_html = template_user
+        for key, value in page_data_map.items():
+            page_html = page_html.replace(key, value)
+
+        filepath = 'r/user/' + user + '.html'
+        if not os.path.isfile(filepath):
+            os.makedirs(os.path.dirname(filepath), exist_ok=True)
+            with open(filepath, 'w') as file:
+                file.write(page_html)
+            # print('wrote %s' % (filepath))
+
+    return True
+
+def write_index(subs):
+    if len(subs) == 0:
+        return False
+    subs.sort(key=lambda k: k['name'].casefold())
+    
+    stat_num_links = 0
+    links_html = ''
+    subs_menu_html = ''
+    for sub in subs:
+        sub_url = sub['name'] + '/index.html'
+        links_html += template_index_sub.replace('#URL_SUB#', sub_url).replace('#SUB#', sub['name']).replace('#NUM_LINKS#', str(sub['num_links']))
+        subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
+        stat_num_links += sub['num_links']
+
+    index_page_data_map = {
+        '###INCLUDE_PATH###':           '',
+        '###TITLE###':                  'subreddits',
+        '###URL_SUBS###':               'index.html',
+        '###URL_PROJECT###':            url_project,
+        '###ARCH_NUM_POSTS###':         str(stat_num_links),
+        '###HTML_LINKS###':             links_html,
+        '###HTML_SUBS_MENU###':         subs_menu_html,
+    }
+    page_html = template_index
+    for key, value in index_page_data_map.items():
+        page_html = page_html.replace(key, value)
+
+    filepath = 'r/index.html'
+    if not os.path.isfile(filepath):
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        with open(filepath, 'w') as file:
+            file.write(page_html)
+        # print('wrote %s' % (filepath))
+
+    return True
+
+# a 'top' comments sort with orphaned comments (incomplete data) rendered last
+# only remove deleted comments if no children
+# 
+def sort_comments(comments, hide_deleted_comments=False):
+    sorted_comments = []
+    if len(comments) == 0:
+        return sorted_comments
+    parent_map = {}
+    id_map = {}
+    top_level_comments = []
+    link_id = comments[0]['link_id']
+    depth = 0
+
+    for c in comments:
+        c['depth'] = depth
+        id_map[c['id']] = c
+        parent_map[c['id']] = c['parent_id']
+        # add stickied comments
+        if c['stickied'].lower() == 'true':
+            sorted_comments.append(c)
+        # store top level comments      
+        elif c['parent_id'] == c['link_id']:
+            top_level_comments.append(c)
+
+    # sort non stickied top level comments
+    if len(top_level_comments) > 0:
+        top_level_comments = sorted(top_level_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
+        sorted_comments += top_level_comments
+
+    # add each top level comment's child comments
+    sorted_linear_comments = []
+    for c in sorted_comments:
+        if hide_deleted_comments and c['body'] in removed_content_identifiers and 't1_' + c['id'] not in parent_map.values():
+            pass
+        else:
+            sorted_linear_comments.append(c)
+            child_comments = get_comment_tree_list([], depth + 1, c, id_map, parent_map, hide_deleted_comments)
+            if len(child_comments) > 0:
+                sorted_linear_comments += child_comments
+
+    # add orphaned comments
+    for c in comments:
+        if c['parent_id'] != link_id and c['parent_id'].strip('t1_') not in id_map.keys():
+            if hide_deleted_comments and c['body'] in removed_content_identifiers:
+                continue
+            sorted_linear_comments.append(c)
+
+    # print('sort_comments() in %s out %s show deleted: %s' % (len(comments), len(sorted_comments), hide_deleted_comments))
+    return sorted_linear_comments
+
+def get_comment_tree_list(tree, depth, parent_comment, id_map, parent_map, hide_deleted_comments):
+    parent_id = 't1_' + parent_comment['id']
+    child_comments = []
+    for key, value in parent_map.items():
+        if value == parent_id:
+            if hide_deleted_comments and id_map[key]['body'] in removed_content_identifiers and 't1_' + key not in parent_map.values():
+                pass
+            else:
+                child_comments.append(id_map[key])
+
+    # sort children by score
+    # TODO: sort by score and # of child comments
+    if len(child_comments) > 0:
+        child_comments = sorted(child_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
+        for child_comment in child_comments:
+            child_comment['depth'] = depth
+            tree.append(child_comment)
+            tree = get_comment_tree_list(tree, depth + 1, child_comment, id_map, parent_map, hide_deleted_comments)
+    return tree
+
+def validate_link(link, min_score=0, min_comments=0):
+    if not link:
+        return False
+    elif not 'id' in link.keys():
+        return False
+    # apply multiple conditions as an OR, keep high score low comments and high comment low score links/posts
+    if min_score > 0 and min_comments > 0:
+        if int(link['score']) < min_score and int(link['num_comments']) < min_comments:
+            return False
+    else:
+        if min_score > 0 and int(link['score']) < min_score:
+            return False
+        if min_comments > 0 and int(link['num_comments']) < min_comments:
+            return False
+
+    return True
+
+def load_links(date, subreddit):
+    links = []
+    if not date or not subreddit:
+        return links
+
+    date_path = date.strftime("%Y/%m/%d")
+    daily_path = 'data/' + subreddit + '/' + date_path
+    daily_links_path = daily_path + '/' + source_data_links
+    if os.path.isfile(daily_links_path):
+        links = []
+        with open(daily_links_path, 'r') as links_file:
+            reader = csv.DictReader(links_file)
+            for link_row in reader:
+                comments = []
+                comments_file_path = daily_path + '/' + link_row['id'] + '.csv'
+                if os.path.isfile(comments_file_path):
+                    with open(comments_file_path, 'r') as comments_file:
+                        reader = csv.DictReader(comments_file)
+                        for comment_row in reader:
+                            comments.append(comment_row)
+                link_row['comments'] = comments
+                links.append(link_row)
+    return links
+
+def get_subs():
+    subs = []
+    if not os.path.isdir('data'):
+        print('ERROR: no data, run fetch_links.py first')
+        return subs
+    return [d.name for d in os.scandir('data') if d.is_dir()]
+
+def get_pager_html(page_num=1, pages=1):
+    html_pager = ''
+
+    # previous
+    css = ''
+    if page_num == 1:
+        css = 'disabled'
+    url = 'index'
+    if page_num  - 1 > 1:
+        url += '-' + str(page_num - 1)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;').replace('#CSS_CLASS#', css)
+    
+    # skip back
+    css = ''
+    prev_skip = page_num - pager_skip
+    if prev_skip < 1:
+        prev_skip = 1
+    if page_num == 1:
+        css = 'disabled'
+    url = 'index'
+    if prev_skip > 1:
+        url += '-' + str(prev_skip)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;&lsaquo;').replace('#CSS_CLASS#', css)
+    
+    # skip back far
+    css = ''
+    prev_skip = page_num - pager_skip_long
+    if prev_skip < 1:
+        prev_skip = 1
+    if page_num == 1:
+        css += ' disabled'
+    url = 'index'
+    if prev_skip > 1:
+        url += '-' + str(prev_skip)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;&lsaquo;&lsaquo;').replace('#CSS_CLASS#', css)
+
+    # n-1
+    start = -2
+    if page_num + 1 > pages:
+        start -= 1
+    if page_num + 2 > pages:
+        start -= 1
+    for prev_page_num in range(start,0):
+        if page_num + prev_page_num > 0:
+            css = ''
+            url = 'index'
+            if page_num + prev_page_num > 1:
+                url += '-' + str(page_num + prev_page_num)
+            url += '.html'
+            if prev_page_num < -1:
+                css = 'd-none d-sm-block'
+            html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num + prev_page_num)).replace('#CSS_CLASS#', css)
+    # n
+    url = 'index'
+    if page_num > 1:
+        url += '-' + str(page_num)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num)).replace('#CSS_CLASS#', 'active')
+    # n + 1
+    css = ''
+    end = 3
+    if page_num -1 < 1:
+        end += 1
+    if page_num - 2 < 1:
+        end += 1
+    for next_page_num in range(1,end):
+        if page_num + next_page_num <= pages:
+            if next_page_num > 1:
+                css = 'd-none d-sm-block'
+            html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(page_num + next_page_num) + '.html').replace('#TEXT#', str(page_num + next_page_num)).replace('#CSS_CLASS#', css)
+
+    # skip forward far
+    next_skip = page_num + pager_skip_long
+    css = ''
+    if page_num == pages:
+        css += ' disabled'
+    if next_skip > pages:
+        next_skip = pages
+    url = 'index'
+    if next_skip > 1:
+        url += '-' + str(next_skip)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&rsaquo;&rsaquo;&rsaquo;').replace('#CSS_CLASS#', css)
+    
+    # skip forward
+    next_skip = page_num + pager_skip
+    css = ''
+    if page_num == pages:
+        css = 'disabled'
+    if next_skip > pages:
+        next_skip = pages
+    url = 'index'
+    if next_skip > 1:
+        url += '-' + str(next_skip)
+    url += '.html'
+    html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&rsaquo;&rsaquo;').replace('#CSS_CLASS#', css)
+
+    # next
+    css = ''
+    next_num = page_num + 1 
+    if page_num == pages:
+      css = 'disabled'
+      next_num = pages
+    html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(next_num) + '.html').replace('#TEXT#', '&rsaquo;').replace('#CSS_CLASS#', css)
+
+    return html_pager
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+if __name__ == '__main__':
+    parser=argparse.ArgumentParser()
+    parser.add_argument('--min-score', default=0, help='limit post rendering, default 0')
+    parser.add_argument('--min-comments', default=0, help='limit post rendering, default 0')
+    parser.add_argument('--hide-deleted-comments', action='store_true', help='exclude deleted and removed comments where possible')
+    args=parser.parse_args()
+
+    hide_deleted_comments = False
+    if args.hide_deleted_comments:
+        hide_deleted_comments = True
+
+    args.min_score = int(args.min_score)
+    args.min_comments = int(args.min_comments)
+
+    generate_html(args.min_score, args.min_comments, hide_deleted_comments)
				`@ -0,0 +1 @@`
				`<h5><a class="subreddit" href="#URL_SUB#">#SUB#</a> <span class="badge badge-secondary">#NUM_LINKS#</span></h5>`
				`@ -0,0 +1 @@`
				`<div class="card bg-dark mb-3"><div class="card-body md">###SELFTEXT###</div></div>`
				`@ -0,0 +1 @@`
				`<a class="dropdown-item" href="###URL_SUB###">###SUB###</a>`
				`@ -0,0 +1 @@`
				`<a class="title mb-1" href="###URL###">###TITLE###</a>`
				`@ -0,0 +1 @@`
				`<li class="page-item #CSS_CLASS#"><a class="page-link" href="#URL#">#TEXT#</a></li>`
				`@ -0,0 +1 @@`
				`<a class="author" href="###URL_AUTHOR###">###AUTHOR###</a>`