Add csv script

This commit is contained in:
Watchful1 2022-02-14 16:04:27 -08:00
parent c08f5f212f
commit 461028b401
6 changed files with 169 additions and 49 deletions

View file

@ -5,6 +5,7 @@ import discord_logging
import pymongo
import time
import sys
from datetime import datetime
log = discord_logging.init_logging()
@ -14,26 +15,33 @@ if __name__ == "__main__":
client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
count = 0
start_time = time.time()
cursor = client.reddit_database.comments.find(
filter={"subreddit": "RelationshipsOver35"},
projection={'_id': False},
sort=[('created_utc', pymongo.ASCENDING)]
)
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
subreddits = [
"PersonalFinanceCanada"
]
start_date = datetime(2020, 1, 1)
end_date = datetime(2021, 1, 1)
output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments.zst")
start_time = time.time()
for comment in cursor:
count += 1
output_writer.write(json.dumps(comment, separators=(',', ':')))
output_writer.write("\n")
if count % 100000 == 0:
log.info(f"{count,} in {int(time.time() - start_time)} seconds")
for subreddit in subreddits:
count = 0
start_time = time.time()
cursor = client.reddit_database.comments.find(
filter={"subreddit": subreddit, "created_utc": {"$gte": int(start_date.timestamp()), "$lt": int(end_date.timestamp())}},
projection={'_id': False},
sort=[('created_utc', pymongo.ASCENDING)]
)
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
output_writer.close()
log.info(f"{count,} in {int(time.time() - start_time)} seconds")
output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\{0}_comments.zst".format(subreddit))
start_time = time.time()
for comment in cursor:
count += 1
output_writer.write(json.dumps(comment, separators=(',', ':')))
output_writer.write("\n")
if count % 10000 == 0:
log.info(f"{count:,} through {datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')} in {int(time.time() - start_time)} seconds r/{subreddit}")
output_writer.close()
log.info(f"{count:,} in {int(time.time() - start_time)} seconds r/{subreddit}")
# db.comments.createIndex({subreddit:1}) // remove