mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-04 11:26:41 -04:00
55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
import json
|
|
|
|
import utils
|
|
import discord_logging
|
|
import pymongo
|
|
import time
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
log = discord_logging.init_logging()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
mongo_address = sys.argv[1] # 192.168.1.131
|
|
client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
|
|
log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
|
|
|
|
subreddits = [
|
|
"PersonalFinanceCanada"
|
|
]
|
|
start_date = datetime(2020, 1, 1)
|
|
end_date = datetime(2021, 1, 1)
|
|
|
|
for subreddit in subreddits:
|
|
count = 0
|
|
start_time = time.time()
|
|
cursor = client.reddit_database.comments.find(
|
|
filter={"subreddit": subreddit, "created_utc": {"$gte": int(start_date.timestamp()), "$lt": int(end_date.timestamp())}},
|
|
projection={'_id': False},
|
|
sort=[('created_utc', pymongo.ASCENDING)]
|
|
)
|
|
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
|
|
|
|
output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\{0}_comments.zst".format(subreddit))
|
|
start_time = time.time()
|
|
for comment in cursor:
|
|
count += 1
|
|
output_writer.write(json.dumps(comment, separators=(',', ':')))
|
|
output_writer.write("\n")
|
|
if count % 10000 == 0:
|
|
log.info(f"{count:,} through {datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')} in {int(time.time() - start_time)} seconds r/{subreddit}")
|
|
|
|
output_writer.close()
|
|
log.info(f"{count:,} in {int(time.time() - start_time)} seconds r/{subreddit}")
|
|
|
|
|
|
# db.comments.createIndex({subreddit:1}) // remove
|
|
# db.comments.createIndex({subreddit:1, created_utc:1})
|
|
# db.comments.createIndex({author:1, created_utc:1})
|
|
# db.comments.createIndex({id:1})
|
|
# db.submissions.createIndex({subreddit:1, created_utc:1})
|
|
# db.submissions.createIndex({author:1, created_utc:1})
|
|
# db.submissions.createIndex({id:1})
|
|
# db.submissions.createIndex({created_utc:1})
|
|
# db.comments.createIndex({created_utc:1})
|