mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-23 14:50:35 -04:00
Reorganize
This commit is contained in:
parent
3700b21b81
commit
4f1d70d34a
19 changed files with 0 additions and 105 deletions
57
personal/mongo/group_subs.py
Normal file
57
personal/mongo/group_subs.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
import json
|
||||
from datetime import datetime
|
||||
import utils
|
||||
import discord_logging
|
||||
import pymongo
|
||||
import time
|
||||
import sys
|
||||
|
||||
log = discord_logging.init_logging()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mongo_address = sys.argv[1] # 192.168.1.131
|
||||
client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
|
||||
log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
|
||||
|
||||
count = 0
|
||||
start_time = time.time()
|
||||
start_date = int(datetime(2021, 6, 1).timestamp())
|
||||
cursor = client.reddit_database.submissions.aggregate(
|
||||
[
|
||||
{"$match": {"created_utc": {"$gt": start_date}}},
|
||||
{"$project": {"subreddit": 1, "over_18": {"$cond": ["$over_18", 1, 0]}}},
|
||||
{"$group": {"_id": "$subreddit", "countTotal": {"$count": {}}, "countNsfw": {"$sum": "$over_18"}}},
|
||||
{"$match": {"countTotal": {"$gt": 100}}},
|
||||
],
|
||||
allowDiskUse=True
|
||||
)
|
||||
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
|
||||
|
||||
start_time = time.time()
|
||||
subreddits = []
|
||||
for subreddit in cursor:
|
||||
subreddit['percent'] = int((subreddit['countNsfw']/subreddit['countTotal'])*100)
|
||||
if subreddit['percent'] >= 10:
|
||||
subreddits.append(subreddit)
|
||||
count += 1
|
||||
if count % 100000 == 0:
|
||||
log.info(f"{count:,} in {int(time.time() - start_time)} seconds")
|
||||
|
||||
log.info(f"{count:,} in {int(time.time() - start_time)} seconds")
|
||||
|
||||
file_out = open(r"\\MYCLOUDPR4100\Public\reddit_final\subreddits.txt", 'w')
|
||||
for subreddit in sorted(subreddits, key=lambda item: (item['percent'], item['countTotal']), reverse=True):
|
||||
file_out.write(f"{subreddit['_id']: <22}{subreddit['countTotal']: <8}{subreddit['countNsfw']: <8}{subreddit['percent']}%\n")
|
||||
file_out.close()
|
||||
|
||||
|
||||
# db.comments.createIndex({subreddit:1}) // remove
|
||||
# db.comments.createIndex({subreddit:1, created_utc:1})
|
||||
# db.comments.createIndex({author:1, created_utc:1})
|
||||
# db.comments.createIndex({id:1})
|
||||
# db.submissions.createIndex({subreddit:1, created_utc:1})
|
||||
# db.submissions.createIndex({author:1, created_utc:1})
|
||||
# db.submissions.createIndex({id:1})
|
||||
# db.submissions.createIndex({created_utc:1})
|
||||
# db.comments.createIndex({created_utc:1})
|
Loading…
Add table
Add a link
Reference in a new issue