Add csv script

This commit is contained in:
Watchful1 2022-02-14 16:04:27 -08:00
parent c08f5f212f
commit 461028b401
6 changed files with 169 additions and 49 deletions

View file

@ -1,33 +1,24 @@
import utils
import discord_logging
import os
from datetime import datetime
from collections import defaultdict
log = discord_logging.init_logging()
if __name__ == "__main__":
subreddits = {}
object_type = "submissions"
folder = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\{object_type}"
if not os.path.exists(folder):
os.makedirs(folder)
input_file = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\relationships_{object_type}.zst"
subreddits = defaultdict(int)
input_file = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2021-06.zst"
input_file_size = os.stat(input_file).st_size
total_lines = 0
for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file):
if comment['subreddit'] not in subreddits:
subreddits[comment['subreddit']] = {'writer': utils.OutputZst(os.path.join(folder, comment['subreddit'] + f"_{object_type}.zst")), 'lines': 0}
subreddit = subreddits[comment['subreddit']]
subreddit['writer'].write(line)
subreddit['writer'].write("\n")
subreddit['lines'] += 1
subreddits[comment['subreddit']] += 1
total_lines += 1
if total_lines % 100000 == 0:
log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%")
log.info(f"{total_lines:,} lines, 100%")
for name, subreddit in subreddits.items():
log.info(f"r/{name}: {subreddit['lines']:,} lines")
subreddit['writer'].close()
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1] * -1):
if count > 1000:
log.info(f"r/{subreddit}: {count:,}")

View file

@ -5,6 +5,7 @@ import discord_logging
import pymongo
import time
import sys
from datetime import datetime
log = discord_logging.init_logging()
@ -14,26 +15,33 @@ if __name__ == "__main__":
client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
count = 0
start_time = time.time()
cursor = client.reddit_database.comments.find(
filter={"subreddit": "RelationshipsOver35"},
projection={'_id': False},
sort=[('created_utc', pymongo.ASCENDING)]
)
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
subreddits = [
"PersonalFinanceCanada"
]
start_date = datetime(2020, 1, 1)
end_date = datetime(2021, 1, 1)
output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments.zst")
start_time = time.time()
for comment in cursor:
count += 1
output_writer.write(json.dumps(comment, separators=(',', ':')))
output_writer.write("\n")
if count % 100000 == 0:
log.info(f"{count,} in {int(time.time() - start_time)} seconds")
for subreddit in subreddits:
count = 0
start_time = time.time()
cursor = client.reddit_database.comments.find(
filter={"subreddit": subreddit, "created_utc": {"$gte": int(start_date.timestamp()), "$lt": int(end_date.timestamp())}},
projection={'_id': False},
sort=[('created_utc', pymongo.ASCENDING)]
)
log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
output_writer.close()
log.info(f"{count,} in {int(time.time() - start_time)} seconds")
output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\{0}_comments.zst".format(subreddit))
start_time = time.time()
for comment in cursor:
count += 1
output_writer.write(json.dumps(comment, separators=(',', ':')))
output_writer.write("\n")
if count % 10000 == 0:
log.info(f"{count:,} through {datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')} in {int(time.time() - start_time)} seconds r/{subreddit}")
output_writer.close()
log.info(f"{count:,} in {int(time.time() - start_time)} seconds r/{subreddit}")
# db.comments.createIndex({subreddit:1}) // remove

30
personal/extract_file.py Normal file
View file

@ -0,0 +1,30 @@
import utils
import discord_logging
import os
import sys
from datetime import datetime
log = discord_logging.init_logging()
if __name__ == "__main__":
input_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.zst"
output_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.txt"
file_size = os.stat(input_file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
inserts = []
output_file = open(output_file_path, 'w')
for obj, line, file_bytes_processed in utils.read_obj_zst_meta(input_file_path):
created = datetime.utcfromtimestamp(int(obj['created_utc']))
file_lines += 1
output_file.write(line)
output_file.write("\n")
if file_lines % 100000 == 0:
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
output_file.close()

View file

@ -8,11 +8,11 @@ log = discord_logging.init_logging()
if __name__ == "__main__":
subreddits = {}
object_type = "submissions"
folder = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\{object_type}"
object_type = "comments"
folder = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\ratmanreturns265_{object_type}"
if not os.path.exists(folder):
os.makedirs(folder)
input_file = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\relationships_{object_type}.zst"
input_file = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\ratmanreturns265_{object_type}.zst"
input_file_size = os.stat(input_file).st_size
total_lines = 0
for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file):

View file

@ -8,18 +8,30 @@ log = discord_logging.init_logging()
if __name__ == "__main__":
file_path = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2011-01.zst"
file_size = os.stat(file_path).st_size
input_path = r"\\MYCLOUDPR4100\Public\reddit\requests\jeanyp"
file_lines = 0
file_bytes_processed = 0
created = None
inserts = []
for obj, line, file_bytes_processed in utils.read_obj_zst_meta(file_path):
created = datetime.utcfromtimestamp(int(obj['created_utc']))
file_lines += 1
if file_lines % 100000 == 0:
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
input_file_paths = []
if os.path.isdir(input_path):
for subdir, dirs, files in os.walk(input_path):
files.sort()
for file_name in files:
if file_name.endswith(".zst"):
input_file_paths.append(os.path.join(subdir, file_name))
else:
input_file_paths.append(input_path)
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
files_processed = 0
for file_path in input_file_paths:
file_name = os.path.basename(file_path)
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
inserts = []
for obj, line, file_bytes_processed in utils.read_obj_zst_meta(file_path):
created = datetime.utcfromtimestamp(int(obj['created_utc']))
file_lines += 1
if file_lines % 100000 == 0:
log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")