PushshiftDumps/personal/subreddits_per_month.py
2023-03-01 21:06:34 -08:00

24 lines
865 B
Python

import os
from collections import defaultdict
if __name__ == "__main__":
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts"
output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
lines = 0
for subdir, dirs, files in os.walk(input_folder):
for file_name in files:
subreddits = defaultdict(int)
input_path = os.path.join(subdir, file_name)
output_path = os.path.join(output_folder, f"{file_name}.txt")
print(f"{lines} : {input_path}")
with open(input_path, 'r') as input_handle:
for line in input_handle:
lines += 1
subreddits[line.strip()] += 1
if lines % 1000000 == 0:
print(f"{lines} : {input_path}")
with open(output_path, 'w') as output_handle:
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
output_handle.write(f"{subreddit} {count}\n")