mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-05 03:46:54 -04:00
24 lines
865 B
Python
24 lines
865 B
Python
import os
|
|
from collections import defaultdict
|
|
|
|
|
|
if __name__ == "__main__":
|
|
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts"
|
|
output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
|
|
lines = 0
|
|
for subdir, dirs, files in os.walk(input_folder):
|
|
for file_name in files:
|
|
subreddits = defaultdict(int)
|
|
input_path = os.path.join(subdir, file_name)
|
|
output_path = os.path.join(output_folder, f"{file_name}.txt")
|
|
print(f"{lines} : {input_path}")
|
|
with open(input_path, 'r') as input_handle:
|
|
for line in input_handle:
|
|
lines += 1
|
|
subreddits[line.strip()] += 1
|
|
if lines % 1000000 == 0:
|
|
print(f"{lines} : {input_path}")
|
|
|
|
with open(output_path, 'w') as output_handle:
|
|
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
|
output_handle.write(f"{subreddit} {count}\n")
|