mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-08-02 03:16:12 -04:00
Bit of other work
This commit is contained in:
parent
2bae2a38d2
commit
c7aa694631
7 changed files with 483 additions and 0 deletions
24
personal/subreddits_per_month.py
Normal file
24
personal/subreddits_per_month.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts"
|
||||
output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
|
||||
lines = 0
|
||||
for subdir, dirs, files in os.walk(input_folder):
|
||||
for file_name in files:
|
||||
subreddits = defaultdict(int)
|
||||
input_path = os.path.join(subdir, file_name)
|
||||
output_path = os.path.join(output_folder, f"{file_name}.txt")
|
||||
print(f"{lines} : {input_path}")
|
||||
with open(input_path, 'r') as input_handle:
|
||||
for line in input_handle:
|
||||
lines += 1
|
||||
subreddits[line.strip()] += 1
|
||||
if lines % 1000000 == 0:
|
||||
print(f"{lines} : {input_path}")
|
||||
|
||||
with open(output_path, 'w') as output_handle:
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
||||
output_handle.write(f"{subreddit} {count}\n")
|
Loading…
Add table
Add a link
Reference in a new issue