mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-24 23:25:24 -04:00
Reorganize
This commit is contained in:
parent
3700b21b81
commit
4f1d70d34a
19 changed files with 0 additions and 105 deletions
37
personal/diagnostic/sum_subreddit_counts.py
Normal file
37
personal/diagnostic/sum_subreddit_counts.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import os
|
||||
import logging.handlers
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
log = logging.getLogger("bot")
|
||||
log.setLevel(logging.DEBUG)
|
||||
log.addHandler(logging.StreamHandler())
|
||||
|
||||
if __name__ == '__main__':
|
||||
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
|
||||
output_file = r"\\MYCLOUDPR4100\Public\subreddit_counts_total.txt"
|
||||
subreddits = defaultdict(int)
|
||||
|
||||
for subdir, dirs, files in os.walk(input_folder):
|
||||
for filename in files:
|
||||
log.info(f"Processing file: {filename}")
|
||||
input_path = os.path.join(subdir, filename)
|
||||
with open(input_path, 'r') as input_handle:
|
||||
line_count = 0
|
||||
for line in input_handle:
|
||||
subreddit, count_string = line.strip().split("\t")
|
||||
count = int(count_string)
|
||||
subreddits[subreddit] += count
|
||||
line_count += 1
|
||||
|
||||
log.info(f"Total subreddits: {len(subreddits):,}")
|
||||
|
||||
count_written = 0
|
||||
with open(output_file, 'w') as output_handle:
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
||||
output_handle.write(f"{subreddit} {count}\n")
|
||||
count_written += 1
|
||||
if count_written % 1000000 == 0:
|
||||
log.info(f"Written: {count_written:,}/{len(subreddits):,}")
|
||||
|
||||
log.info(f"Written: {count_written:,}/{len(subreddits):,}")
|
Loading…
Add table
Add a link
Reference in a new issue