mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-03 19:06:39 -04:00
Short script to sort the subreddit counts
This commit is contained in:
parent
2358bf555b
commit
52d65e3c8d
2 changed files with 22 additions and 2 deletions
|
@ -127,7 +127,6 @@ def read_lines_zst(file_name):
|
|||
# information back to the parent via a queue
|
||||
def process_file(file, queue, field):
|
||||
output_file = None
|
||||
log.debug(f"Starting file: {file.input_path} : {file.file_size:,}")
|
||||
try:
|
||||
for line, file_bytes_processed in read_lines_zst(file.input_path):
|
||||
try:
|
||||
|
@ -149,7 +148,6 @@ def process_file(file, queue, field):
|
|||
|
||||
file.complete = True
|
||||
file.bytes_processed = file.file_size
|
||||
log.debug(f"Finished file: {file.input_path} : {file.file_size:,}")
|
||||
except Exception as err:
|
||||
file.error_message = str(err)
|
||||
queue.put(file)
|
||||
|
|
22
personal/sort_subreddit_counts.py
Normal file
22
personal/sort_subreddit_counts.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
input_file = r"\\MYCLOUDPR4100\Public\field_counts.txt"
|
||||
output_file = r"\\MYCLOUDPR4100\Public\field_counts_sorted.txt"
|
||||
subreddits = {}
|
||||
with open(input_file, 'r') as input_handle:
|
||||
for line in input_handle:
|
||||
subreddit, count_string = line.strip().split("\t")
|
||||
count = int(count_string)
|
||||
if count > 10000:
|
||||
subreddits[subreddit] = count
|
||||
|
||||
print(f"{len(subreddits)}")
|
||||
|
||||
with open(output_file, 'w') as output_handle:
|
||||
count_written = 0
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
||||
output_handle.write(f"{subreddit}\n")
|
||||
count_written += 1
|
||||
if count_written >= 20000:
|
||||
break
|
Loading…
Add table
Add a link
Reference in a new issue