diff --git a/personal/count_subreddits_multiprocess.py b/personal/count_subreddits_multiprocess.py index e24aa19..0f37d2f 100644 --- a/personal/count_subreddits_multiprocess.py +++ b/personal/count_subreddits_multiprocess.py @@ -127,7 +127,6 @@ def read_lines_zst(file_name): # information back to the parent via a queue def process_file(file, queue, field): output_file = None - log.debug(f"Starting file: {file.input_path} : {file.file_size:,}") try: for line, file_bytes_processed in read_lines_zst(file.input_path): try: @@ -149,7 +148,6 @@ def process_file(file, queue, field): file.complete = True file.bytes_processed = file.file_size - log.debug(f"Finished file: {file.input_path} : {file.file_size:,}") except Exception as err: file.error_message = str(err) queue.put(file) diff --git a/personal/sort_subreddit_counts.py b/personal/sort_subreddit_counts.py new file mode 100644 index 0000000..50a235b --- /dev/null +++ b/personal/sort_subreddit_counts.py @@ -0,0 +1,22 @@ + + +if __name__ == '__main__': + input_file = r"\\MYCLOUDPR4100\Public\field_counts.txt" + output_file = r"\\MYCLOUDPR4100\Public\field_counts_sorted.txt" + subreddits = {} + with open(input_file, 'r') as input_handle: + for line in input_handle: + subreddit, count_string = line.strip().split("\t") + count = int(count_string) + if count > 10000: + subreddits[subreddit] = count + + print(f"{len(subreddits)}") + + with open(output_file, 'w') as output_handle: + count_written = 0 + for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True): + output_handle.write(f"{subreddit}\n") + count_written += 1 + if count_written >= 20000: + break