diff --git a/personal/extract_file.py b/personal/compression/extract_file.py similarity index 100% rename from personal/extract_file.py rename to personal/compression/extract_file.py diff --git a/personal/recompress_folder.py b/personal/compression/recompress_folder.py similarity index 100% rename from personal/recompress_folder.py rename to personal/compression/recompress_folder.py diff --git a/personal/count_by_subreddit.py b/personal/count_by_subreddit.py deleted file mode 100644 index 064fce2..0000000 --- a/personal/count_by_subreddit.py +++ /dev/null @@ -1,24 +0,0 @@ -import utils -import discord_logging -import os -from collections import defaultdict - -log = discord_logging.init_logging() - - -if __name__ == "__main__": - subreddits = defaultdict(int) - input_file = r"\\MYCLOUDPR4100\Public\pushshift_working\RC_2022-12.zst" - input_file_size = os.stat(input_file).st_size - total_lines = 0 - for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file): - subreddits[comment['subreddit']] += 1 - total_lines += 1 - if total_lines % 100000 == 0: - log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%") - - log.info(f"{total_lines:,} lines, 100%") - - for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1] * -1): - if count >= 1: - log.info(f"r/{subreddit}: {count:,}") diff --git a/personal/comments_per_day.py b/personal/diagnostic/comments_per_day.py similarity index 100% rename from personal/comments_per_day.py rename to personal/diagnostic/comments_per_day.py diff --git a/personal/compare_lines.py b/personal/diagnostic/compare_lines.py similarity index 100% rename from personal/compare_lines.py rename to personal/diagnostic/compare_lines.py diff --git a/personal/count_fields.py b/personal/diagnostic/count_fields.py similarity index 100% rename from personal/count_fields.py rename to personal/diagnostic/count_fields.py diff --git a/personal/count_subreddits_multiprocess.py b/personal/diagnostic/count_subreddits_multiprocess.py similarity index 100% rename from personal/count_subreddits_multiprocess.py rename to personal/diagnostic/count_subreddits_multiprocess.py diff --git a/personal/sum_subreddit_counts.py b/personal/diagnostic/sum_subreddit_counts.py similarity index 100% rename from personal/sum_subreddit_counts.py rename to personal/diagnostic/sum_subreddit_counts.py diff --git a/personal/test_file.py b/personal/diagnostic/test_file.py similarity index 100% rename from personal/test_file.py rename to personal/diagnostic/test_file.py diff --git a/personal/test_files_multiprocess.py b/personal/diagnostic/test_files_multiprocess.py similarity index 100% rename from personal/test_files_multiprocess.py rename to personal/diagnostic/test_files_multiprocess.py diff --git a/personal/download_pictures.py b/personal/download_pictures.py deleted file mode 100644 index a15fc00..0000000 --- a/personal/download_pictures.py +++ /dev/null @@ -1,20 +0,0 @@ -import utils -import discord_logging -from datetime import datetime -from collections import defaultdict -from urllib.parse import urlparse - -log = discord_logging.init_logging() - -if __name__ == "__main__": - domains = defaultdict(list) - lines = 0 - for submission in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\guessmybf_submissions.zst"): - if submission['is_self']: - continue - - domain = urlparse(submission['url']).netloc - domains[domain].append(submission['url']) - lines += 1 - - log.info(f"{lines}") diff --git a/personal/export_mongo.py b/personal/mongo/export_mongo.py similarity index 100% rename from personal/export_mongo.py rename to personal/mongo/export_mongo.py diff --git a/personal/group_subs.py b/personal/mongo/group_subs.py similarity index 100% rename from personal/group_subs.py rename to personal/mongo/group_subs.py diff --git a/personal/insert_mongo.py b/personal/mongo/insert_mongo.py similarity index 100% rename from personal/insert_mongo.py rename to personal/mongo/insert_mongo.py diff --git a/personal/objects_per_month.py b/personal/objects_per_month.py deleted file mode 100644 index c458af2..0000000 --- a/personal/objects_per_month.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -from collections import defaultdict - - -if __name__ == "__main__": - input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed" - for subdir, dirs, files in os.walk(input_folder): - for file_name in files: - items = 0 - input_path = os.path.join(subdir, file_name) - with open(input_path, 'r') as input_handle: - for line in input_handle: - subreddit, count = line.strip().split("\t") - items += int(count) - print(f"{file_name} {items}") diff --git a/personal/sort_subreddit_counts.py b/personal/sort_subreddit_counts.py deleted file mode 100644 index 50a235b..0000000 --- a/personal/sort_subreddit_counts.py +++ /dev/null @@ -1,22 +0,0 @@ - - -if __name__ == '__main__': - input_file = r"\\MYCLOUDPR4100\Public\field_counts.txt" - output_file = r"\\MYCLOUDPR4100\Public\field_counts_sorted.txt" - subreddits = {} - with open(input_file, 'r') as input_handle: - for line in input_handle: - subreddit, count_string = line.strip().split("\t") - count = int(count_string) - if count > 10000: - subreddits[subreddit] = count - - print(f"{len(subreddits)}") - - with open(output_file, 'w') as output_handle: - count_written = 0 - for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True): - output_handle.write(f"{subreddit}\n") - count_written += 1 - if count_written >= 20000: - break diff --git a/personal/subreddits_per_month.py b/personal/subreddits_per_month.py deleted file mode 100644 index 774e8fa..0000000 --- a/personal/subreddits_per_month.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -from collections import defaultdict - - -if __name__ == "__main__": - input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts" - output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed" - lines = 0 - for subdir, dirs, files in os.walk(input_folder): - for file_name in files: - subreddits = defaultdict(int) - input_path = os.path.join(subdir, file_name) - output_path = os.path.join(output_folder, f"{file_name}.txt") - print(f"{lines} : {input_path}") - with open(input_path, 'r') as input_handle: - for line in input_handle: - lines += 1 - subreddits[line.strip()] += 1 - if lines % 1000000 == 0: - print(f"{lines} : {input_path}") - - with open(output_path, 'w') as output_handle: - for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True): - output_handle.write(f"{subreddit} {count}\n") diff --git a/personal/copy_listed_files.py b/personal/transform/copy_listed_files.py similarity index 100% rename from personal/copy_listed_files.py rename to personal/transform/copy_listed_files.py diff --git a/personal/split_by_subreddit.py b/personal/transform/split_by_subreddit.py similarity index 100% rename from personal/split_by_subreddit.py rename to personal/transform/split_by_subreddit.py