From ec977a76b22d4a39ea66cde8f55a65d55e7ef06c Mon Sep 17 00:00:00 2001 From: Watchful1 Date: Fri, 22 Mar 2024 19:25:53 -0700 Subject: [PATCH] Didn't mean to commit that --- scripts/filter_file.py | 6 +++--- scripts/find_overlapping_users.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/filter_file.py b/scripts/filter_file.py index 25bbcde..999118b 100644 --- a/scripts/filter_file.py +++ b/scripts/filter_file.py @@ -7,9 +7,9 @@ from datetime import datetime import logging.handlers # put the path to the input file, or a folder of files to process all of -input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_23.zst" +input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments.zst" # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well -output_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_hero" +output_file = r"\\MYCLOUDPR4100\Public\output" # the format to output in, pick from the following options # zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo # txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor @@ -79,7 +79,7 @@ field = "body" values = [''] # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above # if this list is very large, it could greatly slow down the process -values_file = r"\\MYCLOUDPR4100\Public\askreddit_submissions_ids.txt" +values_file = None exact_match = False diff --git a/scripts/find_overlapping_users.py b/scripts/find_overlapping_users.py index 8ee9812..3e9ca2f 100644 --- a/scripts/find_overlapping_users.py +++ b/scripts/find_overlapping_users.py @@ -7,10 +7,13 @@ import zstandard import json input_files = [ - r"\\MYCLOUDPR4100\Public\reddit\subreddits23\baseballcards_comments.zst", - r"\\MYCLOUDPR4100\Public\reddit\subreddits23\classicwow_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst", + r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst", ] -ignored_users = ['[deleted]', 'automoderator'] +ignored_users = {'[deleted]', 'automoderator'} +ignored_users_file = "ignored.txt" min_comments_per_sub = 1 file_name = "users.txt" require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs @@ -64,6 +67,11 @@ def read_lines_zst(file_name): if __name__ == "__main__": + if os.path.exists(ignored_users_file): + with open(ignored_users_file) as fh: + for user in fh.readlines(): + ignored_users.add(user.strip().lower()) + commenterSubreddits = defaultdict(int) is_first = True total_lines = 0