From ec977a76b22d4a39ea66cde8f55a65d55e7ef06c Mon Sep 17 00:00:00 2001
From: Watchful1 <watchful@watchful.gr>
Date: Fri, 22 Mar 2024 19:25:53 -0700
Subject: [PATCH] Didn't mean to commit that

---
 scripts/filter_file.py            |  6 +++---
 scripts/find_overlapping_users.py | 14 +++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/scripts/filter_file.py b/scripts/filter_file.py
index 25bbcde..999118b 100644
--- a/scripts/filter_file.py
+++ b/scripts/filter_file.py
@@ -7,9 +7,9 @@ from datetime import datetime
 import logging.handlers
 
 # put the path to the input file, or a folder of files to process all of
-input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_23.zst"
+input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments.zst"
 # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
-output_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_hero"
+output_file = r"\\MYCLOUDPR4100\Public\output"
 # the format to output in, pick from the following options
 #   zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
 #   txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
@@ -79,7 +79,7 @@ field = "body"
 values = ['']
 # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
 # if this list is very large, it could greatly slow down the process
-values_file = r"\\MYCLOUDPR4100\Public\askreddit_submissions_ids.txt"
+values_file = None
 exact_match = False
 
 
diff --git a/scripts/find_overlapping_users.py b/scripts/find_overlapping_users.py
index 8ee9812..3e9ca2f 100644
--- a/scripts/find_overlapping_users.py
+++ b/scripts/find_overlapping_users.py
@@ -7,10 +7,13 @@ import zstandard
 import json
 
 input_files = [
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\baseballcards_comments.zst",
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\classicwow_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst",
 ]
-ignored_users = ['[deleted]', 'automoderator']
+ignored_users = {'[deleted]', 'automoderator'}
+ignored_users_file = "ignored.txt"
 min_comments_per_sub = 1
 file_name = "users.txt"
 require_first_subreddit = False  # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
@@ -64,6 +67,11 @@ def read_lines_zst(file_name):
 
 
 if __name__ == "__main__":
+	if os.path.exists(ignored_users_file):
+		with open(ignored_users_file) as fh:
+			for user in fh.readlines():
+				ignored_users.add(user.strip().lower())
+
 	commenterSubreddits = defaultdict(int)
 	is_first = True
 	total_lines = 0