Clean up overlapping users script

2025-08-15 17:40:15 -04:00 · 2024-04-24 18:11:05 -07:00 · 2024-04-24 18:11:05 -07:00 · b54a2483dc
commit b54a2483dc
parent d67260b9ac
1 changed files with 71 additions and 55 deletions
--- a/scripts/find_overlapping_users.py
+++ b/scripts/find_overlapping_users.py
@ -1,3 +1,4 @@
+import sys
 from collections import defaultdict
 from datetime import datetime, timedelta
 import time
@ -6,13 +7,19 @@ import logging.handlers
 import zstandard
 import json

-input_files = [
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst",
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst",
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst",
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst",
+# IMPORTANT SETUP INSTRUCTIONS
+# change the folder line to the folder where the files are stored
+# change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit"
+# the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
+# the script will look for both comments and submissions files for each subreddit
+folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
+subreddits = [
+	"DPH",
+	"dxm",
 ]
 ignored_users = {'[deleted]', 'automoderator'}
+# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
+# here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
 ignored_users_file = "ignored.txt"
 min_comments_per_sub = 1
 file_name = "users.txt"
@ -66,35 +73,61 @@ def read_lines_zst(file_name):
 		reader.close()


+def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines):
+	file_lines = 0
+	created = None
+	file_size = os.stat(subreddit_file).st_size
+	for line, file_bytes_processed in read_lines_zst(subreddit_file):
+		total_lines += 1
+		file_lines += 1
+		if total_lines % 100000 == 0:
+			log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
+
+		try:
+			obj = json.loads(line)
+			created = datetime.utcfromtimestamp(int(obj['created_utc']))
+
+			if obj['author'].lower() not in ignored_users:
+				subreddit_commenters[obj['author']] += 1
+		except (KeyError, json.JSONDecodeError) as err:
+			pass
+	log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
+	return total_lines
+
+
 if __name__ == "__main__":
+	log.info(f"Subreddit's folder: {folder}")
+	if len(subreddits) <= 10:
+		log.info(f"Finding overlapping users in {', '.join(subreddits)}")
+	else:
+		log.info(f"Finding overlapping users in {len(subreddits)} subreddits")
+	if require_first_subreddit:
+		log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
+	log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
+	log.info(f"Outputting to {file_name}")
+
 	if os.path.exists(ignored_users_file):
 		with open(ignored_users_file) as fh:
 			for user in fh.readlines():
 				ignored_users.add(user.strip().lower())
+		log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")

 	commenterSubreddits = defaultdict(int)
 	is_first = True
 	total_lines = 0
-	for subreddit_file in input_files:
-		file_lines = 0
-		created = None
-		file_size = os.stat(subreddit_file).st_size
+	for subreddit in subreddits:
+		subreddit_exists = False
 		commenters = defaultdict(int)
-		for line, file_bytes_processed in read_lines_zst(subreddit_file):
-			total_lines += 1
-			file_lines += 1
-			if total_lines % 100000 == 0:
-				log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
-
-			try:
-				obj = json.loads(line)
-				created = datetime.utcfromtimestamp(int(obj['created_utc']))
-
-				if obj['author'].lower() not in ignored_users:
-					commenters[obj['author']] += 1
-			except (KeyError, json.JSONDecodeError) as err:
-				pass
-		log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
+		for file_type in ["submissions", "comments"]:
+			subreddit_file = os.path.join(folder, f"{subreddit}_{file_type}.zst")
+			if not os.path.exists(subreddit_file):
+				log.info(f"{file_type} for {subreddit} does not exist, skipping")
+				continue
+			subreddit_exists = True
+			total_lines = get_commenters_from_file(subreddit_file, commenters, total_lines)
+		if not subreddit_exists:
+			log.error(f"Subreddit {subreddit} has no files, aborting")
+			sys.exit(0)

 		for commenter in commenters:
 			if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
@ -106,48 +139,31 @@ if __name__ == "__main__":
 	if require_first_subreddit:
 		count_found = 0
 		with open(file_name, 'w') as txt:
-			txt.write(f"Commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}\n")
+			txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
 			for commenter, countSubreddits in commenterSubreddits.items():
 				if countSubreddits >= 2:
 					count_found += 1
 					txt.write(f"{commenter}\n")
-		log.info(f"{count_found} commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}")
+		log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}")

 	else:
 		sharedCommenters = defaultdict(list)
 		for commenter, countSubreddits in commenterSubreddits.items():
-			if countSubreddits >= len(input_files) - 2:
+			if countSubreddits >= 2:
 				sharedCommenters[countSubreddits].append(commenter)

-		commentersAll = len(sharedCommenters[len(input_files)])
-		commentersMinusOne = len(sharedCommenters[len(input_files) - 1])
-		commentersMinusTwo = len(sharedCommenters[len(input_files) - 2])
-
-		log.info(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}")
-
 		with open(file_name, 'w') as txt:
-			if commentersAll == 0:
-				txt.write(f"No commenters in all subreddits\n")
-			else:
-				txt.write(f"{commentersAll} commenters in all subreddits\n")
-				for user in sorted(sharedCommenters[len(input_files)], key=str.lower):
-					txt.write(f"{user}\n")
-			txt.write("\n")
-
-			if commentersAll < 10 and len(input_files) > 2:
-				if commentersMinusOne == 0:
-					txt.write(f"No commenters in all but one subreddits\n")
+			log.info(f"Writing output to {file_name}")
+			for i in range(len(subreddits)):
+				commenters = len(sharedCommenters[len(subreddits) - i])
+				inner_str = f"but {i} " if i != 0 else ""
+				log.info(f"{commenters} commenters in all {inner_str}subreddits")
+				if commenters == 0:
+					txt.write(f"No commenters in all {inner_str}subreddits\n")
 				else:
-					txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n")
-					for user in sorted(sharedCommenters[len(input_files) - 1], key=str.lower):
+					txt.write(f"{commenters} commenters in all {inner_str}subreddits\n")
+					for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower):
 						txt.write(f"{user}\n")
 				txt.write("\n")
-
-				if commentersMinusOne < 10:
-					if commentersMinusTwo == 0:
-						txt.write(f"No commenters in all but two subreddits\n")
-					else:
-						txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n")
-						for user in sorted(sharedCommenters[len(input_files) - 2], key=str.lower):
-							txt.write(f"{user}\n")
-					txt.write("\n")
+				if commenters > 3:
+					break