Clean up overlapping users script

2025-12-19 10:12:53 -05:00 · 2024-04-24 18:11:05 -07:00 · 2024-04-24 18:11:05 -07:00 · b54a2483dc
commit b54a2483dc
parent d67260b9ac
1 changed files with 71 additions and 55 deletions
--- a/scripts/find_overlapping_users.py
+++ b/scripts/find_overlapping_users.py
@ -1,3 +1,4 @@
 import sys
 from collections import defaultdict
 from datetime import datetime, timedelta
 import time
@ -6,13 +7,19 @@ import logging.handlers
 import zstandard
 import json
-input_files = [
+# IMPORTANT SETUP INSTRUCTIONS
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst",
+# change the folder line to the folder where the files are stored
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst",
+# change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit"
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst",
+# the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
-	r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst",
+# the script will look for both comments and submissions files for each subreddit
 folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
 subreddits = [
 	"DPH",
 	"dxm",
 ]
 ignored_users = {'[deleted]', 'automoderator'}
 # this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
 # here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
 ignored_users_file = "ignored.txt"
 min_comments_per_sub = 1
 file_name = "users.txt"
@ -66,20 +73,10 @@ def read_lines_zst(file_name):
 		reader.close()
-if __name__ == "__main__":
+def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines):
 	if os.path.exists(ignored_users_file):
 		with open(ignored_users_file) as fh:
 			for user in fh.readlines():
 				ignored_users.add(user.strip().lower())
 	commenterSubreddits = defaultdict(int)
 	is_first = True
 	total_lines = 0
 	for subreddit_file in input_files:
 	file_lines = 0
 	created = None
 	file_size = os.stat(subreddit_file).st_size
 		commenters = defaultdict(int)
 	for line, file_bytes_processed in read_lines_zst(subreddit_file):
 		total_lines += 1
 		file_lines += 1
@ -91,10 +88,46 @@ if __name__ == "__main__":
 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
 			if obj['author'].lower() not in ignored_users:
-					commenters[obj['author']] += 1
+				subreddit_commenters[obj['author']] += 1
 		except (KeyError, json.JSONDecodeError) as err:
 			pass
 	log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
 	return total_lines
 if __name__ == "__main__":
 	log.info(f"Subreddit's folder: {folder}")
 	if len(subreddits) <= 10:
 		log.info(f"Finding overlapping users in {', '.join(subreddits)}")
 	else:
 		log.info(f"Finding overlapping users in {len(subreddits)} subreddits")
 	if require_first_subreddit:
 		log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
 	log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
 	log.info(f"Outputting to {file_name}")
 	if os.path.exists(ignored_users_file):
 		with open(ignored_users_file) as fh:
 			for user in fh.readlines():
 				ignored_users.add(user.strip().lower())
 		log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")
 	commenterSubreddits = defaultdict(int)
 	is_first = True
 	total_lines = 0
 	for subreddit in subreddits:
 		subreddit_exists = False
 		commenters = defaultdict(int)
 		for file_type in ["submissions", "comments"]:
 			subreddit_file = os.path.join(folder, f"{subreddit}_{file_type}.zst")
 			if not os.path.exists(subreddit_file):
 				log.info(f"{file_type} for {subreddit} does not exist, skipping")
 				continue
 			subreddit_exists = True
 			total_lines = get_commenters_from_file(subreddit_file, commenters, total_lines)
 		if not subreddit_exists:
 			log.error(f"Subreddit {subreddit} has no files, aborting")
 			sys.exit(0)
 		for commenter in commenters:
 			if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
@ -106,48 +139,31 @@ if __name__ == "__main__":
 	if require_first_subreddit:
 		count_found = 0
 		with open(file_name, 'w') as txt:
-			txt.write(f"Commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}\n")
+			txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
 			for commenter, countSubreddits in commenterSubreddits.items():
 				if countSubreddits >= 2:
 					count_found += 1
 					txt.write(f"{commenter}\n")
-		log.info(f"{count_found} commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}")
+		log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}")
 	else:
 		sharedCommenters = defaultdict(list)
 		for commenter, countSubreddits in commenterSubreddits.items():
-			if countSubreddits >= len(input_files) - 2:
+			if countSubreddits >= 2:
 				sharedCommenters[countSubreddits].append(commenter)
 		commentersAll = len(sharedCommenters[len(input_files)])
 		commentersMinusOne = len(sharedCommenters[len(input_files) - 1])
 		commentersMinusTwo = len(sharedCommenters[len(input_files) - 2])
 		log.info(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}")
 		with open(file_name, 'w') as txt:
-			if commentersAll == 0:
+			log.info(f"Writing output to {file_name}")
-				txt.write(f"No commenters in all subreddits\n")
+			for i in range(len(subreddits)):
 				commenters = len(sharedCommenters[len(subreddits) - i])
 				inner_str = f"but {i} " if i != 0 else ""
 				log.info(f"{commenters} commenters in all {inner_str}subreddits")
 				if commenters == 0:
 					txt.write(f"No commenters in all {inner_str}subreddits\n")
 				else:
-				txt.write(f"{commentersAll} commenters in all subreddits\n")
+					txt.write(f"{commenters} commenters in all {inner_str}subreddits\n")
-				for user in sorted(sharedCommenters[len(input_files)], key=str.lower):
+					for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower):
 					txt.write(f"{user}\n")
 			txt.write("\n")
 			if commentersAll < 10 and len(input_files) > 2:
 				if commentersMinusOne == 0:
 					txt.write(f"No commenters in all but one subreddits\n")
 				else:
 					txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n")
 					for user in sorted(sharedCommenters[len(input_files) - 1], key=str.lower):
 						txt.write(f"{user}\n")
 				txt.write("\n")
 				if commentersMinusOne < 10:
 					if commentersMinusTwo == 0:
 						txt.write(f"No commenters in all but two subreddits\n")
 					else:
 						txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n")
 						for user in sorted(sharedCommenters[len(input_files) - 2], key=str.lower):
 						txt.write(f"{user}\n")
 				txt.write("\n")
 				if commenters > 3:
 					break