Add overlapping users finder

2025-08-14 09:05:37 -04:00 · 2023-05-25 18:28:37 -07:00 · 2023-05-25 18:28:37 -07:00 · 4a50ca6605
commit 4a50ca6605
parent 897332b1d7
2 changed files with 146 additions and 2 deletions
--- a/scripts/filter_file.py
+++ b/scripts/filter_file.py
@ -1,5 +1,3 @@
-# this is an example
-
 import zstandard
 import os
 import json
--- a/scripts/find_overlapping_users.py
+++ b/scripts/find_overlapping_users.py
@ -0,0 +1,146 @@
+from collections import defaultdict
+from datetime import datetime, timedelta
+import time
+import os
+import logging.handlers
+import zstandard
+import json
+
+input_files = [
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits\redditdev_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits\announcements_comments.zst",
+	r"\\MYCLOUDPR4100\Public\reddit\subreddits\modnews_comments.zst",
+]
+ignored_users = ['[deleted]', 'automoderator']
+min_comments_per_sub = 1
+file_name = "users.txt"
+require_first_subreddit = False  # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
+
+
+# sets up logging to the console as well as a file
+log = logging.getLogger("bot")
+log.setLevel(logging.INFO)
+log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
+log_str_handler = logging.StreamHandler()
+log_str_handler.setFormatter(log_formatter)
+log.addHandler(log_str_handler)
+if not os.path.exists("logs"):
+	os.makedirs("logs")
+log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
+log_file_handler.setFormatter(log_formatter)
+log.addHandler(log_file_handler)
+
+
+def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
+	chunk = reader.read(chunk_size)
+	bytes_read += chunk_size
+	if previous_chunk is not None:
+		chunk = previous_chunk + chunk
+	try:
+		return chunk.decode()
+	except UnicodeDecodeError:
+		if bytes_read > max_window_size:
+			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
+		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
+		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
+
+
+def read_lines_zst(file_name):
+	with open(file_name, 'rb') as file_handle:
+		buffer = ''
+		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
+		while True:
+			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
+
+			if not chunk:
+				break
+			lines = (buffer + chunk).split("\n")
+
+			for line in lines[:-1]:
+				yield line.strip(), file_handle.tell()
+
+			buffer = lines[-1]
+
+		reader.close()
+
+
+if __name__ == "__main__":
+	commenterSubreddits = defaultdict(int)
+	is_first = True
+	total_lines = 0
+	for subreddit_file in input_files:
+		file_lines = 0
+		created = None
+		file_size = os.stat(subreddit_file).st_size
+		commenters = defaultdict(int)
+		for line, file_bytes_processed in read_lines_zst(subreddit_file):
+			total_lines += 1
+			file_lines += 1
+			if total_lines % 100000 == 0:
+				log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
+
+			try:
+				obj = json.loads(line)
+				created = datetime.utcfromtimestamp(int(obj['created_utc']))
+
+				if obj['author'].lower() not in ignored_users:
+					commenters[obj['author']] += 1
+			except (KeyError, json.JSONDecodeError) as err:
+				pass
+		log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
+
+		for commenter in commenters:
+			if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
+				continue
+			if commenters[commenter] >= min_comments_per_sub:
+				commenterSubreddits[commenter] += 1
+		is_first = False
+
+	if require_first_subreddit:
+		count_found = 0
+		with open(file_name, 'w') as txt:
+			txt.write(f"Commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}\n")
+			for commenter, countSubreddits in commenterSubreddits.items():
+				if countSubreddits >= 2:
+					count_found += 1
+					txt.write(f"{commenter}\n")
+		log.info(f"{count_found} commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}")
+
+	else:
+		sharedCommenters = defaultdict(list)
+		for commenter, countSubreddits in commenterSubreddits.items():
+			if countSubreddits >= len(input_files) - 2:
+				sharedCommenters[countSubreddits].append(commenter)
+
+		commentersAll = len(sharedCommenters[len(input_files)])
+		commentersMinusOne = len(sharedCommenters[len(input_files) - 1])
+		commentersMinusTwo = len(sharedCommenters[len(input_files) - 2])
+
+		log.info(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}")
+
+		with open(file_name, 'w') as txt:
+			if commentersAll == 0:
+				txt.write(f"No commenters in all subreddits\n")
+			else:
+				txt.write(f"{commentersAll} commenters in all subreddits\n")
+				for user in sorted(sharedCommenters[len(input_files)], key=str.lower):
+					txt.write(f"{user}\n")
+			txt.write("\n")
+
+			if commentersAll < 10 and len(input_files) > 2:
+				if commentersMinusOne == 0:
+					txt.write(f"No commenters in all but one subreddits\n")
+				else:
+					txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n")
+					for user in sorted(sharedCommenters[len(input_files) - 1], key=str.lower):
+						txt.write(f"{user}\n")
+				txt.write("\n")
+
+				if commentersMinusOne < 10:
+					if commentersMinusTwo == 0:
+						txt.write(f"No commenters in all but two subreddits\n")
+					else:
+						txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n")
+						for user in sorted(sharedCommenters[len(input_files) - 2], key=str.lower):
+							txt.write(f"{user}\n")
+					txt.write("\n")