More updates to overlapping users

This commit is contained in:
Watchful1 2025-01-07 20:36:11 -08:00
parent 902f142228
commit bb25896815

View file

@ -13,19 +13,17 @@ import json
# the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst # the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
# the script will look for both comments and submissions files for each subreddit # the script will look for both comments and submissions files for each subreddit
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23" folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
subreddits = [ subreddits_string = """
"phillies", TheSimpsons
"EASportsFC", Askmen
"eagles", Seinfeld
"politics", """
"BucksCountyPA",
]
ignored_users = {'[deleted]', 'automoderator'} ignored_users = {'[deleted]', 'automoderator'}
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for # this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
# here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt # here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
ignored_users_file = "ignored.txt" ignored_users_file = "ignored.txt"
min_comments_per_sub = 1 min_comments_per_sub = 1
file_name = "users.txt" output_file_name = "users.txt"
require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
@ -76,7 +74,7 @@ def read_lines_zst(file_name):
reader.close() reader.close()
def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines): def get_commenters_from_file(subreddit, subreddit_file, subreddit_commenters, total_lines, files_status):
file_lines = 0 file_lines = 0
created = None created = None
file_size = os.stat(subreddit_file).st_size file_size = os.stat(subreddit_file).st_size
@ -84,7 +82,7 @@ def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines):
total_lines += 1 total_lines += 1
file_lines += 1 file_lines += 1
if total_lines % 100000 == 0: if total_lines % 100000 == 0:
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%") log.info(f"{files_status}: {total_lines:,}: r/{subreddit}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
try: try:
obj = json.loads(line) obj = json.loads(line)
@ -103,6 +101,13 @@ if __name__ == "__main__":
if not os.path.exists(folder): if not os.path.exists(folder):
log.error(f"Subreddit's folder either doesn't exist or the script doesn't have access to it: {folder}") log.error(f"Subreddit's folder either doesn't exist or the script doesn't have access to it: {folder}")
sys.exit() sys.exit()
subreddits = []
for line in subreddits_string.split("\n"):
subreddit = line.strip()
if subreddit == "":
continue
subreddits.append(subreddit)
if len(subreddits) <= 10: if len(subreddits) <= 10:
log.info(f"Finding overlapping users in {', '.join(subreddits)}") log.info(f"Finding overlapping users in {', '.join(subreddits)}")
else: else:
@ -110,7 +115,7 @@ if __name__ == "__main__":
if require_first_subreddit: if require_first_subreddit:
log.info(f"Finding users from the first subreddit that are in any of the other subreddits") log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}") log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
log.info(f"Outputting to {file_name}") log.info(f"Outputting to {output_file_name}")
if os.path.exists(ignored_users_file): if os.path.exists(ignored_users_file):
with open(ignored_users_file) as fh: with open(ignored_users_file) as fh:
@ -118,24 +123,55 @@ if __name__ == "__main__":
ignored_users.add(user.strip().lower()) ignored_users.add(user.strip().lower())
log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}") log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")
log.info(f"Checking that subreddit files are present")
folder_files = {}
for file in os.listdir(folder):
folder_files[file.lower()] = file
subreddit_stats = []
for subreddit in subreddits:
subreddit_stat = {"files": 0, "bytes": 0, "name": subreddit}
for file_type in ["submissions", "comments"]:
file_ending = f"_{file_type}.zst"
file_name = folder_files.get(f"{subreddit.lower()}{file_ending}")
if file_name is None:
continue
subreddit_file = os.path.join(folder, file_name)
subreddit_stat["name"] = file_name[0:-len(file_ending)]
subreddit_stat[file_type] = subreddit_file
subreddit_stat["files"] += 1
subreddit_stat["bytes"] += os.stat(subreddit_file).st_size
subreddit_stats.append(subreddit_stat)
subreddit_stats.sort(key=lambda x: x["bytes"], reverse=True)
abort = False
for subreddit_stat in subreddit_stats:
if subreddit_stat["files"] == 0:
log.info(f"No files for {subreddit_stat['name']} exist")
abort = True
else:
log.info(f"r/{subreddit_stat['name']} files total {(subreddit_stat['bytes'] / (2**30)):.2f} gb")
if abort:
log.error(f"The script can see {len(folder_files)} files in the folder, but not the ones requested: {folder}")
sys.exit(0)
commenterSubreddits = defaultdict(int) commenterSubreddits = defaultdict(int)
is_first = True is_first = True
total_lines = 0 total_lines = 0
for subreddit in subreddits: files_processed = 1
subreddit_exists = False for subreddit_stat in subreddit_stats:
commenters = defaultdict(int) commenters = defaultdict(int)
for file_type in ["submissions", "comments"]: for file_type in ["submissions", "comments"]:
subreddit_file = os.path.join(folder, f"{subreddit}_{file_type}.zst") total_lines = get_commenters_from_file(
if not os.path.exists(subreddit_file): f"{subreddit_stat['name']}_{file_type}",
log.info(f"{file_type} for {subreddit} does not exist, skipping: {subreddit_file}") subreddit_stat[file_type],
continue commenters,
subreddit_exists = True total_lines,
total_lines = get_commenters_from_file(subreddit_file, commenters, total_lines) f"{files_processed}|{len(subreddit_stats)}")
if not subreddit_exists:
log.error(f"Subreddit {subreddit} has no files, aborting")
file_count = len(list(os.listdir(folder)))
log.error(f"The script can see {file_count} files in the folder, but not the ones requested: {folder}")
sys.exit(0)
for commenter in commenters: for commenter in commenters:
if require_first_subreddit and not is_first and commenter not in commenterSubreddits: if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
@ -143,10 +179,11 @@ if __name__ == "__main__":
if commenters[commenter] >= min_comments_per_sub: if commenters[commenter] >= min_comments_per_sub:
commenterSubreddits[commenter] += 1 commenterSubreddits[commenter] += 1
is_first = False is_first = False
files_processed += 1
if require_first_subreddit: if require_first_subreddit:
count_found = 0 count_found = 0
with open(file_name, 'w') as txt: with open(output_file_name, 'w') as txt:
txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n") txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
for commenter, countSubreddits in commenterSubreddits.items(): for commenter, countSubreddits in commenterSubreddits.items():
if countSubreddits >= 2: if countSubreddits >= 2:
@ -160,8 +197,8 @@ if __name__ == "__main__":
if countSubreddits >= 2: if countSubreddits >= 2:
sharedCommenters[countSubreddits].append(commenter) sharedCommenters[countSubreddits].append(commenter)
with open(file_name, 'w') as txt: with open(output_file_name, 'w') as txt:
log.info(f"Writing output to {file_name}") log.info(f"Writing output to {output_file_name}")
for i in range(len(subreddits)): for i in range(len(subreddits)):
commenters = len(sharedCommenters[len(subreddits) - i]) commenters = len(sharedCommenters[len(subreddits) - i])
inner_str = f"but {i} " if i != 0 else "" inner_str = f"but {i} " if i != 0 else ""