mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 07:35:24 -04:00
Clean up overlapping users script
This commit is contained in:
parent
d67260b9ac
commit
b54a2483dc
1 changed files with 71 additions and 55 deletions
|
@ -1,3 +1,4 @@
|
||||||
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import time
|
import time
|
||||||
|
@ -6,13 +7,19 @@ import logging.handlers
|
||||||
import zstandard
|
import zstandard
|
||||||
import json
|
import json
|
||||||
|
|
||||||
input_files = [
|
# IMPORTANT SETUP INSTRUCTIONS
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\trading212_comments.zst",
|
# change the folder line to the folder where the files are stored
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\Fire_comments.zst",
|
# change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit"
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\IAmTheMainCharacter_comments.zst",
|
# the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
|
||||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits23\BrightonHoveAlbion_comments.zst",
|
# the script will look for both comments and submissions files for each subreddit
|
||||||
|
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
|
||||||
|
subreddits = [
|
||||||
|
"DPH",
|
||||||
|
"dxm",
|
||||||
]
|
]
|
||||||
ignored_users = {'[deleted]', 'automoderator'}
|
ignored_users = {'[deleted]', 'automoderator'}
|
||||||
|
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
|
||||||
|
# here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
|
||||||
ignored_users_file = "ignored.txt"
|
ignored_users_file = "ignored.txt"
|
||||||
min_comments_per_sub = 1
|
min_comments_per_sub = 1
|
||||||
file_name = "users.txt"
|
file_name = "users.txt"
|
||||||
|
@ -66,20 +73,10 @@ def read_lines_zst(file_name):
|
||||||
reader.close()
|
reader.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines):
|
||||||
if os.path.exists(ignored_users_file):
|
|
||||||
with open(ignored_users_file) as fh:
|
|
||||||
for user in fh.readlines():
|
|
||||||
ignored_users.add(user.strip().lower())
|
|
||||||
|
|
||||||
commenterSubreddits = defaultdict(int)
|
|
||||||
is_first = True
|
|
||||||
total_lines = 0
|
|
||||||
for subreddit_file in input_files:
|
|
||||||
file_lines = 0
|
file_lines = 0
|
||||||
created = None
|
created = None
|
||||||
file_size = os.stat(subreddit_file).st_size
|
file_size = os.stat(subreddit_file).st_size
|
||||||
commenters = defaultdict(int)
|
|
||||||
for line, file_bytes_processed in read_lines_zst(subreddit_file):
|
for line, file_bytes_processed in read_lines_zst(subreddit_file):
|
||||||
total_lines += 1
|
total_lines += 1
|
||||||
file_lines += 1
|
file_lines += 1
|
||||||
|
@ -91,10 +88,46 @@ if __name__ == "__main__":
|
||||||
created = datetime.utcfromtimestamp(int(obj['created_utc']))
|
created = datetime.utcfromtimestamp(int(obj['created_utc']))
|
||||||
|
|
||||||
if obj['author'].lower() not in ignored_users:
|
if obj['author'].lower() not in ignored_users:
|
||||||
commenters[obj['author']] += 1
|
subreddit_commenters[obj['author']] += 1
|
||||||
except (KeyError, json.JSONDecodeError) as err:
|
except (KeyError, json.JSONDecodeError) as err:
|
||||||
pass
|
pass
|
||||||
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
|
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
|
||||||
|
return total_lines
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
log.info(f"Subreddit's folder: {folder}")
|
||||||
|
if len(subreddits) <= 10:
|
||||||
|
log.info(f"Finding overlapping users in {', '.join(subreddits)}")
|
||||||
|
else:
|
||||||
|
log.info(f"Finding overlapping users in {len(subreddits)} subreddits")
|
||||||
|
if require_first_subreddit:
|
||||||
|
log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
|
||||||
|
log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
|
||||||
|
log.info(f"Outputting to {file_name}")
|
||||||
|
|
||||||
|
if os.path.exists(ignored_users_file):
|
||||||
|
with open(ignored_users_file) as fh:
|
||||||
|
for user in fh.readlines():
|
||||||
|
ignored_users.add(user.strip().lower())
|
||||||
|
log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")
|
||||||
|
|
||||||
|
commenterSubreddits = defaultdict(int)
|
||||||
|
is_first = True
|
||||||
|
total_lines = 0
|
||||||
|
for subreddit in subreddits:
|
||||||
|
subreddit_exists = False
|
||||||
|
commenters = defaultdict(int)
|
||||||
|
for file_type in ["submissions", "comments"]:
|
||||||
|
subreddit_file = os.path.join(folder, f"{subreddit}_{file_type}.zst")
|
||||||
|
if not os.path.exists(subreddit_file):
|
||||||
|
log.info(f"{file_type} for {subreddit} does not exist, skipping")
|
||||||
|
continue
|
||||||
|
subreddit_exists = True
|
||||||
|
total_lines = get_commenters_from_file(subreddit_file, commenters, total_lines)
|
||||||
|
if not subreddit_exists:
|
||||||
|
log.error(f"Subreddit {subreddit} has no files, aborting")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
for commenter in commenters:
|
for commenter in commenters:
|
||||||
if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
|
if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
|
||||||
|
@ -106,48 +139,31 @@ if __name__ == "__main__":
|
||||||
if require_first_subreddit:
|
if require_first_subreddit:
|
||||||
count_found = 0
|
count_found = 0
|
||||||
with open(file_name, 'w') as txt:
|
with open(file_name, 'w') as txt:
|
||||||
txt.write(f"Commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}\n")
|
txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
|
||||||
for commenter, countSubreddits in commenterSubreddits.items():
|
for commenter, countSubreddits in commenterSubreddits.items():
|
||||||
if countSubreddits >= 2:
|
if countSubreddits >= 2:
|
||||||
count_found += 1
|
count_found += 1
|
||||||
txt.write(f"{commenter}\n")
|
txt.write(f"{commenter}\n")
|
||||||
log.info(f"{count_found} commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}")
|
log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
sharedCommenters = defaultdict(list)
|
sharedCommenters = defaultdict(list)
|
||||||
for commenter, countSubreddits in commenterSubreddits.items():
|
for commenter, countSubreddits in commenterSubreddits.items():
|
||||||
if countSubreddits >= len(input_files) - 2:
|
if countSubreddits >= 2:
|
||||||
sharedCommenters[countSubreddits].append(commenter)
|
sharedCommenters[countSubreddits].append(commenter)
|
||||||
|
|
||||||
commentersAll = len(sharedCommenters[len(input_files)])
|
|
||||||
commentersMinusOne = len(sharedCommenters[len(input_files) - 1])
|
|
||||||
commentersMinusTwo = len(sharedCommenters[len(input_files) - 2])
|
|
||||||
|
|
||||||
log.info(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}")
|
|
||||||
|
|
||||||
with open(file_name, 'w') as txt:
|
with open(file_name, 'w') as txt:
|
||||||
if commentersAll == 0:
|
log.info(f"Writing output to {file_name}")
|
||||||
txt.write(f"No commenters in all subreddits\n")
|
for i in range(len(subreddits)):
|
||||||
|
commenters = len(sharedCommenters[len(subreddits) - i])
|
||||||
|
inner_str = f"but {i} " if i != 0 else ""
|
||||||
|
log.info(f"{commenters} commenters in all {inner_str}subreddits")
|
||||||
|
if commenters == 0:
|
||||||
|
txt.write(f"No commenters in all {inner_str}subreddits\n")
|
||||||
else:
|
else:
|
||||||
txt.write(f"{commentersAll} commenters in all subreddits\n")
|
txt.write(f"{commenters} commenters in all {inner_str}subreddits\n")
|
||||||
for user in sorted(sharedCommenters[len(input_files)], key=str.lower):
|
for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower):
|
||||||
txt.write(f"{user}\n")
|
|
||||||
txt.write("\n")
|
|
||||||
|
|
||||||
if commentersAll < 10 and len(input_files) > 2:
|
|
||||||
if commentersMinusOne == 0:
|
|
||||||
txt.write(f"No commenters in all but one subreddits\n")
|
|
||||||
else:
|
|
||||||
txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n")
|
|
||||||
for user in sorted(sharedCommenters[len(input_files) - 1], key=str.lower):
|
|
||||||
txt.write(f"{user}\n")
|
|
||||||
txt.write("\n")
|
|
||||||
|
|
||||||
if commentersMinusOne < 10:
|
|
||||||
if commentersMinusTwo == 0:
|
|
||||||
txt.write(f"No commenters in all but two subreddits\n")
|
|
||||||
else:
|
|
||||||
txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n")
|
|
||||||
for user in sorted(sharedCommenters[len(input_files) - 2], key=str.lower):
|
|
||||||
txt.write(f"{user}\n")
|
txt.write(f"{user}\n")
|
||||||
txt.write("\n")
|
txt.write("\n")
|
||||||
|
if commenters > 3:
|
||||||
|
break
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue