mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-04 03:16:40 -04:00
177 lines
6.7 KiB
Python
177 lines
6.7 KiB
Python
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timedelta
|
|
import time
|
|
import os
|
|
import logging.handlers
|
|
import zstandard
|
|
import json
|
|
|
|
# IMPORTANT SETUP INSTRUCTIONS
|
|
# change the folder line to the folder where the files are stored
|
|
# change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit"
|
|
# the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
|
|
# the script will look for both comments and submissions files for each subreddit
|
|
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits23"
|
|
subreddits = [
|
|
"phillies",
|
|
"EASportsFC",
|
|
"eagles",
|
|
"politics",
|
|
"BucksCountyPA",
|
|
]
|
|
ignored_users = {'[deleted]', 'automoderator'}
|
|
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
|
|
# here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
|
|
ignored_users_file = "ignored.txt"
|
|
min_comments_per_sub = 1
|
|
file_name = "users.txt"
|
|
require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
|
|
|
|
|
|
# sets up logging to the console as well as a file
|
|
log = logging.getLogger("bot")
|
|
log.setLevel(logging.INFO)
|
|
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
|
|
log_str_handler = logging.StreamHandler()
|
|
log_str_handler.setFormatter(log_formatter)
|
|
log.addHandler(log_str_handler)
|
|
if not os.path.exists("logs"):
|
|
os.makedirs("logs")
|
|
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
|
|
log_file_handler.setFormatter(log_formatter)
|
|
log.addHandler(log_file_handler)
|
|
|
|
|
|
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
|
|
chunk = reader.read(chunk_size)
|
|
bytes_read += chunk_size
|
|
if previous_chunk is not None:
|
|
chunk = previous_chunk + chunk
|
|
try:
|
|
return chunk.decode()
|
|
except UnicodeDecodeError:
|
|
if bytes_read > max_window_size:
|
|
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
|
|
log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
|
|
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
|
|
|
|
|
|
def read_lines_zst(file_name):
|
|
with open(file_name, 'rb') as file_handle:
|
|
buffer = ''
|
|
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
|
|
while True:
|
|
chunk = read_and_decode(reader, 2**27, (2**29) * 2)
|
|
|
|
if not chunk:
|
|
break
|
|
lines = (buffer + chunk).split("\n")
|
|
|
|
for line in lines[:-1]:
|
|
yield line.strip(), file_handle.tell()
|
|
|
|
buffer = lines[-1]
|
|
|
|
reader.close()
|
|
|
|
|
|
def get_commenters_from_file(subreddit_file, subreddit_commenters, total_lines):
|
|
file_lines = 0
|
|
created = None
|
|
file_size = os.stat(subreddit_file).st_size
|
|
for line, file_bytes_processed in read_lines_zst(subreddit_file):
|
|
total_lines += 1
|
|
file_lines += 1
|
|
if total_lines % 100000 == 0:
|
|
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
|
|
|
|
try:
|
|
obj = json.loads(line)
|
|
created = datetime.utcfromtimestamp(int(obj['created_utc']))
|
|
|
|
if obj['author'].lower() not in ignored_users:
|
|
subreddit_commenters[obj['author']] += 1
|
|
except (KeyError, json.JSONDecodeError) as err:
|
|
pass
|
|
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
|
|
return total_lines
|
|
|
|
|
|
if __name__ == "__main__":
|
|
log.info(f"Subreddit's folder: {folder}")
|
|
if not os.path.exists(folder):
|
|
log.error(f"Subreddit's folder either doesn't exist or the script doesn't have access to it: {folder}")
|
|
sys.exit()
|
|
if len(subreddits) <= 10:
|
|
log.info(f"Finding overlapping users in {', '.join(subreddits)}")
|
|
else:
|
|
log.info(f"Finding overlapping users in {len(subreddits)} subreddits")
|
|
if require_first_subreddit:
|
|
log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
|
|
log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
|
|
log.info(f"Outputting to {file_name}")
|
|
|
|
if os.path.exists(ignored_users_file):
|
|
with open(ignored_users_file) as fh:
|
|
for user in fh.readlines():
|
|
ignored_users.add(user.strip().lower())
|
|
log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")
|
|
|
|
commenterSubreddits = defaultdict(int)
|
|
is_first = True
|
|
total_lines = 0
|
|
for subreddit in subreddits:
|
|
subreddit_exists = False
|
|
commenters = defaultdict(int)
|
|
for file_type in ["submissions", "comments"]:
|
|
subreddit_file = os.path.join(folder, f"{subreddit}_{file_type}.zst")
|
|
if not os.path.exists(subreddit_file):
|
|
log.info(f"{file_type} for {subreddit} does not exist, skipping: {subreddit_file}")
|
|
continue
|
|
subreddit_exists = True
|
|
total_lines = get_commenters_from_file(subreddit_file, commenters, total_lines)
|
|
if not subreddit_exists:
|
|
log.error(f"Subreddit {subreddit} has no files, aborting")
|
|
file_count = len(list(os.listdir(folder)))
|
|
log.error(f"The script can see {file_count} files in the folder, but not the ones requested: {folder}")
|
|
sys.exit(0)
|
|
|
|
for commenter in commenters:
|
|
if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
|
|
continue
|
|
if commenters[commenter] >= min_comments_per_sub:
|
|
commenterSubreddits[commenter] += 1
|
|
is_first = False
|
|
|
|
if require_first_subreddit:
|
|
count_found = 0
|
|
with open(file_name, 'w') as txt:
|
|
txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
|
|
for commenter, countSubreddits in commenterSubreddits.items():
|
|
if countSubreddits >= 2:
|
|
count_found += 1
|
|
txt.write(f"{commenter}\n")
|
|
log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}")
|
|
|
|
else:
|
|
sharedCommenters = defaultdict(list)
|
|
for commenter, countSubreddits in commenterSubreddits.items():
|
|
if countSubreddits >= 2:
|
|
sharedCommenters[countSubreddits].append(commenter)
|
|
|
|
with open(file_name, 'w') as txt:
|
|
log.info(f"Writing output to {file_name}")
|
|
for i in range(len(subreddits)):
|
|
commenters = len(sharedCommenters[len(subreddits) - i])
|
|
inner_str = f"but {i} " if i != 0 else ""
|
|
log.info(f"{commenters} commenters in all {inner_str}subreddits")
|
|
if commenters == 0:
|
|
txt.write(f"No commenters in all {inner_str}subreddits\n")
|
|
else:
|
|
txt.write(f"{commenters} commenters in all {inner_str}subreddits\n")
|
|
for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower):
|
|
txt.write(f"{user}\n")
|
|
txt.write("\n")
|
|
if commenters > 3:
|
|
break
|