mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-04 11:26:41 -04:00
Add overlapping users finder
This commit is contained in:
parent
897332b1d7
commit
4a50ca6605
2 changed files with 146 additions and 2 deletions
|
@ -1,5 +1,3 @@
|
|||
# this is an example
|
||||
|
||||
import zstandard
|
||||
import os
|
||||
import json
|
||||
|
|
146
scripts/find_overlapping_users.py
Normal file
146
scripts/find_overlapping_users.py
Normal file
|
@ -0,0 +1,146 @@
|
|||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
import os
|
||||
import logging.handlers
|
||||
import zstandard
|
||||
import json
|
||||
|
||||
input_files = [
|
||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits\redditdev_comments.zst",
|
||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits\announcements_comments.zst",
|
||||
r"\\MYCLOUDPR4100\Public\reddit\subreddits\modnews_comments.zst",
|
||||
]
|
||||
ignored_users = ['[deleted]', 'automoderator']
|
||||
min_comments_per_sub = 1
|
||||
file_name = "users.txt"
|
||||
require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
|
||||
|
||||
|
||||
# sets up logging to the console as well as a file
|
||||
log = logging.getLogger("bot")
|
||||
log.setLevel(logging.INFO)
|
||||
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
|
||||
log_str_handler = logging.StreamHandler()
|
||||
log_str_handler.setFormatter(log_formatter)
|
||||
log.addHandler(log_str_handler)
|
||||
if not os.path.exists("logs"):
|
||||
os.makedirs("logs")
|
||||
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
|
||||
log_file_handler.setFormatter(log_formatter)
|
||||
log.addHandler(log_file_handler)
|
||||
|
||||
|
||||
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
|
||||
chunk = reader.read(chunk_size)
|
||||
bytes_read += chunk_size
|
||||
if previous_chunk is not None:
|
||||
chunk = previous_chunk + chunk
|
||||
try:
|
||||
return chunk.decode()
|
||||
except UnicodeDecodeError:
|
||||
if bytes_read > max_window_size:
|
||||
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
|
||||
log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
|
||||
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
|
||||
|
||||
|
||||
def read_lines_zst(file_name):
|
||||
with open(file_name, 'rb') as file_handle:
|
||||
buffer = ''
|
||||
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
|
||||
while True:
|
||||
chunk = read_and_decode(reader, 2**27, (2**29) * 2)
|
||||
|
||||
if not chunk:
|
||||
break
|
||||
lines = (buffer + chunk).split("\n")
|
||||
|
||||
for line in lines[:-1]:
|
||||
yield line.strip(), file_handle.tell()
|
||||
|
||||
buffer = lines[-1]
|
||||
|
||||
reader.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
commenterSubreddits = defaultdict(int)
|
||||
is_first = True
|
||||
total_lines = 0
|
||||
for subreddit_file in input_files:
|
||||
file_lines = 0
|
||||
created = None
|
||||
file_size = os.stat(subreddit_file).st_size
|
||||
commenters = defaultdict(int)
|
||||
for line, file_bytes_processed in read_lines_zst(subreddit_file):
|
||||
total_lines += 1
|
||||
file_lines += 1
|
||||
if total_lines % 100000 == 0:
|
||||
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
created = datetime.utcfromtimestamp(int(obj['created_utc']))
|
||||
|
||||
if obj['author'].lower() not in ignored_users:
|
||||
commenters[obj['author']] += 1
|
||||
except (KeyError, json.JSONDecodeError) as err:
|
||||
pass
|
||||
log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
|
||||
|
||||
for commenter in commenters:
|
||||
if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
|
||||
continue
|
||||
if commenters[commenter] >= min_comments_per_sub:
|
||||
commenterSubreddits[commenter] += 1
|
||||
is_first = False
|
||||
|
||||
if require_first_subreddit:
|
||||
count_found = 0
|
||||
with open(file_name, 'w') as txt:
|
||||
txt.write(f"Commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}\n")
|
||||
for commenter, countSubreddits in commenterSubreddits.items():
|
||||
if countSubreddits >= 2:
|
||||
count_found += 1
|
||||
txt.write(f"{commenter}\n")
|
||||
log.info(f"{count_found} commenters in r/{input_files[0]} and at least one of r/{(', '.join(input_files))}")
|
||||
|
||||
else:
|
||||
sharedCommenters = defaultdict(list)
|
||||
for commenter, countSubreddits in commenterSubreddits.items():
|
||||
if countSubreddits >= len(input_files) - 2:
|
||||
sharedCommenters[countSubreddits].append(commenter)
|
||||
|
||||
commentersAll = len(sharedCommenters[len(input_files)])
|
||||
commentersMinusOne = len(sharedCommenters[len(input_files) - 1])
|
||||
commentersMinusTwo = len(sharedCommenters[len(input_files) - 2])
|
||||
|
||||
log.info(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}")
|
||||
|
||||
with open(file_name, 'w') as txt:
|
||||
if commentersAll == 0:
|
||||
txt.write(f"No commenters in all subreddits\n")
|
||||
else:
|
||||
txt.write(f"{commentersAll} commenters in all subreddits\n")
|
||||
for user in sorted(sharedCommenters[len(input_files)], key=str.lower):
|
||||
txt.write(f"{user}\n")
|
||||
txt.write("\n")
|
||||
|
||||
if commentersAll < 10 and len(input_files) > 2:
|
||||
if commentersMinusOne == 0:
|
||||
txt.write(f"No commenters in all but one subreddits\n")
|
||||
else:
|
||||
txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n")
|
||||
for user in sorted(sharedCommenters[len(input_files) - 1], key=str.lower):
|
||||
txt.write(f"{user}\n")
|
||||
txt.write("\n")
|
||||
|
||||
if commentersMinusOne < 10:
|
||||
if commentersMinusTwo == 0:
|
||||
txt.write(f"No commenters in all but two subreddits\n")
|
||||
else:
|
||||
txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n")
|
||||
for user in sorted(sharedCommenters[len(input_files) - 2], key=str.lower):
|
||||
txt.write(f"{user}\n")
|
||||
txt.write("\n")
|
Loading…
Add table
Add a link
Reference in a new issue