Reorganize

This commit is contained in:
Watchful1 2023-08-26 16:52:58 -07:00
parent 3700b21b81
commit 4f1d70d34a
19 changed files with 0 additions and 105 deletions

View file

@ -1,24 +0,0 @@
import utils
import discord_logging
import os
from collections import defaultdict
log = discord_logging.init_logging()
if __name__ == "__main__":
subreddits = defaultdict(int)
input_file = r"\\MYCLOUDPR4100\Public\pushshift_working\RC_2022-12.zst"
input_file_size = os.stat(input_file).st_size
total_lines = 0
for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file):
subreddits[comment['subreddit']] += 1
total_lines += 1
if total_lines % 100000 == 0:
log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%")
log.info(f"{total_lines:,} lines, 100%")
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1] * -1):
if count >= 1:
log.info(f"r/{subreddit}: {count:,}")

View file

@ -1,20 +0,0 @@
import utils
import discord_logging
from datetime import datetime
from collections import defaultdict
from urllib.parse import urlparse
log = discord_logging.init_logging()
if __name__ == "__main__":
domains = defaultdict(list)
lines = 0
for submission in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\guessmybf_submissions.zst"):
if submission['is_self']:
continue
domain = urlparse(submission['url']).netloc
domains[domain].append(submission['url'])
lines += 1
log.info(f"{lines}")

View file

@ -1,15 +0,0 @@
import os
from collections import defaultdict
if __name__ == "__main__":
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
for subdir, dirs, files in os.walk(input_folder):
for file_name in files:
items = 0
input_path = os.path.join(subdir, file_name)
with open(input_path, 'r') as input_handle:
for line in input_handle:
subreddit, count = line.strip().split("\t")
items += int(count)
print(f"{file_name} {items}")

View file

@ -1,22 +0,0 @@
if __name__ == '__main__':
input_file = r"\\MYCLOUDPR4100\Public\field_counts.txt"
output_file = r"\\MYCLOUDPR4100\Public\field_counts_sorted.txt"
subreddits = {}
with open(input_file, 'r') as input_handle:
for line in input_handle:
subreddit, count_string = line.strip().split("\t")
count = int(count_string)
if count > 10000:
subreddits[subreddit] = count
print(f"{len(subreddits)}")
with open(output_file, 'w') as output_handle:
count_written = 0
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
output_handle.write(f"{subreddit}\n")
count_written += 1
if count_written >= 20000:
break

View file

@ -1,24 +0,0 @@
import os
from collections import defaultdict
if __name__ == "__main__":
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts"
output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
lines = 0
for subdir, dirs, files in os.walk(input_folder):
for file_name in files:
subreddits = defaultdict(int)
input_path = os.path.join(subdir, file_name)
output_path = os.path.join(output_folder, f"{file_name}.txt")
print(f"{lines} : {input_path}")
with open(input_path, 'r') as input_handle:
for line in input_handle:
lines += 1
subreddits[line.strip()] += 1
if lines % 1000000 == 0:
print(f"{lines} : {input_path}")
with open(output_path, 'w') as output_handle:
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
output_handle.write(f"{subreddit} {count}\n")