mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-02 02:16:41 -04:00
Reorganize
This commit is contained in:
parent
3700b21b81
commit
4f1d70d34a
19 changed files with 0 additions and 105 deletions
|
@ -1,24 +0,0 @@
|
|||
import utils
|
||||
import discord_logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
log = discord_logging.init_logging()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
subreddits = defaultdict(int)
|
||||
input_file = r"\\MYCLOUDPR4100\Public\pushshift_working\RC_2022-12.zst"
|
||||
input_file_size = os.stat(input_file).st_size
|
||||
total_lines = 0
|
||||
for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file):
|
||||
subreddits[comment['subreddit']] += 1
|
||||
total_lines += 1
|
||||
if total_lines % 100000 == 0:
|
||||
log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%")
|
||||
|
||||
log.info(f"{total_lines:,} lines, 100%")
|
||||
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1] * -1):
|
||||
if count >= 1:
|
||||
log.info(f"r/{subreddit}: {count:,}")
|
|
@ -1,20 +0,0 @@
|
|||
import utils
|
||||
import discord_logging
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
log = discord_logging.init_logging()
|
||||
|
||||
if __name__ == "__main__":
|
||||
domains = defaultdict(list)
|
||||
lines = 0
|
||||
for submission in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\guessmybf_submissions.zst"):
|
||||
if submission['is_self']:
|
||||
continue
|
||||
|
||||
domain = urlparse(submission['url']).netloc
|
||||
domains[domain].append(submission['url'])
|
||||
lines += 1
|
||||
|
||||
log.info(f"{lines}")
|
|
@ -1,15 +0,0 @@
|
|||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
|
||||
for subdir, dirs, files in os.walk(input_folder):
|
||||
for file_name in files:
|
||||
items = 0
|
||||
input_path = os.path.join(subdir, file_name)
|
||||
with open(input_path, 'r') as input_handle:
|
||||
for line in input_handle:
|
||||
subreddit, count = line.strip().split("\t")
|
||||
items += int(count)
|
||||
print(f"{file_name} {items}")
|
|
@ -1,22 +0,0 @@
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
input_file = r"\\MYCLOUDPR4100\Public\field_counts.txt"
|
||||
output_file = r"\\MYCLOUDPR4100\Public\field_counts_sorted.txt"
|
||||
subreddits = {}
|
||||
with open(input_file, 'r') as input_handle:
|
||||
for line in input_handle:
|
||||
subreddit, count_string = line.strip().split("\t")
|
||||
count = int(count_string)
|
||||
if count > 10000:
|
||||
subreddits[subreddit] = count
|
||||
|
||||
print(f"{len(subreddits)}")
|
||||
|
||||
with open(output_file, 'w') as output_handle:
|
||||
count_written = 0
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
||||
output_handle.write(f"{subreddit}\n")
|
||||
count_written += 1
|
||||
if count_written >= 20000:
|
||||
break
|
|
@ -1,24 +0,0 @@
|
|||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts"
|
||||
output_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
|
||||
lines = 0
|
||||
for subdir, dirs, files in os.walk(input_folder):
|
||||
for file_name in files:
|
||||
subreddits = defaultdict(int)
|
||||
input_path = os.path.join(subdir, file_name)
|
||||
output_path = os.path.join(output_folder, f"{file_name}.txt")
|
||||
print(f"{lines} : {input_path}")
|
||||
with open(input_path, 'r') as input_handle:
|
||||
for line in input_handle:
|
||||
lines += 1
|
||||
subreddits[line.strip()] += 1
|
||||
if lines % 1000000 == 0:
|
||||
print(f"{lines} : {input_path}")
|
||||
|
||||
with open(output_path, 'w') as output_handle:
|
||||
for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
|
||||
output_handle.write(f"{subreddit} {count}\n")
|
Loading…
Add table
Add a link
Reference in a new issue