Fix path name

This commit is contained in:
Watchful1 2025-07-23 19:31:31 -07:00
parent 143a40fc23
commit 5b95fc7e7d
2 changed files with 23 additions and 11 deletions

View file

@ -326,8 +326,8 @@ if __name__ == '__main__':
input_lines += 1 input_lines += 1
monthly_counts[line.strip()] += 1 monthly_counts[line.strip()] += 1
file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path)) file.count_file_path = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
with open(file.monthly_count_file, 'w') as output_handle: with open(file.count_file_path, 'w') as output_handle:
for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True): for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
output_handle.write(f"{field} {count}\n") output_handle.write(f"{field} {count}\n")
@ -338,13 +338,13 @@ if __name__ == '__main__':
if stage == "agg": if stage == "agg":
field_counts = defaultdict(int) field_counts = defaultdict(int)
for file in input_files: for file in input_files:
with open(file.monthly_count_file, 'r') as input_handle: with open(file.count_file_path, 'r') as input_handle:
for line in input_handle: for line in input_handle:
try: try:
field, count = line.strip().split("\t") field, count = line.strip().split("\t")
field_counts[field] = count field_counts[field] = count
except Exception as err: except Exception as err:
log.info(f"Line failed in file {file.monthly_count_file}: {line}") log.info(f"Line failed in file {file.count_file_path}: {line}")
raise raise
sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True) sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)

View file

@ -5,17 +5,18 @@ import sys
import csv import csv
from datetime import datetime from datetime import datetime
import logging.handlers import logging.handlers
import traceback
# put the path to the input file, or a folder of files to process all of # put the path to the input file, or a folder of files to process all of
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst" input_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments.zst"
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
output_file = r"\\MYCLOUDPR4100\Public\output" output_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments"
# the format to output in, pick from the following options # the format to output in, pick from the following options
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo # zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor # txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
# csv: a comma separated value file. Can be opened by a text editor or excel # csv: a comma separated value file. Can be opened by a text editor or excel
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file # WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
output_format = "csv" output_format = "zst"
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below # override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
# any field that's in the dump is supported, but useful ones are # any field that's in the dump is supported, but useful ones are
# author: the username of the author # author: the username of the author
@ -76,12 +77,15 @@ to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id" # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
# change this to field = None if you don't want to filter by anything # change this to field = None if you don't want to filter by anything
field = "body" field = "selftext"
values = [''] values = ['']
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
# if this list is very large, it could greatly slow down the process # if this list is very large, it could greatly slow down the process
values_file = None values_file = None
# if true, only match the full value, if false match the value anywhere in the text. Partial matches are much slower
exact_match = False exact_match = False
# if true, returns rows that do not match the condition
inverse = False
# sets up logging to the console as well as a file # sets up logging to the console as well as a file
@ -209,7 +213,10 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
continue continue
if field is not None: if field is not None:
field_value = obj[field].lower() field_value = obj[field]
if field_value is None:
continue
field_value = field_value.lower()
matched = False matched = False
for value in values: for value in values:
if exact_match: if exact_match:
@ -220,9 +227,14 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
if value in field_value: if value in field_value:
matched = True matched = True
break break
if inverse:
if matched:
continue
else:
if not matched: if not matched:
continue continue
matched_lines += 1 matched_lines += 1
if output_format == "zst": if output_format == "zst":
write_line_zst(handle, line) write_line_zst(handle, line)