Fix path name

This commit is contained in:
Watchful1 2025-07-23 19:31:31 -07:00
parent 143a40fc23
commit 5b95fc7e7d
2 changed files with 23 additions and 11 deletions

View file

@ -326,8 +326,8 @@ if __name__ == '__main__':
input_lines += 1
monthly_counts[line.strip()] += 1
file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
with open(file.monthly_count_file, 'w') as output_handle:
file.count_file_path = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
with open(file.count_file_path, 'w') as output_handle:
for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
output_handle.write(f"{field} {count}\n")
@ -338,13 +338,13 @@ if __name__ == '__main__':
if stage == "agg":
field_counts = defaultdict(int)
for file in input_files:
with open(file.monthly_count_file, 'r') as input_handle:
with open(file.count_file_path, 'r') as input_handle:
for line in input_handle:
try:
field, count = line.strip().split("\t")
field_counts[field] = count
except Exception as err:
log.info(f"Line failed in file {file.monthly_count_file}: {line}")
log.info(f"Line failed in file {file.count_file_path}: {line}")
raise
sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)

View file

@ -5,17 +5,18 @@ import sys
import csv
from datetime import datetime
import logging.handlers
import traceback
# put the path to the input file, or a folder of files to process all of
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
input_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments.zst"
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
output_file = r"\\MYCLOUDPR4100\Public\output"
output_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments"
# the format to output in, pick from the following options
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
# csv: a comma separated value file. Can be opened by a text editor or excel
# WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
output_format = "csv"
output_format = "zst"
# override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
# any field that's in the dump is supported, but useful ones are
# author: the username of the author
@ -76,12 +77,15 @@ to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
# change this to field = None if you don't want to filter by anything
field = "body"
field = "selftext"
values = ['']
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
# if this list is very large, it could greatly slow down the process
values_file = None
# if true, only match the full value, if false match the value anywhere in the text. Partial matches are much slower
exact_match = False
# if true, returns rows that do not match the condition
inverse = False
# sets up logging to the console as well as a file
@ -209,7 +213,10 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
continue
if field is not None:
field_value = obj[field].lower()
field_value = obj[field]
if field_value is None:
continue
field_value = field_value.lower()
matched = False
for value in values:
if exact_match:
@ -220,9 +227,14 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
if value in field_value:
matched = True
break
if inverse:
if matched:
continue
else:
if not matched:
continue
matched_lines += 1
if output_format == "zst":
write_line_zst(handle, line)