From 5b95fc7e7d6c53ec3da538c37fc37d8fc7b3fc06 Mon Sep 17 00:00:00 2001 From: Watchful1 Date: Wed, 23 Jul 2025 19:31:31 -0700 Subject: [PATCH] Fix path name --- .../count_subreddits_multiprocess.py | 8 +++--- scripts/filter_file.py | 26 ++++++++++++++----- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/personal/diagnostic/count_subreddits_multiprocess.py b/personal/diagnostic/count_subreddits_multiprocess.py index 9c2ffd8..b24e5ef 100644 --- a/personal/diagnostic/count_subreddits_multiprocess.py +++ b/personal/diagnostic/count_subreddits_multiprocess.py @@ -326,8 +326,8 @@ if __name__ == '__main__': input_lines += 1 monthly_counts[line.strip()] += 1 - file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path)) - with open(file.monthly_count_file, 'w') as output_handle: + file.count_file_path = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path)) + with open(file.count_file_path, 'w') as output_handle: for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True): output_handle.write(f"{field} {count}\n") @@ -338,13 +338,13 @@ if __name__ == '__main__': if stage == "agg": field_counts = defaultdict(int) for file in input_files: - with open(file.monthly_count_file, 'r') as input_handle: + with open(file.count_file_path, 'r') as input_handle: for line in input_handle: try: field, count = line.strip().split("\t") field_counts[field] = count except Exception as err: - log.info(f"Line failed in file {file.monthly_count_file}: {line}") + log.info(f"Line failed in file {file.count_file_path}: {line}") raise sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True) diff --git a/scripts/filter_file.py b/scripts/filter_file.py index a26763a..cde0e7b 100644 --- a/scripts/filter_file.py +++ b/scripts/filter_file.py @@ -5,17 +5,18 @@ import sys import csv from datetime import datetime import logging.handlers +import traceback # put the path to the input file, or a folder of files to process all of -input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst" +input_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments.zst" # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well -output_file = r"\\MYCLOUDPR4100\Public\output" +output_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments" # the format to output in, pick from the following options # zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo # txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor # csv: a comma separated value file. Can be opened by a text editor or excel # WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file -output_format = "csv" +output_format = "zst" # override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below # any field that's in the dump is supported, but useful ones are # author: the username of the author @@ -76,12 +77,15 @@ to_date = datetime.strptime("2030-12-31", "%Y-%m-%d") # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id" # change this to field = None if you don't want to filter by anything -field = "body" +field = "selftext" values = [''] # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above # if this list is very large, it could greatly slow down the process values_file = None +# if true, only match the full value, if false match the value anywhere in the text. Partial matches are much slower exact_match = False +# if true, returns rows that do not match the condition +inverse = False # sets up logging to the console as well as a file @@ -209,7 +213,10 @@ def process_file(input_file, output_file, output_format, field, values, from_dat continue if field is not None: - field_value = obj[field].lower() + field_value = obj[field] + if field_value is None: + continue + field_value = field_value.lower() matched = False for value in values: if exact_match: @@ -220,8 +227,13 @@ def process_file(input_file, output_file, output_format, field, values, from_dat if value in field_value: matched = True break - if not matched: - continue + if inverse: + if matched: + continue + else: + if not matched: + continue + matched_lines += 1 if output_format == "zst":