Fix path name

2025-07-26 16:15:37 -04:00 · 2025-07-23 19:31:31 -07:00 · 2025-07-23 19:31:31 -07:00 · 5b95fc7e7d
commit 5b95fc7e7d
parent 143a40fc23
2 changed files with 23 additions and 11 deletions
--- a/personal/diagnostic/count_subreddits_multiprocess.py
+++ b/personal/diagnostic/count_subreddits_multiprocess.py
@ -326,8 +326,8 @@ if __name__ == '__main__':
 					input_lines += 1
 					monthly_counts[line.strip()] += 1

-			file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
-			with open(file.monthly_count_file, 'w') as output_handle:
+			file.count_file_path = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
+			with open(file.count_file_path, 'w') as output_handle:
 				for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
 					output_handle.write(f"{field}	{count}\n")

@ -338,13 +338,13 @@ if __name__ == '__main__':
 	if stage == "agg":
 		field_counts = defaultdict(int)
 		for file in input_files:
-			with open(file.monthly_count_file, 'r') as input_handle:
+			with open(file.count_file_path, 'r') as input_handle:
 				for line in input_handle:
 					try:
 						field, count = line.strip().split("\t")
 						field_counts[field] = count
 					except Exception as err:
-						log.info(f"Line failed in file {file.monthly_count_file}: {line}")
+						log.info(f"Line failed in file {file.count_file_path}: {line}")
 						raise

 		sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)
--- a/scripts/filter_file.py
+++ b/scripts/filter_file.py
@ -5,17 +5,18 @@ import sys
 import csv
 from datetime import datetime
 import logging.handlers
+import traceback

 # put the path to the input file, or a folder of files to process all of
-input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
+input_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments.zst"
 # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
-output_file = r"\\MYCLOUDPR4100\Public\output"
+output_file = r"\\MYCLOUDPR4100\Public\wallstreetbets_comments"
 # the format to output in, pick from the following options
 #   zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
 #   txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
 #   csv: a comma separated value file. Can be opened by a text editor or excel
 # WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
-output_format = "csv"
+output_format = "zst"
 # override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
 # any field that's in the dump is supported, but useful ones are
 #   author: the username of the author
@ -76,12 +77,15 @@ to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
 # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"

 # change this to field = None if you don't want to filter by anything
-field = "body"
+field = "selftext"
 values = ['']
 # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
 # if this list is very large, it could greatly slow down the process
 values_file = None
+# if true, only match the full value, if false match the value anywhere in the text. Partial matches are much slower
 exact_match = False
+# if true, returns rows that do not match the condition
+inverse = False


 # sets up logging to the console as well as a file
@ -209,7 +213,10 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
 				continue

 			if field is not None:
-				field_value = obj[field].lower()
+				field_value = obj[field]
+				if field_value is None:
+					continue
+				field_value = field_value.lower()
 				matched = False
 				for value in values:
 					if exact_match:
@ -220,9 +227,14 @@ def process_file(input_file, output_file, output_format, field, values, from_dat
 						if value in field_value:
 							matched = True
 							break
+				if inverse:
+					if matched:
+						continue
+				else:
 					if not matched:
 						continue

+
 			matched_lines += 1
 			if output_format == "zst":
 				write_line_zst(handle, line)