Add a single_output flag

This commit is contained in:
Watchful1 2025-04-07 21:43:31 -07:00
parent 439ab0108e
commit 4e0d382bee
3 changed files with 27 additions and 7 deletions

View file

@ -301,7 +301,14 @@ if __name__ == '__main__':
parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None)
parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_")
parser.add_argument("--split_intermediate", help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files", action="store_true")
parser.add_argument(
"--split_intermediate",
help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files",
action="store_true")
parser.add_argument(
"--single_output",
help="Output a single combined file instead of splitting by the search term",
action="store_true")
parser.add_argument(
"--error_rate", help=
"Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, "
@ -329,8 +336,8 @@ if __name__ == '__main__':
else:
log.info(f"Writing output to working folder")
if (args.partial or args.regex) and args.split_intermediate:
log.info("The partial and regex flags are not compatible with the split_intermediate flag")
if (args.partial or args.regex or args.single_output) and args.split_intermediate:
log.info("The partial, regex and single_output flags are not compatible with the split_intermediate flag")
sys.exit(1)
values = set()
@ -368,6 +375,9 @@ if __name__ == '__main__':
else:
log.info(f"Checking if any of {val_string} exactly match field {args.field}")
if args.partial or args.regex or args.single_output:
log.info(f"Outputing to a single combined file")
multiprocessing.set_start_method('spawn')
queue = multiprocessing.Manager().Queue()
status_json = os.path.join(args.working, "status.json")
@ -559,7 +569,7 @@ if __name__ == '__main__':
for line, file_bytes_processed in input_handle.yield_lines():
output_lines += 1
obj = json.loads(line)
if args.partial or args.regex:
if args.partial or args.regex or args.single_output:
observed_case = "output"
else:
observed_case = obj[args.field]

View file

@ -15,8 +15,14 @@ import json
# the script will look for both comments and submissions files for each subreddit
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24"
subreddits_string = """
Truckers
SIBO
askcarsales
Denton
relationship_advice
Dallas
askdfw
AskMen
rolex
lego
"""
ignored_users = {'[deleted]', 'automoderator'}
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for

View file

@ -169,4 +169,8 @@ Link_Demobilizer
LazyLinkerBot
Darnit_Bot
checks_out_bot
HippoBot9000
HippoBot9000
could-of-bot
mentionhelper
RossGellerBot
the_timezone_bot