diff --git a/scripts/combine_folder_multiprocess.py b/scripts/combine_folder_multiprocess.py index c5695c7..26d57b2 100644 --- a/scripts/combine_folder_multiprocess.py +++ b/scripts/combine_folder_multiprocess.py @@ -301,7 +301,14 @@ if __name__ == '__main__': parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None) parser.add_argument("--processes", help="Number of processes to use", default=10, type=int) parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_") - parser.add_argument("--split_intermediate", help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files", action="store_true") + parser.add_argument( + "--split_intermediate", + help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files", + action="store_true") + parser.add_argument( + "--single_output", + help="Output a single combined file instead of splitting by the search term", + action="store_true") parser.add_argument( "--error_rate", help= "Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, " @@ -329,8 +336,8 @@ if __name__ == '__main__': else: log.info(f"Writing output to working folder") - if (args.partial or args.regex) and args.split_intermediate: - log.info("The partial and regex flags are not compatible with the split_intermediate flag") + if (args.partial or args.regex or args.single_output) and args.split_intermediate: + log.info("The partial, regex and single_output flags are not compatible with the split_intermediate flag") sys.exit(1) values = set() @@ -368,6 +375,9 @@ if __name__ == '__main__': else: log.info(f"Checking if any of {val_string} exactly match field {args.field}") + if args.partial or args.regex or args.single_output: + log.info(f"Outputing to a single combined file") + multiprocessing.set_start_method('spawn') queue = multiprocessing.Manager().Queue() status_json = os.path.join(args.working, "status.json") @@ -559,7 +569,7 @@ if __name__ == '__main__': for line, file_bytes_processed in input_handle.yield_lines(): output_lines += 1 obj = json.loads(line) - if args.partial or args.regex: + if args.partial or args.regex or args.single_output: observed_case = "output" else: observed_case = obj[args.field] diff --git a/scripts/find_overlapping_users.py b/scripts/find_overlapping_users.py index 14f6b48..6240740 100644 --- a/scripts/find_overlapping_users.py +++ b/scripts/find_overlapping_users.py @@ -15,8 +15,14 @@ import json # the script will look for both comments and submissions files for each subreddit folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24" subreddits_string = """ - Truckers - SIBO + askcarsales + Denton + relationship_advice + Dallas + askdfw + AskMen + rolex + lego """ ignored_users = {'[deleted]', 'automoderator'} # this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for diff --git a/scripts/ignored.txt b/scripts/ignored.txt index c58971a..ce1cbfc 100644 --- a/scripts/ignored.txt +++ b/scripts/ignored.txt @@ -169,4 +169,8 @@ Link_Demobilizer LazyLinkerBot Darnit_Bot checks_out_bot -HippoBot9000 \ No newline at end of file +HippoBot9000 +could-of-bot +mentionhelper +RossGellerBot +the_timezone_bot \ No newline at end of file