mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 15:45:19 -04:00
Add a single_output flag
This commit is contained in:
parent
439ab0108e
commit
4e0d382bee
3 changed files with 27 additions and 7 deletions
|
@ -301,7 +301,14 @@ if __name__ == '__main__':
|
||||||
parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None)
|
parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None)
|
||||||
parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
|
parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
|
||||||
parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_")
|
parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_")
|
||||||
parser.add_argument("--split_intermediate", help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files", action="store_true")
|
parser.add_argument(
|
||||||
|
"--split_intermediate",
|
||||||
|
help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files",
|
||||||
|
action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--single_output",
|
||||||
|
help="Output a single combined file instead of splitting by the search term",
|
||||||
|
action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--error_rate", help=
|
"--error_rate", help=
|
||||||
"Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, "
|
"Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, "
|
||||||
|
@ -329,8 +336,8 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
log.info(f"Writing output to working folder")
|
log.info(f"Writing output to working folder")
|
||||||
|
|
||||||
if (args.partial or args.regex) and args.split_intermediate:
|
if (args.partial or args.regex or args.single_output) and args.split_intermediate:
|
||||||
log.info("The partial and regex flags are not compatible with the split_intermediate flag")
|
log.info("The partial, regex and single_output flags are not compatible with the split_intermediate flag")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
values = set()
|
values = set()
|
||||||
|
@ -368,6 +375,9 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
log.info(f"Checking if any of {val_string} exactly match field {args.field}")
|
log.info(f"Checking if any of {val_string} exactly match field {args.field}")
|
||||||
|
|
||||||
|
if args.partial or args.regex or args.single_output:
|
||||||
|
log.info(f"Outputing to a single combined file")
|
||||||
|
|
||||||
multiprocessing.set_start_method('spawn')
|
multiprocessing.set_start_method('spawn')
|
||||||
queue = multiprocessing.Manager().Queue()
|
queue = multiprocessing.Manager().Queue()
|
||||||
status_json = os.path.join(args.working, "status.json")
|
status_json = os.path.join(args.working, "status.json")
|
||||||
|
@ -559,7 +569,7 @@ if __name__ == '__main__':
|
||||||
for line, file_bytes_processed in input_handle.yield_lines():
|
for line, file_bytes_processed in input_handle.yield_lines():
|
||||||
output_lines += 1
|
output_lines += 1
|
||||||
obj = json.loads(line)
|
obj = json.loads(line)
|
||||||
if args.partial or args.regex:
|
if args.partial or args.regex or args.single_output:
|
||||||
observed_case = "output"
|
observed_case = "output"
|
||||||
else:
|
else:
|
||||||
observed_case = obj[args.field]
|
observed_case = obj[args.field]
|
||||||
|
|
|
@ -15,8 +15,14 @@ import json
|
||||||
# the script will look for both comments and submissions files for each subreddit
|
# the script will look for both comments and submissions files for each subreddit
|
||||||
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24"
|
folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24"
|
||||||
subreddits_string = """
|
subreddits_string = """
|
||||||
Truckers
|
askcarsales
|
||||||
SIBO
|
Denton
|
||||||
|
relationship_advice
|
||||||
|
Dallas
|
||||||
|
askdfw
|
||||||
|
AskMen
|
||||||
|
rolex
|
||||||
|
lego
|
||||||
"""
|
"""
|
||||||
ignored_users = {'[deleted]', 'automoderator'}
|
ignored_users = {'[deleted]', 'automoderator'}
|
||||||
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
|
# this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
|
||||||
|
|
|
@ -170,3 +170,7 @@ LazyLinkerBot
|
||||||
Darnit_Bot
|
Darnit_Bot
|
||||||
checks_out_bot
|
checks_out_bot
|
||||||
HippoBot9000
|
HippoBot9000
|
||||||
|
could-of-bot
|
||||||
|
mentionhelper
|
||||||
|
RossGellerBot
|
||||||
|
the_timezone_bot
|
Loading…
Add table
Add a link
Reference in a new issue