mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-24 15:15:24 -04:00
Add more logging to filter file
This commit is contained in:
parent
4a50ca6605
commit
4110374fe8
1 changed files with 18 additions and 4 deletions
|
@ -7,7 +7,7 @@ from datetime import datetime
|
||||||
import logging.handlers
|
import logging.handlers
|
||||||
|
|
||||||
# put the path to the input file
|
# put the path to the input file
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits\redditdev_comments.zst"
|
input_file = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2023-02.zst"
|
||||||
# put the name or path to the output file. The file extension from below will be added automatically
|
# put the name or path to the output file. The file extension from below will be added automatically
|
||||||
output_file = r"\\MYCLOUDPR4100\Public\output"
|
output_file = r"\\MYCLOUDPR4100\Public\output"
|
||||||
# the format to output in, pick from the following options
|
# the format to output in, pick from the following options
|
||||||
|
@ -75,8 +75,8 @@ to_date = datetime.strptime("2025-01-01", "%Y-%m-%d")
|
||||||
# run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
|
# run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
|
||||||
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
||||||
|
|
||||||
field = "author"
|
field = "subreddit"
|
||||||
values = ["watchful1","spez"]
|
values = ['vim','google']
|
||||||
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
||||||
# if this list is very large, it could greatly slow down the process
|
# if this list is very large, it could greatly slow down the process
|
||||||
values_file = None
|
values_file = None
|
||||||
|
@ -189,16 +189,28 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
log.error(f"Unsupported output format {output_format}")
|
log.error(f"Unsupported output format {output_format}")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
log.info(f"Input file is: {input_file}")
|
||||||
|
log.info(f"Output file is: {output_path}")
|
||||||
|
|
||||||
if values_file is not None:
|
if values_file is not None:
|
||||||
values = []
|
values = []
|
||||||
with open(values_file, 'r') as values_handle:
|
with open(values_file, 'r') as values_handle:
|
||||||
for value in values_handle:
|
for value in values_handle:
|
||||||
values.append(value.strip().lower())
|
values.append(value.strip().lower())
|
||||||
log.info(f"Loaded {len(values)} from values file")
|
log.info(f"Loaded {len(values)} from values file {values_file}")
|
||||||
else:
|
else:
|
||||||
values = [value.lower() for value in values] # convert to lowercase
|
values = [value.lower() for value in values] # convert to lowercase
|
||||||
|
|
||||||
|
log.info(f"Filtering field: {field}")
|
||||||
|
if len(values) <= 20:
|
||||||
|
log.info(f"On values: {','.join(values)}")
|
||||||
|
else:
|
||||||
|
log.info(f"On values:")
|
||||||
|
for value in values:
|
||||||
|
log.info(value)
|
||||||
|
log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}. Is submission {is_submission}")
|
||||||
|
log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
|
||||||
|
|
||||||
file_size = os.stat(input_file).st_size
|
file_size = os.stat(input_file).st_size
|
||||||
file_bytes_processed = 0
|
file_bytes_processed = 0
|
||||||
created = None
|
created = None
|
||||||
|
@ -243,6 +255,8 @@ if __name__ == "__main__":
|
||||||
write_line_single(handle, obj, single_field)
|
write_line_single(handle, obj, single_field)
|
||||||
else:
|
else:
|
||||||
write_line_json(handle, obj)
|
write_line_json(handle, obj)
|
||||||
|
else:
|
||||||
|
log.info(f"Something went wrong, invalid output format {output_format}")
|
||||||
except (KeyError, json.JSONDecodeError) as err:
|
except (KeyError, json.JSONDecodeError) as err:
|
||||||
bad_lines += 1
|
bad_lines += 1
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue