diff --git a/personal/combine/merge.py b/personal/combine/merge.py index 0aa2016..f4a5b42 100644 --- a/personal/combine/merge.py +++ b/personal/combine/merge.py @@ -97,7 +97,7 @@ field_actions = { "retrieved_utc": FieldAction.SPECIAL, "rte_mode": FieldAction.OVERWRITE_NOT_NONE, "saved": FieldAction.SPECIAL_NO_OVERWRITE, - "score": FieldAction.OVERWRITE_NOT_NONE, + "score": FieldAction.SPECIAL, "score_hidden": FieldAction.OVERWRITE, "send_replies": FieldAction.OVERWRITE, "spam": FieldAction.DELETE, @@ -250,7 +250,7 @@ field_actions = { "retrieved_utc": FieldAction.SPECIAL, "rte_mode": FieldAction.OVERWRITE_NOT_NONE, "saved": FieldAction.SPECIAL_NO_OVERWRITE, - "score": FieldAction.OVERWRITE_NOT_NONE, + "score": FieldAction.SPECIAL, "secure_media": FieldAction.OVERWRITE_NOT_NONE, "secure_media_embed": FieldAction.OVERWRITE_NOT_NONE, "selftext": FieldAction.SPECIAL, @@ -345,6 +345,10 @@ def merge_fields(existing_obj, new_obj, obj_type): if 'previous_body' in existing_obj: existing_obj['previous_body'] = original_value existing_obj['body'] = new_value + elif key == "score": + if not is_empty(new_value): + if is_empty(original_value) or abs(new_value) > abs(original_value): + existing_obj['score'] = new_value elif key == "selftext": if not is_empty(new_value): if 'previous_selftext' not in existing_obj: @@ -393,7 +397,7 @@ def parse_fields(new_obj, obj_type): unmatched_field = True keys_to_delete.append(key) elif action == FieldAction.SPECIAL: - if key in ["retrieved_on", "body", "selftext", "updated_on"]: + if key in ["retrieved_on", "body", "selftext", "updated_on", "score"]: pass elif key == "removal_reason" and new_value in ["legal", None]: pass diff --git a/personal/diagnostic/comments_per_day.py b/personal/diagnostic/comments_per_day.py index e07eee5..854f839 100644 --- a/personal/diagnostic/comments_per_day.py +++ b/personal/diagnostic/comments_per_day.py @@ -8,7 +8,7 @@ log = discord_logging.init_logging() if __name__ == "__main__": day = None day_comments = 0 - for comment in utils.read_obj_zst(r"C:\Users\greg\Desktop\Drive\pushshift\haley0530\chatbots_submissions.zst"): + for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"): created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d") if day is None: day = created_day diff --git a/personal/diagnostic/comments_per_day_with_score.py b/personal/diagnostic/comments_per_day_with_score.py new file mode 100644 index 0000000..600e6bf --- /dev/null +++ b/personal/diagnostic/comments_per_day_with_score.py @@ -0,0 +1,23 @@ +import utils +import discord_logging +from datetime import datetime + +log = discord_logging.init_logging() + + +if __name__ == "__main__": + day = None + day_comments, day_comments_with_score = 0, 0 + for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"): + created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d") + if day is None: + day = created_day + if day != created_day: + log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%") + day_comments, day_comments_with_score = 0, 0 + day = created_day + day_comments += 1 + if comment['score'] != 1: + day_comments_with_score += 1 + + log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%") diff --git a/scripts/filter_file.py b/scripts/filter_file.py index f03d215..25bbcde 100644 --- a/scripts/filter_file.py +++ b/scripts/filter_file.py @@ -7,9 +7,9 @@ from datetime import datetime import logging.handlers # put the path to the input file, or a folder of files to process all of -input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits\CryptoCurrency_submissions.zst" +input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_23.zst" # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well -output_file = r"\\MYCLOUDPR4100\Public\output" +output_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_hero" # the format to output in, pick from the following options # zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo # txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor @@ -75,11 +75,11 @@ to_date = datetime.strptime("2025-12-31", "%Y-%m-%d") # run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id" -field = "title" +field = "body" values = [''] # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above # if this list is very large, it could greatly slow down the process -values_file = None +values_file = r"\\MYCLOUDPR4100\Public\askreddit_submissions_ids.txt" exact_match = False