mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 15:45:19 -04:00
Update merge to not overwrite scores
This commit is contained in:
parent
fe8fef722f
commit
ef186b7bd7
4 changed files with 35 additions and 8 deletions
|
@ -97,7 +97,7 @@ field_actions = {
|
||||||
"retrieved_utc": FieldAction.SPECIAL,
|
"retrieved_utc": FieldAction.SPECIAL,
|
||||||
"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
|
"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
||||||
"score": FieldAction.OVERWRITE_NOT_NONE,
|
"score": FieldAction.SPECIAL,
|
||||||
"score_hidden": FieldAction.OVERWRITE,
|
"score_hidden": FieldAction.OVERWRITE,
|
||||||
"send_replies": FieldAction.OVERWRITE,
|
"send_replies": FieldAction.OVERWRITE,
|
||||||
"spam": FieldAction.DELETE,
|
"spam": FieldAction.DELETE,
|
||||||
|
@ -250,7 +250,7 @@ field_actions = {
|
||||||
"retrieved_utc": FieldAction.SPECIAL,
|
"retrieved_utc": FieldAction.SPECIAL,
|
||||||
"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
|
"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
||||||
"score": FieldAction.OVERWRITE_NOT_NONE,
|
"score": FieldAction.SPECIAL,
|
||||||
"secure_media": FieldAction.OVERWRITE_NOT_NONE,
|
"secure_media": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
"secure_media_embed": FieldAction.OVERWRITE_NOT_NONE,
|
"secure_media_embed": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
"selftext": FieldAction.SPECIAL,
|
"selftext": FieldAction.SPECIAL,
|
||||||
|
@ -345,6 +345,10 @@ def merge_fields(existing_obj, new_obj, obj_type):
|
||||||
if 'previous_body' in existing_obj:
|
if 'previous_body' in existing_obj:
|
||||||
existing_obj['previous_body'] = original_value
|
existing_obj['previous_body'] = original_value
|
||||||
existing_obj['body'] = new_value
|
existing_obj['body'] = new_value
|
||||||
|
elif key == "score":
|
||||||
|
if not is_empty(new_value):
|
||||||
|
if is_empty(original_value) or abs(new_value) > abs(original_value):
|
||||||
|
existing_obj['score'] = new_value
|
||||||
elif key == "selftext":
|
elif key == "selftext":
|
||||||
if not is_empty(new_value):
|
if not is_empty(new_value):
|
||||||
if 'previous_selftext' not in existing_obj:
|
if 'previous_selftext' not in existing_obj:
|
||||||
|
@ -393,7 +397,7 @@ def parse_fields(new_obj, obj_type):
|
||||||
unmatched_field = True
|
unmatched_field = True
|
||||||
keys_to_delete.append(key)
|
keys_to_delete.append(key)
|
||||||
elif action == FieldAction.SPECIAL:
|
elif action == FieldAction.SPECIAL:
|
||||||
if key in ["retrieved_on", "body", "selftext", "updated_on"]:
|
if key in ["retrieved_on", "body", "selftext", "updated_on", "score"]:
|
||||||
pass
|
pass
|
||||||
elif key == "removal_reason" and new_value in ["legal", None]:
|
elif key == "removal_reason" and new_value in ["legal", None]:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -8,7 +8,7 @@ log = discord_logging.init_logging()
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
day = None
|
day = None
|
||||||
day_comments = 0
|
day_comments = 0
|
||||||
for comment in utils.read_obj_zst(r"C:\Users\greg\Desktop\Drive\pushshift\haley0530\chatbots_submissions.zst"):
|
for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"):
|
||||||
created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d")
|
created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d")
|
||||||
if day is None:
|
if day is None:
|
||||||
day = created_day
|
day = created_day
|
||||||
|
|
23
personal/diagnostic/comments_per_day_with_score.py
Normal file
23
personal/diagnostic/comments_per_day_with_score.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import utils
|
||||||
|
import discord_logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
log = discord_logging.init_logging()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
day = None
|
||||||
|
day_comments, day_comments_with_score = 0, 0
|
||||||
|
for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"):
|
||||||
|
created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d")
|
||||||
|
if day is None:
|
||||||
|
day = created_day
|
||||||
|
if day != created_day:
|
||||||
|
log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%")
|
||||||
|
day_comments, day_comments_with_score = 0, 0
|
||||||
|
day = created_day
|
||||||
|
day_comments += 1
|
||||||
|
if comment['score'] != 1:
|
||||||
|
day_comments_with_score += 1
|
||||||
|
|
||||||
|
log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%")
|
|
@ -7,9 +7,9 @@ from datetime import datetime
|
||||||
import logging.handlers
|
import logging.handlers
|
||||||
|
|
||||||
# put the path to the input file, or a folder of files to process all of
|
# put the path to the input file, or a folder of files to process all of
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits\CryptoCurrency_submissions.zst"
|
input_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_23.zst"
|
||||||
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
# put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
|
||||||
output_file = r"\\MYCLOUDPR4100\Public\output"
|
output_file = r"\\MYCLOUDPR4100\Public\askreddit_comments_hero"
|
||||||
# the format to output in, pick from the following options
|
# the format to output in, pick from the following options
|
||||||
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
# zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
|
||||||
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
# txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
|
||||||
|
@ -75,11 +75,11 @@ to_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
|
||||||
# run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
|
# run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
|
||||||
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
# if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
|
||||||
|
|
||||||
field = "title"
|
field = "body"
|
||||||
values = ['']
|
values = ['']
|
||||||
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
# if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
|
||||||
# if this list is very large, it could greatly slow down the process
|
# if this list is very large, it could greatly slow down the process
|
||||||
values_file = None
|
values_file = r"\\MYCLOUDPR4100\Public\askreddit_submissions_ids.txt"
|
||||||
exact_match = False
|
exact_match = False
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue