mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-03 19:06:39 -04:00
549 lines
20 KiB
Python
549 lines
20 KiB
Python
import re
|
|
import sys
|
|
from enum import Enum
|
|
import discord_logging
|
|
import zstandard
|
|
import json
|
|
from datetime import datetime
|
|
import requests
|
|
import time
|
|
|
|
import counters
|
|
|
|
log = discord_logging.get_logger(init=True)
|
|
|
|
|
|
def parse_ingest_string(ingest_string):
|
|
ingest_ids = []
|
|
for char in ingest_string:
|
|
ingest_ids.append(char)
|
|
return ingest_ids
|
|
|
|
|
|
def read_obj_zst(file_name):
|
|
with open(file_name, 'rb') as file_handle:
|
|
buffer = ''
|
|
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
|
|
while True:
|
|
chunk = read_and_decode(reader, 2**27, (2**29) * 2)
|
|
|
|
if not chunk:
|
|
break
|
|
lines = (buffer + chunk).split("\n")
|
|
|
|
for line in lines[:-1]:
|
|
if line == "":
|
|
continue
|
|
yield json.loads(line.strip())
|
|
|
|
buffer = lines[-1]
|
|
|
|
reader.close()
|
|
|
|
|
|
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
|
|
chunk = reader.read(chunk_size)
|
|
bytes_read += chunk_size
|
|
if previous_chunk is not None:
|
|
chunk = previous_chunk + chunk
|
|
try:
|
|
return chunk.decode()
|
|
except UnicodeDecodeError:
|
|
if bytes_read > max_window_size:
|
|
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
|
|
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
|
|
|
|
|
|
def base36encode(integer: int) -> str:
|
|
chars = '0123456789abcdefghijklmnopqrstuvwxyz'
|
|
sign = '-' if integer < 0 else ''
|
|
integer = abs(integer)
|
|
result = ''
|
|
while integer > 0:
|
|
integer, remainder = divmod(integer, 36)
|
|
result = chars[remainder] + result
|
|
return sign + result
|
|
|
|
|
|
def base36decode(base36: str) -> int:
|
|
return int(base36, 36)
|
|
|
|
|
|
def next_string_id(string_id):
|
|
return base36encode(base36decode(string_id) + 1)
|
|
|
|
|
|
def get_next_hundred_ids(start_id):
|
|
start_num = base36decode(start_id)
|
|
ids = []
|
|
id_num = -1
|
|
for id_num in range(start_num, start_num + 100):
|
|
ids.append(base36encode(id_num))
|
|
return ids, base36encode(id_num)
|
|
|
|
|
|
class FieldAction(Enum):
|
|
OVERWRITE = 1
|
|
OVERWRITE_NOT_NONE = 2
|
|
OVERWRITE_IF_NONE = 3
|
|
DONT_OVERWRITE = 4
|
|
DELETE = 5
|
|
SPECIAL = 6
|
|
SPECIAL_NO_OVERWRITE = 7
|
|
ALLOW = 8
|
|
ALLOW_EMPTY = 9
|
|
|
|
|
|
class ObjectType(Enum):
|
|
COMMENT = 1
|
|
SUBMISSION = 2
|
|
|
|
|
|
field_actions = {
|
|
ObjectType.COMMENT: {
|
|
"all_awardings": FieldAction.OVERWRITE_NOT_NONE,
|
|
"approved": FieldAction.DELETE,
|
|
"approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"approved_by": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"archived": FieldAction.OVERWRITE,
|
|
"associated_award": FieldAction.ALLOW_EMPTY,
|
|
"author": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_cakeday": FieldAction.DONT_OVERWRITE,
|
|
"author_flair_background_color": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_css_class": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_richtext": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_template_id": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_text": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_text_color": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_flair_type": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_fullname": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"author_patreon_flair": FieldAction.OVERWRITE,
|
|
"author_premium": FieldAction.OVERWRITE,
|
|
"awarders": FieldAction.OVERWRITE_IF_NONE,
|
|
"ban_note": FieldAction.DELETE,
|
|
"banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"banned_by": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"body": FieldAction.SPECIAL,
|
|
"body_html": FieldAction.DELETE,
|
|
"body_sha1": FieldAction.OVERWRITE_NOT_NONE,
|
|
"can_gild": FieldAction.OVERWRITE,
|
|
"can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"collapsed": FieldAction.OVERWRITE,
|
|
"collapsed_because_crowd_control": FieldAction.ALLOW_EMPTY,
|
|
"collapsed_reason": FieldAction.OVERWRITE,
|
|
"collapsed_reason_code": FieldAction.OVERWRITE,
|
|
"comment_type": FieldAction.ALLOW_EMPTY,
|
|
"controversiality": FieldAction.OVERWRITE,
|
|
"created": FieldAction.OVERWRITE_IF_NONE,
|
|
"created_utc": FieldAction.ALLOW,
|
|
"distinguished": FieldAction.OVERWRITE,
|
|
"downs": FieldAction.OVERWRITE_IF_NONE,
|
|
"editable": FieldAction.OVERWRITE,
|
|
"edited": FieldAction.OVERWRITE_NOT_NONE,
|
|
"gilded": FieldAction.OVERWRITE_NOT_NONE,
|
|
"gildings": FieldAction.OVERWRITE_NOT_NONE,
|
|
"id": FieldAction.ALLOW,
|
|
"ignore_reports": FieldAction.DELETE,
|
|
"is_submitter": FieldAction.DONT_OVERWRITE,
|
|
"likes": FieldAction.ALLOW_EMPTY,
|
|
"link_id": FieldAction.ALLOW,
|
|
"locked": FieldAction.OVERWRITE,
|
|
"media_metadata": FieldAction.OVERWRITE,
|
|
"mod_note": FieldAction.ALLOW_EMPTY,
|
|
"mod_reason_by": FieldAction.ALLOW_EMPTY,
|
|
"mod_reason_title": FieldAction.ALLOW_EMPTY,
|
|
"mod_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"mod_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"name": FieldAction.OVERWRITE_IF_NONE,
|
|
"nest_level": FieldAction.OVERWRITE_NOT_NONE,
|
|
"no_follow": FieldAction.OVERWRITE,
|
|
"num_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"parent_id": FieldAction.OVERWRITE_IF_NONE,
|
|
"permalink": FieldAction.DONT_OVERWRITE,
|
|
"removal_reason": FieldAction.SPECIAL,
|
|
"removed": FieldAction.DELETE,
|
|
"replies": FieldAction.OVERWRITE_IF_NONE,
|
|
"report_reasons": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"retrieved_on": FieldAction.SPECIAL,
|
|
"retrieved_utc": FieldAction.SPECIAL,
|
|
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"score": FieldAction.OVERWRITE_NOT_NONE,
|
|
"score_hidden": FieldAction.OVERWRITE,
|
|
"send_replies": FieldAction.OVERWRITE,
|
|
"spam": FieldAction.DELETE,
|
|
"stickied": FieldAction.OVERWRITE,
|
|
"subreddit": FieldAction.OVERWRITE_NOT_NONE,
|
|
"subreddit_id": FieldAction.ALLOW,
|
|
"subreddit_name_prefixed": FieldAction.OVERWRITE_NOT_NONE,
|
|
"subreddit_type": FieldAction.DONT_OVERWRITE,
|
|
"top_awarded_type": FieldAction.ALLOW_EMPTY,
|
|
"total_awards_received": FieldAction.OVERWRITE_NOT_NONE,
|
|
"treatment_tags": FieldAction.OVERWRITE_NOT_NONE,
|
|
"unrepliable_reason": FieldAction.ALLOW_EMPTY,
|
|
"ups": FieldAction.OVERWRITE_NOT_NONE,
|
|
"user_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"updated_on": FieldAction.SPECIAL,
|
|
"updated_utc": FieldAction.SPECIAL,
|
|
"utc_datetime_str": FieldAction.DELETE,
|
|
},
|
|
ObjectType.SUBMISSION: {
|
|
"ad_promoted_user_posts": FieldAction.ALLOW_EMPTY,
|
|
"ad_supplementary_text_md": FieldAction.ALLOW,
|
|
"adserver_click_url": FieldAction.ALLOW_EMPTY,
|
|
"adserver_imp_pixel": FieldAction.ALLOW_EMPTY,
|
|
"all_awardings": FieldAction.OVERWRITE_NOT_NONE,
|
|
"allow_live_comments": FieldAction.OVERWRITE,
|
|
"app_store_data": FieldAction.ALLOW_EMPTY,
|
|
"approved": FieldAction.DELETE,
|
|
"approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"approved_by": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"archived": FieldAction.ALLOW_EMPTY,
|
|
"author": FieldAction.OVERWRITE_IF_NONE,
|
|
"author_cakeday": FieldAction.DONT_OVERWRITE,
|
|
"author_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_richtext": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_template_id": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_text": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_text_color": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_flair_type": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_fullname": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_id": FieldAction.OVERWRITE_NOT_NONE,
|
|
"author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"author_patreon_flair": FieldAction.OVERWRITE,
|
|
"author_premium": FieldAction.OVERWRITE,
|
|
"awarders": FieldAction.ALLOW_EMPTY,
|
|
"ban_note": FieldAction.DELETE,
|
|
"banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"banned_by": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"call_to_action": FieldAction.OVERWRITE,
|
|
"campaign_id": FieldAction.ALLOW_EMPTY,
|
|
"can_gild": FieldAction.OVERWRITE,
|
|
"can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"category": FieldAction.OVERWRITE_NOT_NONE,
|
|
"clicked": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"collections": FieldAction.OVERWRITE_NOT_NONE,
|
|
"content_categories": FieldAction.ALLOW,
|
|
"contest_mode": FieldAction.OVERWRITE,
|
|
"created": FieldAction.OVERWRITE_IF_NONE,
|
|
"created_utc": FieldAction.ALLOW,
|
|
"crosspost_parent": FieldAction.ALLOW,
|
|
"crosspost_parent_list": FieldAction.OVERWRITE_NOT_NONE,
|
|
"discussion_type": FieldAction.ALLOW,
|
|
"distinguished": FieldAction.OVERWRITE,
|
|
"domain": FieldAction.OVERWRITE_NOT_NONE,
|
|
"domain_override": FieldAction.OVERWRITE_NOT_NONE,
|
|
"downs": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"edited": FieldAction.OVERWRITE,
|
|
"embed_type": FieldAction.ALLOW_EMPTY,
|
|
"embed_url": FieldAction.ALLOW_EMPTY,
|
|
"event_end": FieldAction.OVERWRITE_NOT_NONE,
|
|
"event_is_live": FieldAction.OVERWRITE_NOT_NONE,
|
|
"event_start": FieldAction.OVERWRITE_NOT_NONE,
|
|
"events": FieldAction.ALLOW_EMPTY,
|
|
"eventsOnRender": FieldAction.ALLOW_EMPTY,
|
|
"gallery_data": FieldAction.OVERWRITE_NOT_NONE,
|
|
"gilded": FieldAction.OVERWRITE_NOT_NONE,
|
|
"gildings": FieldAction.OVERWRITE_NOT_NONE,
|
|
"hidden": FieldAction.ALLOW_EMPTY,
|
|
"hide_score": FieldAction.OVERWRITE,
|
|
"href_url": FieldAction.DONT_OVERWRITE,
|
|
"id": FieldAction.ALLOW,
|
|
"ignore_reports": FieldAction.DELETE,
|
|
"impression_id": FieldAction.ALLOW_EMPTY,
|
|
"impression_id_str": FieldAction.ALLOW_EMPTY,
|
|
"is_blank": FieldAction.ALLOW_EMPTY,
|
|
"is_created_from_ads_ui": FieldAction.ALLOW,
|
|
"is_crosspostable": FieldAction.OVERWRITE,
|
|
"is_gallery": FieldAction.ALLOW,
|
|
"is_meta": FieldAction.OVERWRITE,
|
|
"is_original_content": FieldAction.OVERWRITE,
|
|
"is_reddit_media_domain": FieldAction.OVERWRITE,
|
|
"is_robot_indexable": FieldAction.OVERWRITE,
|
|
"is_self": FieldAction.DONT_OVERWRITE,
|
|
"is_survey_ad": FieldAction.ALLOW_EMPTY,
|
|
"is_video": FieldAction.ALLOW,
|
|
"likes": FieldAction.ALLOW_EMPTY,
|
|
"link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_richtext": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_template_id": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_text": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_text_color": FieldAction.OVERWRITE_NOT_NONE,
|
|
"link_flair_type": FieldAction.OVERWRITE_NOT_NONE,
|
|
"locked": FieldAction.OVERWRITE,
|
|
"media": FieldAction.OVERWRITE_NOT_NONE,
|
|
"media_embed": FieldAction.OVERWRITE_NOT_NONE,
|
|
"media_metadata": FieldAction.OVERWRITE_NOT_NONE,
|
|
"media_only": FieldAction.OVERWRITE,
|
|
"mobile_ad_url": FieldAction.ALLOW,
|
|
"mod_note": FieldAction.ALLOW_EMPTY,
|
|
"mod_reason_by": FieldAction.ALLOW_EMPTY,
|
|
"mod_reason_title": FieldAction.ALLOW_EMPTY,
|
|
"mod_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"name": FieldAction.OVERWRITE_IF_NONE,
|
|
"no_follow": FieldAction.OVERWRITE,
|
|
"num_comments": FieldAction.OVERWRITE_NOT_NONE,
|
|
"num_crossposts": FieldAction.OVERWRITE,
|
|
"num_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"original_link": FieldAction.ALLOW_EMPTY,
|
|
"outbound_link": FieldAction.ALLOW_EMPTY,
|
|
"over_18": FieldAction.OVERWRITE,
|
|
"parent_whitelist_status": FieldAction.OVERWRITE,
|
|
"permalink": FieldAction.DONT_OVERWRITE,
|
|
"pinned": FieldAction.ALLOW_EMPTY,
|
|
"poll_data": FieldAction.OVERWRITE_NOT_NONE,
|
|
"post_hint": FieldAction.OVERWRITE,
|
|
"preview": FieldAction.OVERWRITE_NOT_NONE,
|
|
"priority_id": FieldAction.ALLOW_EMPTY,
|
|
"product_ids": FieldAction.ALLOW_EMPTY,
|
|
"promo_layout": FieldAction.OVERWRITE,
|
|
"promoted": FieldAction.ALLOW_EMPTY,
|
|
"promoted_by": FieldAction.ALLOW_EMPTY,
|
|
"promoted_display_name": FieldAction.ALLOW_EMPTY,
|
|
"promoted_url": FieldAction.ALLOW_EMPTY,
|
|
"pwls": FieldAction.OVERWRITE,
|
|
"quarantine": FieldAction.DONT_OVERWRITE,
|
|
"removal_reason": FieldAction.SPECIAL,
|
|
"removed": FieldAction.DELETE,
|
|
"removed_by": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"removed_by_category": FieldAction.OVERWRITE,
|
|
"report_reasons": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"retrieved_on": FieldAction.SPECIAL,
|
|
"retrieved_utc": FieldAction.SPECIAL,
|
|
"saved": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"score": FieldAction.OVERWRITE_NOT_NONE,
|
|
"secure_media": FieldAction.OVERWRITE_NOT_NONE,
|
|
"secure_media_embed": FieldAction.OVERWRITE_NOT_NONE,
|
|
"selftext": FieldAction.SPECIAL,
|
|
"selftext_html": FieldAction.DELETE,
|
|
"send_replies": FieldAction.OVERWRITE,
|
|
"show_media": FieldAction.ALLOW,
|
|
"sk_ad_network_data": FieldAction.ALLOW_EMPTY,
|
|
"spam": FieldAction.DELETE,
|
|
"spoiler": FieldAction.OVERWRITE,
|
|
"stickied": FieldAction.OVERWRITE,
|
|
"subcaption": FieldAction.OVERWRITE,
|
|
"subreddit": FieldAction.ALLOW,
|
|
"subreddit_id": FieldAction.ALLOW,
|
|
"subreddit_name_prefixed": FieldAction.ALLOW,
|
|
"subreddit_subscribers": FieldAction.OVERWRITE_IF_NONE,
|
|
"subreddit_type": FieldAction.DONT_OVERWRITE,
|
|
"suggested_sort": FieldAction.OVERWRITE,
|
|
"third_party_trackers": FieldAction.ALLOW_EMPTY,
|
|
"third_party_tracking": FieldAction.ALLOW_EMPTY,
|
|
"third_party_tracking_2": FieldAction.ALLOW_EMPTY,
|
|
"thumbnail": FieldAction.OVERWRITE_NOT_NONE,
|
|
"thumbnail_height": FieldAction.OVERWRITE_NOT_NONE,
|
|
"thumbnail_width": FieldAction.OVERWRITE_NOT_NONE,
|
|
"title": FieldAction.DONT_OVERWRITE,
|
|
"top_awarded_type": FieldAction.OVERWRITE,
|
|
"total_awards_received": FieldAction.OVERWRITE_NOT_NONE,
|
|
"treatment_tags": FieldAction.OVERWRITE_NOT_NONE,
|
|
"updated_on": FieldAction.SPECIAL,
|
|
"updated_utc": FieldAction.SPECIAL,
|
|
"ups": FieldAction.OVERWRITE_NOT_NONE,
|
|
"upvote_ratio": FieldAction.OVERWRITE,
|
|
"url": FieldAction.OVERWRITE_NOT_NONE,
|
|
"url_overridden_by_dest": FieldAction.OVERWRITE_NOT_NONE,
|
|
"user_reports": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"utc_datetime_str": FieldAction.DELETE,
|
|
"view_count": FieldAction.ALLOW_EMPTY,
|
|
"visited": FieldAction.SPECIAL_NO_OVERWRITE,
|
|
"whitelist_status": FieldAction.OVERWRITE,
|
|
"wls": FieldAction.OVERWRITE,
|
|
},
|
|
}
|
|
|
|
|
|
def is_empty(value):
|
|
return value is None \
|
|
or value == "" \
|
|
or value == "[deleted]" \
|
|
or value == "[removed]" \
|
|
or value == [] \
|
|
or value == {} \
|
|
or value is False \
|
|
or value == 0
|
|
|
|
|
|
def replace(match):
|
|
if match.group(0) == "amp;": return ""
|
|
if match.group(0) == "<": return "<"
|
|
if match.group(0) == ">": return ">"
|
|
log.warning(f"Unknown group: {match}")
|
|
sys.exit(2)
|
|
|
|
|
|
unencode_regex = re.compile(r"amp;|<|>")
|
|
|
|
|
|
def merge_fields(existing_obj, new_obj, obj_type):
|
|
unmatched_field = False
|
|
type_actions = field_actions[obj_type]
|
|
for key, new_value in new_obj.items():
|
|
action = type_actions.get(key)
|
|
|
|
original_value = existing_obj.get(key)
|
|
if new_value != original_value:
|
|
if isinstance(new_value, str) and unencode_regex.search(new_value):
|
|
new_value_no_encode = unencode_regex.sub(replace, new_value)
|
|
if new_value_no_encode == original_value:
|
|
continue
|
|
if action == FieldAction.OVERWRITE:
|
|
existing_obj[key] = new_value
|
|
elif action == FieldAction.OVERWRITE_NOT_NONE:
|
|
if not is_empty(new_value):
|
|
existing_obj[key] = new_value
|
|
elif action == FieldAction.OVERWRITE_IF_NONE:
|
|
if is_empty(original_value):
|
|
existing_obj[key] = new_value
|
|
elif action == FieldAction.SPECIAL:
|
|
if key == "body":
|
|
if not is_empty(new_value):
|
|
if 'previous_body' in existing_obj:
|
|
existing_obj['previous_body'] = original_value
|
|
existing_obj['body'] = new_value
|
|
elif key == "selftext":
|
|
if not is_empty(new_value):
|
|
if 'previous_selftext' not in existing_obj:
|
|
existing_obj['previous_selftext'] = original_value
|
|
existing_obj['selftext'] = new_value
|
|
elif key == "removal_reason" and new_value in ["legal", None]:
|
|
existing_obj[key] = new_value
|
|
elif key in ["retrieved_on", "retrieved_utc"]:
|
|
prev_retrieved_on = existing_obj["retrieved_on"]
|
|
if new_value < prev_retrieved_on:
|
|
existing_obj["retrieved_on"] = new_value
|
|
existing_obj["updated_on"] = prev_retrieved_on
|
|
if new_value > prev_retrieved_on:
|
|
existing_obj["updated_on"] = new_value
|
|
elif key in ["updated_on", "updated_utc"]:
|
|
if new_value > existing_obj["updated_on"]:
|
|
existing_obj["updated_on"] = new_value
|
|
else:
|
|
log.info(f"{new_obj['id']} unmatched special: {key}: {original_value} != {new_value}")
|
|
unmatched_field = True
|
|
elif action == FieldAction.DELETE or action == FieldAction.DONT_OVERWRITE or action == FieldAction.SPECIAL_NO_OVERWRITE:
|
|
pass
|
|
else:
|
|
log.info(f"{new_obj['id']} unmatched no action: {key}|{action}: {original_value} != {new_value}")
|
|
unmatched_field = True
|
|
elif action is None:
|
|
log.info(f"{new_obj['id']} matched no action: {key}: {new_value}")
|
|
unmatched_field = True
|
|
|
|
return unmatched_field
|
|
|
|
|
|
def parse_fields(new_obj, obj_type):
|
|
keys_to_delete = []
|
|
keys_to_add = []
|
|
unmatched_field = False
|
|
type_actions = field_actions[obj_type]
|
|
for key, new_value in new_obj.items():
|
|
action = type_actions.get(key)
|
|
if action is not None:
|
|
if action == FieldAction.DELETE:
|
|
keys_to_delete.append(key)
|
|
elif action == FieldAction.ALLOW_EMPTY:
|
|
if not is_empty(new_value):
|
|
log.info(f"{new_obj['id']} not empty: {key}: {new_value}")
|
|
unmatched_field = True
|
|
keys_to_delete.append(key)
|
|
elif action == FieldAction.SPECIAL:
|
|
if key in ["retrieved_on", "body", "selftext", "updated_on"]:
|
|
pass
|
|
elif key == "removal_reason" and new_value in ["legal", None]:
|
|
pass
|
|
elif key == "retrieved_utc":
|
|
keys_to_add.append(("retrieved_on", new_value))
|
|
keys_to_delete.append(key)
|
|
elif key == "updated_utc":
|
|
keys_to_add.append(("updated_on", new_value))
|
|
keys_to_delete.append(key)
|
|
else:
|
|
log.info(f"{new_obj['id']} special no match: {key}: {new_value}")
|
|
unmatched_field = True
|
|
keys_to_delete.append(key)
|
|
elif action == FieldAction.SPECIAL_NO_OVERWRITE:
|
|
if key in ["can_mod_post", "saved", "clicked", "visited", "author_is_blocked"]:
|
|
new_obj[key] = False
|
|
elif key in ["banned_at_utc", "banned_by", "approved_at_utc", "approved_by", "user_reports_dismissed", "mod_reports_dismissed", "removed_by"]:
|
|
new_obj[key] = None
|
|
elif key in ["num_reports", "downs"]:
|
|
new_obj[key] = 0
|
|
elif key in ["report_reasons", "user_reports", "mod_reports"]:
|
|
new_obj[key] = []
|
|
else:
|
|
log.info(f"{new_obj['id']} special no overwrite no match: {key}: {new_value}")
|
|
unmatched_field = True
|
|
keys_to_delete.append(key)
|
|
else:
|
|
log.info(f"{new_obj['id']} no action: {key}: {new_value}")
|
|
unmatched_field = True
|
|
|
|
for key in keys_to_delete:
|
|
del new_obj[key]
|
|
|
|
for key, value in keys_to_add:
|
|
new_obj[key] = value
|
|
|
|
if 'retrieved_on' not in new_obj:
|
|
new_obj['retrieved_on'] = int(datetime.utcnow().timestamp())
|
|
|
|
return unmatched_field
|
|
|
|
|
|
def merge_lowest_highest_id(str_id, lowest_id, highest_id):
|
|
int_id = base36decode(str_id)
|
|
if lowest_id is None or int_id < lowest_id:
|
|
lowest_id = int_id
|
|
if highest_id is None or int_id > highest_id:
|
|
highest_id = int_id
|
|
return lowest_id, highest_id
|
|
|
|
|
|
async def record_rate_limits(reddit, client):
|
|
reddit_user = await reddit.user.me()
|
|
remaining = int(reddit._core._rate_limiter.remaining)
|
|
used = int(reddit._core._rate_limiter.used)
|
|
counters.rate_requests_remaining.labels(username=reddit_user.name, client=client).set(remaining)
|
|
counters.rate_requests_used.labels(username=reddit_user.name, client=client).set(used)
|
|
|
|
reset_timestamp = reddit._core._rate_limiter.reset_timestamp
|
|
seconds_to_reset = (datetime.utcfromtimestamp(reset_timestamp) - datetime.utcnow()).total_seconds()
|
|
counters.rate_seconds_remaining.labels(username=reddit_user.name, client=client).set(int(seconds_to_reset))
|
|
window_size = int(reddit._core._rate_limiter.window_size) if reddit._core._rate_limiter.window_size is not None else reddit._core._rate_limiter.window_size
|
|
time_to_next_request = max((datetime.utcnow() - datetime.utcfromtimestamp(reddit._core._rate_limiter.next_request_timestamp)).total_seconds(), 0)
|
|
#log.info(f"Rate: u/{reddit_user.name}: {window_size} : {remaining} : {used} : {seconds_to_reset:.2f} : {time_to_next_request:.3f} ")
|
|
|
|
return
|
|
|
|
|
|
def chunk_list(items, chunk_size):
|
|
for i in range(0, len(items), chunk_size):
|
|
yield items[i:i + chunk_size]
|
|
|
|
|
|
def query_pushshift(ids, bearer, object_type):
|
|
object_name = "comment" if object_type == ObjectType.COMMENT else "submission"
|
|
url = f"https://api.pushshift.io/reddit/{object_name}/search?limit=1000&ids={','.join(ids)}"
|
|
log.debug(f"pushshift query: {url}")
|
|
response = None
|
|
for i in range(4):
|
|
response = requests.get(url, headers={
|
|
'User-Agent': "In script by /u/Watchful1",
|
|
'Authorization': f"Bearer {bearer}"})
|
|
if response.status_code == 200:
|
|
break
|
|
if response.status_code == 403:
|
|
log.warning(f"Pushshift unauthorized, aborting")
|
|
sys.exit(2)
|
|
time.sleep(2)
|
|
if response.status_code != 200:
|
|
log.warning(f"4 requests failed with status code {response.status_code}")
|
|
return response.json()['data']
|