This commit is contained in:
Watchful1 2023-08-26 20:18:38 -07:00
parent 4f56c141fd
commit cf4962fd4c
3 changed files with 10 additions and 12 deletions

View file

@ -11,6 +11,8 @@ import json
import praw import praw
from praw import endpoints from praw import endpoints
sys.path.append('personal')
log = discord_logging.init_logging(debug=False) log = discord_logging.init_logging(debug=False)
import utils import utils
@ -31,7 +33,7 @@ def query_pushshift(ids, bearer, object_type):
for i in range(4): for i in range(4):
response = requests.get(url, headers={ response = requests.get(url, headers={
'User-Agent': "In script by /u/Watchful1", 'User-Agent': "In script by /u/Watchful1",
'Authorization': f"Bearer {bearer}"}) 'Authorization': f"Bearer {bearer}"}, timeout=15)
if response.status_code == 200: if response.status_code == 200:
break break
if response.status_code == 403: if response.status_code == 403:
@ -166,16 +168,11 @@ if __name__ == "__main__":
log.error(f"Invalid type: {args.type}") log.error(f"Invalid type: {args.type}")
sys.exit(2) sys.exit(2)
config = discord_logging.get_config()
user_name = "Watchful12" user_name = "Watchful12"
reddit = praw.Reddit( reddit = praw.Reddit(user_name)
username=user_name,
password=discord_logging.get_config_var(config, user_name, "password"),
client_id=discord_logging.get_config_var(config, user_name, f"client_id_1"),
client_secret=discord_logging.get_config_var(config, user_name, f"client_secret_1"),
user_agent=f"Remindme ingest script")
pushshift_token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjoiV2F0Y2hmdWwxIiwiZXhwaXJlcyI6MTY5MzA5OTE4OC4wMjU3MDU4fQ.HJJd73nwHArOz2lErpubUuTVd_gdJ44SfpKDjb91tIY" config = discord_logging.get_config()
pushshift_token = discord_logging.get_config_var(config, user_name, "pushshift_token")
while start_date <= end_date: while start_date <= end_date:
build_day(start_date, input_folders, args.output, object_type, reddit, pushshift_token) build_day(start_date, input_folders, args.output, object_type, reddit, pushshift_token)

View file

@ -11,6 +11,7 @@ from collections import defaultdict
log = discord_logging.get_logger() log = discord_logging.get_logger()
import utils import utils
import merge
NEWLINE_ENCODED = "\n".encode('utf-8') NEWLINE_ENCODED = "\n".encode('utf-8')
@ -258,12 +259,12 @@ class ObjectDict:
created_minute = created_utc.replace(second=0, microsecond=0) created_minute = created_utc.replace(second=0, microsecond=0)
if obj['id'] in self.by_id: if obj['id'] in self.by_id:
existing_obj = self.by_id[obj['id']] existing_obj = self.by_id[obj['id']]
unmatched_field = utils.merge_fields(existing_obj, obj, self.obj_type) unmatched_field = merge.merge_fields(existing_obj, obj, self.obj_type)
self.counts[created_minute][ingest_type][False] += 1 self.counts[created_minute][ingest_type][False] += 1
return unmatched_field return unmatched_field
if created_utc < self.min_datetime or created_utc > self.max_datetime: if created_utc < self.min_datetime or created_utc > self.max_datetime:
return False return False
unmatched_field = utils.parse_fields(obj, self.obj_type) unmatched_field = merge.parse_fields(obj, self.obj_type)
self.by_id[obj['id']] = obj self.by_id[obj['id']] = obj
self.by_minute[created_minute].add(obj) self.by_minute[created_minute].add(obj)
self.counts[created_minute][ingest_type][True] += 1 self.counts[created_minute][ingest_type][True] += 1

View file

@ -189,7 +189,7 @@ field_actions = {
"is_robot_indexable": FieldAction.OVERWRITE, "is_robot_indexable": FieldAction.OVERWRITE,
"is_self": FieldAction.DONT_OVERWRITE, "is_self": FieldAction.DONT_OVERWRITE,
"is_survey_ad": FieldAction.ALLOW_EMPTY, "is_survey_ad": FieldAction.ALLOW_EMPTY,
"is_video": FieldAction.ALLOW, "is_video": FieldAction.OVERWRITE,
"likes": FieldAction.ALLOW_EMPTY, "likes": FieldAction.ALLOW_EMPTY,
"link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE, "link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
"link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE, "link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,