diff --git a/personal/combine/build_day.py b/personal/combine/build_day.py index c17d4f4..39b5605 100644 --- a/personal/combine/build_day.py +++ b/personal/combine/build_day.py @@ -11,6 +11,8 @@ import json import praw from praw import endpoints +sys.path.append('personal') + log = discord_logging.init_logging(debug=False) import utils @@ -31,7 +33,7 @@ def query_pushshift(ids, bearer, object_type): for i in range(4): response = requests.get(url, headers={ 'User-Agent': "In script by /u/Watchful1", - 'Authorization': f"Bearer {bearer}"}) + 'Authorization': f"Bearer {bearer}"}, timeout=15) if response.status_code == 200: break if response.status_code == 403: @@ -166,16 +168,11 @@ if __name__ == "__main__": log.error(f"Invalid type: {args.type}") sys.exit(2) - config = discord_logging.get_config() user_name = "Watchful12" - reddit = praw.Reddit( - username=user_name, - password=discord_logging.get_config_var(config, user_name, "password"), - client_id=discord_logging.get_config_var(config, user_name, f"client_id_1"), - client_secret=discord_logging.get_config_var(config, user_name, f"client_secret_1"), - user_agent=f"Remindme ingest script") + reddit = praw.Reddit(user_name) - pushshift_token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjoiV2F0Y2hmdWwxIiwiZXhwaXJlcyI6MTY5MzA5OTE4OC4wMjU3MDU4fQ.HJJd73nwHArOz2lErpubUuTVd_gdJ44SfpKDjb91tIY" + config = discord_logging.get_config() + pushshift_token = discord_logging.get_config_var(config, user_name, "pushshift_token") while start_date <= end_date: build_day(start_date, input_folders, args.output, object_type, reddit, pushshift_token) diff --git a/personal/combine/classes.py b/personal/combine/classes.py index 1451a34..90a2dc6 100644 --- a/personal/combine/classes.py +++ b/personal/combine/classes.py @@ -11,6 +11,7 @@ from collections import defaultdict log = discord_logging.get_logger() import utils +import merge NEWLINE_ENCODED = "\n".encode('utf-8') @@ -258,12 +259,12 @@ class ObjectDict: created_minute = created_utc.replace(second=0, microsecond=0) if obj['id'] in self.by_id: existing_obj = self.by_id[obj['id']] - unmatched_field = utils.merge_fields(existing_obj, obj, self.obj_type) + unmatched_field = merge.merge_fields(existing_obj, obj, self.obj_type) self.counts[created_minute][ingest_type][False] += 1 return unmatched_field if created_utc < self.min_datetime or created_utc > self.max_datetime: return False - unmatched_field = utils.parse_fields(obj, self.obj_type) + unmatched_field = merge.parse_fields(obj, self.obj_type) self.by_id[obj['id']] = obj self.by_minute[created_minute].add(obj) self.counts[created_minute][ingest_type][True] += 1 diff --git a/personal/combine/merge.py b/personal/combine/merge.py index 11f72b9..4f86824 100644 --- a/personal/combine/merge.py +++ b/personal/combine/merge.py @@ -189,7 +189,7 @@ field_actions = { "is_robot_indexable": FieldAction.OVERWRITE, "is_self": FieldAction.DONT_OVERWRITE, "is_survey_ad": FieldAction.ALLOW_EMPTY, - "is_video": FieldAction.ALLOW, + "is_video": FieldAction.OVERWRITE, "likes": FieldAction.ALLOW_EMPTY, "link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE, "link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,