mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 15:45:19 -04:00
Clean up
This commit is contained in:
parent
4f56c141fd
commit
cf4962fd4c
3 changed files with 10 additions and 12 deletions
|
@ -11,6 +11,8 @@ import json
|
||||||
import praw
|
import praw
|
||||||
from praw import endpoints
|
from praw import endpoints
|
||||||
|
|
||||||
|
sys.path.append('personal')
|
||||||
|
|
||||||
log = discord_logging.init_logging(debug=False)
|
log = discord_logging.init_logging(debug=False)
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
@ -31,7 +33,7 @@ def query_pushshift(ids, bearer, object_type):
|
||||||
for i in range(4):
|
for i in range(4):
|
||||||
response = requests.get(url, headers={
|
response = requests.get(url, headers={
|
||||||
'User-Agent': "In script by /u/Watchful1",
|
'User-Agent': "In script by /u/Watchful1",
|
||||||
'Authorization': f"Bearer {bearer}"})
|
'Authorization': f"Bearer {bearer}"}, timeout=15)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
break
|
break
|
||||||
if response.status_code == 403:
|
if response.status_code == 403:
|
||||||
|
@ -166,16 +168,11 @@ if __name__ == "__main__":
|
||||||
log.error(f"Invalid type: {args.type}")
|
log.error(f"Invalid type: {args.type}")
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
config = discord_logging.get_config()
|
|
||||||
user_name = "Watchful12"
|
user_name = "Watchful12"
|
||||||
reddit = praw.Reddit(
|
reddit = praw.Reddit(user_name)
|
||||||
username=user_name,
|
|
||||||
password=discord_logging.get_config_var(config, user_name, "password"),
|
|
||||||
client_id=discord_logging.get_config_var(config, user_name, f"client_id_1"),
|
|
||||||
client_secret=discord_logging.get_config_var(config, user_name, f"client_secret_1"),
|
|
||||||
user_agent=f"Remindme ingest script")
|
|
||||||
|
|
||||||
pushshift_token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VyX2lkIjoiV2F0Y2hmdWwxIiwiZXhwaXJlcyI6MTY5MzA5OTE4OC4wMjU3MDU4fQ.HJJd73nwHArOz2lErpubUuTVd_gdJ44SfpKDjb91tIY"
|
config = discord_logging.get_config()
|
||||||
|
pushshift_token = discord_logging.get_config_var(config, user_name, "pushshift_token")
|
||||||
|
|
||||||
while start_date <= end_date:
|
while start_date <= end_date:
|
||||||
build_day(start_date, input_folders, args.output, object_type, reddit, pushshift_token)
|
build_day(start_date, input_folders, args.output, object_type, reddit, pushshift_token)
|
||||||
|
|
|
@ -11,6 +11,7 @@ from collections import defaultdict
|
||||||
log = discord_logging.get_logger()
|
log = discord_logging.get_logger()
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
import merge
|
||||||
|
|
||||||
NEWLINE_ENCODED = "\n".encode('utf-8')
|
NEWLINE_ENCODED = "\n".encode('utf-8')
|
||||||
|
|
||||||
|
@ -258,12 +259,12 @@ class ObjectDict:
|
||||||
created_minute = created_utc.replace(second=0, microsecond=0)
|
created_minute = created_utc.replace(second=0, microsecond=0)
|
||||||
if obj['id'] in self.by_id:
|
if obj['id'] in self.by_id:
|
||||||
existing_obj = self.by_id[obj['id']]
|
existing_obj = self.by_id[obj['id']]
|
||||||
unmatched_field = utils.merge_fields(existing_obj, obj, self.obj_type)
|
unmatched_field = merge.merge_fields(existing_obj, obj, self.obj_type)
|
||||||
self.counts[created_minute][ingest_type][False] += 1
|
self.counts[created_minute][ingest_type][False] += 1
|
||||||
return unmatched_field
|
return unmatched_field
|
||||||
if created_utc < self.min_datetime or created_utc > self.max_datetime:
|
if created_utc < self.min_datetime or created_utc > self.max_datetime:
|
||||||
return False
|
return False
|
||||||
unmatched_field = utils.parse_fields(obj, self.obj_type)
|
unmatched_field = merge.parse_fields(obj, self.obj_type)
|
||||||
self.by_id[obj['id']] = obj
|
self.by_id[obj['id']] = obj
|
||||||
self.by_minute[created_minute].add(obj)
|
self.by_minute[created_minute].add(obj)
|
||||||
self.counts[created_minute][ingest_type][True] += 1
|
self.counts[created_minute][ingest_type][True] += 1
|
||||||
|
|
|
@ -189,7 +189,7 @@ field_actions = {
|
||||||
"is_robot_indexable": FieldAction.OVERWRITE,
|
"is_robot_indexable": FieldAction.OVERWRITE,
|
||||||
"is_self": FieldAction.DONT_OVERWRITE,
|
"is_self": FieldAction.DONT_OVERWRITE,
|
||||||
"is_survey_ad": FieldAction.ALLOW_EMPTY,
|
"is_survey_ad": FieldAction.ALLOW_EMPTY,
|
||||||
"is_video": FieldAction.ALLOW,
|
"is_video": FieldAction.OVERWRITE,
|
||||||
"likes": FieldAction.ALLOW_EMPTY,
|
"likes": FieldAction.ALLOW_EMPTY,
|
||||||
"link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
|
"link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
"link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
|
"link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue