mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-26 08:05:27 -04:00
Save pushshift token, write to discord
This commit is contained in:
parent
e119573d1c
commit
34c0f93fe1
2 changed files with 35 additions and 8 deletions
|
@ -11,6 +11,7 @@ import json
|
||||||
import praw
|
import praw
|
||||||
from praw import endpoints
|
from praw import endpoints
|
||||||
import prawcore
|
import prawcore
|
||||||
|
import logging.handlers
|
||||||
|
|
||||||
sys.path.append('personal')
|
sys.path.append('personal')
|
||||||
|
|
||||||
|
@ -26,12 +27,25 @@ NEWLINE_ENCODED = "\n".encode('utf-8')
|
||||||
reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
|
reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
|
||||||
|
|
||||||
|
|
||||||
|
def save_pushshift_token(token):
|
||||||
|
with open("pushshift.txt", 'w') as file:
|
||||||
|
file.write(token)
|
||||||
|
|
||||||
|
|
||||||
|
def load_pushshift_token():
|
||||||
|
with open("pushshift.txt", 'r') as file:
|
||||||
|
token = file.read().strip()
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
def re_auth_pushshift(old_token):
|
def re_auth_pushshift(old_token):
|
||||||
response = requests.post(f"https://auth.pushshift.io/refresh?access_token={old_token}")
|
response = requests.post(f"https://auth.pushshift.io/refresh?access_token={old_token}")
|
||||||
result = response.json()
|
result = response.json()
|
||||||
log.warning(f"Reauth responce: {str(result)}")
|
log.warning(f"Reauth responce: {str(result)}")
|
||||||
new_token = result['access_token']
|
new_token = result['access_token']
|
||||||
log.warning(f"New pushshift token: {new_token}")
|
log.warning(f"New pushshift token: {new_token}")
|
||||||
|
save_pushshift_token(new_token)
|
||||||
|
discord_logging.flush_discord()
|
||||||
return new_token
|
return new_token
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,6 +62,8 @@ def query_pushshift(ids, bearer, object_type):
|
||||||
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout):
|
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout):
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
continue
|
continue
|
||||||
|
if response is None:
|
||||||
|
continue
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
break
|
break
|
||||||
if response.status_code == 403:
|
if response.status_code == 403:
|
||||||
|
@ -76,7 +92,9 @@ def end_of_day(input_minute):
|
||||||
return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
|
return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
|
||||||
|
|
||||||
|
|
||||||
def build_day(day_to_process, input_folders, output_folder, object_type, reddit, pushshift_token):
|
def build_day(day_to_process, input_folders, output_folder, object_type, reddit):
|
||||||
|
pushshift_token = load_pushshift_token()
|
||||||
|
|
||||||
file_type = "comments" if object_type == ObjectType.COMMENT else "submissions"
|
file_type = "comments" if object_type == ObjectType.COMMENT else "submissions"
|
||||||
|
|
||||||
file_minutes = {}
|
file_minutes = {}
|
||||||
|
@ -152,6 +170,7 @@ def build_day(day_to_process, input_folders, output_folder, object_type, reddit,
|
||||||
|
|
||||||
objects.rebuild_minute_dict()
|
objects.rebuild_minute_dict()
|
||||||
|
|
||||||
|
discord_logging.flush_discord()
|
||||||
if unmatched_field:
|
if unmatched_field:
|
||||||
log.info(f"Unmatched field, aborting")
|
log.info(f"Unmatched field, aborting")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -168,7 +187,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day")
|
parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day")
|
||||||
parser.add_argument('--input', help='Input folder', required=True)
|
parser.add_argument('--input', help='Input folder', required=True)
|
||||||
parser.add_argument('--output', help='Output folder', required=True)
|
parser.add_argument('--output', help='Output folder', required=True)
|
||||||
parser.add_argument('--pushshift', help='The pushshift token', required=True)
|
parser.add_argument('--pushshift', help='The pushshift token')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
input_folders = [
|
input_folders = [
|
||||||
|
@ -201,8 +220,15 @@ if __name__ == "__main__":
|
||||||
user_name = "Watchful12"
|
user_name = "Watchful12"
|
||||||
reddit = praw.Reddit(user_name)
|
reddit = praw.Reddit(user_name)
|
||||||
|
|
||||||
while start_date <= end_date:
|
discord_logging.init_discord_logging(
|
||||||
build_day(start_date, input_folders, args.output, object_type, reddit, args.pushshift)
|
section_name=None,
|
||||||
start_date = end_of_day(start_date)
|
log_level=logging.WARNING,
|
||||||
|
logging_webhook=reddit.config.custom["logging_webhook"]
|
||||||
|
)
|
||||||
|
|
||||||
#log.info(f"{len(file_minutes)} : {count_ingest_minutes} : {count_rescan_minutes} : {day_highest_id - day_lowest_id:,} - {count_objects:,} = {(day_highest_id - day_lowest_id) - count_objects:,}: {utils.base36encode(day_lowest_id)}-{utils.base36encode(day_highest_id)}")
|
if args.pushshift is not None:
|
||||||
|
save_pushshift_token(args.pushshift)
|
||||||
|
|
||||||
|
while start_date <= end_date:
|
||||||
|
build_day(start_date, input_folders, args.output, object_type, reddit)
|
||||||
|
start_date = end_of_day(start_date)
|
||||||
|
|
|
@ -12,10 +12,11 @@ NEWLINE_ENCODED = "\n".encode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\RS_2023-04.zst"
|
input_file = r"\\MYCLOUDPR4100\Public\RS_2023-05.zst"
|
||||||
output_folder = r"\\MYCLOUDPR4100\Public\ingest\download2"
|
output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
|
||||||
file_type = "comments" if "RC" in input_file else "submissions"
|
file_type = "comments" if "RC" in input_file else "submissions"
|
||||||
|
|
||||||
|
log.info(f"Input: {input_file} - Output: {output_folder}")
|
||||||
previous_minute, output_handle, created_utc = None, None, None
|
previous_minute, output_handle, created_utc = None, None, None
|
||||||
count_objects, count_minute = 0, 0
|
count_objects, count_minute = 0, 0
|
||||||
for obj in utils.read_obj_zst(input_file):
|
for obj in utils.read_obj_zst(input_file):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue