mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-29 09:28:41 -04:00
Restructure this to catch errors
This commit is contained in:
parent
ac936cd564
commit
01170ebbdc
1 changed files with 18 additions and 4 deletions
|
@ -10,6 +10,7 @@ from datetime import datetime, timedelta
|
||||||
import json
|
import json
|
||||||
import praw
|
import praw
|
||||||
from praw import endpoints
|
from praw import endpoints
|
||||||
|
import prawcore
|
||||||
|
|
||||||
sys.path.append('personal')
|
sys.path.append('personal')
|
||||||
|
|
||||||
|
@ -49,6 +50,21 @@ def query_pushshift(ids, bearer, object_type):
|
||||||
return response.json()['data']
|
return response.json()['data']
|
||||||
|
|
||||||
|
|
||||||
|
def query_reddit(ids, reddit, object_type):
|
||||||
|
id_prefix = 't1_' if object_type == ObjectType.COMMENT else 't3_'
|
||||||
|
id_string = f"{id_prefix}{(f',{id_prefix}'.join(ids))}"
|
||||||
|
response = None
|
||||||
|
for i in range(4):
|
||||||
|
try:
|
||||||
|
response = reddit.request(method="GET", path=endpoints.API_PATH["info"], params={"id": id_string})
|
||||||
|
break
|
||||||
|
except prawcore.exceptions.ServerError:
|
||||||
|
time.sleep(2)
|
||||||
|
if response.status_code != 200:
|
||||||
|
log.warning(f"4 requests failed with status code {response.status_code}")
|
||||||
|
return response['data']['children']
|
||||||
|
|
||||||
|
|
||||||
def end_of_day(input_minute):
|
def end_of_day(input_minute):
|
||||||
return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
|
return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
|
||||||
|
|
||||||
|
@ -103,11 +119,9 @@ def build_day(day_to_process, input_folders, output_folder, object_type, reddit,
|
||||||
if objects.add_object(pushshift_object, IngestType.PUSHSHIFT):
|
if objects.add_object(pushshift_object, IngestType.PUSHSHIFT):
|
||||||
unmatched_field = True
|
unmatched_field = True
|
||||||
|
|
||||||
id_prefix = 't1_' if file_type == 'comments' else 't3_'
|
|
||||||
for chunk in utils.chunk_list(missing_ids, 100):
|
for chunk in utils.chunk_list(missing_ids, 100):
|
||||||
id_string = f"{id_prefix}{(f',{id_prefix}'.join(chunk))}"
|
reddit_objects = query_reddit(chunk, reddit, object_type)
|
||||||
reddit_objects = reddit.request(method="GET", path=endpoints.API_PATH["info"], params={"id": id_string})
|
for reddit_object in reddit_objects:
|
||||||
for reddit_object in reddit_objects['data']['children']:
|
|
||||||
if objects.add_object(reddit_object['data'], IngestType.BACKFILL):
|
if objects.add_object(reddit_object['data'], IngestType.BACKFILL):
|
||||||
unmatched_field = True
|
unmatched_field = True
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue