mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-25 15:45:19 -04:00
Change this to use command line arguments
This commit is contained in:
parent
f35762e203
commit
bb7a696959
1 changed files with 13 additions and 6 deletions
|
@ -3,6 +3,7 @@ import os
|
||||||
import zstandard
|
import zstandard
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
|
import argparse
|
||||||
|
|
||||||
log = discord_logging.init_logging()
|
log = discord_logging.init_logging()
|
||||||
|
|
||||||
|
@ -12,14 +13,20 @@ NEWLINE_ENCODED = "\n".encode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_file = r"\\MYCLOUDPR4100\Public\reddit\blocks\RS_2023-10.zst_blocks"
|
parser = argparse.ArgumentParser(description="Take a zst_blocks file and split it by minute chunks")
|
||||||
output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
|
parser.add_argument('--input', help='Input file', required=True)
|
||||||
file_type = "comments" if "RC" in input_file else "submissions"
|
parser.add_argument('--output', help='Output folder', required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
log.info(f"Input: {input_file} - Output: {output_folder}")
|
# input_file = r"\\MYCLOUDPR4100\Public\reddit\blocks\RS_2023-10.zst_blocks"
|
||||||
|
# output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
|
||||||
|
file_type = "comments" if "RC" in args.input else "submissions"
|
||||||
|
|
||||||
|
log.info(f"Input file: {args.input}")
|
||||||
|
log.info(f"Output folder: {args.output}")
|
||||||
previous_minute, output_handle, created_utc = None, None, None
|
previous_minute, output_handle, created_utc = None, None, None
|
||||||
count_objects, count_minute = 0, 0
|
count_objects, count_minute = 0, 0
|
||||||
for obj in utils.read_obj_zst_blocks(input_file):
|
for obj in utils.read_obj_zst_blocks(args.input):
|
||||||
created_utc = datetime.utcfromtimestamp(obj["created_utc"])
|
created_utc = datetime.utcfromtimestamp(obj["created_utc"])
|
||||||
current_minute = created_utc.replace(second=0)
|
current_minute = created_utc.replace(second=0)
|
||||||
|
|
||||||
|
@ -30,7 +37,7 @@ if __name__ == "__main__":
|
||||||
if output_handle is not None:
|
if output_handle is not None:
|
||||||
output_handle.close()
|
output_handle.close()
|
||||||
|
|
||||||
output_path = os.path.join(output_folder, file_type, created_utc.strftime('%y-%m-%d'))
|
output_path = os.path.join(args.output, file_type, created_utc.strftime('%y-%m-%d'))
|
||||||
if not os.path.exists(output_path):
|
if not os.path.exists(output_path):
|
||||||
os.makedirs(output_path)
|
os.makedirs(output_path)
|
||||||
output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst")
|
output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue