PushshiftDumps/personal/diagnostic/get_zst_details.py

70 lines
2.2 KiB
Python

import zstandard
import os
import json
import sys
import time
import argparse
import re
from collections import defaultdict
import logging.handlers
import multiprocessing
import utils
# sets up logging to the console as well as a file
log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(
os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
chunk = reader.read(chunk_size)
bytes_read += len(chunk)
if previous_chunk is not None:
chunk = previous_chunk + chunk
try:
return chunk.decode(), bytes_read
except UnicodeDecodeError:
if bytes_read > max_window_size:
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
def count_lines_bytes(file_name):
count_lines = 0
uncompressed_bytes = 0
with open(file_name, 'rb') as file_handle:
buffer = ''
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
while True:
chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2)
uncompressed_bytes += chunk_bytes
if not chunk:
break
lines = (buffer + chunk).split("\n")
count_lines += len(lines) - 1
buffer = lines[-1]
reader.close()
return count_lines, uncompressed_bytes
if __name__ == '__main__':
input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2008-03.zst"
compressed_size = os.stat(input_path).st_size
count_lines, uncompressed_bytes = count_lines_bytes(input_path)
log.info(f"Compressed size: {compressed_size:,} : {(compressed_size / (2**30)):.2f} gb")
log.info(f"Uncompressed size: {uncompressed_bytes:,} : {(uncompressed_bytes / (2**30)):.2f} gb")
log.info(f"Ratio: {(uncompressed_bytes / compressed_size):.2f}")
log.info(f"Lines: {count_lines:,}")