mirror of
https://github.com/Watchful1/PushshiftDumps.git
synced 2025-07-04 19:36:44 -04:00
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
import zstandard
|
|
import os
|
|
import json
|
|
import sys
|
|
import time
|
|
import argparse
|
|
import re
|
|
from collections import defaultdict
|
|
import logging.handlers
|
|
import multiprocessing
|
|
import utils
|
|
|
|
|
|
# sets up logging to the console as well as a file
|
|
log = logging.getLogger("bot")
|
|
log.setLevel(logging.INFO)
|
|
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
|
|
|
|
log_str_handler = logging.StreamHandler()
|
|
log_str_handler.setFormatter(log_formatter)
|
|
log.addHandler(log_str_handler)
|
|
if not os.path.exists("logs"):
|
|
os.makedirs("logs")
|
|
log_file_handler = logging.handlers.RotatingFileHandler(
|
|
os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
|
|
log_file_handler.setFormatter(log_formatter)
|
|
log.addHandler(log_file_handler)
|
|
|
|
|
|
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
|
|
chunk = reader.read(chunk_size)
|
|
bytes_read += len(chunk)
|
|
if previous_chunk is not None:
|
|
chunk = previous_chunk + chunk
|
|
try:
|
|
return chunk.decode(), bytes_read
|
|
except UnicodeDecodeError:
|
|
if bytes_read > max_window_size:
|
|
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
|
|
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
|
|
|
|
|
|
def count_lines_bytes(file_name):
|
|
count_lines = 0
|
|
uncompressed_bytes = 0
|
|
with open(file_name, 'rb') as file_handle:
|
|
buffer = ''
|
|
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
|
|
|
|
while True:
|
|
chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2)
|
|
uncompressed_bytes += chunk_bytes
|
|
if not chunk:
|
|
break
|
|
lines = (buffer + chunk).split("\n")
|
|
count_lines += len(lines) - 1
|
|
|
|
buffer = lines[-1]
|
|
reader.close()
|
|
return count_lines, uncompressed_bytes
|
|
|
|
|
|
if __name__ == '__main__':
|
|
input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2008-03.zst"
|
|
compressed_size = os.stat(input_path).st_size
|
|
count_lines, uncompressed_bytes = count_lines_bytes(input_path)
|
|
log.info(f"Compressed size: {compressed_size:,} : {(compressed_size / (2**30)):.2f} gb")
|
|
log.info(f"Uncompressed size: {uncompressed_bytes:,} : {(uncompressed_bytes / (2**30)):.2f} gb")
|
|
log.info(f"Ratio: {(uncompressed_bytes / compressed_size):.2f}")
|
|
log.info(f"Lines: {count_lines:,}")
|