From e103298be31442ce22c642f0411d571961f48a7f Mon Sep 17 00:00:00 2001 From: Watchful1 Date: Thu, 16 Mar 2023 18:26:40 -0700 Subject: [PATCH] Update iterate folder with the new decode method --- scripts/iterate_folder.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/scripts/iterate_folder.py b/scripts/iterate_folder.py index 99981c2..27a7fbd 100644 --- a/scripts/iterate_folder.py +++ b/scripts/iterate_folder.py @@ -15,20 +15,36 @@ log.setLevel(logging.DEBUG) log.addHandler(logging.StreamHandler()) +def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): + chunk = reader.read(chunk_size) + bytes_read += chunk_size + if previous_chunk is not None: + chunk = previous_chunk + chunk + try: + return chunk.decode() + except UnicodeDecodeError: + if bytes_read > max_window_size: + raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") + log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") + return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) + + def read_lines_zst(file_name): with open(file_name, 'rb') as file_handle: buffer = '' - reader = zstandard.ZstdDecompressor(max_window_size=2**28).stream_reader(file_handle) + reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) while True: - chunk = reader.read(2**24).decode() # read a 16mb chunk at a time. There are some really big comments + chunk = read_and_decode(reader, 2**27, (2**29) * 2) + if not chunk: break lines = (buffer + chunk).split("\n") for line in lines[:-1]: - yield line, file_handle.tell() + yield line.strip(), file_handle.tell() buffer = lines[-1] + reader.close()