From 31ad7179dca0e1dcb3a5240f775f3b00bb3f95d2 Mon Sep 17 00:00:00 2001 From: Watchful1 Date: Wed, 8 Mar 2023 17:48:48 -0800 Subject: [PATCH] Update the parse here, switch to counting instead of adding scores --- scripts/count_words_single_file.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/scripts/count_words_single_file.py b/scripts/count_words_single_file.py index f7478a3..5efd832 100644 --- a/scripts/count_words_single_file.py +++ b/scripts/count_words_single_file.py @@ -13,13 +13,27 @@ log.setLevel(logging.DEBUG) log.addHandler(logging.StreamHandler()) -# this function handles decompressing the zst files +def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): + chunk = reader.read(chunk_size) + bytes_read += chunk_size + if previous_chunk is not None: + chunk = previous_chunk + chunk + try: + return chunk.decode() + except UnicodeDecodeError: + if bytes_read > max_window_size: + raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") + log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") + return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) + + def read_lines_zst(file_name): with open(file_name, 'rb') as file_handle: buffer = '' reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) while True: - chunk = reader.read(2**27).decode() + chunk = read_and_decode(reader, 2**27, (2**29) * 2) + if not chunk: break lines = (buffer + chunk).split("\n") @@ -28,6 +42,7 @@ def read_lines_zst(file_name): yield line, file_handle.tell() buffer = lines[-1] + reader.close() @@ -86,8 +101,7 @@ if __name__ == "__main__": for phrase in phrases: # check if it's the text if phrase in body_lower: - # and then add the object's score to the dict - word_counts[phrase] += obj['score'] + word_counts[phrase] += 1 # just in case there's corruption somewhere in the file except (KeyError, json.JSONDecodeError) as err: