Fix: Fixed the tokenization process of a raw dataset and improved its efficiency (#3035)

This commit is contained in:
Fernando Tarin Morales 2023-07-13 00:05:37 +09:00 committed by GitHub
parent 3f19e94c93
commit 987d0fe023
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -388,12 +388,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
return
tokens = list(split_chunks(tokens, step))
for i in range(1, len(tokens)):
tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i]
out_tokens.extend(tokens)
del tokens
out_tokens.extend(split_chunks(tokens, cutoff_len, step))
del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
@ -663,9 +658,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
yield f"Done! LoRA saved to `{lora_file_path}`"
def split_chunks(arr, step):
def split_chunks(arr, size, step):
for i in range(0, len(arr), step):
yield arr[i:i + step]
yield arr[i:i + size]
def cut_chunk_for_newline(chunk: str, max_length: int):