From b1e361882d4b30df26724ba427b0e4663b67147d Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Wed, 12 Apr 2023 03:51:29 +0000 Subject: [PATCH] fix: multi-turn data breaks --- data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data.py b/data.py index 7d61154d..dc404af1 100644 --- a/data.py +++ b/data.py @@ -15,8 +15,8 @@ def tokenize_inputs(config, tokenizer, examples): out = {"labels": [], "input_ids": []} for prompt, response in zip(examples["prompt"], examples["response"]): if different_eos: - if response.count("") > 0: - response = response.replace("", tokenizer.eos_token) + if response.count(" \n") > 0: + response = response.replace(" \n", f"{tokenizer.eos_token} \n") prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0])