Merge pull request #57 from martinnj/fix/tokenize-dialogue-regex-backref

Fix a regex issue in `tokenize_dialogue`.
This commit is contained in:
oobabooga 2023-02-05 14:09:19 -03:00 committed by GitHub
commit a2519ede90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -532,7 +532,7 @@ if args.chat or args.cai_chat:
dialogue = re.sub('<START>', '', dialogue)
dialogue = re.sub('<start>', '', dialogue)
dialogue = re.sub('(\n|^)[Aa]non:', '\\1You:', dialogue)
dialogue = re.sub('(\n|^)\[CHARACTER\]:', f'\\1{name2}:', dialogue)
dialogue = re.sub('(\n|^)\[CHARACTER\]:', f'\\g<1>{name2}:', dialogue)
idx = [m.start() for m in re.finditer(f"(^|\n)({name1}|{name2}):", dialogue)]
if len(idx) == 0:
return _history