Move towards HF LLaMA implementation

This commit is contained in:
oobabooga 2023-03-05 01:20:31 -03:00
parent bd8aac8fa4
commit c33715ad5b
6 changed files with 4 additions and 245 deletions

View File

@ -1,96 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
import json
import os
import sys
import time
from pathlib import Path
from typing import Tuple
import fire
import torch
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from llama import LLaMA, ModelArgs, Tokenizer, Transformer
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MP'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2223'
def setup_model_parallel() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))
torch.distributed.init_process_group("gloo")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size
def load(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
max_seq_len: int,
max_batch_size: int,
) -> LLaMA:
start_time = time.time()
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert world_size == len(
checkpoints
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
ckpt_path = checkpoints[local_rank]
print("Loading")
checkpoint = torch.load(ckpt_path, map_location="cpu")
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())
model_args: ModelArgs = ModelArgs(
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
)
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
torch.set_default_tensor_type(torch.FloatTensor)
model.load_state_dict(checkpoint, strict=False)
generator = LLaMA(model, tokenizer)
print(f"Loaded in {time.time() - start_time:.2f} seconds")
return generator
class LLaMAModel:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1):
tokenizer_path = path / "tokenizer.model"
path = os.path.abspath(path)
tokenizer_path = os.path.abspath(tokenizer_path)
local_rank, world_size = setup_model_parallel()
if local_rank > 0:
sys.stdout = open(os.devnull, "w")
generator = load(
path, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
)
result = self()
result.pipeline = generator
return result
def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95):
results = self.pipeline.generate(
[prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p
)
return results[0]

View File

@ -1,125 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from typing import Tuple
import os
import sys
import torch
import fire
import time
import json
from pathlib import Path
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from repositories.llama_int8.llama import ModelArgs, Transformer, Tokenizer, LLaMA
def setup_model_parallel() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))
torch.distributed.init_process_group("nccl")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size
def load(
ckpt_dir: str,
tokenizer_path: str,
max_seq_len: int,
max_batch_size: int,
) -> LLaMA:
start_time = time.time()
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())
model_args: ModelArgs = ModelArgs(
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
)
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
# torch.set_default_tensor_type(torch.cuda.HalfTensor)
torch.set_default_tensor_type(torch.HalfTensor)
print("Creating transformer")
model = Transformer(model_args)
print("Transformer created")
key_to_dim = {
"w1": 0,
"w2": -1,
"w3": 0,
"wo": -1,
"wq": 0,
"wk": 0,
"wv": 0,
"output": 0,
"tok_embeddings": -1,
"ffn_norm": None,
"attention_norm": None,
"norm": None,
"rope": None,
}
# ?
torch.set_default_tensor_type(torch.FloatTensor)
# load the state dict incrementally, to avoid memory problems
for i, ckpt in enumerate(checkpoints):
print(f"Loading checkpoint {i}")
checkpoint = torch.load(ckpt, map_location="cpu")
for parameter_name, parameter in model.named_parameters():
short_name = parameter_name.split(".")[-2]
if key_to_dim[short_name] is None and i == 0:
parameter.data = checkpoint[parameter_name]
elif key_to_dim[short_name] == 0:
size = checkpoint[parameter_name].size(0)
parameter.data[size * i : size * (i + 1), :] = checkpoint[
parameter_name
]
elif key_to_dim[short_name] == -1:
size = checkpoint[parameter_name].size(-1)
parameter.data[:, size * i : size * (i + 1)] = checkpoint[
parameter_name
]
del checkpoint
# model.load_state_dict(checkpoint, strict=False)
model.quantize()
generator = LLaMA(model, tokenizer)
print(f"Loaded in {time.time() - start_time:.2f} seconds")
return generator
class LLaMAModel_8bit:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1):
tokenizer_path = path / "tokenizer.model"
path = os.path.abspath(path)
tokenizer_path = os.path.abspath(tokenizer_path)
generator = load(path, tokenizer_path, max_seq_len, max_batch_size)
result = self()
result.pipeline = generator
return result
def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95):
results = self.pipeline.generate(
[prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p
)
return results[0]

View File

@ -39,10 +39,9 @@ def load_model(model_name):
t0 = time.time()
shared.is_RWKV = model_name.lower().startswith('rwkv-')
shared.is_LLaMA = model_name.lower().startswith('llama-')
# Default settings
if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV or shared.is_LLaMA):
if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
@ -86,23 +85,6 @@ def load_model(model_name):
return model, None
# LLaMA model (not on HuggingFace)
elif shared.is_LLaMA:
if shared.args.load_in_8bit:
import modules.LLaMA_8bit
from modules.LLaMA_8bit import LLaMAModel_8bit
model = LLaMAModel_8bit.from_pretrained(Path(f'models/{model_name}'))
return model, None
else:
import modules.LLaMA
from modules.LLaMA import LLaMAModel
model = LLaMAModel.from_pretrained(Path(f'models/{model_name}'))
return model, None
# Custom
else:
command = "AutoModelForCausalLM.from_pretrained"

View File

@ -6,7 +6,6 @@ model_name = ""
soft_prompt_tensor = None
soft_prompt = False
is_RWKV = False
is_LLaMA = False
# Chat variables
history = {'internal': [], 'visible': []}
@ -43,7 +42,6 @@ settings = {
'default': 'NovelAI-Sphinx Moth',
'pygmalion-*': 'Pygmalion',
'RWKV-*': 'Naive',
'llama-*': 'Naive',
'(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)'
},
'prompts': {

View File

@ -24,7 +24,7 @@ def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
# These models do not have explicit tokenizers for now, so
# we return an estimate for the number of tokens
if shared.is_RWKV or shared.is_LLaMA:
if shared.is_RWKV:
return np.zeros((1, len(prompt)//4))
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens)
@ -90,7 +90,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
# These models are not part of Hugging Face, so we handle them
# separately and terminate the function call earlier
if shared.is_RWKV or shared.is_LLaMA:
if shared.is_RWKV:
if shared.args.no_stream:
reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature, top_p=top_p)
t1 = time.time()

View File

@ -5,4 +5,4 @@ gradio==3.18.0
numpy
rwkv==0.0.6
safetensors==0.2.8
git+https://github.com/huggingface/transformers
git+https://github.com/oobabooga/transformers@llama_push