added training code

This commit is contained in:
Saifeddine ALOUI 2023-06-06 22:20:29 +02:00
parent b4694fee2c
commit 584a1f6f03
14 changed files with 602 additions and 43 deletions

26
app.py
View File

@ -876,6 +876,32 @@ class LoLLMsWebUI(LoLLMsAPPI):
print(f"Problem with model : {model}")
return jsonify(models)
def train(self):
form_data = request.form
# Create and populate the config file
config = {
'model_name': form_data['model_name'],
'tokenizer_name': form_data['tokenizer_name'],
'dataset_path': form_data['dataset_path'],
'max_length': form_data['max_length'],
'batch_size': form_data['batch_size'],
'lr': form_data['lr'],
'num_epochs': form_data['num_epochs'],
'output_dir': form_data['output_dir'],
}
with open('train/configs/train/local_cfg.yaml', 'w') as f:
yaml.dump(config, f)
# Trigger the train.py script
# Place your code here to run the train.py script with the created config file
# accelerate launch --dynamo_backend=inductor --num_processes=8 --num_machines=1 --machine_rank=0 --deepspeed_multinode_launcher standard --mixed_precision=bf16 --use_deepspeed --deepspeed_config_file=configs/deepspeed/ds_config_gptj.json train.py --config configs/train/finetune_gptj.yaml
subprocess.check_call(["accelerate","launch", "--dynamo_backend=inductor", "--num_processes=8", "--num_machines=1", "--machine_rank=0", "--deepspeed_multinode_launcher standard", "--mixed_precision=bf16", "--use_deepspeed", "--deepspeed_config_file=train/configs/deepspeed/ds_config_gptj.json", "train/train.py", "--config", "train/configs/train/local_cfg.yaml"])
return jsonify({'message': 'Training started'})
def get_config(self):
return jsonify(self.config.to_dict())

2
train/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
output
!output/.keep

View File

@ -0,0 +1,48 @@
{
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"train_micro_batch_size_per_gpu": "auto",
"fp16": {
"enabled": "auto",
"min_loss_scale": 1,
"loss_scale_window": 1000,
"hysteresis": 2,
"initial_scale_power": 32
},
"bf16": {
"enabled": "auto"
},
"gradient_clipping": 1,
"zero_optimization": {
"stage": 2,
"offload_param": {
"device": "none"
},
"offload_optimizer": {
"device": "none"
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"contiguous_gradients": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-08
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"warmup_type": "linear"
}
}
}

1
train/configs/train/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
local_cfg.yaml

View File

@ -0,0 +1,29 @@
# model/tokenizer
model_name: # add model here
tokenizer_name: # add model here
gradient_checkpointing: true
save_name: # CHANGE
# dataset
streaming: false
num_proc: 64
dataset_path: # update
max_length: 1024
batch_size: 32
# train dynamics
lr: 5.0e-5
eval_every: 800
eval_steps: 100
save_every: 800
output_dir: # CHANGE
checkpoint: null
lora: false
warmup_steps: 100
num_epochs: 2
# logging
wandb: true
wandb_entity: # update
wandb_project_name: # update
seed: 42

View File

@ -0,0 +1,31 @@
# model/tokenizer
model_name: # update
tokenizer_name: # update
gradient_checkpointing: false
save_name: # CHANGE
# dataset
streaming: false
num_proc: 64
dataset_path: # CHANGE
max_length: 1024
batch_size: 4
# train dynamics
lr: 5.0e-5
min_lr: 0
weight_decay: 0.0
eval_every: 2000
eval_steps: 100
save_every: 2000
output_dir: # CHANGE
checkpoint: null
lora: true
warmup_steps: 100
num_epochs: 2
# logging
wandb: true
wandb_entity: # update
wandb_project_name: # update
seed: 42

View File

@ -0,0 +1,31 @@
# model/tokenizer
model_name: jondurbin/airoboros-7b-gpt4 # update
tokenizer_name: jondurbin/airoboros-7b-gpt4 # update
gradient_checkpointing: false
save_name: parisneo-7b_gpt42_lora # CHANGE
# dataset
streaming: false
num_proc: 64
dataset_path: # CHANGE
max_length: 1024
batch_size: 4
# train dynamics
lr: 5.0e-5
min_lr: 0
weight_decay: 0.0
eval_every: 2000
eval_steps: 100
save_every: 2000
output_dir: output # CHANGE
checkpoint: null
lora: true
warmup_steps: 100
num_epochs: 2
# logging
wandb: false # update if you want to use weights and biases
wandb_entity: # update
wandb_project_name: # update
seed: 42

15
train/requirements.txt Normal file
View File

@ -0,0 +1,15 @@
accelerate
datasets
torchmetrics
evaluate
transformers>=4.28.0
wandb
pip
peft
nodelist-inflator
deepspeed
sentencepiece
jsonlines
nomic
scikit-learn
matplotlib

233
train/train.py Normal file
View File

@ -0,0 +1,233 @@
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM
import torch
from torch.optim import AdamW
from argparse import ArgumentParser
from read import read_config
from accelerate import Accelerator
from accelerate.utils import DummyScheduler, DummyOptim, set_seed
from peft import get_peft_model, LoraConfig, TaskType
from data import load_data
from torchmetrics import MeanMetric
from tqdm import tqdm
import wandb
torch.backends.cuda.matmul.allow_tf32 = True
def format_metrics(metrics, split, prefix=""):
log = f"[{split}]" + prefix
log += " ".join([f"{key}: {value:.4f}" for key, value in metrics.items()])
return log
def evaluate(model, val_dataloader):
model.eval()
val_loss = MeanMetric(nan_strategy="error").to(model.device)
with torch.no_grad():
for batch in tqdm(val_dataloader):
loss = model(**batch).loss
loss_values = accelerator.gather_for_metrics({"loss": loss.detach()})
val_loss.update(loss_values["loss"])
return val_loss
def train(accelerator, config):
set_seed(config['seed'])
accelerator.print(config)
accelerator.print(f"Using {accelerator.num_processes} GPUs")
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
# if no pad token, set it to eos
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
with accelerator.main_process_first():
train_dataloader, val_dataloader = load_data(config, tokenizer)
checkpoint = config["gradient_checkpointing"]
model = AutoModelForCausalLM.from_pretrained(config["model_name"],
use_cache=False if checkpoint else True,
trust_remote_code=True)
if checkpoint:
model.gradient_checkpointing_enable()
if config["lora"]:
peft_config = LoraConfig(
# should R be configurable?
task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
optimizer_cls = (
AdamW
if accelerator.state.deepspeed_plugin is None
or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
else DummyOptim
)
# karpathy doesn't decay embeddding, maybe we should exclude
# https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s
optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
if accelerator.state.deepspeed_plugin is not None:
gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
"gradient_accumulation_steps"
]
# decay to min_lr instead of 0
lr_ratio = config["min_lr"] / config["lr"]
accelerator.print(f"Len of train_dataloader: {len(train_dataloader)}")
total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * config["num_epochs"]
# instead of decaying to zero, decay to ratio of min_lr / lr
total_num_steps += int(total_num_steps * lr_ratio) + config["warmup_steps"]
accelerator.print(f"Total training steps: {total_num_steps}")
# Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
if (
accelerator.state.deepspeed_plugin is None
or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
):
scheduler = get_scheduler(
name="cosine",
optimizer=optimizer,
num_warmup_steps=config["warmup_steps"] * accelerator.num_processes,
num_training_steps=total_num_steps,
)
else:
scheduler = DummyScheduler(
optimizer, total_num_steps=config["warmup_steps"], warmup_num_steps=config["warmup_steps"]
)
model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
model, optimizer, train_dataloader, val_dataloader, scheduler
)
# setup for saving training states in case preemption
accelerator.register_for_checkpointing(scheduler)
if config["checkpoint"]:
accelerator.load_state(config["checkpoint"])
accelerator.print(f"Resumed from checkpoint: {config['checkpoint']}")
path = os.path.basename(config["train_args"]["resume_from_checkpoint"])
training_difference = os.path.splitext(path)[0]
resume_step = int(training_difference.replace("step_", ""))
accelerator.skip_first_batches(train_dataloader, resume_step)
accelerator.print(f"Resuming from step {resume_step}")
# log gradients
if accelerator.is_main_process and config["wandb"]:
wandb.watch(model, log_freq=config["log_grads_every"], log="all")
for epoch in range(config["num_epochs"]):
train_loss = MeanMetric(nan_strategy="error").to(model.device)
for step, batch in enumerate(tqdm(train_dataloader)):
model.train()
outputs = model(**batch)
loss = outputs.loss
# gather loss before backprop in case of gradient accumulation
loss_values = accelerator.gather_for_metrics({"loss": loss.detach().float()})
train_loss.update(loss_values["loss"])
loss = loss / gradient_accumulation_steps
accelerator.backward(loss)
# get gradient norm of all params
# log LR in case something weird happens
if step > 0 and step % (config["eval_every"] // 10) == 0:
if config["wandb"]:
curr_step = step + epoch * len(train_dataloader)
accelerator.log({"lr": scheduler.get_last_lr()[0]}, step=curr_step)
if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if step > 0 and step % config["save_every"] == 0:
curr_step = step + epoch * len(train_dataloader)
accelerator.save_state(f"{config['output_dir']}/step_{curr_step}")
if step > 0 and (step % config["eval_every"] == 0 or step == len(train_dataloader) - 1):
val_loss = evaluate(model, val_dataloader)
log_train = {
"train_loss": train_loss.compute()
}
log_val = {
"val_loss": val_loss.compute()
}
if config["wandb"]:
curr_step = step + epoch * len(train_dataloader)
accelerator.log({**log_train, **log_val}, step=curr_step)
accelerator.print(f"Current LR: {scheduler.get_last_lr()[0]}")
accelerator.print(format_metrics(log_train, "train", f" step {step} "))
accelerator.print(format_metrics(log_val, "val", f" step {step} "))
train_loss.reset()
accelerator.print(f"Epoch {epoch} finished")
accelerator.print(f"Pushing to HF hub")
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
try:
if accelerator.is_main_process:
unwrapped_model.push_to_hub(config["save_name"] + f"-epoch_{epoch}", private=True)
except Exception as e:
accelerator.print(e)
accelerator.print(f"Failed to push to hub")
unwrapped_model.save_pretrained(
f"{config['output_dir']}/epoch_{epoch}",
is_main_process=accelerator.is_main_process,
save_function=accelerator.save,
state_dict=accelerator.get_state_dict(model),
)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
f"{config['output_dir']}/final",
is_main_process=accelerator.is_main_process,
save_function=accelerator.save,
state_dict=accelerator.get_state_dict(model),
)
accelerator.end_training()
if __name__ == "__main__":
# parse arguments by reading in a config
parser = ArgumentParser()
parser.add_argument("--config", type=str, default="config.yaml")
args = parser.parse_args()
config = read_config(args.config)
if config["wandb"]:
accelerator = Accelerator(log_with="wandb")
accelerator.init_trackers(
project_name=config["wandb_project_name"],
config=config,
init_kwargs={"wandb": {"entity": config["wandb_entity"]}},
)
else:
accelerator = Accelerator()
train(accelerator, config=config)

File diff suppressed because one or more lines are too long

1
web/dist/assets/index-54621153.css vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

4
web/dist/index.html vendored
View File

@ -6,8 +6,8 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GPT4All - WEBUI</title>
<script type="module" crossorigin src="/assets/index-0344eb9b.js"></script>
<link rel="stylesheet" href="/assets/index-488cca87.css">
<script type="module" crossorigin src="/assets/index-f5f472ed.js"></script>
<link rel="stylesheet" href="/assets/index-54621153.css">
</head>
<body>
<div id="app"></div>

View File

@ -1,16 +1,159 @@
<template>
<div>
Training
<div class="container overflow-y-scroll flex flex-col no-scrollbar shadow-lg p-10 pt-0">
<form @submit.prevent="submitForm" class="max-w-md mx-auto">
<!-- Model/Tokenizer -->
<div class="mb-4">
<label for="model_name" class="text-sm">Model Name:</label>
<input
type="text"
id="model_name"
v-model="model_name"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<div class="mb-4">
<label for="tokenizer_name" class="text-sm">Tokenizer Name:</label>
<input
type="text"
id="tokenizer_name"
v-model="tokenizer_name"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<!-- Dataset -->
<div class="mb-4">
<label for="dataset_path" class="text-sm">Dataset:</label>
<input
type="file"
id="dataset_path"
ref="dataset_path"
accept=".parquet"
v-on:change="selectDatasetPath"
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
<p class="mt-2 text-xs">Selected File: {{ selectedDatasetPath }}</p>
</div>
<div class="mb-4">
<label for="max_length" class="text-sm">Max Length:</label>
<input
type="number"
id="max_length"
v-model.number="max_length"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<div class="mb-4">
<label for="batch_size" class="text-sm">Batch Size:</label>
<input
type="number"
id="batch_size"
v-model.number="batch_size"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<!-- Train Dynamics -->
<div class="mb-4">
<label for="lr" class="text-sm">Learning Rate:</label>
<input
type="number"
id="lr"
v-model.number="lr"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<div class="mb-4">
<label for="num_epochs" class="text-sm">Number of Epochs:</label>
<input
type="number"
id="num_epochs"
v-model.number="num_epochs"
required
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
>
</div>
<!-- Logging -->
<div class="mb-4">
<label for="output_dir" class="text-sm">Output Directory:</label>
<input
type="text"
id="output_dir"
v-model="selectedFolder"
class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
placeholder="Enter or select the output folder"
>
<input
type="file"
id="folder_selector"
ref="folder_selector"
style="display: none"
webkitdirectory
v-on:change="selectOutputDirectory"
>
<button type="button" @click="openFolderSelector" class="bg-blue-500 text-white px-4 py-2 rounded">Select Folder</button>
</div>
<button type="submit" class="bg-blue-500 text-white px-4 py-2 rounded">Train LLM</button>
</form>
</div>
</template>
<script>
export default {
setup () {
return {}
}
}
</script>
</template>
<script>
export default {
data() {
return {
model_name: 'jondurbin/airoboros-7b-gpt4',
tokenizer_name: 'jondurbin/airoboros-7b-gpt4',
dataset_path: '',
max_length: 1024,
batch_size: 4,
lr: 5.0e-5,
num_epochs: 2,
selectedFolder: '',
selectedDatasetPath: '',
};
},
methods: {
submitForm() {
const formData = {
model_name: this.model_name,
tokenizer_name: this.tokenizer_name,
dataset_path: this.selectedDatasetPath,
max_length: this.max_length,
batch_size: this.batch_size,
lr: this.lr,
num_epochs: this.num_epochs,
output_dir: this.selectedFolder,
};
// Send the form data to the backend
// ...
},
openFolderSelector() {
this.$refs.folder_selector.click();
},
selectOutputDirectory(event) {
console.log("here")
const folderPath = event.target.files[0]?.path;
console.log(folderPath)
if (folderPath) {
this.selectedFolder = folderPath;
}
},
selectDatasetPath(event) {
const files = event.target.files;
if (files.length > 0) {
this.selectedDatasetPath = files[0].webkitRelativePath;
}
},
},
};
</script>