diff --git a/legacy.py b/legacy.py index 9a33a769..dc1d9335 100644 --- a/legacy.py +++ b/legacy.py @@ -1,9 +1,10 @@ -import json -import torch from aspen import LlamaModel, Tokenizer, DataSet from aspen import LlamaModelArgs, MultiLoraBatchData from aspen import load_llama_7b_weight, load_random_lora_7b_weight from aspen import save_lora_model + +import json +import torch import torch.optim with open('config/lora.json', 'r', encoding='utf8') as fp: @@ -47,19 +48,15 @@ def init_lora_model(llama_model: LlamaModel): torch.cuda.empty_cache() - # optim begin - optimizer = torch.optim.SGD( - llama_model.get_train_paramas(config), lr=1e-3) - # optim end + optimizer = torch.optim.AdamW(llama_model.get_train_paramas(config)) - step = 0 - # torch.autograd.set_detect_anomaly(True) + step_cnt = 0 while not data_set.check_done(): optimizer.zero_grad() loss_fn = torch.nn.CrossEntropyLoss() input: MultiLoraBatchData = data_set.get_batch_data() - step += 1 + step_cnt += 1 output = llama_model.forward(input) labels = torch.tensor(input.batch_tokens_, @@ -84,11 +81,11 @@ def init_lora_model(llama_model: LlamaModel): total_loss.backward() optimizer.step() - if step % 200 == 0: + if step_cnt % config["save_step"] == 0: for lora_config in config["lora"]: save_lora_model( - llama_model, lora_config["output"] + f".chk{step}", lora_config["name"]) + llama_model, lora_config["output"] + f".bin{step_cnt}", lora_config["name"]) for lora_config in config["lora"]: save_lora_model( - llama_model, lora_config["output"], lora_config["name"]) + llama_model, lora_config["output"], lora_config["name"]) \ No newline at end of file