找回密码
 立即注册
首页 业界区 科技 大模型的监督微调

大模型的监督微调

歇凛尾 2025-7-22 00:15:41
1.Chat Template
  1. '1. 指令微调在预训练(LoRA微调)之后'
  2. # 预训练使大模型成为“领域专家”
  3. # 指令微调令大模型学会表达
  4. 每一个大模型的指令微调都不一样;
  5. 所以一定要根据官方发布的格式做指令微调
复制代码
  1. '2. Chat_Template的源代码'
  2. # Chat_Template.py
  3. from transformers import AutoModelForCausalLM, AutoTokenizer
  4. import torch
  5. model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
  6. tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
  7. model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
  8. optimizer = torch.optim.AdamW(model.parameters())
  9. dialog = [{"role": "system", "content": "You are a helpful assistant."},
  10.           {"role": "user", "content": "天空为什么是蓝色的?"},
  11.           {"role": "assistant", "content": "这是由于光的散射引起的。"}]
  12. input = tokenizer.apply_chat_template(dialog, return_tensors="pt")
  13. input = {k: v.to("cuda") for k, v in input.items()}
  14. #设置labels和inputs一致
  15. input["labels"] = input["input_ids"].clone()
  16. output = model(**input)
  17. #获取模型的loss
  18. loss = output.loss
  19. loss.backward()
  20. optimizer.step()
  21. optimizer.zero_grad()
  22. #保存模型
  23. model.save_pretrained("output_dir")
复制代码
2.Completions only
  1. '1. 只对回答部分做计算loss'
  2. 由于Chat Template会对指令的所有内容计算loss
  3. 为了优化该部分,使用Completions only
复制代码
  1. '2. Completions onlyの源代码'
  2. # Completions_only.py
  3. import functools
  4. import json
  5. from peft import LoraConfig, TaskType, get_peft_model
  6. from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
  7. from torch.utils.data import DataLoader, Dataset
  8. import torch
  9. class SFTDataset(Dataset):
  10.     def __init__(self, file_path, tokenizer):
  11.         super().__init__()
  12.         self.file_path = file_path
  13.         self.examples = self._load_data(self.file_path)
  14.         self.tokenizer = tokenizer
  15.     @staticmethod
  16.     def _load_data(file_path):
  17.         items = []
  18.         with open(file_path, "r", encoding="utf8")as f:
  19.             for line in f:
  20.                 item = json.loads(line)
  21.                 items.append(item)
  22.         return items
  23.     def __getitem__(self, index):
  24.         example = self.examples[index]
  25.         dialog = [{"role": "system", "content": "You are a helpful assistant."},
  26.                   {"role": "user", "content": example["query"]},
  27.                   {"role": "assistant", "content": example["answer"]}]
  28.         chat = tokenizer.apply_chat_template(dialog, tokenize=False)
  29.         return chat
  30.     def __len__(self):
  31.         return len(self.examples)
  32. model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
  33. tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
  34. tokenizer.padding_side = "right"
  35. tokenizer.pad_token = tokenizer.eos_token
  36. bnb_config = BitsAndBytesConfig(
  37.     load_in_4bit=True,
  38.     bnb_4bit_use_double_quant=True,
  39.     bnb_4bit_quant_type="nf4",
  40.     bnb_4bit_compute_dtype=torch.bfloat16
  41. )
  42. model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
  43. peft_config = LoraConfig(
  44.     r=8,
  45.     target_modules=["q_proj",
  46.                     "v_proj",
  47.                     "k_proj",
  48.                     "o_proj",
  49.                     "gate_proj",
  50.                     "down_proj",
  51.                     "up_proj"
  52.                     ],
  53.     task_type=TaskType.CAUSAL_LM,
  54.     lora_alpha=16,
  55.     lora_dropout=0.05
  56. )
  57. model = get_peft_model(model, peft_config)
  58. model.print_trainable_parameters()
  59. model.to("cuda")
  60. optimizer = torch.optim.AdamW(model.parameters())
  61. def sft_collate(batch, tokenizer, end_str, max_length):
  62.     end_str = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
  63.     inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
  64.     input_ids = inputs["input_ids"]
  65.     input_len = len(input_ids[0])
  66.     end_ids = tokenizer(end_str)["input_ids"]
  67.     end_id_len = len(end_ids)
  68.     loss_mask = []
  69.     for input_id in input_ids:
  70.         for i in range(len(input_id) - end_id_len, -1, -1):
  71.             if input_id[i:i + end_id_len] == end_ids:
  72.                 mask = [1] * (input_len - 1)
  73.                 mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
  74.                 loss_mask.append(mask)
  75.                 break
  76.             if i == 0:  # 所有回答部分都被截断
  77.                 loss_mask.append([0] * (input_len - 1))
  78.     inputs = {k: torch.tensor(v) for k, v in inputs.items()}
  79.     loss_mask = torch.tensor(loss_mask)
  80.     return inputs, loss_mask
  81. collate_fn = functools.partial(sft_collate,
  82.                                tokenizer=tokenizer,
  83.                                end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
  84.                                max_length=50)
  85. sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
  86. data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
  87. epoch = 10
  88. for i in range(epoch):
  89.     for inputs, loss_mask in data_loader:
  90.         inputs = {k: v.to("cuda") for k, v in inputs.items()}
  91.         loss_mask = loss_mask.to("cuda")
  92.         logits = model(**inputs).logits[:, :-1, :]
  93.         labels = inputs["input_ids"][:, 1:]
  94.         logits = logits.reshape(-1, logits.size(-1))
  95.         labels = labels.reshape(-1)
  96.         loss_mask = loss_mask.reshape(-1)
  97.         loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
  98.         loss = loss * loss_mask
  99.         loss = torch.mean(loss)
  100.         loss.backward()
  101.         optimizer.step()
  102.         optimizer.zero_grad()
  103.     print(loss.item())
复制代码
3.NEFTune
  1. '1. Noisy Embeddings Finetuning'
  2. 通过embedding,将离散对象映射到连续向量空间中的点
  3. 对embedding做噪声处理,可以提高模型的表现
复制代码
  1. '2. NEFTuneの源代码'
  2. import functools
  3. import json
  4. from peft import LoraConfig, TaskType, get_peft_model
  5. from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
  6. from torch.utils.data import DataLoader, Dataset
  7. import torch
  8. class SFTDataset(Dataset):
  9.     def __init__(self, file_path, tokenizer):
  10.         super().__init__()
  11.         self.file_path = file_path
  12.         self.examples = self._load_data(self.file_path)
  13.         self.tokenizer = tokenizer
  14.     @staticmethod
  15.     def _load_data(file_path):
  16.         items = []
  17.         with open(file_path, "r", encoding="utf8")as f:
  18.             for line in f:
  19.                 item = json.loads(line)
  20.                 items.append(item)
  21.         return items
  22.     def __getitem__(self, index):
  23.         example = self.examples[index]
  24.         dialog = [{"role": "system", "content": "You are a helpful assistant."},
  25.                   {"role": "user", "content": example["query"]},
  26.                   {"role": "assistant", "content": example["answer"]}]
  27.         chat = tokenizer.apply_chat_template(dialog, tokenize=False)
  28.         return chat
  29.     def __len__(self):
  30.         return len(self.examples)
  31. model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
  32. tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
  33. tokenizer.padding_side = "right"
  34. tokenizer.pad_token = tokenizer.eos_token
  35. bnb_config = BitsAndBytesConfig(
  36.     load_in_4bit=True,
  37.     bnb_4bit_use_double_quant=True,
  38.     bnb_4bit_quant_type="nf4",
  39.     bnb_4bit_compute_dtype=torch.bfloat16
  40. )
  41. model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
  42. peft_config = LoraConfig(
  43.     r=8,
  44.     target_modules=["q_proj",
  45.                     "v_proj",
  46.                     "k_proj",
  47.                     "o_proj",
  48.                     "gate_proj",
  49.                     "down_proj",
  50.                     "up_proj"
  51.                     ],
  52.     task_type=TaskType.CAUSAL_LM,
  53.     lora_alpha=16,
  54.     lora_dropout=0.05
  55. )
  56. model = get_peft_model(model, peft_config)
  57. model.print_trainable_parameters()
  58. model.to("cuda")
  59. optimizer = torch.optim.AdamW(model.parameters())
  60. def sft_collate(batch, tokenizer, end_str, max_length):
  61.     inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
  62.     input_ids = inputs["input_ids"]
  63.     input_len = len(input_ids[0])
  64.     end_ids = tokenizer(end_str)["input_ids"]
  65.     end_id_len = len(end_ids)
  66.     loss_mask = []
  67.     for input_id in input_ids:
  68.         for i in range(len(input_id) - end_id_len, -1, -1):
  69.             if input_id[i:i + end_id_len] == end_ids:
  70.                 mask = [1] * (input_len - 1)
  71.                 mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
  72.                 loss_mask.append(mask)
  73.                 break
  74.             if i == 0:  # 所有回答部分都被截断
  75.                 loss_mask.append([0] * (input_len - 1))
  76.     inputs = {k: torch.tensor(v) for k, v in inputs.items()}
  77.     loss_mask = torch.tensor(loss_mask)
  78.     return inputs, loss_mask
  79. collate_fn = functools.partial(sft_collate,
  80.                                tokenizer=tokenizer,
  81.                                end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
  82.                                max_length=500)
  83. sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
  84. data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
  85. epoch = 10
  86. neftune_noise_alpha = 10
  87. for i in range(epoch):
  88.     for inputs, loss_mask in data_loader:
  89.         input_ids = inputs.pop("input_ids")
  90.         input_embeddings = model.base_model.model.model.embed_tokens(input_ids)
  91.         dims = torch.tensor(input_embeddings.size(1) * input_embeddings.size(2))
  92.         mag_norm = neftune_noise_alpha / torch.sqrt(dims)
  93.         input_embeddings = input_embeddings + torch.zeros_like(input_embeddings).uniform_(-mag_norm, mag_norm)
  94.         inputs["inputs_embeds"] = input_embeddings
  95.         inputs = {k: v.to("cuda") for k, v in inputs.items()}
  96.         loss_mask = loss_mask.to("cuda")
  97.         logits = model(**inputs).logits[:, :-1, :]
  98.         labels = input_ids[:, 1:].to("cuda")
  99.         logits = logits.reshape(-1, logits.size(-1))
  100.         labels = labels.reshape(-1)
  101.         loss_mask = loss_mask.reshape(-1)
  102.         loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
  103.         loss = loss * loss_mask
  104.         loss = torch.mean(loss)
  105.         loss.backward()
  106.         optimizer.step()
  107.         optimizer.zero_grad()
  108.     print(loss.item())
复制代码
4.SFT_Trainer
  1. # 用Trainer包监督微调大模型
  2. import json
  3. import torch
  4. from datasets import Dataset
  5. from peft import LoraConfig, get_peft_model, TaskType
  6. from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
  7. from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
  8. items = []
  9. with open("./data/sft_data.json", "r", encoding="utf8")as f:
  10.     for line in f:
  11.         item = json.loads(line)
  12.         items.append({"prompt": item["query"], "completion": item["answer"]})
  13. dataset = Dataset.from_list(items)
  14. model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
  15. tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
  16. tokenizer.padding_side = "right"
  17. tokenizer.pad_token = tokenizer.eos_token
  18. bnb_config = BitsAndBytesConfig(
  19.     load_in_4bit=True,
  20.     bnb_4bit_use_double_quant=True,
  21.     bnb_4bit_quant_type="nf4",
  22.     bnb_4bit_compute_dtype=torch.float16
  23. )
  24. model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config,torch_dtype=torch.float16)
  25. peft_config = LoraConfig(
  26.     r=8,
  27.     target_modules=["q_proj",
  28.                     "v_proj",
  29.                     "k_proj",
  30.                     "o_proj",
  31.                     "gate_proj",
  32.                     "down_proj",
  33.                     "up_proj"
  34.                     ],
  35.     task_type=TaskType.CAUSAL_LM,
  36.     lora_alpha=16,
  37.     lora_dropout=0.05
  38. )
  39. model = get_peft_model(model, peft_config)
  40. sft_config = SFTConfig(output_dir="/tmp",
  41.                        neftune_noise_alpha=10,
  42.                        per_device_train_batch_size=1,
  43.                        max_seq_length=100,
  44.                        num_train_epochs=10,
  45.                        logging_steps=10,
  46.                        logging_strategy="steps")
  47. response_template = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
  48. collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
  49. trainer = SFTTrainer(
  50.     model=model,
  51.     train_dataset=dataset,
  52.     args=sft_config,
  53.     data_collator=collator
  54. )
  55. trainer.train()
复制代码
学习视频:【大模型微调看这个视频就够了 SFT NEFTune】 https://www.bilibili.com/video/BV1gmWDeLEMZ/?share_source=copy_web&vd_source=050ab764db52d186ab224170392c4055

来源:豆瓜网用户自行投稿发布,如果侵权,请联系站长删除

相关推荐

您需要登录后才可以回帖 登录 | 立即注册