1.Chat Template
- '1. 指令微调在预训练(LoRA微调)之后'
- # 预训练使大模型成为“领域专家”
- # 指令微调令大模型学会表达
- 每一个大模型的指令微调都不一样;
- 所以一定要根据官方发布的格式做指令微调
复制代码- '2. Chat_Template的源代码'
- # Chat_Template.py
- from transformers import AutoModelForCausalLM, AutoTokenizer
- import torch
- model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
- model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
- optimizer = torch.optim.AdamW(model.parameters())
- dialog = [{"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": "天空为什么是蓝色的?"},
- {"role": "assistant", "content": "这是由于光的散射引起的。"}]
- input = tokenizer.apply_chat_template(dialog, return_tensors="pt")
- input = {k: v.to("cuda") for k, v in input.items()}
- #设置labels和inputs一致
- input["labels"] = input["input_ids"].clone()
- output = model(**input)
- #获取模型的loss
- loss = output.loss
- loss.backward()
- optimizer.step()
- optimizer.zero_grad()
- #保存模型
- model.save_pretrained("output_dir")
复制代码 2.Completions only
- '1. 只对回答部分做计算loss'
- 由于Chat Template会对指令的所有内容计算loss
- 为了优化该部分,使用Completions only
复制代码 - '2. Completions onlyの源代码'
- # Completions_only.py
- import functools
- import json
- from peft import LoraConfig, TaskType, get_peft_model
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
- from torch.utils.data import DataLoader, Dataset
- import torch
- class SFTDataset(Dataset):
- def __init__(self, file_path, tokenizer):
- super().__init__()
- self.file_path = file_path
- self.examples = self._load_data(self.file_path)
- self.tokenizer = tokenizer
- @staticmethod
- def _load_data(file_path):
- items = []
- with open(file_path, "r", encoding="utf8")as f:
- for line in f:
- item = json.loads(line)
- items.append(item)
- return items
- def __getitem__(self, index):
- example = self.examples[index]
- dialog = [{"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": example["query"]},
- {"role": "assistant", "content": example["answer"]}]
- chat = tokenizer.apply_chat_template(dialog, tokenize=False)
- return chat
- def __len__(self):
- return len(self.examples)
- model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
- tokenizer.padding_side = "right"
- tokenizer.pad_token = tokenizer.eos_token
- bnb_config = BitsAndBytesConfig(
- load_in_4bit=True,
- bnb_4bit_use_double_quant=True,
- bnb_4bit_quant_type="nf4",
- bnb_4bit_compute_dtype=torch.bfloat16
- )
- model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
- peft_config = LoraConfig(
- r=8,
- target_modules=["q_proj",
- "v_proj",
- "k_proj",
- "o_proj",
- "gate_proj",
- "down_proj",
- "up_proj"
- ],
- task_type=TaskType.CAUSAL_LM,
- lora_alpha=16,
- lora_dropout=0.05
- )
- model = get_peft_model(model, peft_config)
- model.print_trainable_parameters()
- model.to("cuda")
- optimizer = torch.optim.AdamW(model.parameters())
- def sft_collate(batch, tokenizer, end_str, max_length):
- end_str = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
- inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
- input_ids = inputs["input_ids"]
- input_len = len(input_ids[0])
- end_ids = tokenizer(end_str)["input_ids"]
- end_id_len = len(end_ids)
- loss_mask = []
- for input_id in input_ids:
- for i in range(len(input_id) - end_id_len, -1, -1):
- if input_id[i:i + end_id_len] == end_ids:
- mask = [1] * (input_len - 1)
- mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
- loss_mask.append(mask)
- break
- if i == 0: # 所有回答部分都被截断
- loss_mask.append([0] * (input_len - 1))
- inputs = {k: torch.tensor(v) for k, v in inputs.items()}
- loss_mask = torch.tensor(loss_mask)
- return inputs, loss_mask
- collate_fn = functools.partial(sft_collate,
- tokenizer=tokenizer,
- end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
- max_length=50)
- sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
- data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
- epoch = 10
- for i in range(epoch):
- for inputs, loss_mask in data_loader:
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
- loss_mask = loss_mask.to("cuda")
- logits = model(**inputs).logits[:, :-1, :]
- labels = inputs["input_ids"][:, 1:]
- logits = logits.reshape(-1, logits.size(-1))
- labels = labels.reshape(-1)
- loss_mask = loss_mask.reshape(-1)
- loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
- loss = loss * loss_mask
- loss = torch.mean(loss)
- loss.backward()
- optimizer.step()
- optimizer.zero_grad()
- print(loss.item())
复制代码 3.NEFTune
- '1. Noisy Embeddings Finetuning'
- 通过embedding,将离散对象映射到连续向量空间中的点
- 对embedding做噪声处理,可以提高模型的表现
复制代码- '2. NEFTuneの源代码'
- import functools
- import json
- from peft import LoraConfig, TaskType, get_peft_model
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
- from torch.utils.data import DataLoader, Dataset
- import torch
- class SFTDataset(Dataset):
- def __init__(self, file_path, tokenizer):
- super().__init__()
- self.file_path = file_path
- self.examples = self._load_data(self.file_path)
- self.tokenizer = tokenizer
- @staticmethod
- def _load_data(file_path):
- items = []
- with open(file_path, "r", encoding="utf8")as f:
- for line in f:
- item = json.loads(line)
- items.append(item)
- return items
- def __getitem__(self, index):
- example = self.examples[index]
- dialog = [{"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": example["query"]},
- {"role": "assistant", "content": example["answer"]}]
- chat = tokenizer.apply_chat_template(dialog, tokenize=False)
- return chat
- def __len__(self):
- return len(self.examples)
- model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
- tokenizer.padding_side = "right"
- tokenizer.pad_token = tokenizer.eos_token
- bnb_config = BitsAndBytesConfig(
- load_in_4bit=True,
- bnb_4bit_use_double_quant=True,
- bnb_4bit_quant_type="nf4",
- bnb_4bit_compute_dtype=torch.bfloat16
- )
- model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
- peft_config = LoraConfig(
- r=8,
- target_modules=["q_proj",
- "v_proj",
- "k_proj",
- "o_proj",
- "gate_proj",
- "down_proj",
- "up_proj"
- ],
- task_type=TaskType.CAUSAL_LM,
- lora_alpha=16,
- lora_dropout=0.05
- )
- model = get_peft_model(model, peft_config)
- model.print_trainable_parameters()
- model.to("cuda")
- optimizer = torch.optim.AdamW(model.parameters())
- def sft_collate(batch, tokenizer, end_str, max_length):
- inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
- input_ids = inputs["input_ids"]
- input_len = len(input_ids[0])
- end_ids = tokenizer(end_str)["input_ids"]
- end_id_len = len(end_ids)
- loss_mask = []
- for input_id in input_ids:
- for i in range(len(input_id) - end_id_len, -1, -1):
- if input_id[i:i + end_id_len] == end_ids:
- mask = [1] * (input_len - 1)
- mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
- loss_mask.append(mask)
- break
- if i == 0: # 所有回答部分都被截断
- loss_mask.append([0] * (input_len - 1))
- inputs = {k: torch.tensor(v) for k, v in inputs.items()}
- loss_mask = torch.tensor(loss_mask)
- return inputs, loss_mask
- collate_fn = functools.partial(sft_collate,
- tokenizer=tokenizer,
- end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
- max_length=500)
- sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
- data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
- epoch = 10
- neftune_noise_alpha = 10
- for i in range(epoch):
- for inputs, loss_mask in data_loader:
- input_ids = inputs.pop("input_ids")
- input_embeddings = model.base_model.model.model.embed_tokens(input_ids)
- dims = torch.tensor(input_embeddings.size(1) * input_embeddings.size(2))
- mag_norm = neftune_noise_alpha / torch.sqrt(dims)
- input_embeddings = input_embeddings + torch.zeros_like(input_embeddings).uniform_(-mag_norm, mag_norm)
- inputs["inputs_embeds"] = input_embeddings
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
- loss_mask = loss_mask.to("cuda")
- logits = model(**inputs).logits[:, :-1, :]
- labels = input_ids[:, 1:].to("cuda")
- logits = logits.reshape(-1, logits.size(-1))
- labels = labels.reshape(-1)
- loss_mask = loss_mask.reshape(-1)
- loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
- loss = loss * loss_mask
- loss = torch.mean(loss)
- loss.backward()
- optimizer.step()
- optimizer.zero_grad()
- print(loss.item())
复制代码 4.SFT_Trainer
- # 用Trainer包监督微调大模型
- import json
- import torch
- from datasets import Dataset
- from peft import LoraConfig, get_peft_model, TaskType
- from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
- from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
- items = []
- with open("./data/sft_data.json", "r", encoding="utf8")as f:
- for line in f:
- item = json.loads(line)
- items.append({"prompt": item["query"], "completion": item["answer"]})
- dataset = Dataset.from_list(items)
- model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
- tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
- tokenizer.padding_side = "right"
- tokenizer.pad_token = tokenizer.eos_token
- bnb_config = BitsAndBytesConfig(
- load_in_4bit=True,
- bnb_4bit_use_double_quant=True,
- bnb_4bit_quant_type="nf4",
- bnb_4bit_compute_dtype=torch.float16
- )
- model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config,torch_dtype=torch.float16)
- peft_config = LoraConfig(
- r=8,
- target_modules=["q_proj",
- "v_proj",
- "k_proj",
- "o_proj",
- "gate_proj",
- "down_proj",
- "up_proj"
- ],
- task_type=TaskType.CAUSAL_LM,
- lora_alpha=16,
- lora_dropout=0.05
- )
- model = get_peft_model(model, peft_config)
- sft_config = SFTConfig(output_dir="/tmp",
- neftune_noise_alpha=10,
- per_device_train_batch_size=1,
- max_seq_length=100,
- num_train_epochs=10,
- logging_steps=10,
- logging_strategy="steps")
- response_template = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
- collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
- trainer = SFTTrainer(
- model=model,
- train_dataset=dataset,
- args=sft_config,
- data_collator=collator
- )
- trainer.train()
复制代码 学习视频:【大模型微调看这个视频就够了 SFT NEFTune】 https://www.bilibili.com/video/BV1gmWDeLEMZ/?share_source=copy_web&vd_source=050ab764db52d186ab224170392c4055
来源:豆瓜网用户自行投稿发布,如果侵权,请联系站长删除 |