一、定义
- 定义
- 案例1
二、实现
- 定义
QLoRa: 量化+LoRa.
网址:https://huggingface.co/docs/peft/main/en/developer_guides/quantization - 案例1
1. 4bit 量化+LoRa
import torch
from transformers import BitsAndBytesConfigconfig = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_quant_type="nf4",bnb_4bit_use_double_quant=True,bnb_4bit_compute_dtype=torch.bfloat16,
)
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config)
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)from peft import LoraConfig
config = LoraConfig(r=16,lora_alpha=8,target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],lora_dropout=0.05,bias="none",task_type="CAUSAL_LM"
)
from peft import get_peft_model
model = get_peft_model(model, config)
- LoftQ 算法
LoftQ为初始化LoRa的一种算法,使量化误差最小化。
网址: 查看支持的模型https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning
先对模型进行量化、保存。然后进行加载训练。
def quantize_and_save():args = arg_parse()# Download weights and configure LoRAtokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True)if any(name in args.model_name_or_path.lower() for name in ["llama", "mistral", "falcon"]):model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True)task_type = TaskType.CAUSAL_LMtarget_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"]elif any(name in args.model_name_or_path.lower() for name in ["bart", "t5"]):model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path, token=args.token)task_type = TaskType.SEQ_2_SEQ_LMtarget_modules = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "out_proj"]elif any(name in args.model_name_or_path.lower() for name in ["deberta", "roberta", "bert"]):model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, token=args.token)task_type = TaskType.SEQ_CLStarget_modules = ["query_proj", "key_proj", "value_proj", "dense"] # embeddings not supported by peftelse:raise NotImplementedError("Other models not supported yet.")# Config of LoftQloftq_config = LoftQConfig(loftq_bits=args.bits, loftq_iter=args.iter)lora_config = LoraConfig(task_type=task_type,inference_mode=True,r=args.rank,lora_alpha=16 if task_type is TaskType.CAUSAL_LM else args.rank,lora_dropout=0.1,target_modules=target_modules,init_lora_weights="loftq",loftq_config=loftq_config,)# Obtain LoftQ modellora_model = get_peft_model(model, lora_config)base_model = lora_model.get_base_model()# Save LoftQ modelmodel_name = args.model_name_or_path.split("/")[-1] + f"-{args.bits}bit" + f"-{args.rank}rank"base_model_dir = os.path.join(args.save_dir, model_name)lora_model_dir = os.path.join(args.save_dir, model_name, "loft_init")# save lora adapters firstlora_model.base_model.peft_config["default"].base_model_name_or_path = base_model_dir # This can be a local path or Hub model idlora_model.base_model.peft_config["default"].init_lora_weights = True # Don't apply LoftQ when loading againlora_model.save_pretrained(lora_model_dir)print_model(lora_model, "lora_model")# remove lora adapters and save the backboneunwrap_model(base_model)base_model.save_pretrained(base_model_dir)tokenizer.save_pretrained(base_model_dir)print_model(base_model, "base_model")return base_model_dir, lora_model_dir SAVE_DIR="model_zoo/loftq/"
python quantize_save_load.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \ # high-precision model id in HF
--token HF_TOKEN \ # your HF token if the model is private, e.g., llama-2
--bits 4 \
--iter 5 \
--rank 16 \
--save_dir $SAVE_DIR
加载:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModelMODEL_DIR = "model_zoo/loftq/Llama-2-7b-hf-4bit-16rank"base_model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, torch_dtype=torch.bfloat16,quantization_config=BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.bfloat16,bnb_4bit_use_double_quant=False,bnb_4bit_quant_type='nf4',),
)
peft_model = PeftModel.from_pretrained(base_model,MODEL_DIR,subfolder="loftq_init",is_trainable=True,
)
# Do training with peft_model ...
- AQLM quantization +LoRa
from transformers import AutoTokenizer, AutoModelForCausalLMquantized_model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True,
)
from peft import LoraConfig, TaskType
from peft import get_peft_modelpeft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["layers.28.self_attn.v_proj"])
quantized_model = get_peft_model(quantized_model, peft_config)
quantized_model.print_trainable_parameters()
- EETQ quantization:8 bit 量化
import torch
from transformers import EetqConfig
config = EetqConfig("int8")
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config)
from peft import LoraConfig, get_peft_model
config = LoraConfig(r=16,lora_alpha=8,target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],lora_dropout=0.05,bias="none",task_type="CAUSAL_LM"
)model = get_peft_model(model, config)
- HQQ quantization +LoRa :Half 精度
from transformers import HqqConfig, AutoModelForCausalLM
quant_config = HqqConfig(nbits=4, group_size=64)
quantized_model = AutoModelForCausalLM.from_pretrained(save_dir_or_hfhub, device='cuda', quantization_config=quant_config)
peft_config = LoraConfig(...)
quantized_model = get_peft_model(quantized_model, peft_config)