一、任务目标
本文使用的是来自Kaggle的一个情感识别数据集,这个数据集的总数据量是5934条,标签为anger、fear、joy三种情感的其中一种,很明显是一个多分类任务。这里,我们将使用微调技巧进行深度学习建模,同时我们会比较不微调和微调之间的模型表现差距,以便于我们理解微调的优势。
二、数据集处理
首先,我们加载数据集看看:
python">import pandas as pd
df = pd.read_csv('./data/Emotion_classify_Data.csv')
print(df.info())
print(df.head())
可以看到,数据集没有出现空值,但标签是文本类型,为了便于后续建模,我们将它转换为索引:
python">print(list(set(df['Emotion'])))
# 根据打印出来的标签类别,逐个编上编号索引
label_idx = {'anger':0, 'joy':1, 'fear':2}
df['Emotion'] = df['Emotion'].apply(lambda x: label_idx[x])
print(df.head())
为了便于后面对比不微调和微调的模型表现差别,我们提前划分训练集和测试集:
python">X_train, X_test, y_train, y_test = train_test_split(df['Comment'].tolist(), df['Emotion'].tolist(), stratify=df['Emotion'].tolist(), test_size=0.3, random_state=2024)
三、模型框架构建
1、定义数据集类
首先我们自定义一个Dataset类,便于进行数据的转换:
python">from torch.utils.data import Datasetclass SentimentDataset(Dataset):def __init__(self,text,target):self.text=textself.target=targetself.tokenizer=transformers.BertTokenizer.from_pretrained(config['model_path'],do_lower_case=True)def __len__(self):return len(self.text)def __getitem__(self,index):text=str(self.text[index])text=" ".join(text.split())target=self.target[index]input=self.tokenizer.encode_plus(text,None,max_length=config['max_len'],truncation=True,pad_to_max_length=True,)ids=input['input_ids']mask=input['attention_mask']token_type_ids=input['token_type_ids']return {"ids":torch.tensor(ids,dtype=torch.long),"masks":torch.tensor(mask,dtype=torch.long),"token_type_ids":torch.tensor(token_type_ids,dtype=torch.long),"target":torch.tensor(target,dtype=torch.float)}
2、定义模型类
现在,我们定义一个模型类,这里上游模型是Bert,下游模型为全连接层,即MLP。
python">class Bertmodel(nn.Module):def __init__(self):super(Bertmodel,self).__init__()self.bert=transformers.BertModel.from_pretrained(config['model_path'])self.dropout=nn.Dropout(0.3)self.fc1=nn.Linear(768,128)self.fc2 = nn.Linear(128, 64)self.fc3 = nn.Linear(64, 3)def forward(self,ids,mask,token_type):_,x=self.bert(ids,attention_mask=mask,token_type_ids=token_type,return_dict=False)x=self.dropout(x)x=self.fc1(x)x=self.dropout(x)x=self.fc2(x)x = self.fc3(x)return x
3、定义训练和测试函数
python">def train_fn(data_loader,model,optimizer,device,scheduler):model.train()for step,data in enumerate(data_loader):ids=data['ids']masks=data['masks']token_type=data['token_type_ids']target=data['target']ids=ids.to(device,dtype=torch.long)masks=masks.to(device,dtype=torch.long)token_type=token_type.to(device,dtype=torch.long)target=target.to(device,dtype=torch.long)optimizer.zero_grad()preds=model(ids,masks,token_type)loss=loss_fn(preds,target)loss.backward()optimizer.step()# scheduler.step()return loss.item()def eval_fn(data_loader,model,device):fin_targets=[]fin_outputs=[]model.eval()with torch.no_grad():for data in data_loader:ids=data['ids']masks=data['masks']token_type=data['token_type_ids']target=data['target']ids=ids.to(device,dtype=torch.long)masks=masks.to(device,dtype=torch.long)token_type=token_type.to(device,dtype=torch.long)target=target.to(device, dtype=torch.long)preds=model(ids,masks,token_type)loss=loss_fn(preds,target)target=target.cpu().detach()fin_targets.extend(target.numpy().tolist())outputs=torch.argmax(preds, dim=1).cpu().detach()fin_outputs.extend(outputs.numpy().tolist())return fin_outputs,fin_targets
4、定义损失函数和优化器
python">criterion = nn.CrossEntropyLoss()
def loss_fn(output,target):loss=criterion(output,target)return lossdef train(X_train, y_train):train_dataset=SentimentDataset(X_train, y_train)train_loader=torch.utils.data.DataLoader(train_dataset,batch_size=config['train_batch'],num_workers=1)num_train_steps=int(len(X_train)/config['train_batch']*config['epochs'])optimizer=AdamW([param for param in model.parameters() if param.requires_grad], lr=5e-5)scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)best_precision=0for epoch in range(config['epochs']):loss=train_fn(train_loader,model,optimizer,device,scheduler)print(f"Epoch_{epoch}, Train_Loss-->>{loss}")
5、训练模型
在这里,我们初始化模型并确定哪些模型的参数需要参与训练。首先对于不微调预训练模型Bert的过程,我们仅解冻fc层参数,而微调版本则仅解冻Bert最后一层的output子层参数:
python">device=torch.device("cuda")
model=Bertmodel()
model.to(device)# 微调预训练模型版本
# unfreeze_layers = ['layer.11.output', 'fc']# 不微调预训练模型版本,相当于仅训练MLP
unfreeze_layers = ['fc']for name ,param in model.named_parameters():param.requires_grad = Falsefor ele in unfreeze_layers:if ele in name:param.requires_grad = Truebreak
for name, param in model.named_parameters():if param.requires_grad:print(name,param.size())train(X_train, y_train)
6、测试模型表现
评估模型表现:
python">from sklearn.metrics import precision_score, recall_score, f1_scoretest_dataset=SentimentDataset(X_test, y_test)
test_loader=torch.utils.data.DataLoader(test_dataset,batch_size=16)
total_preds=[]
with torch.no_grad():for data in test_loader:ids=data["ids"].to(device,dtype=torch.long)mask=data["masks"].to(device,dtype=torch.long)token_type=data['token_type_ids'].to(device,dtype=torch.long)output=model(ids,mask,token_type)preds=torch.argmax(output, dim=1).cpu().detach()preds=preds.numpy().tolist()total_preds.extend(preds)print('Precision', precision_score(y_test, total_preds, average='macro'))
print('Recall', recall_score(y_test, total_preds, average='macro'))
print('F1', f1_score(y_test, total_preds, average='macro'))
下面这是不微调Bert模型,仅训练MLP的效果:
下面是微调Bert,并训练MLP的效果,可以看到模型的效果大幅提升,即便我们都是只训练了10个epoch。
四、完整代码
python">import logging
logging.basicConfig(level='ERROR')
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")config={"max_len":512,"train_batch":16,"valid_batch":16,"epochs":10,"model_path":"bert-base-uncased"
}class SentimentDataset(Dataset):def __init__(self,text,target):self.text=textself.target=targetself.tokenizer=transformers.BertTokenizer.from_pretrained(config['model_path'],do_lower_case=True)def __len__(self):return len(self.text)def __getitem__(self,index):text=str(self.text[index])text=" ".join(text.split())target=self.target[index]input=self.tokenizer.encode_plus(text,None,max_length=config['max_len'],truncation=True,pad_to_max_length=True,)ids=input['input_ids']mask=input['attention_mask']token_type_ids=input['token_type_ids']return {"ids":torch.tensor(ids,dtype=torch.long),"masks":torch.tensor(mask,dtype=torch.long),"token_type_ids":torch.tensor(token_type_ids,dtype=torch.long),"target":torch.tensor(target,dtype=torch.float)}class Bertmodel(nn.Module):def __init__(self):super(Bertmodel,self).__init__()self.bert=transformers.BertModel.from_pretrained(config['model_path'])self.dropout=nn.Dropout(0.3)self.fc1=nn.Linear(768,128)self.fc2 = nn.Linear(128, 64)self.fc3 = nn.Linear(64, 3)def forward(self,ids,mask,token_type):_,x=self.bert(ids,attention_mask=mask,token_type_ids=token_type,return_dict=False)x=self.dropout(x)x=self.fc1(x)x=self.dropout(x)x=self.fc2(x)x = self.fc3(x)return xdef train_fn(data_loader,model,optimizer,device,scheduler):model.train()for step,data in enumerate(data_loader):ids=data['ids']masks=data['masks']token_type=data['token_type_ids']target=data['target']ids=ids.to(device,dtype=torch.long)masks=masks.to(device,dtype=torch.long)token_type=token_type.to(device,dtype=torch.long)target=target.to(device,dtype=torch.long)optimizer.zero_grad()preds=model(ids,masks,token_type)loss=loss_fn(preds,target)loss.backward()optimizer.step()# scheduler.step()return loss.item()def eval_fn(data_loader,model,device):fin_targets=[]fin_outputs=[]model.eval()with torch.no_grad():for data in data_loader:ids=data['ids']masks=data['masks']token_type=data['token_type_ids']target=data['target']ids=ids.to(device,dtype=torch.long)masks=masks.to(device,dtype=torch.long)token_type=token_type.to(device,dtype=torch.long)target=target.to(device, dtype=torch.long)preds=model(ids,masks,token_type)loss=loss_fn(preds,target)target=target.cpu().detach()fin_targets.extend(target.numpy().tolist())outputs=torch.argmax(preds, dim=1).cpu().detach()fin_outputs.extend(outputs.numpy().tolist())return fin_outputs,fin_targetscriterion = nn.CrossEntropyLoss()
def loss_fn(output,target):loss=criterion(output,target)return lossdef train(X_train, y_train):train_dataset=SentimentDataset(X_train, y_train)train_loader=torch.utils.data.DataLoader(train_dataset,batch_size=config['train_batch'],num_workers=1)num_train_steps=int(len(X_train)/config['train_batch']*config['epochs'])optimizer=AdamW([param for param in model.parameters() if param.requires_grad], lr=5e-5)scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)best_precision=0for epoch in range(config['epochs']):loss=train_fn(train_loader,model,optimizer,device,scheduler)print(f"Epoch_{epoch}, Train_Loss-->>{loss}")device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model=Bertmodel()
model.to(device)# 微调预训练模型版本
# unfreeze_layers = ['layer.11.output', 'fc']# 不微调预训练模型版本,相当于仅训练MLP
unfreeze_layers = ['fc']for name ,param in model.named_parameters():param.requires_grad = Falsefor ele in unfreeze_layers:if ele in name:param.requires_grad = Truebreak
for name, param in model.named_parameters():if param.requires_grad:print(name,param.size())train(X_train, y_train)test_dataset=SentimentDataset(X_test, y_test)
test_loader=torch.utils.data.DataLoader(test_dataset,batch_size=16)
total_preds=[]
with torch.no_grad():for data in test_loader:ids=data["ids"].to(device,dtype=torch.long)mask=data["masks"].to(device,dtype=torch.long)token_type=data['token_type_ids'].to(device,dtype=torch.long)output=model(ids,mask,token_type)preds=torch.argmax(output, dim=1).cpu().detach()preds=preds.numpy().tolist()total_preds.extend(preds)print('Precision', precision_score(y_test, total_preds, average='macro'))
print('Recall', recall_score(y_test, total_preds, average='macro'))
print('F1', f1_score(y_test, total_preds, average='macro'))
五、总结
可以看到,如果仅仅是应用预训练模型的embeddings,任务建模的效果有限,这是由于预训练的过程中学习到的大都是通用知识。经过微调之后,预训练模型更有效地理解了任务的需求并能够生成更为有效的文本表示,从而大大提升建模效果,这就是微调的魅力。但是微调并不是在任何情况下都能够取得更好效果的,这受到解冻层数、参数量的选取,以及学习率等超参数的设置等多方因素的影响,需要我们根据经验调整到最优状态,否则微调的效果可能还不如不微调。本文并未对建模过程中的代码设计展开详细的介绍,这将在下一篇博文中重点讲解,敬请期待!