Transformer代码

server/2024/10/18 20:21:03/

一、Embedding

import torch
from torch import nn
import torch.nn.functional as F
import math
from torch import Tensor#将输入的词汇表索引转换成指定维度的Embedding向量
class TokenEmbedding(nn.Embedding):def __init__(self,vocab_size,d_model):super(TokenEmbedding,self).__init__(vocab_size,d_model,padding_idx=1)#通过位置编码计算输入序列的位置信息
class PositionalEmbedding(nn.Module):def __init__(self,d_model,max_len,device):super(PositionalEmbedding,self).__init__()#初始化全零矩阵self.encoding = torch.zeros(max_len,d_model,device=device)self.encoding.requires_grad = Falsepos = torch.arange(0,max_len,device=device)pos = pos.float().unsqueeze(1)_2i = torch.arange(0,d_model,step=2,device=device)#做奇偶区别self.encoding[:,0::2] = torch.sin(pos*torch.exp(_2i*-(math.log(10000.0)/d_model)))self.encoding[:,1::2] = torch.cos(pos*torch.exp(_2i*-(math.log(10000.0)/d_model)))#前向传播def forward(self,x):batch_size,seq_len = x.size()return self.encoding[:seq_len,:]
#
class TransformerEmbedding(nn.Module):def __init__(self,vocab_size,d_model,max_len,drop_prob,device):self.tok_emb = TokenEmbedding(vocab_size,d_model)self.pos_emb = PositionalEmbedding(d_model,max_len,device)self.drop = nn.Dropout(p=drop_prob)def forward(self,x):tok_emb = self.tok_emb(x)pos_emb = self.pos_emb(x)return self.drop(tok_emb + pos_emb)

二、Multi-Head Attention

import torch
from torch import nn
import torch.nn.functional as F
import math
from torch import Tensorx = torch.rand(128,32,512)
d_model = 512
n_head = 8class MutiHeadAttention(nn.Module):def __init__(self,d_model,n_head):super(MutiHeadAttention,self).__init__()self.n_head = n_headself.d_model = d_modelself.w_q = nn.Linear(d_model,d_model)self.w_k = nn.Linear(d_model,d_model)self.w_v = nn.Linear(d_model,d_model)self.w_combine = nn.Linear(d_model,d_model)self.softmax = nn.Softmax(dim=-1)def forward(self,q,k,v,mask = None):batch,time,dimension = q.shape# q k v 的维度都是 batch * time * dimensionn_d = self.d_model // self.n_head# n_d = dimension // self.n_headq,k,v = self.w_q(q),self.w_k(k),self.w_v(v)#重塑q = q.view(batch,time,self.n_head,n_d).permute(0,2,1,3)k = k.view(batch,time,self.n_head,n_d).permute(0,2,1,3)v = v.view(batch,time,self.n_head,n_d).permute(0,2,1,3)#连接计算(q与k的转置)score = q@k.transpose(2,3)/math.sqrt(n_d)if mask is not None:score = score.masked_fill(mask == 0,-10000)score = self.softmax(score)@vscore = score.permute(0,2,1,3).contiguous().view(batch,time,dimension)#计算输出output = self.w_combine(score)return outputattention = MutiHeadAttention(d_model,n_head)
print(attention(x,x,x))

三、Layer Normalization

import torch
from torch import nn
import torch.nn.functional as F
import math
from torch import Tensorclass LayerNorm(nn.Module):def __init__(self,d_model,eps = 1e-12):super(LayerNorm,self).__init__()self.gamma = nn.Parameter(torch.ones(d_model))self.beta = nn.Parameter(torch.zeros(d_model))self.eps = eps#前向传播def forward(self,x):mean = x.mean(-1,keepdim = True)var = x.var(-1,unbiased = False,keepdim = True)out = (x - mean) / torch.sqrt(var + self.eps)out = self.gamma * out + self.betareturn out

四、Encoder

import torch.nn as nn
import torch.nn.functional as F
import torch
from Transformer.Embedding import TransformerEmbeddingclass PositionwiseFeedForward(nn.Module):def __init__(self, d_model, hidden,dropout = 0.1):super(PositionwiseFeedForward, self).__init__()self.fc1 = nn.Linear(d_model, hidden)self.fc2 = nn.Linear(hidden, d_model)self.dropout = nn.Dropout(dropout)def forward(self, x):x = self.fc1(x)x = F.relu(x)x = self.dropout(x)x = self.fc2(x)return xclass EncoderLayer(nn.Module):def __init__(self, d_model, n_head, ffn_hidden, dropout = 0.1):super(EncoderLayer,self).__init__()self.attention = nn.MultiheadAttention(d_model, n_head, dropout = dropout)self.norm1 = nn.LayerNorm(d_model)self.dropout1 = nn.Dropout(dropout)self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, dropout = dropout)self.norm2 = nn.LayerNorm(d_model)self.dropout2 = nn.Dropout(dropout)def forward(self, x,mask = None):_x = xx = self.attention(x,x,x,mask)x = self.dropout1(x)#残差连接x = self.norm1(x + _x)_x = x#前馈网络x = self.ffn(x)x = self.dropout2(x)#残差连接x = self.norm2(x + _x)return xclass Encoder(nn.Module):# device = 'cuda' if torch.cuda.is_available() else 'cpu'def __init__(self, enc_voc_size, max_len,d_model,ffn_hidden, n_head, n_layer, dropout = 0.1,device = "gpu"):super(Encoder, self).__init__()self.embedding = TransformerEmbedding(enc_voc_size, max_len, d_model, dropout = 0.1,device = "gpu")self.layers = nn.ModuleList([EncoderLayer(d_model, ffn_hidden, n_head,device) for _ in range(n_layer)])def forward(self,x,s_mask):#x的维度为[seq_len,batch_size]x = self.embedding(x)for layer in self.layers:x = layer(x,s_mask)return x

五、Decoder

import torch
from torch import nn
import torch.nn.functional as F
import math
from torch import Tensor
from Transformer.layernorm import LayerNorm
from Transformer.Attention import MutiHeadAttention
from Transformer.Encoder import PositionwiseFeedForward
from Transformer.Embedding import TransformerEmbeddingclass DecoderLayer(nn.Module):def __init__(self,d_model,ffn_hidden,n_head,drop_prob):super(DecoderLayer,self).__init__()self.attention1 = MutiHeadAttention(n_head,d_model,drop_prob)self.norm1 = LayerNorm(d_model)self.dropout1 = nn.Dropout(drop_prob)self.cross_attention = MutiHeadAttention(n_head,d_model,drop_prob)self.norm2 = LayerNorm(d_model)self.dropout2 = nn.Dropout(drop_prob)self.ffn = PositionwiseFeedForward(d_model,ffn_hidden,drop_prob)self.norm3 = LayerNorm(d_model)self.dropout3 = nn.Dropout(drop_prob)def forward(self,x,dec,enc,t_mask,s_mask):_x = decs = self.attention1(dec,dec,dec,t_mask)x = self.dropout1(x)x = self.norm1(x + _x)_x = xx = self.cross_attention(x,enc,enc,s_mask)x = self.dropout2(x)x = self.norm2(x + _x)x = self.ffn(x)x = self.dropout3(x)x = self.norm3(x + _x)return xclass Decoder(nn.Module):def __init__(self,dec_voc_size,max_len,d_model,ffn_hidden,n_head,n_layer,drop_prob,device):super(Decoder,self).__init__()self.embedding = TransformerEmbedding(dec_voc_size,d_model,max_len,drop_prob,device)self.layers = nn.ModuleList([DecoderLayer(d_model,ffn_hidden,n_head,drop_prob) for _ in range(n_layer)])self.fc = nn.Linear(d_model,dec_voc_size)def forward(self,dec,enc,t_mask,s_mask):dec = self.embedding(dec)for layer in self.layers:dec = layer(dec,enc,t_mask,s_mask)dec = self.fc(dec)return dec

六、Transformer 

import torch
from torch import nn
import torch.nn.functional as F
import math
from torch import Tensorfrom Transformer.Decoder import Decoder
from Transformer.Encoder import Encoderclass Transformer(nn.Module):def __init__(self,src_pad_idx,trg_pad_idx,enc_voc_size,dec_voc_size,d_model,n_heads,ffn_hidden,n_layers,drop_prob,device):super(Transformer,self).__init__()self.encoder = Encoder(enc_voc_size, d_model, n_heads, ffn_hidden, n_layers, drop_prob, device)self.decoder = Decoder(dec_voc_size, d_model, n_heads, ffn_hidden, n_layers, drop_prob, device)self.src_pad_idx = src_pad_idxself.trg_pad_idx = trg_pad_idxself.device = devicedef make_pad_mask(self,q,k,pad_idx_q,pad_idx_k):len_q,len_k = q.size(1), k.size(1)q = q.ne(pad_idx_q).unsqueeze(1).unsqueez(3)q = q.repeat(1,1,1,len_k)k = k.ne(pad_idx_k).unsqueeze(1).unsqueeze(2)k = k.repeat(1,1,len_q,1)mask = q & kreturn maskdef make_casual_mask(self,q,k,len_q,len_k):mask = torch.trill(torch.ones(len_q,len_k)).type(torch.BoolTensor).to(self.device)return maskdef forward(self,src,trg):src_mask = self.make_pad_mask(src,src,self.src_pad_idx,self.src_pad_idx)trg_mask = self.make_pad_mask(trg,trg,self.trg_pad_idx,self.trg_pad_idx) * self.make_casual_mask(trg,trg)enc = self.encoder(src,src_mask)out = self.decoder(trg,src,trg_mask,src_mask)return out


http://www.ppmy.cn/server/98355.html

相关文章

golang判断某个文件内容是否是二进制文件方法, LimitReader, 获取文件大小,字符串0写入后的byte数据为48, byte零值

go语言中判断某个文件是否是二进制文件的方法, 通过LimitReader读取指定大小的数据后对数据进行判断, 这里有一个很有趣的知识点就是 字符串0在写入文件后,再通过io read读取后的byte数据他在内存中显示的可不是0 而是变成了 48, 十六进制 0x…

【微信小程序开发】——奶茶点餐小程序的制作(一)

👨‍💻个人主页:开发者-曼亿点 👨‍💻 hallo 欢迎 点赞👍 收藏⭐ 留言📝 加关注✅! 👨‍💻 本文由 曼亿点 原创 👨‍💻 收录于专栏&#xff1a…

【Material-UI】按钮组:垂直按钮组详解

文章目录 一、按钮组概述1. 组件介绍2. 基本用法 二、垂直按钮组的应用场景1. 导航菜单2. 表单操作3. 选项切换 三、按钮组的样式定制1. 变体(Variants)2. 颜色(Colors) 四、垂直按钮组的优势1. 空间利用2. 可读性与易用性3. 视觉…

Babel 7入门基础知识 实践案例篇【2】推荐

Babel 是一个工具链,主要用于将采用 ECMAScript 2015 语法编写的代码转换为向后兼容的 JavaScript 语法,以便能够运行在当前和旧版本的浏览器或其他环境中。 我们前面回顾webpack入门基础知识的时候已经已经了解过了。 并且在Babel入门基础知识 实践案…

write_sdc和write_script区别

文章目录 一、set_disable_clock_gating_check二、write_sdc和write_script区别1. write_sdc2. write_script 一、set_disable_clock_gating_check set_disable_clock_gating_check对指定的cell/pin/lib_cell/lib_pin设置是否进行clock gating的时序检查。 对于工具插入或者…

【书生大模型实战营第三期 | 基础岛第1关-书生大模型全链路开源体系】

学习心得:《书生浦语大模型全链路开源开放体系》 摘要 通过观看哔哩哔哩上的《书生浦语大模型全链路开源开放体系》视频,我对开源大模型及其工具链有了更深入的了解。视频由社区贡献者汪周谦讲解,不仅介绍了书生浦语大模型的架构和功能&…

重庆市合川区第二届网络安全“钓鱼城“杯部分题解

MISC 下载文件后,进行分析 往下划看见smb 最开始以为是通过smb协议下载的文件 找半天没发现,往前翻了翻,看见了flag 存储为原始数据 通过上述分析发现开头是pk,保存为zip压缩包 发现需要密码 感觉是伪加密 使用工具一把梭 再…

【LabVIEW学习篇 - 13】:队列

文章目录 队列 队列 队列通常情况下是一种先入先出(FIFO:First in First out)的数据结构,常用作数据缓存,通过队列结构可以保证数据有序的传递,避免竞争和冲突。 案例:利用队列,模…