Transformer详解(附代码)

引言

Transformer模型是Google团队在2017年6月由 Ashish Vaswani等人在论文 《Attention Is All You Need》 所提出,当前它已经成为NLP领域中的首选模型。Transformer抛弃了RNN的顺序结构,采用了 Self-Attention机制,使得模型可以并行化训练,而且能够充分利用训练资料的全局信息,加入Transformer的Seq2seg模型在NLP的各个任务上都有了显著的提升。本文做了大量的图示目的是能够更加清晰地讲解Transformer的运行原理,以及相关组件的操作细节,文末还有完整可运行的代码示例。

注意力机制

Transformer中的核心机制就是Self-Attention。Self-Attention机制的本质来自于人类视觉注意力机制。当人视觉在感知东西时候往往会更加关注某个场景中显著性的物体,为了合理利用有限的视觉信息处理资源,人需要选择视觉区域中的特定部分,然后集中关注它。注意力机制主要目的就是对输入进行注意力权重的分配,即決定需要关注输入的哪部分,并对其分配有限的信息处理资源给重要的部分。

代码示例

import torchimport torch.nn as nnimport os
class SelfAttention(nn.Module):def __init__(self, embed_size, heads): super(SelfAttention, self).__init__() self.embed_size = embed_size self.heads = heads self.head_dim = embed_size // heads
assert (self.head_dim * heads == embed_size), "Embed size needs to be div by heads"
self.values = nn.Linear(self.head_dim, self.head_dim, bias=False) self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False) self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False) self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
def forward(self, values, keys, query, mask): N =query.shape[0] value_len , key_len , query_len = values.shape[1], keys.shape[1], query.shape[1]
# split embedding into self.heads pieces values = values.reshape(N, value_len, self.heads, self.head_dim) keys = keys.reshape(N, key_len, self.heads, self.head_dim) queries = query.reshape(N, query_len, self.heads, self.head_dim)
values = self.values(values) keys = self.keys(keys) queries = self.queries(queries)
energy = torch.einsum("nqhd,nkhd->nhqk", queries, keys)# queries shape: (N, query_len, heads, heads_dim)# keys shape : (N, key_len, heads, heads_dim)# energy shape: (N, heads, query_len, key_len)
if mask is not None: energy = energy.masked_fill(mask == 0, float("-1e20"))
attention = torch.softmax(energy/ (self.embed_size ** (1/2)), dim=3)
out = torch.einsum("nhql, nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads*self.head_dim)# attention shape: (N, heads, query_len, key_len)# values shape: (N, value_len, heads, heads_dim)# (N, query_len, heads, head_dim)
out = self.fc_out(out)return out

class TransformerBlock(nn.Module):def __init__(self, embed_size, heads, dropout, forward_expansion): super(TransformerBlock, self).__init__() self.attention = SelfAttention(embed_size, heads) self.norm1 = nn.LayerNorm(embed_size) self.norm2 = nn.LayerNorm(embed_size)
self.feed_forward = nn.Sequential( nn.Linear(embed_size, forward_expansion*embed_size), nn.ReLU(), nn.Linear(forward_expansion*embed_size, embed_size) ) self.dropout = nn.Dropout(dropout)
def forward(self, value, key, query, mask): attention = self.attention(value, key, query, mask)
x = self.dropout(self.norm1(attention + query)) forward = self.feed_forward(x) out = self.dropout(self.norm2(forward + x))return out

class Encoder(nn.Module):def __init__( self, src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length, ): super(Encoder, self).__init__() self.embed_size = embed_size self.device = device self.word_embedding = nn.Embedding(src_vocab_size, embed_size) self.position_embedding = nn.Embedding(max_length, embed_size)
self.layers = nn.ModuleList( [ TransformerBlock( embed_size, heads, dropout=dropout, forward_expansion=forward_expansion, )for _ in range(num_layers)] ) self.dropout = nn.Dropout(dropout)

def forward(self, x, mask): N, seq_length = x.shape positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device) out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))for layer in self.layers: out = layer(out, out, out, mask)
return out

class DecoderBlock(nn.Module):def __init__(self, embed_size, heads, forward_expansion, dropout, device): super(DecoderBlock, self).__init__() self.attention = SelfAttention(embed_size, heads) self.norm = nn.LayerNorm(embed_size) self.transformer_block = TransformerBlock( embed_size, heads, dropout, forward_expansion )
self.dropout = nn.Dropout(dropout)
def forward(self, x, value, key, src_mask, trg_mask): attention = self.attention(x, x, x, trg_mask) query = self.dropout(self.norm(attention + x)) out = self.transformer_block(value, key, query, src_mask)return out
class Decoder(nn.Module):def __init__( self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length, ): super(Decoder, self).__init__() self.device = device self.word_embedding = nn.Embedding(trg_vocab_size, embed_size) self.position_embedding = nn.Embedding(max_length, embed_size) self.layers = nn.ModuleList( [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)for _ in range(num_layers)] ) self.fc_out = nn.Linear(embed_size, trg_vocab_size) self.dropout = nn.Dropout(dropout)
def forward(self, x ,enc_out , src_mask, trg_mask): N, seq_length = x.shape positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device) x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
for layer in self.layers: x = layer(x, enc_out, enc_out, src_mask, trg_mask)
out =self.fc_out(x)return out

class Transformer(nn.Module):def __init__( self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size = 256, num_layers = 6, forward_expansion = 4, heads = 8, dropout = 0, device="cuda", max_length=100 ): super(Transformer, self).__init__() self.encoder = Encoder( src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length ) self.decoder = Decoder( trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length )

self.src_pad_idx = src_pad_idx self.trg_pad_idx = trg_pad_idx self.device = device

def make_src_mask(self, src): src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)# (N, 1, 1, src_len)return src_mask.to(self.device)
def make_trg_mask(self, trg): N, trg_len = trg.shape trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand( N, 1, trg_len, trg_len )return trg_mask.to(self.device)
def forward(self, src, trg): src_mask = self.make_src_mask(src) trg_mask = self.make_trg_mask(trg) enc_src = self.encoder(src, src_mask) out = self.decoder(trg, enc_src, src_mask, trg_mask)return out

if __name__ == '__main__': device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) x = torch.tensor([[1,5,6,4,3,9,5,2,0],[1,8,7,3,4,5,6,7,2]]).to(device) trg = torch.tensor([[1,7,4,3,5,9,2,0],[1,5,6,2,4,7,6,2]]).to(device)
src_pad_idx = 0 trg_pad_idx = 0 src_vocab_size = 10 trg_vocab_size = 10 model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(device) out = model(x, trg[:, : -1]) print(out.shape)

版权声明:本文为CSDN博主「鬼道2022」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。

原文链接:https://blog.csdn.net/qq_38406029/article/details/122050257

*本文转载目的在于传递更多信息,如若文章内容侵犯您的权益,请及时联系小编删除。

相关推荐

  • 在 Java 中初始化 List 的五种方法
  • Spring Cloud 中 7 种负载均衡策略!
  • SpringBoot防止接口恶意刷新和暴力请求
  • Nginx一网打尽:动静分离、压缩、缓存、黑白名单、跨域、高可用、性能优化...
  • 以小见大,彻底理解 cookie,session,token 之间的关系,通俗易懂
  • 全网最详细的 SpringBoot + Druid DataSource 实现监控 MySQL 性能
  • 2023 MySQL 与 PostgreSQL 之最新对比
  • 独家丨李志飞将在大模型领域创业,做中国的 OpenAI
  • 一批信仰 AGI 的年轻人,填补了中国 AI 大模型创业公司的空白
  • 这段音频火爆外网!文字、图片一键生成逼真音效,音频界AIGC来了
  • 复旦大学自然语言处理实验室《自然语言处理导论》 网络初版发布
  • SpringBoot 中使用 spring-retry 轻松解决重试
  • 雄安购房条件之一是北京户口;男子抢劫156元后藏进山洞14年;丈夫泼妻子硫酸仅被行拘10日......|酷玩日爆
  • 兄弟情才是真“塑料” | 每日一冷
  • 王毅在欧洲,释放出强烈信号!
  • 四天工作制是延迟退休的镇痛良药吗?
  • 北溪被炸新证据出炉,美国嫌疑加倍,又能如何…
  • ​特拉斯批评中国制度和政策,中国使馆:“她没资格”;北溪爆炸更多细节被公布;失联双胞胎兄弟遗体被找到 | 每日大新闻
  • 乌克兰出人,欧洲出坦克,美国出……点子
  • 中国开源社区健康案例——Apache StreamPark社区