|
|
import torch |
|
|
import torch.nn as nn |
|
|
|
|
|
class LayerNorm(nn.Module): |
|
|
def __init__(self, emb_dim): |
|
|
super().__init__() |
|
|
self.eps = 1e-6 |
|
|
self.scale = nn.Parameter(torch.ones(emb_dim)) |
|
|
self.shift = nn.Parameter(torch.zeros(emb_dim)) |
|
|
|
|
|
def forward(self, x): |
|
|
''' |
|
|
In this implementation of Layer Normalization, the normalization is applied along |
|
|
the last dimension of the input tensor ๐, which represents the embedding dimension (dim=-1). |
|
|
Normalizing over the embedding dimension ensures that each word is treated independently, |
|
|
preventing one word from influencing another. |
|
|
|
|
|
For Transformer models, input data typically has the following shape: |
|
|
[batch_size, seq_len, emb_dim] |
|
|
''' |
|
|
|
|
|
mean = x.mean(dim=-1, keepdim=True) |
|
|
var = x.var(dim=-1, keepdim=True, unbiased=False) |
|
|
norm_x = (x - mean) / torch.sqrt(var + self.eps) |
|
|
|
|
|
return self.scale * norm_x + self.shift |