import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-6 # small value to avoid division by zero
        self.scale = nn.Parameter(torch.ones(emb_dim)) # trainable scale parameter
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # trainable shift parameter

    def forward(self, x):
        '''
        In this implementation of Layer Normalization, the normalization is applied along 
        the last dimension of the input tensor 𝑋, which represents the embedding dimension (dim=-1). 
        Normalizing over the embedding dimension ensures that each word is treated independently,
        preventing one word from influencing another.

        For Transformer models, input data typically has the following shape: 
                        [batch_size, seq_len, emb_dim]
        '''

        mean = x.mean(dim=-1, keepdim=True)  
        var = x.var(dim=-1, keepdim=True, unbiased=False) # unbiased=False means that the variance is calculated with the Bessel correction
        norm_x = (x - mean) / torch.sqrt(var + self.eps)

        return self.scale * norm_x + self.shift