Large Language Models#

Attention is all you need#

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
  • The paper revolutionized NLP, leading to significant advancements in machine translation, text summarization, and other tasks.

  • The architechture enabled parallel processing of tokens, significantly improving training and inference speeds by utilizing the GPU resources fully.

  • Long sequences could be handled without losss of information.

  • The model contains three important blocks.

    • Multihead Attention Module

    • Position Embeddings

    • Feed forwaard network

Scaled Dot Product Attention#

\[ \text{Attention}(Q, K, V) = \text{softmax}(\frac{Q K^T}{\sqrt{d_k}}) V \]
# query: batch_size x n_heads x query_len x value_len
# keys: batch_size x n_heads x query_len x value_len
# value: batch_size x n_heads x query_len x value_len
class ScaledDotProductAttention(nn.Module):

    def __init__(self, norm,dropout=0.1):
        super(ScaledDotProductAttention, self).__init__()
        self.norm = norm
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2,-1))
        scores = scores / self.norm
        if mask is not None:
            scores = scores.masked_fill(mask==0, float('-inf'))
        attention_probs = F.softmax(scores,dim=-1)
        output = torch.matmul(self.dropout(attention_probs), value)
        return output, attention_probs
sdp = ScaledDotProductAttention(10)
output, attns = sdp.forward(torch.randn((64,8,256,512)),torch.randn((64,8,256,512)),torch.randn((64,8,256,512)))
output.shape
torch.Size([64, 8, 256, 512])
attns.shape
torch.Size([64, 8, 256, 256])

Multihead Attention#

\[\begin{split}\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, \dots, \text{head}_h) W^O \\ \text{where}\ \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \\ W_i^Q \in \mathbb{R}^{\mathrm{d_{model}\times d_k}}, W_i^K \in \mathbb{R}^{\mathrm{d_{model}\times d_k}}, W_i^V \in \mathbb{R}^{\mathrm{d_{model}\times d_v}}, W_i^O \in \mathbb{R}^{\mathrm{hd_v\times d_{model}}}\end{split}\]
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, n_heads,dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0, "`d_model` should be a multiple of `n_heads`"
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = int(d_model / n_heads)

        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        self.W_o = nn.Linear(d_model, d_model)

        self.attention = ScaledDotProductAttention(np.sqrt(self.d_k))

    def forward(self, q, k, v):
        """
        q: batch_size x query_len x d_model
        k: batch_size x query_len x d_model
        v: batch_size x query_len x d_model
        mask: batch_size x 1 x source_seq_len
              batch_size x tgt_seq_len x tgt_seq_len
        """
        print(q.size())

        Q = q.view(q.size(0), -1, self.n_heads, self.d_k).transpose(1,2) # batch_size x n_heads x query_len x d_k
        K = k.view(k.size(0), -1, self.n_heads, self.d_k).transpose(1,2)
        V = v.view(v.size(0), -1, self.n_heads, self.d_k).transpose(1,2)

        # calc attention
        x, attn = self.attention(Q,K,V)

        # regroup 
        x = x.transpose(1,2).contiguous().view(x.size(0), -1, self.n_heads * self.d_k)
        x = self.W_o(x)

        return x, attn
mha = MultiHeadAttention(d_model=512,n_heads=8)
output, attns = mha.forward(torch.randn((64,256,512)),torch.randn((64,256,512)),torch.randn((64,256,512)))
torch.Size([64, 256, 512])
output.size()
torch.Size([64, 256, 512])
attns.size()
torch.Size([64, 8, 256, 256])

Feed forward Network#

\[ FFN(x) = max(0, xW_1 + b_1)W_2 + b_2 \]
class PositionFeedForward(nn.Module):

    def __init__(self, d_model, d_ff, dropout_rate=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self,x):
        x = self.dropout(F.relu(self.w_1(x)))
        x = self.w_2(x)
        return x
        # batch_size, seq_len, d_model
\[\begin{split} \begin{aligned} \text{PE}_{(pos, 2i)} &= \text{sin}(\frac{pos}{10000^{2i/d_{model}}}) \\ \text{PE}_{(pos, 2i + 1)} &= \text{cos}(\frac{pos}{10000^{2i/d_{model}}}) \end{aligned} \end{split}\]

Positional Embeddings#

import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout_rate=0.1,maxlen=10000):
        super(PositionalEncoding, self).__init__()

        self.d_model = d_model
        self.dropout = nn.Dropout(dropout_rate)
        self.maxlen = maxlen

        pe = torch.zeros(maxlen, d_model)
        position = torch.arange(0, maxlen).unsqueeze(1)

        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )  # (d_model,)

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        # make these static
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        x= self.dropout(x)
        return x
class EncoderLayer(nn.Module):

    def __init__(self, d_model, n_heads, d_ff, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.attention_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
        self.attention_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.ff_layer= PositionFeedForward(d_model, d_ff, dropout_rate)
        self.ff_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.dropout = nn.Dropout(dropout_rate)


    def forward(self,x, mask):

        x1 = self.attention_layer(x,x,x,mask)
        x = self.attention_layer_norm(x+ self.dropout(x1))
        x1= self.ff_layer(x)
        x = self.ff_layer_norm(x+self.dropout(x1))

        return x 
class Encoder(nn.Module):

    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate=0.1,maxlen=10000):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        self.maxlen = maxlen


        self.tok_embedding = nn.Linear(vocab_size, d_model)
        self.pos_embedding = PositionalEncoding(d_model, dropout_rate=dropout_rate,maxlen=maxlen)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, d_ff, dropout_rate)
            for _ in range(n_layers)
        ])
        self.layer_norm = nn.LayerNorm(d_model, dropout_rate=dropout_rate)

    def forward(self, x, mask):
        # x : batch_size x seq_len
        x = self.tok_embedding(x)
        x = self.pos_embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        x = self.layer_norm(x)
        return x
        # batch_size x seq_len x d_model
class DecoderLayer(nn.Module):

    def __init__(self, d_model, n_heads, d_ff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.attn_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
        self.attn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.ff_layer = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
        self.ff_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.dropout = nn.Dropout(dropout_rate)

        self.encoder_attn_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
        self.encoder_attn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

    def forward(self, x, encoder_op, src_mask, tgt_mask):

        x1 = self.attn_layer(x,x,x,tgt_mask)
        x = self.attn_layer_norm(x + self.dropout(x1))
        x1, attn = self.encoder_attn_layer(x, encoder_op, encoder_op, src_mask)
        x = self.encoder_attn_layer(x+self.dropout(x1))

        x1 = self.ff_layer(x)
        x = self.ff_layer_norm(x+nn.Dropout(x1))

        return x, attn   
class Decoder(nn.Module):

    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate=0.1,maxlen=10000):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        self.maxlen = maxlen


        self.tok_embedding = nn.Linear(vocab_size, d_model)
        self.pos_embedding = PositionalEncoding(d_model, dropout_rate=dropout_rate,maxlen=maxlen)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, d_ff, dropout_rate)
            for _ in range(n_layers)
        ])
        self.layer_norm = nn.LayerNorm(d_model, dropout_rate=dropout_rate)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # x : batch_size x seq_len
        x = self.tok_embedding(x)
        x = self.pos_embedding(x)
        for layer in self.layers:
            x = layer(x, encodermask)
        x = self.layer_norm(x, enc_output, src_mask, tgt_mask)
        return x
        # batch_size x seq_len x d_model
class Transformer(nn.Module):
    
    def __init__(self, encoder, decoder, linear_mapper):

        self.encoder= encoder
        self.decoder= decoder
        self.linear_mapper = linear_mapper

    def forward(self,src,tgt):
        enc_output = self.encoder(src)
        dec_output, attn = self.decoder(tgt, enc_output)
        output = self.linear_mapper(dec_output)
        return output, attn
class LinearMapper(nn.Module):
    
    def __init__(self, d_model, vocab_size):
        super(LinearMapper, self).__init__()
        self.affine_map = nn.Linear(d_model, vocab_size)

    def forward(self,x):
        x = self.affine_map(x) 
        output = F.log_softmax(x, dim=-1)
        return output
transformer = nn.Transformer(d_model=512, nhead=8, 
                             num_encoder_layers=6, num_decoder_layers=6, 
                             dim_feedforward=2048, dropout=0.1)
/opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)
  warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
transformer
Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (dropout3): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
)

Finetuning Decoder Only Transformer#

from tokenizers import normalizers
from datasets import load_dataset
ds = load_dataset("SuryaKrishna02/aya-telugu-poems")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[23], line 1
----> 1 from tokenizers import normalizers
      2 from datasets import load_dataset
      3 ds = load_dataset("SuryaKrishna02/aya-telugu-poems")

ModuleNotFoundError: No module named 'tokenizers'
def generate_data(x):
    for each_s in x:
        yield each_s['inputs']
g = generate_data(ds['train'])
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])

tokenizer.train_from_iterator(g, trainer)

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2) * (-math.log(10000.0) / embed_size))
        pe = torch.zeros(max_len, 1, embed_size)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.pe = pe

    def forward(self, x):
        # batch, len, d_model
        seq_len = x.size(1)
        x = x + self.pe[:x.size(0), :, :].to(x.device)
        return x
import math
class DecoderOnlyTransformer(nn.Module):
    
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, 
                 dropout_rate=0.1,max_len=32):
        
        super(DecoderOnlyTransformer, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        self.max_len = max_len

        self.position_embedding = PositionalEncoding(d_model, max_len)
        self.tok_embedding = nn.Embedding(vocab_size, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size, 
                                                   nhead =n_heads, 
                                                   dim_feedforward=d_ff)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self,x):
        x = self.tok_embedding(x)
        x = self.position_embedding(x)
        x = self.transformer_decoder(x,memory=x)
        logits = self.fc_out(x)
        return logits
vocab_size = 10000
embed_size = 512  
num_heads = 8  
num_layers = 6 
hidden_dim = 2048  
max_len = 32 
g = generate_data(ds['train'])
def tokenize_data(corpus, tokenizer, max_len=32):
    tokenized_corpus = [tokenizer.encode(sentence).ids for sentence in corpus]
    tokenized_corpus = [x[0:max_len] for x in tokenized_corpus]
    return tokenized_corpus
tokenized_corpus = tokenize_data(g, tokenizer)
len(tokenized_corpus)
5115
batch_size = 10
sequence_length = 32
input_sequences = torch.zeros((batch_size, sequence_length), dtype=torch.long)
input_sequences.size()
torch.Size([10, 32])
for i in range(batch_size):
    input_sequences[i, :len(tokenized_corpus[i])] = torch.tensor(tokenized_corpus[i])
input_sequences.size()
torch.Size([10, 32])
transformer = DecoderOnlyTransformer( vocab_size=30000, d_model=512, n_layers=6, n_heads=8, 
                       d_ff=512, dropout_rate=0.1,max_len=32)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
    transformer.train()
    optimizer.zero_grad()
    
    # Forward pass
    logits = transformer(input_sequences)
    
    # Shift input by 1 token to compute next token prediction loss
    target_sequences = input_sequences[:, 1:]
    logits = logits[:, :-1, :].reshape(-1, 30000)  # Reshape for loss computation
    target_sequences = target_sequences.reshape(-1)
    
    # Compute loss
    loss = criterion(logits, target_sequences)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
Epoch 1/10, Loss: 10.540102005004883
Epoch 2/10, Loss: 8.720990180969238
Epoch 3/10, Loss: 7.641439437866211
Epoch 4/10, Loss: 6.5575408935546875
Epoch 5/10, Loss: 5.865268230438232
Epoch 6/10, Loss: 5.230606555938721
Epoch 7/10, Loss: 4.630501747131348
Epoch 8/10, Loss: 4.067417144775391
Epoch 9/10, Loss: 3.589400053024292
Epoch 10/10, Loss: 3.262800455093384

Lora Finetuning#

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
/home/sandeep/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
  warnings.warn(
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,  
    lora_alpha=32,  
    target_modules=["c_attn"],  
    lora_dropout=0.1, 
    bias="none",  #
    task_type="CAUSAL_LM" 
)
model = get_peft_model(model, lora_config)
/home/sandeep/anaconda3/lib/python3.12/site-packages/peft/tuners/lora/layer.py:1091: UserWarning: fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True.
  warnings.warn(
from datasets import load_dataset

# Load a dataset (e.g., WikiText)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True,max_length=512)
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
def tokenize_and_prepare_labels(examples):
    # Tokenize the input text
    tokenized_inputs = tokenizer(examples['text'], return_tensors="pt", padding=True, truncation=True)
    
    # GPT-2 uses the input as the label, so we clone the input_ids
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()

    return tokenized_inputs

# Apply the tokenization and labeling function
tokenized_dataset = dataset.map(tokenize_and_prepare_labels, batched=True, remove_columns='text')
len(tokenized_dataset['train'][1]['input_ids'])
473
tokenized_dataset['train']
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 36718
})
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  
    num_train_epochs=3,  
    logging_dir="./logs",  
    logging_steps=10,  
    save_steps=500, 
    learning_rate=5e-5,  
    evaluation_strategy="steps", 
    save_total_limit=2,
    per_device_train_batch_size =1,
    per_device_eval_batch_size =1
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

# Fine-tune the model
# trainer.train()
/home/sandeep/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(

BERT Models#

  • Language models trained in unsupervised manner to learn good contextualized representations.

  • Contains only stacks of encoder layers (Mutli-head attention, Feed-foward blocks, Token embeddings, Position Embeddings)

  • Training is done using two objectives.

    • Masked Language Modelling: Randomly masks some tokens in the input sequence and tries to predict it.

    • Next Sentence Prediction: Predicts if the sentence pair are consecutive in training.

  • Special Tokens:

    • [CLS] token is used at the beginning of every sentence, it is assumed to contain all the information present in the sentence, it is useful in classfication tasks.

    • [SEP] token is used to separate sentences from each other so that the model can undestand and learn it.

    • [PAD] token is used for padding data. [UNK] is for unknown tokens.

  • BERT + Finetuning classfication head achieves a good performance and strong baseline in text classification tasks. Before jumping on to generative AI, it is better to validate performance using strong pretrained BERT model. The inference cost of this model will be cheaper and the model can be finetuned to your usecase.

  • BERT uses trainable position embeddings unlike the original transformers which use sin/cos fixed embeddings.

import math
class BERT(nn.Module):
    
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, 
                 dropout_rate=0.1,max_len=32):
        super(BERT, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate       
        self.max_len = max_len

        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(max_len, d_model)

        self.decoder_layer = nn.TransformerDecoderLayer(d_model, n_heads,
                                                 d_ff, dropout_rate)

        self.decoder = nn.TransformerDecoder(self.decoder_layer,num_layers=n_layers)

        self.dropout = nn.Dropout(dropout_rate)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        token_embed = self.token_embedding(x)
        positions = torch.arange(0, self.max_len, dtype=torch.long)
        positions = positions.unsqueeze(0).expand_as(x)
        pos_embed = self.pos_embedding(positions)
        segment_embed = token_embed + pos_embed
        x = self.decoder(segment_embed,memory=segment_embed)
        x = self.fc_out(x)
        return x
bert_model = BERT(vocab_size=30000, d_model=512, n_layers=6, n_heads=8, 
                       d_ff=512, dropout_rate=0.1,max_len=32)
def print_weight_sizes(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f'Total parameters: {total_params}\n')

    for name, param in model.named_parameters():
        param_size = param.numel()
        percentage = 100 * param_size / total_params
        print(f'Layer: {name} | Size: {param.size()} | Number of parameters: {param_size} | Percentage: {percentage:.2f}%')
print_weight_sizes(bert_model)
Total parameters: 49173808

Layer: token_embedding.weight | Size: torch.Size([30000, 512]) | Number of parameters: 15360000 | Percentage: 31.24%
Layer: pos_embedding.weight | Size: torch.Size([32, 512]) | Number of parameters: 16384 | Percentage: 0.03%
Layer: decoder_layer.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder_layer.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder_layer.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder_layer.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder_layer.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.0.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.0.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.0.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.0.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.1.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.1.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.1.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.1.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.2.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.2.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.2.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.2.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.3.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.3.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.3.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.3.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.4.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.4.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.4.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.4.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.5.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.5.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.5.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.5.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: fc_out.weight | Size: torch.Size([30000, 512]) | Number of parameters: 15360000 | Percentage: 31.24%
Layer: fc_out.bias | Size: torch.Size([30000]) | Number of parameters: 30000 | Percentage: 0.06%
inp = torch.randint(0, 10000, (16,32))
bert_model.forward(inp).size()
torch.Size([16, 32, 30000])

LLama Models#

  • LLama models change positional embeddings to RoPE: RoPE incorporates position information directly into the attention mechanism by rotating the input embeddings in a way that allows the model to capture relative positional relationships, rather than absolute ones. This makes the model more efficient at handling long-range dependencies.

  • LLama uses prenormalization unlike GPT.

  • Custom tokenizer with support for tokens in other langugages.

  • Large context length with the recent Llama3: 128k