Large Language Models#
Attention is all you need#
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
The paper revolutionized NLP, leading to significant advancements in machine translation, text summarization, and other tasks.
The architechture enabled parallel processing of tokens, significantly improving training and inference speeds by utilizing the GPU resources fully.
Long sequences could be handled without losss of information.
The model contains three important blocks.
Multihead Attention Module
Position Embeddings
Feed forwaard network
Scaled Dot Product Attention#
# query: batch_size x n_heads x query_len x value_len
# keys: batch_size x n_heads x query_len x value_len
# value: batch_size x n_heads x query_len x value_len
class ScaledDotProductAttention(nn.Module):
def __init__(self, norm,dropout=0.1):
super(ScaledDotProductAttention, self).__init__()
self.norm = norm
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
scores = torch.matmul(query, key.transpose(-2,-1))
scores = scores / self.norm
if mask is not None:
scores = scores.masked_fill(mask==0, float('-inf'))
attention_probs = F.softmax(scores,dim=-1)
output = torch.matmul(self.dropout(attention_probs), value)
return output, attention_probs
sdp = ScaledDotProductAttention(10)
output, attns = sdp.forward(torch.randn((64,8,256,512)),torch.randn((64,8,256,512)),torch.randn((64,8,256,512)))
output.shape
torch.Size([64, 8, 256, 512])
attns.shape
torch.Size([64, 8, 256, 256])
Multihead Attention#
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads,dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % n_heads == 0, "`d_model` should be a multiple of `n_heads`"
self.dropout = nn.Dropout(dropout)
self.d_model = d_model
self.n_heads = n_heads
self.d_k = int(d_model / n_heads)
self.W_q = nn.Linear(d_model, d_model, bias=False)
self.W_k = nn.Linear(d_model, d_model, bias=False)
self.W_v = nn.Linear(d_model, d_model, bias=False)
self.W_o = nn.Linear(d_model, d_model)
self.attention = ScaledDotProductAttention(np.sqrt(self.d_k))
def forward(self, q, k, v):
"""
q: batch_size x query_len x d_model
k: batch_size x query_len x d_model
v: batch_size x query_len x d_model
mask: batch_size x 1 x source_seq_len
batch_size x tgt_seq_len x tgt_seq_len
"""
print(q.size())
Q = q.view(q.size(0), -1, self.n_heads, self.d_k).transpose(1,2) # batch_size x n_heads x query_len x d_k
K = k.view(k.size(0), -1, self.n_heads, self.d_k).transpose(1,2)
V = v.view(v.size(0), -1, self.n_heads, self.d_k).transpose(1,2)
# calc attention
x, attn = self.attention(Q,K,V)
# regroup
x = x.transpose(1,2).contiguous().view(x.size(0), -1, self.n_heads * self.d_k)
x = self.W_o(x)
return x, attn
mha = MultiHeadAttention(d_model=512,n_heads=8)
output, attns = mha.forward(torch.randn((64,256,512)),torch.randn((64,256,512)),torch.randn((64,256,512)))
torch.Size([64, 256, 512])
output.size()
torch.Size([64, 256, 512])
attns.size()
torch.Size([64, 8, 256, 256])
Feed forward Network#
class PositionFeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout_rate=0.1):
super(PositionwiseFeedForward, self).__init__()
self.d_model = d_model
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.w1 = nn.Linear(d_model, d_ff)
self.w2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout_rate)
def forward(self,x):
x = self.dropout(F.relu(self.w_1(x)))
x = self.w_2(x)
return x
# batch_size, seq_len, d_model
Positional Embeddings#
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout_rate=0.1,maxlen=10000):
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.dropout = nn.Dropout(dropout_rate)
self.maxlen = maxlen
pe = torch.zeros(maxlen, d_model)
position = torch.arange(0, maxlen).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
) # (d_model,)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
# make these static
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
x= self.dropout(x)
return x
class EncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout_rate=0.1):
super(EncoderLayer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.attention_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
self.attention_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.ff_layer= PositionFeedForward(d_model, d_ff, dropout_rate)
self.ff_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.dropout = nn.Dropout(dropout_rate)
def forward(self,x, mask):
x1 = self.attention_layer(x,x,x,mask)
x = self.attention_layer_norm(x+ self.dropout(x1))
x1= self.ff_layer(x)
x = self.ff_layer_norm(x+self.dropout(x1))
return x
class Encoder(nn.Module):
def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate=0.1,maxlen=10000):
super(Encoder, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layers = n_layers
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.maxlen = maxlen
self.tok_embedding = nn.Linear(vocab_size, d_model)
self.pos_embedding = PositionalEncoding(d_model, dropout_rate=dropout_rate,maxlen=maxlen)
self.layers = nn.ModuleList([
EncoderLayer(d_model, n_heads, d_ff, dropout_rate)
for _ in range(n_layers)
])
self.layer_norm = nn.LayerNorm(d_model, dropout_rate=dropout_rate)
def forward(self, x, mask):
# x : batch_size x seq_len
x = self.tok_embedding(x)
x = self.pos_embedding(x)
for layer in self.layers:
x = layer(x, mask)
x = self.layer_norm(x)
return x
# batch_size x seq_len x d_model
class DecoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout_rate=0.1):
super(DecoderLayer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.attn_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
self.attn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.ff_layer = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
self.ff_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.dropout = nn.Dropout(dropout_rate)
self.encoder_attn_layer = MultiHeadAttention(d_model, n_heads, dropout_rate)
self.encoder_attn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
def forward(self, x, encoder_op, src_mask, tgt_mask):
x1 = self.attn_layer(x,x,x,tgt_mask)
x = self.attn_layer_norm(x + self.dropout(x1))
x1, attn = self.encoder_attn_layer(x, encoder_op, encoder_op, src_mask)
x = self.encoder_attn_layer(x+self.dropout(x1))
x1 = self.ff_layer(x)
x = self.ff_layer_norm(x+nn.Dropout(x1))
return x, attn
class Decoder(nn.Module):
def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate=0.1,maxlen=10000):
super(Encoder, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layers = n_layers
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.maxlen = maxlen
self.tok_embedding = nn.Linear(vocab_size, d_model)
self.pos_embedding = PositionalEncoding(d_model, dropout_rate=dropout_rate,maxlen=maxlen)
self.layers = nn.ModuleList([
DecoderLayer(d_model, n_heads, d_ff, dropout_rate)
for _ in range(n_layers)
])
self.layer_norm = nn.LayerNorm(d_model, dropout_rate=dropout_rate)
def forward(self, x, enc_output, src_mask, tgt_mask):
# x : batch_size x seq_len
x = self.tok_embedding(x)
x = self.pos_embedding(x)
for layer in self.layers:
x = layer(x, encodermask)
x = self.layer_norm(x, enc_output, src_mask, tgt_mask)
return x
# batch_size x seq_len x d_model
class Transformer(nn.Module):
def __init__(self, encoder, decoder, linear_mapper):
self.encoder= encoder
self.decoder= decoder
self.linear_mapper = linear_mapper
def forward(self,src,tgt):
enc_output = self.encoder(src)
dec_output, attn = self.decoder(tgt, enc_output)
output = self.linear_mapper(dec_output)
return output, attn
class LinearMapper(nn.Module):
def __init__(self, d_model, vocab_size):
super(LinearMapper, self).__init__()
self.affine_map = nn.Linear(d_model, vocab_size)
def forward(self,x):
x = self.affine_map(x)
output = F.log_softmax(x, dim=-1)
return output
transformer = nn.Transformer(d_model=512, nhead=8,
num_encoder_layers=6, num_decoder_layers=6,
dim_feedforward=2048, dropout=0.1)
/opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/site-packages/torch/nn/modules/transformer.py:307: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)
warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
transformer
Transformer(
(encoder): TransformerEncoder(
(layers): ModuleList(
(0-5): 6 x TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(linear1): Linear(in_features=512, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=2048, out_features=512, bias=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
)
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(decoder): TransformerDecoder(
(layers): ModuleList(
(0-5): 6 x TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(linear1): Linear(in_features=512, out_features=2048, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=2048, out_features=512, bias=True)
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
)
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
Finetuning Decoder Only Transformer#
from tokenizers import normalizers
from datasets import load_dataset
ds = load_dataset("SuryaKrishna02/aya-telugu-poems")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[23], line 1
----> 1 from tokenizers import normalizers
2 from datasets import load_dataset
3 ds = load_dataset("SuryaKrishna02/aya-telugu-poems")
ModuleNotFoundError: No module named 'tokenizers'
def generate_data(x):
for each_s in x:
yield each_s['inputs']
g = generate_data(ds['train'])
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])
tokenizer.train_from_iterator(g, trainer)
class PositionalEncoding(nn.Module):
def __init__(self, embed_size, max_len):
super(PositionalEncoding, self).__init__()
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, embed_size, 2) * (-math.log(10000.0) / embed_size))
pe = torch.zeros(max_len, 1, embed_size)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.pe = pe
def forward(self, x):
# batch, len, d_model
seq_len = x.size(1)
x = x + self.pe[:x.size(0), :, :].to(x.device)
return x
import math
class DecoderOnlyTransformer(nn.Module):
def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff,
dropout_rate=0.1,max_len=32):
super(DecoderOnlyTransformer, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layers = n_layers
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.max_len = max_len
self.position_embedding = PositionalEncoding(d_model, max_len)
self.tok_embedding = nn.Embedding(vocab_size, d_model)
decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size,
nhead =n_heads,
dim_feedforward=d_ff)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_layers)
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self,x):
x = self.tok_embedding(x)
x = self.position_embedding(x)
x = self.transformer_decoder(x,memory=x)
logits = self.fc_out(x)
return logits
vocab_size = 10000
embed_size = 512
num_heads = 8
num_layers = 6
hidden_dim = 2048
max_len = 32
g = generate_data(ds['train'])
def tokenize_data(corpus, tokenizer, max_len=32):
tokenized_corpus = [tokenizer.encode(sentence).ids for sentence in corpus]
tokenized_corpus = [x[0:max_len] for x in tokenized_corpus]
return tokenized_corpus
tokenized_corpus = tokenize_data(g, tokenizer)
len(tokenized_corpus)
5115
batch_size = 10
sequence_length = 32
input_sequences = torch.zeros((batch_size, sequence_length), dtype=torch.long)
input_sequences.size()
torch.Size([10, 32])
for i in range(batch_size):
input_sequences[i, :len(tokenized_corpus[i])] = torch.tensor(tokenized_corpus[i])
input_sequences.size()
torch.Size([10, 32])
transformer = DecoderOnlyTransformer( vocab_size=30000, d_model=512, n_layers=6, n_heads=8,
d_ff=512, dropout_rate=0.1,max_len=32)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
transformer.train()
optimizer.zero_grad()
# Forward pass
logits = transformer(input_sequences)
# Shift input by 1 token to compute next token prediction loss
target_sequences = input_sequences[:, 1:]
logits = logits[:, :-1, :].reshape(-1, 30000) # Reshape for loss computation
target_sequences = target_sequences.reshape(-1)
# Compute loss
loss = criterion(logits, target_sequences)
# Backward pass and optimization
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
Epoch 1/10, Loss: 10.540102005004883
Epoch 2/10, Loss: 8.720990180969238
Epoch 3/10, Loss: 7.641439437866211
Epoch 4/10, Loss: 6.5575408935546875
Epoch 5/10, Loss: 5.865268230438232
Epoch 6/10, Loss: 5.230606555938721
Epoch 7/10, Loss: 4.630501747131348
Epoch 8/10, Loss: 4.067417144775391
Epoch 9/10, Loss: 3.589400053024292
Epoch 10/10, Loss: 3.262800455093384
Lora Finetuning#
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
/home/sandeep/anaconda3/lib/python3.12/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
warnings.warn(
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["c_attn"],
lora_dropout=0.1,
bias="none", #
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
/home/sandeep/anaconda3/lib/python3.12/site-packages/peft/tuners/lora/layer.py:1091: UserWarning: fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True.
warnings.warn(
from datasets import load_dataset
# Load a dataset (e.g., WikiText)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], return_tensors="pt", padding=True, truncation=True,max_length=512)
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
def tokenize_and_prepare_labels(examples):
# Tokenize the input text
tokenized_inputs = tokenizer(examples['text'], return_tensors="pt", padding=True, truncation=True)
# GPT-2 uses the input as the label, so we clone the input_ids
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
return tokenized_inputs
# Apply the tokenization and labeling function
tokenized_dataset = dataset.map(tokenize_and_prepare_labels, batched=True, remove_columns='text')
len(tokenized_dataset['train'][1]['input_ids'])
473
tokenized_dataset['train']
Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 36718
})
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
logging_dir="./logs",
logging_steps=10,
save_steps=500,
learning_rate=5e-5,
evaluation_strategy="steps",
save_total_limit=2,
per_device_train_batch_size =1,
per_device_eval_batch_size =1
)
# Initialize the trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
)
# Fine-tune the model
# trainer.train()
/home/sandeep/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
warnings.warn(
BERT Models#
Language models trained in unsupervised manner to learn good contextualized representations.
Contains only stacks of encoder layers (Mutli-head attention, Feed-foward blocks, Token embeddings, Position Embeddings)
Training is done using two objectives.
Masked Language Modelling: Randomly masks some tokens in the input sequence and tries to predict it.
Next Sentence Prediction: Predicts if the sentence pair are consecutive in training.
Special Tokens:
[CLS] token is used at the beginning of every sentence, it is assumed to contain all the information present in the sentence, it is useful in classfication tasks.
[SEP] token is used to separate sentences from each other so that the model can undestand and learn it.
[PAD] token is used for padding data. [UNK] is for unknown tokens.
BERT + Finetuning classfication head achieves a good performance and strong baseline in text classification tasks. Before jumping on to generative AI, it is better to validate performance using strong pretrained BERT model. The inference cost of this model will be cheaper and the model can be finetuned to your usecase.
BERT uses trainable position embeddings unlike the original transformers which use sin/cos fixed embeddings.
import math
class BERT(nn.Module):
def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff,
dropout_rate=0.1,max_len=32):
super(BERT, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layers = n_layers
self.n_heads = n_heads
self.d_ff = d_ff
self.dropout_rate = dropout_rate
self.max_len = max_len
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.pos_embedding = nn.Embedding(max_len, d_model)
self.decoder_layer = nn.TransformerDecoderLayer(d_model, n_heads,
d_ff, dropout_rate)
self.decoder = nn.TransformerDecoder(self.decoder_layer,num_layers=n_layers)
self.dropout = nn.Dropout(dropout_rate)
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, x):
token_embed = self.token_embedding(x)
positions = torch.arange(0, self.max_len, dtype=torch.long)
positions = positions.unsqueeze(0).expand_as(x)
pos_embed = self.pos_embedding(positions)
segment_embed = token_embed + pos_embed
x = self.decoder(segment_embed,memory=segment_embed)
x = self.fc_out(x)
return x
bert_model = BERT(vocab_size=30000, d_model=512, n_layers=6, n_heads=8,
d_ff=512, dropout_rate=0.1,max_len=32)
def print_weight_sizes(model):
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params}\n')
for name, param in model.named_parameters():
param_size = param.numel()
percentage = 100 * param_size / total_params
print(f'Layer: {name} | Size: {param.size()} | Number of parameters: {param_size} | Percentage: {percentage:.2f}%')
print_weight_sizes(bert_model)
Total parameters: 49173808
Layer: token_embedding.weight | Size: torch.Size([30000, 512]) | Number of parameters: 15360000 | Percentage: 31.24%
Layer: pos_embedding.weight | Size: torch.Size([32, 512]) | Number of parameters: 16384 | Percentage: 0.03%
Layer: decoder_layer.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder_layer.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder_layer.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder_layer.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder_layer.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder_layer.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder_layer.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.0.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.0.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.0.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.0.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.0.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.0.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.1.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.1.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.1.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.1.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.1.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.1.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.2.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.2.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.2.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.2.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.2.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.2.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.3.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.3.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.3.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.3.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.3.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.3.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.4.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.4.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.4.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.4.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.4.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.4.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.self_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.5.self_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.5.self_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.self_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.multihead_attn.in_proj_weight | Size: torch.Size([1536, 512]) | Number of parameters: 786432 | Percentage: 1.60%
Layer: decoder.layers.5.multihead_attn.in_proj_bias | Size: torch.Size([1536]) | Number of parameters: 1536 | Percentage: 0.00%
Layer: decoder.layers.5.multihead_attn.out_proj.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.multihead_attn.out_proj.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.linear1.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.linear1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.linear2.weight | Size: torch.Size([512, 512]) | Number of parameters: 262144 | Percentage: 0.53%
Layer: decoder.layers.5.linear2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm1.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm1.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm2.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm2.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm3.weight | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: decoder.layers.5.norm3.bias | Size: torch.Size([512]) | Number of parameters: 512 | Percentage: 0.00%
Layer: fc_out.weight | Size: torch.Size([30000, 512]) | Number of parameters: 15360000 | Percentage: 31.24%
Layer: fc_out.bias | Size: torch.Size([30000]) | Number of parameters: 30000 | Percentage: 0.06%
inp = torch.randint(0, 10000, (16,32))
bert_model.forward(inp).size()
torch.Size([16, 32, 30000])
LLama Models#
LLama models change positional embeddings to RoPE: RoPE incorporates position information directly into the attention mechanism by rotating the input embeddings in a way that allows the model to capture relative positional relationships, rather than absolute ones. This makes the model more efficient at handling long-range dependencies.
LLama uses prenormalization unlike GPT.
Custom tokenizer with support for tokens in other langugages.
Large context length with the recent Llama3: 128k