Build Your First LLM from ScratchPart 3 · Section 12 of 13
Complete Input Pipeline
Let's combine everything into a single class:
Production note: In real systems, the tokenizer lives outside the model—tokenization happens on CPU (often in parallel) before tensors hit the GPU. We bundle it here for simplicity.
class InputEmbedding(nn.Module):
def __init__(self, vocabulary: dict, embed_dim: int = 64, max_seq_len: int = 32):
super().__init__()
self.tokenizer = Tokenizer(vocabulary)
self.embedding = nn.Embedding(len(vocabulary), embed_dim)
self.pos_embedding = nn.Embedding(max_seq_len, embed_dim)
def forward(self, text: str) -> torch.Tensor:
# Step 1: Tokenize
token_ids = self.tokenizer.encode(text)
token_ids = torch.tensor(token_ids)
# Step 2: Get token embeddings
embeddings = self.embedding(token_ids)
# Step 3: Add position embeddings
positions = torch.arange(len(token_ids))
pos_embeddings = self.pos_embedding(positions)
embeddings = embeddings + pos_embeddings
# Step 4: Add batch dimension [Seq, Dim] -> [Batch, Seq, Dim]
return embeddings.unsqueeze(0)Usage:
input_layer = InputEmbedding(vocabulary)
output = input_layer("two plus three")
print(output.shape) # torch.Size([1, 5, 64])
# Batch=1, Seq=5 tokens (with [START] and [END]), Dim=64
# Ready for the transformer!At Scale
# Same pattern, different numbers
input_layer = InputEmbedding(
vocabulary=gpt4_vocabulary, # 100k tokens
embed_dim=12288,
max_seq_len=8192
)
output = input_layer("Hello, how are you today?")
print(output.shape) # torch.Size([7, 12288])Key insight: The pipeline is identical. Only the scale changes.
Helpful?