Training Loop
Training is simple: show examples, compute loss, update weights. Repeat thousands of times.
python
1from torch.utils.data import DataLoader, TensorDataset2
3def prepare_dataloader(data, tokenizer, max_seq_len, batch_size):4 """Convert data to PyTorch DataLoader."""5 sequences = [6 pad_sequence(7 tokenizer.encode(item["full"]),8 max_seq_len,9 tokenizer.pad_token_id,10 )11 for item in data12 ]13 tensor_data = torch.tensor(sequences)14 dataset = TensorDataset(tensor_data)15 return DataLoader(dataset, batch_size=batch_size, shuffle=True)16
17
18def train(model, train_loader, tokenizer, config, num_epochs=100, lr=0.001):19 """Train the model with next-token prediction."""20 optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)21 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)22 criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)23
24 for epoch in range(num_epochs):25 model.train()26 total_loss = 027
28 for batch in train_loader:29 x = batch[0]30 inputs = x[:, :-1] # All tokens except last31 targets = x[:, 1:] # All tokens except first (shifted by 1)32
33 optimizer.zero_grad()34 outputs = model(inputs)35 loss = criterion(36 outputs.reshape(-1, config["vocab_size"]),37 targets.reshape(-1),38 )39 loss.backward()40 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)41 optimizer.step()42 total_loss += loss.item()43
44 scheduler.step()45
46 if (epoch + 1) % 10 == 0:47 avg_loss = total_loss / len(train_loader)48 print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")49
50# Create model and train51model = CalculatorLLM(52 vocab_size=config["vocab_size"],53 embed_dim=config["embed_dim"],54 num_heads=config["num_heads"],55 num_layers=config["num_layers"],56 ff_dim=config["ff_dim"],57 max_seq_len=config["max_seq_len"],58 dropout=0.0, # No dropout for this small model59)60print(f"Parameters: {model.count_parameters():,}")61
62train_loader = prepare_dataloader(train_data, tokenizer, config["max_seq_len"], batch_size=64)63train(model, train_loader, tokenizer, config, num_epochs=100)Tests
python
1# tests/test_train.py2def test_loss_computes(model, tokenizer):3 x = torch.tensor([[1, 6, 32, 7, 35, 9, 2, 0, 0]])4 output = model(x)5 criterion = torch.nn.CrossEntropyLoss(ignore_index=0)6 logits = output[:, :-1, :].view(-1, 36)7 targets = x[:, 1:].view(-1)8 loss = criterion(logits, targets)9 assert loss.item() > 010 assert not torch.isnan(loss)11
12def test_loss_decreases_with_training(model):13 x = torch.tensor([[1, 6, 32, 7, 35, 9, 2]])14 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)15 criterion = torch.nn.CrossEntropyLoss(ignore_index=0)16
17 initial_loss = None18 for _ in range(10):19 optimizer.zero_grad()20 output = model(x)21 loss = criterion(output[:, :-1].view(-1, 36), x[:, 1:].view(-1))22 if initial_loss is None:23 initial_loss = loss.item()24 loss.backward()25 optimizer.step()26
27 assert loss.item() < initial_loss # Model is learningRun tests: pytest tests/test_train.py -v
Helpful?