Training Loop

Training is simple: show examples, compute loss, update weights. Repeat thousands of times.

python
1from torch.utils.data import DataLoader, TensorDataset
2
3def prepare_dataloader(data, tokenizer, max_seq_len, batch_size):
4 """Convert data to PyTorch DataLoader."""
5 sequences = [
6 pad_sequence(
7 tokenizer.encode(item["full"]),
8 max_seq_len,
9 tokenizer.pad_token_id,
10 )
11 for item in data
12 ]
13 tensor_data = torch.tensor(sequences)
14 dataset = TensorDataset(tensor_data)
15 return DataLoader(dataset, batch_size=batch_size, shuffle=True)
16
17
18def train(model, train_loader, tokenizer, config, num_epochs=100, lr=0.001):
19 """Train the model with next-token prediction."""
20 optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
21 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
22 criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
23
24 for epoch in range(num_epochs):
25 model.train()
26 total_loss = 0
27
28 for batch in train_loader:
29 x = batch[0]
30 inputs = x[:, :-1] # All tokens except last
31 targets = x[:, 1:] # All tokens except first (shifted by 1)
32
33 optimizer.zero_grad()
34 outputs = model(inputs)
35 loss = criterion(
36 outputs.reshape(-1, config["vocab_size"]),
37 targets.reshape(-1),
38 )
39 loss.backward()
40 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
41 optimizer.step()
42 total_loss += loss.item()
43
44 scheduler.step()
45
46 if (epoch + 1) % 10 == 0:
47 avg_loss = total_loss / len(train_loader)
48 print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")
49
50# Create model and train
51model = CalculatorLLM(
52 vocab_size=config["vocab_size"],
53 embed_dim=config["embed_dim"],
54 num_heads=config["num_heads"],
55 num_layers=config["num_layers"],
56 ff_dim=config["ff_dim"],
57 max_seq_len=config["max_seq_len"],
58 dropout=0.0, # No dropout for this small model
59)
60print(f"Parameters: {model.count_parameters():,}")
61
62train_loader = prepare_dataloader(train_data, tokenizer, config["max_seq_len"], batch_size=64)
63train(model, train_loader, tokenizer, config, num_epochs=100)

Tests

python
1# tests/test_train.py
2def test_loss_computes(model, tokenizer):
3 x = torch.tensor([[1, 6, 32, 7, 35, 9, 2, 0, 0]])
4 output = model(x)
5 criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
6 logits = output[:, :-1, :].view(-1, 36)
7 targets = x[:, 1:].view(-1)
8 loss = criterion(logits, targets)
9 assert loss.item() > 0
10 assert not torch.isnan(loss)
11
12def test_loss_decreases_with_training(model):
13 x = torch.tensor([[1, 6, 32, 7, 35, 9, 2]])
14 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
15 criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
16
17 initial_loss = None
18 for _ in range(10):
19 optimizer.zero_grad()
20 output = model(x)
21 loss = criterion(output[:, :-1].view(-1, 36), x[:, 1:].view(-1))
22 if initial_loss is None:
23 initial_loss = loss.item()
24 loss.backward()
25 optimizer.step()
26
27 assert loss.item() < initial_loss # Model is learning

Run tests: pytest tests/test_train.py -v

Helpful?