Training Dataset

We need training examples: math problems with their answers. Let's generate them programmatically.

python
1def num_to_words(n: int) -> str:
2 """Convert a number (0-99) to English words."""
3 if n < 0:
4 return "negative " + num_to_words(-n)
5
6 words_0_19 = [
7 "zero", "one", "two", "three", "four", "five", "six", "seven",
8 "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen",
9 "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
10 ]
11
12 if n < 20:
13 return words_0_19[n]
14
15 tens_words = [
16 "", "", "twenty", "thirty", "forty", "fifty",
17 "sixty", "seventy", "eighty", "ninety",
18 ]
19
20 if n < 100:
21 tens = tens_words[n // 10]
22 ones = n % 10
23 return tens if ones == 0 else f"{tens} {num_to_words(ones)}"
24
25 return str(n)
26
27
28def generate_all_combinations() -> list[dict]:
29 """Generate all valid math problem combinations."""
30 data = []
31
32 # Addition: a + b where result <= 99
33 for a in range(100):
34 for b in range(100 - a):
35 result = a + b
36 data.append({
37 "input": f"{num_to_words(a)} plus {num_to_words(b)}",
38 "output": num_to_words(result),
39 "full": f"{num_to_words(a)} plus {num_to_words(b)} equals {num_to_words(result)}",
40 })
41
42 # Subtraction: a - b where result >= 0
43 for a in range(100):
44 for b in range(a + 1):
45 result = a - b
46 data.append({
47 "input": f"{num_to_words(a)} minus {num_to_words(b)}",
48 "output": num_to_words(result),
49 "full": f"{num_to_words(a)} minus {num_to_words(b)} equals {num_to_words(result)}",
50 })
51
52 # Multiplication: a * b where result <= 99
53 for a in range(100):
54 for b in range(100):
55 result = a * b
56 if result <= 99:
57 data.append({
58 "input": f"{num_to_words(a)} times {num_to_words(b)}",
59 "output": num_to_words(result),
60 "full": f"{num_to_words(a)} times {num_to_words(b)} equals {num_to_words(result)}",
61 })
62
63 return data
64
65# Generate dataset
66all_data = generate_all_combinations()
67print(f"Total examples: {len(all_data):,}")
68# Total examples: ~15,000

Tests

python
1# tests/test_data.py
2def test_num_to_words_single_digits():
3 assert num_to_words(0) == "zero"
4 assert num_to_words(5) == "five"
5
6def test_num_to_words_compound():
7 assert num_to_words(42) == "forty two"
8 assert num_to_words(99) == "ninety nine"
9
10def test_generate_has_all_operations():
11 data = generate_all_combinations()
12 assert any("plus" in d["input"] for d in data)
13 assert any("minus" in d["input"] for d in data)
14 assert any("times" in d["input"] for d in data)
15
16def test_addition_example():
17 data = generate_all_combinations()
18 two_plus_three = [d for d in data if d["input"] == "two plus three"]
19 assert two_plus_three[0]["output"] == "five"

Run tests: pytest tests/test_data.py -v

Helpful?