G2P Character-to-Phoneme Alignment Implementation Plan¶
For agentic workers: REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (
- [ ]) syntax for tracking.
Goal: Produce a JSON file mapping character spans to IPA phonemes for all ~134K CMU dict entries (including alternate pronunciations), enabling subword token phoneme attribution in the Inventio governor.
Architecture: Pure Python script that parses the raw CMU dict, bootstraps a grapheme-to-phoneme table from the data itself, then uses constrained dynamic programming to find optimal character-to-phoneme alignments. Two-phase approach: greedy bootstrap builds the table, DP alignment uses it.
Tech Stack: Python 3.10+, pytest, no external dependencies beyond stdlib + json
Spec: docs/superpowers/specs/2026-03-12-g2p-alignment-design.md
File Structure¶
workers/scripts/
├── g2p_alignment.py # Main script: CMU parser, bootstrap, DP aligner, JSON export
└── config.py # Existing (no changes)
tests/
└── test_g2p_alignment.py # All tests for the alignment pipeline
data/
├── cmu/cmudict-0.7b # Existing input (no changes)
├── mappings/arpa_to_ipa.json # Existing input (no changes)
└── g2p_alignment.json # Output (gitignored)
The script is a single file (g2p_alignment.py) with four clear internal sections:
1. CMU parser — reads raw dict, handles variants/special chars, converts ARPAbet to IPA
2. Bootstrap — greedy first pass to build grapheme-to-phoneme table
3. DP aligner — constrained DP that uses the bootstrap table to find optimal alignments
4. Export — writes JSON output, reports coverage stats
Chunk 1: CMU Parser + Tests¶
Task 1: Create test file with CMU parser tests¶
Files:
- Create: tests/test_g2p_alignment.py
- [ ] Step 1: Write CMU parser tests
"""Tests for G2P character-to-phoneme alignment pipeline."""
import json
import pytest
from pathlib import Path
# We'll import from the script once it exists
# For now, define expected behaviors
ARPA_TO_IPA_PATH = Path(__file__).parent.parent / "data" / "mappings" / "arpa_to_ipa.json"
CMU_DICT_PATH = Path(__file__).parent.parent / "data" / "cmu" / "cmudict-0.7b"
class TestParseCmuLine:
"""Test parsing individual CMU dict lines."""
def test_simple_word(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("CAT K AE1 T")
assert result == ("cat", 0, ["K", "AE1", "T"])
def test_alternate_pronunciation(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("LIVE(1) L IH1 V")
assert result == ("live", 1, ["L", "IH1", "V"])
def test_second_alternate(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("TOMATO(2) T AH0 M EY1 T OW0")
assert result == ("tomato", 2, ["T", "AH0", "M", "EY1", "T", "OW0"])
def test_apostrophe_word(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("O'BRIEN OW0 B R AY1 IH0 N")
assert result == ("o'brien", 0, ["OW0", "B", "R", "AY1", "IH0", "N"])
def test_contraction(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("'BOUT B AW1 T")
assert result == ("'bout", 0, ["B", "AW1", "T"])
def test_hyphenated_word(self):
from workers.scripts.g2p_alignment import parse_cmu_line
result = parse_cmu_line("ABLE-BODIED EY1 B AH0 L B AA1 D IY0 D")
assert result == ("able-bodied", 0, ["EY1", "B", "AH0", "L", "B", "AA1", "D", "IY0", "D"])
def test_comment_line_returns_none(self):
from workers.scripts.g2p_alignment import parse_cmu_line
assert parse_cmu_line(";;; This is a comment") is None
def test_empty_line_returns_none(self):
from workers.scripts.g2p_alignment import parse_cmu_line
assert parse_cmu_line("") is None
def test_punctuation_prefix_returns_none(self):
from workers.scripts.g2p_alignment import parse_cmu_line
assert parse_cmu_line('!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T') is None
assert parse_cmu_line('"CLOSE-QUOTE K L OW1 Z K W OW1 T') is None
assert parse_cmu_line("#HASH-MARK HH AE1 M AA2 R K") is None
class TestArpaToIpa:
"""Test ARPAbet to IPA conversion."""
def test_simple_consonants(self):
from workers.scripts.g2p_alignment import arpa_to_ipa
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
assert mapping["K"] == "k"
assert mapping["T"] == "t"
assert mapping["B"] == "b"
def test_stressed_vowels(self):
from workers.scripts.g2p_alignment import arpa_to_ipa
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
assert mapping["AE1"] == "ˈæ"
assert mapping["AH0"] == "ə"
assert mapping["AH1"] == "ˈʌ"
def test_digraph_consonants(self):
from workers.scripts.g2p_alignment import arpa_to_ipa
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
assert mapping["TH"] == "θ"
assert mapping["DH"] == "ð"
assert mapping["SH"] == "ʃ"
assert mapping["CH"] == "tʃ"
class TestConvertArpaSequence:
"""Test converting a full ARPAbet sequence to IPA."""
def test_cat(self):
from workers.scripts.g2p_alignment import arpa_to_ipa, convert_arpa_sequence
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
result = convert_arpa_sequence(["K", "AE1", "T"], mapping)
assert result == ["k", "ˈæ", "t"]
def test_bartholomew(self):
from workers.scripts.g2p_alignment import arpa_to_ipa, convert_arpa_sequence
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
result = convert_arpa_sequence(
["B", "AA0", "R", "TH", "AA1", "L", "AH0", "M", "Y", "UW2"], mapping
)
assert result == ["b", "ɑ", "ɹ", "θ", "ˈɑ", "l", "ə", "m", "j", "ˌu"]
class TestLoadCmuDict:
"""Test loading the full CMU dict file."""
def test_loads_entries(self):
from workers.scripts.g2p_alignment import load_cmu_dict
entries = load_cmu_dict(CMU_DICT_PATH)
# Should have 100K+ entries
assert len(entries) > 100000
def test_has_alternates(self):
from workers.scripts.g2p_alignment import load_cmu_dict
entries = load_cmu_dict(CMU_DICT_PATH)
# Find entries for "live" - should have variant 0 and variant 1
live_entries = [e for e in entries if e[0] == "live"]
assert len(live_entries) >= 2
variants = {e[1] for e in live_entries}
assert 0 in variants
assert 1 in variants
def test_skips_punctuation_prefixed(self):
from workers.scripts.g2p_alignment import load_cmu_dict
entries = load_cmu_dict(CMU_DICT_PATH)
words = {e[0] for e in entries}
assert "!exclamation-point" not in words
assert '"close-quote' not in words
def test_includes_apostrophe_words(self):
from workers.scripts.g2p_alignment import load_cmu_dict
entries = load_cmu_dict(CMU_DICT_PATH)
words = {e[0] for e in entries}
assert "o'brien" in words
def test_lowercases_all_words(self):
from workers.scripts.g2p_alignment import load_cmu_dict
entries = load_cmu_dict(CMU_DICT_PATH)
for word, _, _ in entries[:1000]:
assert word == word.lower(), f"Word not lowercased: {word}"
- [ ] Step 2: Run tests to verify they fail
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py -v
Expected: FAIL — ModuleNotFoundError: No module named 'workers.scripts.g2p_alignment'
- [ ] Step 3: Commit test file
git add tests/test_g2p_alignment.py
git commit -m "test: add CMU parser tests for G2P alignment"
Task 2: Implement CMU parser¶
Files:
- Create: workers/scripts/g2p_alignment.py
- [ ] Step 1: Implement CMU parser functions
"""
G2P Character-to-Phoneme Alignment
Produces character-span-to-phoneme alignments for all CMU dict entries.
Used by diffusion-governors' LookupBuilder for subword token phoneme attribution.
Usage:
python workers/scripts/g2p_alignment.py
Inputs:
data/cmu/cmudict-0.7b
data/mappings/arpa_to_ipa.json
Output:
data/g2p_alignment.json
"""
from __future__ import annotations
import json
import re
import sys
import time
from pathlib import Path
from typing import Optional
# Paths relative to repo root
REPO_ROOT = Path(__file__).parent.parent.parent
CMU_DICT_PATH = REPO_ROOT / "data" / "cmu" / "cmudict-0.7b"
ARPA_TO_IPA_PATH = REPO_ROOT / "data" / "mappings" / "arpa_to_ipa.json"
OUTPUT_PATH = REPO_ROOT / "data" / "g2p_alignment.json"
# Regex for alternate pronunciation suffix: WORD(N)
VARIANT_RE = re.compile(r"^(.+)\((\d+)\)$")
# Characters that mark meta-entries to skip
SKIP_PREFIXES = set('!"#$%&*+,./0123456789:;<=>?@[\\]^_`{|}~')
def arpa_to_ipa(path: Path) -> dict[str, str]:
"""Load ARPAbet-to-IPA mapping from JSON file."""
with open(path) as f:
return json.load(f)
def convert_arpa_sequence(arpa_phones: list[str], mapping: dict[str, str]) -> list[str]:
"""Convert a list of ARPAbet symbols to IPA using the mapping."""
return [mapping[phone] for phone in arpa_phones]
def parse_cmu_line(line: str) -> Optional[tuple[str, int, list[str]]]:
"""
Parse a single CMU dict line.
Returns (word, variant_index, arpa_phonemes) or None if line should be skipped.
"""
line = line.strip()
if not line or line.startswith(";;;"):
return None
# Split on two spaces (CMU dict format)
parts = line.split(" ", 1)
if len(parts) != 2:
return None
key, phonemes_str = parts
key = key.strip()
# Skip punctuation-prefixed meta-entries
if key and key[0] in SKIP_PREFIXES:
return None
# Parse variant suffix
variant = 0
m = VARIANT_RE.match(key)
if m:
key = m.group(1)
variant = int(m.group(2))
word = key.lower()
arpa_phonemes = phonemes_str.strip().split()
return (word, variant, arpa_phonemes)
def load_cmu_dict(path: Path) -> list[tuple[str, int, list[str]]]:
"""
Load all entries from CMU dict file.
Returns list of (word, variant_index, arpa_phonemes) tuples.
"""
entries = []
with open(path, encoding="latin-1") as f:
for line in f:
result = parse_cmu_line(line)
if result is not None:
entries.append(result)
return entries
if __name__ == "__main__":
print("G2P Alignment — CMU parser loaded")
entries = load_cmu_dict(CMU_DICT_PATH)
print(f" {len(entries):,} entries parsed")
alternates = sum(1 for _, v, _ in entries if v > 0)
print(f" {alternates:,} alternate pronunciations")
- [ ] Step 2: Run tests to verify they pass
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py -v
Expected: All TestParseCmuLine, TestArpaToIpa, TestConvertArpaSequence, and TestLoadCmuDict tests PASS
- [ ] Step 3: Run the script standalone to verify counts
Run: cd /Users/jneumann/Repos/PhonoLex && python workers/scripts/g2p_alignment.py
Expected: Prints entry count (~125K) and alternate count (~8,600)
- [ ] Step 4: Commit
git add workers/scripts/g2p_alignment.py
git commit -m "feat: add CMU dict parser for G2P alignment"
Chunk 2: Grapheme Bootstrap + Tests¶
Task 3: Write bootstrap tests¶
Files:
- Modify: tests/test_g2p_alignment.py
- [ ] Step 1: Add bootstrap tests
Append to tests/test_g2p_alignment.py:
class TestBootstrapGraphemeTable:
"""Test Phase 1: bootstrapping grapheme-to-phoneme table from CMU data."""
def test_seed_graphemes_defined(self):
from workers.scripts.g2p_alignment import SEED_GRAPHEMES
# Must include the key multi-character graphemes
assert "th" in SEED_GRAPHEMES
assert "sh" in SEED_GRAPHEMES
assert "ch" in SEED_GRAPHEMES
assert "ck" in SEED_GRAPHEMES
assert "ph" in SEED_GRAPHEMES
assert "ng" in SEED_GRAPHEMES
assert "ew" in SEED_GRAPHEMES
assert "ough" in SEED_GRAPHEMES
def test_greedy_segment_simple(self):
from workers.scripts.g2p_alignment import greedy_segment
# "cat" has no multi-char graphemes -> single chars
result = greedy_segment("cat")
assert result == ["c", "a", "t"]
def test_greedy_segment_th(self):
from workers.scripts.g2p_alignment import greedy_segment
result = greedy_segment("think")
assert result == ["th", "i", "n", "k"]
def test_greedy_segment_ough(self):
from workers.scripts.g2p_alignment import greedy_segment
result = greedy_segment("through")
assert result == ["th", "r", "ough"]
def test_greedy_segment_tch(self):
from workers.scripts.g2p_alignment import greedy_segment
result = greedy_segment("match")
assert result == ["m", "a", "tch"]
def test_greedy_segment_apostrophe(self):
from workers.scripts.g2p_alignment import greedy_segment
result = greedy_segment("o'brien")
assert result == ["o", "'", "b", "r", "ie", "n"]
def test_greedy_segment_hyphen(self):
from workers.scripts.g2p_alignment import greedy_segment
# Hyphen is a single char, not a seed grapheme
result = greedy_segment("able-bodied")
assert "a" in result
assert "-" in result
def test_bootstrap_builds_table(self):
from workers.scripts.g2p_alignment import bootstrap_grapheme_table, arpa_to_ipa
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
# Use a small subset of entries for testing
entries = [
("cat", 0, ["K", "AE1", "T"]),
("think", 0, ["TH", "IH1", "NG", "K"]),
("ship", 0, ["SH", "IH1", "P"]),
("phone", 0, ["F", "OW1", "N"]),
]
table = bootstrap_grapheme_table(entries, mapping)
# Single-char mappings learned
assert "k" in table["c"]
assert "t" in table["t"]
# Multi-char mappings learned
assert "θ" in table["th"]
assert "ʃ" in table["sh"]
# "ph" -> "f" learned from "phone"
assert "f" in table["ph"]
def test_bootstrap_validates_against_phoneme_count(self):
from workers.scripts.g2p_alignment import bootstrap_grapheme_table, arpa_to_ipa
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
# "box" has 3 chars but 4 phonemes (b, ɑ, k, s) - greedy can't align 1:1
# Should be discarded from bootstrap (handled by DP later)
entries = [
("box", 0, ["B", "AA1", "K", "S"]),
]
table = bootstrap_grapheme_table(entries, mapping)
# "x" should NOT learn "k" or "s" from this entry since it was discarded
# (the greedy segmentation produces ["b","o","x"] = 3 units for 4 phonemes)
assert "x" not in table or "k" not in table.get("x", set())
- [ ] Step 2: Run tests to verify they fail
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py::TestBootstrapGraphemeTable -v
Expected: FAIL — ImportError: cannot import name 'SEED_GRAPHEMES'
- [ ] Step 3: Commit test additions
git add tests/test_g2p_alignment.py
git commit -m "test: add bootstrap grapheme table tests"
Task 4: Implement bootstrap¶
Files:
- Modify: workers/scripts/g2p_alignment.py
- [ ] Step 1: Add seed graphemes and greedy segmenter
Add after the load_cmu_dict function in g2p_alignment.py:
# ---------------------------------------------------------------------------
# Phase 1: Bootstrap grapheme-phoneme table
# ---------------------------------------------------------------------------
# Seed multi-character graphemes, sorted longest-first for greedy matching
SEED_GRAPHEMES = sorted(
[
"th", "sh", "ch", "ck", "ph", "gh", "ng", "wh",
"wr", "kn", "gn", "mb", "mn", "ps", "pn", "rh",
"dg", "tch", "ght", "qu",
"ee", "ea", "oo", "ou", "ow", "oi", "oy", "aw",
"au", "ew", "ei", "ey", "ai", "ay", "ie", "ue",
"oa", "oe", "eigh", "ough", "augh",
],
key=len,
reverse=True,
)
# Max grapheme length (for DP later)
MAX_GRAPHEME_LEN = max(len(g) for g in SEED_GRAPHEMES)
def greedy_segment(word: str) -> list[str]:
"""
Segment a word into grapheme units using longest-match-first against seed graphemes.
Characters not matching any seed grapheme are returned as single characters.
"""
segments = []
i = 0
while i < len(word):
matched = False
for grapheme in SEED_GRAPHEMES:
g_len = len(grapheme)
if word[i : i + g_len] == grapheme:
segments.append(grapheme)
i += g_len
matched = True
break
if not matched:
segments.append(word[i])
i += 1
return segments
def bootstrap_grapheme_table(
entries: list[tuple[str, int, list[str]]],
ipa_mapping: dict[str, str],
) -> dict[str, set[str]]:
"""
Build grapheme-to-phoneme table by greedy segmentation of all entries.
For each entry, segment the word into graphemes, convert ARPAbet to IPA,
and if the segment count equals the phoneme count, record the 1:1 pairings.
Entries where segment count != phoneme count are discarded (handled by DP later).
Returns dict mapping grapheme string -> set of observed IPA phonemes.
"""
table: dict[str, set[str]] = {}
for word, _, arpa_phones in entries:
segments = greedy_segment(word)
if len(segments) != len(arpa_phones):
continue # Can't do 1:1 pairing, skip
try:
ipa_phones = convert_arpa_sequence(arpa_phones, ipa_mapping)
except KeyError:
continue # Unknown ARPAbet symbol, skip
for grapheme, phoneme in zip(segments, ipa_phones):
if grapheme not in table:
table[grapheme] = set()
table[grapheme].add(phoneme)
return table
- [ ] Step 2: Run tests to verify they pass
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py::TestBootstrapGraphemeTable -v
Expected: All PASS
- [ ] Step 3: Quick sanity check — run bootstrap on full data
Run: cd /Users/jneumann/Repos/PhonoLex && python -c "
from workers.scripts.g2p_alignment import *
entries = load_cmu_dict(CMU_DICT_PATH)
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
table = bootstrap_grapheme_table(entries, mapping)
print(f'Graphemes in table: {len(table)}')
print(f'Sample: th -> {table.get(\"th\", set())}')
print(f'Sample: a -> {sorted(table.get(\"a\", set()))[:5]}')
aligned = sum(1 for w,_,p in entries if len(greedy_segment(w)) == len(p))
print(f'Bootstrap aligned: {aligned:,} / {len(entries):,} ({aligned/len(entries)*100:.1f}%)')
"
Expected: A significant percentage of entries align 1:1 in the bootstrap pass (likely 60-80%)
- [ ] Step 4: Commit
git add workers/scripts/g2p_alignment.py
git commit -m "feat: add grapheme bootstrap for G2P alignment table"
Chunk 3: DP Aligner + Tests¶
Task 5: Write DP aligner tests¶
Files:
- Modify: tests/test_g2p_alignment.py
- [ ] Step 1: Add DP alignment tests
Append to tests/test_g2p_alignment.py:
class TestDPAlign:
"""Test Phase 2: DP-based character-to-phoneme alignment."""
@pytest.fixture
def table(self):
"""Build a bootstrap table from full CMU data for DP tests."""
from workers.scripts.g2p_alignment import (
load_cmu_dict,
arpa_to_ipa,
bootstrap_grapheme_table,
)
entries = load_cmu_dict(CMU_DICT_PATH)
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
return bootstrap_grapheme_table(entries, mapping)
def test_simple_word_cat(self, table):
from workers.scripts.g2p_alignment import dp_align
result = dp_align("cat", ["k", "ˈæ", "t"], table)
assert result is not None
assert len(result) == 3
assert result[0] == {"chars": [0, 1], "grapheme": "c", "phonemes": ["k"]}
assert result[1] == {"chars": [1, 2], "grapheme": "a", "phonemes": ["ˈæ"]}
assert result[2] == {"chars": [2, 3], "grapheme": "t", "phonemes": ["t"]}
def test_multi_char_grapheme_think(self, table):
from workers.scripts.g2p_alignment import dp_align
# THINK TH IH1 NG K -> θ ˈɪ ŋ k
result = dp_align("think", ["θ", "ˈɪ", "ŋ", "k"], table)
assert result is not None
# "th" should be a single grapheme mapping to θ
th_entry = [r for r in result if r["grapheme"] == "th"]
assert len(th_entry) == 1
assert th_entry[0]["phonemes"] == ["θ"]
assert th_entry[0]["chars"] == [0, 2]
def test_silent_e_make(self, table):
from workers.scripts.g2p_alignment import dp_align
# MAKE M EY1 K -> m ˈeɪ k
result = dp_align("make", ["m", "ˈeɪ", "k"], table)
assert result is not None
# Should have 4 entries (m, a, k, e) with silent e
assert len(result) == 4
# Last entry should be silent "e"
assert result[-1]["grapheme"] == "e"
assert result[-1]["phonemes"] == []
def test_silent_k_knife(self, table):
from workers.scripts.g2p_alignment import dp_align
# KNIFE N AY1 F -> n ˈaɪ f
result = dp_align("knife", ["n", "ˈaɪ", "f"], table)
assert result is not None
# DP may produce "kn" -> n (cost 0 if learned from bootstrap via "knee", "know")
# or "k" -> silent (cost 2) + "n" -> n (cost 0)
# Either way, the first phoneme "n" is attributed to chars starting at 0
first = result[0]
assert first["chars"][0] == 0
assert "n" in first["phonemes"]
def test_multi_phoneme_grapheme_box(self, table):
from workers.scripts.g2p_alignment import dp_align
# BOX B AA1 K S -> b ˈɑ k s
result = dp_align("box", ["b", "ˈɑ", "k", "s"], table)
assert result is not None
# "x" should map to two phonemes [k, s]
x_entry = [r for r in result if r["grapheme"] == "x"]
assert len(x_entry) == 1
assert x_entry[0]["phonemes"] == ["k", "s"]
def test_bartholomew(self, table):
from workers.scripts.g2p_alignment import dp_align
ipa = ["b", "ɑ", "ɹ", "θ", "ˈɑ", "l", "ə", "m", "j", "ˌu"]
result = dp_align("bartholomew", ipa, table)
assert result is not None
# "th" at chars [3,5] should map to θ
th_entry = [r for r in result if r["grapheme"] == "th"]
assert len(th_entry) == 1
assert th_entry[0]["phonemes"] == ["θ"]
assert th_entry[0]["chars"] == [3, 5]
# "ew" at end should map to [j, ˌu]
ew_entry = [r for r in result if r["grapheme"] == "ew"]
assert len(ew_entry) == 1
assert ew_entry[0]["phonemes"] == ["j", "ˌu"]
def test_apostrophe_obrien(self, table):
from workers.scripts.g2p_alignment import dp_align
# O'BRIEN OW0 B R AY1 IH0 N -> oʊ b ɹ ˈaɪ ɪ n
ipa = ["oʊ", "b", "ɹ", "ˈaɪ", "ɪ", "n"]
result = dp_align("o'brien", ipa, table)
assert result is not None
# Apostrophe should be silent
apos_entries = [r for r in result if r["grapheme"] == "'"]
assert len(apos_entries) == 1
assert apos_entries[0]["phonemes"] == []
def test_hyphen_silent(self, table):
from workers.scripts.g2p_alignment import dp_align
# Simplified: "a-b" with phonemes [eɪ, b] — hyphen should be silent
ipa = ["ˈeɪ", "b"]
result = dp_align("a-b", ipa, table)
assert result is not None
hyphen_entries = [r for r in result if r["grapheme"] == "-"]
assert len(hyphen_entries) == 1
assert hyphen_entries[0]["phonemes"] == []
def test_doubled_consonant(self, table):
from workers.scripts.g2p_alignment import dp_align
# BELL B EH1 L -> b ˈɛ l
result = dp_align("bell", ["b", "ˈɛ", "l"], table)
assert result is not None
# DP may produce "ll" -> l (single novel grapheme, cost 1)
# or "l" -> l + "l" -> silent (cost 0 + 2 = 2)
# Either way, all phonemes are accounted for
all_phonemes = [p for r in result for p in r["phonemes"]]
assert all_phonemes == ["b", "ˈɛ", "l"]
def test_church_ch_digraph(self, table):
from workers.scripts.g2p_alignment import dp_align
# CHURCH CH ER1 CH -> tʃ ˈɝ tʃ
result = dp_align("church", ["tʃ", "ˈɝ", "tʃ"], table)
assert result is not None
ch_entries = [r for r in result if r["grapheme"] == "ch"]
assert len(ch_entries) == 2
assert all(r["phonemes"] == ["tʃ"] for r in ch_entries)
def test_returns_none_for_impossible_alignment(self, table):
from workers.scripts.g2p_alignment import dp_align
# More phonemes than 2x characters — DP cannot consume them all
# (each char consumes at most 2 phonemes, so 3 chars can handle at most 6)
result = dp_align("cat", ["ʒ", "ʊ", "ŋ", "k", "s", "t", "b"], table)
assert result is None
- [ ] Step 2: Run tests to verify they fail
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py::TestDPAlign -v
Expected: FAIL — ImportError: cannot import name 'dp_align'
- [ ] Step 3: Commit test additions
git add tests/test_g2p_alignment.py
git commit -m "test: add DP alignment tests for G2P"
Task 6: Implement DP aligner¶
Files:
- Modify: workers/scripts/g2p_alignment.py
- [ ] Step 1: Add DP alignment function
Add after the bootstrap_grapheme_table function:
# ---------------------------------------------------------------------------
# Phase 2: DP alignment
# ---------------------------------------------------------------------------
def dp_align(
word: str,
ipa: list[str],
table: dict[str, set[str]],
) -> Optional[list[dict]]:
"""
Find optimal character-to-phoneme alignment using dynamic programming.
Returns list of alignment entries [{"chars": [start, end], "grapheme": str, "phonemes": list[str]}]
or None if no valid alignment exists.
"""
n = len(word)
m = len(ipa)
# DP table: best (cost, num_units) to reach state (i, j)
# where i = chars consumed, j = phonemes consumed
INF = (float("inf"), float("inf"))
dp = [[INF] * (m + 1) for _ in range(n + 1)]
dp[0][0] = (0, 0)
# Backtrack: store (prev_i, prev_j, grapheme, phonemes_consumed) per state
back: list[list[Optional[tuple[int, int, str, list[str]]]]] = [
[None] * (m + 1) for _ in range(n + 1)
]
for i in range(n):
for j in range(m + 1):
if dp[i][j] == INF:
continue
cur_cost, cur_units = dp[i][j]
# Try all grapheme spans starting at position i
for g in range(1, min(MAX_GRAPHEME_LEN, n - i) + 1):
grapheme = word[i : i + g]
# Transition 1: Silent (0 phonemes consumed)
new_cost = cur_cost + 2
new_units = cur_units + 1
new_state = (new_cost, new_units)
if new_state < dp[i + g][j]:
dp[i + g][j] = new_state
back[i + g][j] = (i, j, grapheme, [])
# Transition 2: 1 phoneme consumed
if j < m:
known = table.get(grapheme, set())
cost_add = 0 if ipa[j] in known else 1
new_cost = cur_cost + cost_add
new_units = cur_units + 1
new_state = (new_cost, new_units)
if new_state < dp[i + g][j + 1]:
dp[i + g][j + 1] = new_state
back[i + g][j + 1] = (i, j, grapheme, [ipa[j]])
# Transition 3: 2 phonemes consumed
if j + 1 < m:
# The 1:1 bootstrap can never learn 2-phoneme mappings
# (words with multi-phoneme graphemes have segment != phoneme
# count and are discarded from Phase 1). So all 2-phoneme
# transitions are novel (cost=1). Per spec, this is correct:
# "the Phase 1 bootstrap cannot learn" these mappings.
cost_add = 1
new_cost = cur_cost + cost_add
new_units = cur_units + 1
new_state = (new_cost, new_units)
if new_state < dp[i + g][j + 2]:
dp[i + g][j + 2] = new_state
back[i + g][j + 2] = (i, j, grapheme, [ipa[j], ipa[j + 1]])
# Check if goal state is reachable
if dp[n][m] == INF:
return None
# Backtrack to recover alignment
alignment = []
ci, cj = n, m
while ci > 0 or cj > 0:
if back[ci][cj] is None:
return None # Should not happen if dp[n][m] != INF
prev_i, prev_j, grapheme, phonemes = back[ci][cj]
alignment.append(
{"chars": [prev_i, ci], "grapheme": grapheme, "phonemes": phonemes}
)
ci, cj = prev_i, prev_j
alignment.reverse()
return alignment
- [ ] Step 2: Run tests to verify they pass
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py::TestDPAlign -v
Expected: All PASS
- [ ] Step 3: Run all tests together
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py -v
Expected: All tests PASS
- [ ] Step 4: Commit
git add workers/scripts/g2p_alignment.py
git commit -m "feat: add DP aligner for G2P character-to-phoneme alignment"
Chunk 4: Fallback, Export, and Full Pipeline + Tests¶
Task 7: Write fallback and export tests¶
Files:
- Modify: tests/test_g2p_alignment.py
- [ ] Step 1: Add fallback and export tests
Append to tests/test_g2p_alignment.py:
class TestFallbackAlignment:
"""Test the left-greedy proportional fallback for words that can't be DP-aligned."""
def test_more_chars_than_phonemes(self):
from workers.scripts.g2p_alignment import fallback_align
# 5 chars, 3 phonemes -> first 3 chars get 1 phoneme each, rest get []
result = fallback_align("abcde", ["p1", "p2", "p3"])
assert len(result) == 5
assert result[0]["phonemes"] == ["p1"]
assert result[1]["phonemes"] == ["p2"]
assert result[2]["phonemes"] == ["p3"]
assert result[3]["phonemes"] == []
assert result[4]["phonemes"] == []
def test_more_phonemes_than_chars(self):
from workers.scripts.g2p_alignment import fallback_align
# 2 chars, 4 phonemes -> last char gets remaining phonemes
result = fallback_align("ab", ["p1", "p2", "p3", "p4"])
assert len(result) == 2
assert result[0]["phonemes"] == ["p1"]
assert result[1]["phonemes"] == ["p2", "p3", "p4"]
def test_equal_chars_and_phonemes(self):
from workers.scripts.g2p_alignment import fallback_align
result = fallback_align("abc", ["p1", "p2", "p3"])
assert len(result) == 3
assert all(len(r["phonemes"]) == 1 for r in result)
def test_char_spans_are_correct(self):
from workers.scripts.g2p_alignment import fallback_align
result = fallback_align("abc", ["p1", "p2", "p3"])
assert result[0]["chars"] == [0, 1]
assert result[1]["chars"] == [1, 2]
assert result[2]["chars"] == [2, 3]
class TestAlignWord:
"""Test the top-level align_word function that combines DP + fallback."""
@pytest.fixture
def table(self):
from workers.scripts.g2p_alignment import (
load_cmu_dict,
arpa_to_ipa,
bootstrap_grapheme_table,
)
entries = load_cmu_dict(CMU_DICT_PATH)
mapping = arpa_to_ipa(ARPA_TO_IPA_PATH)
return bootstrap_grapheme_table(entries, mapping)
def test_high_confidence_word(self, table):
from workers.scripts.g2p_alignment import align_word
alignment, confidence = align_word("cat", ["k", "ˈæ", "t"], table)
assert confidence == "high"
assert len(alignment) == 3
def test_low_confidence_fallback(self, table):
from workers.scripts.g2p_alignment import align_word
# More phonemes than 2x characters — forces DP failure and fallback
alignment, confidence = align_word("cat", ["ʒ", "ʊ", "ŋ", "k", "s", "t", "b"], table)
assert confidence == "low"
assert len(alignment) == 3 # One entry per character
class TestFullPipeline:
"""Integration test: run the full pipeline on a curated word set."""
@pytest.fixture(scope="class")
def pipeline_result(self):
from workers.scripts.g2p_alignment import run_pipeline
return run_pipeline(CMU_DICT_PATH, ARPA_TO_IPA_PATH)
def test_returns_dict(self, pipeline_result):
result, stats = pipeline_result
assert isinstance(result, dict)
def test_has_many_words(self, pipeline_result):
result, stats = pipeline_result
assert len(result) > 100000
def test_live_has_two_variants(self, pipeline_result):
result, stats = pipeline_result
assert "live" in result
assert len(result["live"]) >= 2
def test_cat_alignment_correct(self, pipeline_result):
result, stats = pipeline_result
assert "cat" in result
cat = result["cat"][0]
assert cat["variant"] == 0
assert cat["confidence"] == "high"
assert len(cat["alignment"]) == 3
assert cat["alignment"][0]["grapheme"] == "c"
assert cat["alignment"][0]["phonemes"] == ["k"]
def test_high_confidence_rate(self, pipeline_result):
"""At least 90% of entries should have high confidence."""
result, stats = pipeline_result
assert stats["high_confidence_pct"] >= 90.0
def test_stats_has_required_fields(self, pipeline_result):
result, stats = pipeline_result
assert "total_entries" in stats
assert "unique_words" in stats
assert "high_confidence" in stats
assert "low_confidence" in stats
assert "high_confidence_pct" in stats
- [ ] Step 2: Run tests to verify they fail
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py::TestFallbackAlignment -v
Expected: FAIL — ImportError: cannot import name 'fallback_align'
- [ ] Step 3: Commit test additions
git add tests/test_g2p_alignment.py
git commit -m "test: add fallback, align_word, and pipeline integration tests"
Task 8: Implement fallback, align_word, run_pipeline, and main¶
Files:
- Modify: workers/scripts/g2p_alignment.py
- [ ] Step 1: Add fallback alignment
Add after the dp_align function:
def fallback_align(word: str, ipa: list[str]) -> list[dict]:
"""
Left-greedy proportional fallback for words that can't be DP-aligned.
Assigns phonemes left-to-right, one per character.
Extra characters get empty phoneme lists.
Extra phonemes go to the last character.
"""
n = len(word)
m = len(ipa)
alignment = []
for i in range(n):
if i == n - 1 and i < m:
# Last character: assign all remaining phonemes
alignment.append(
{"chars": [i, i + 1], "grapheme": word[i], "phonemes": list(ipa[i:])}
)
elif i < m:
# Normal: one phoneme per character
alignment.append(
{"chars": [i, i + 1], "grapheme": word[i], "phonemes": [ipa[i]]}
)
else:
# Extra characters beyond phoneme count: silent
alignment.append(
{"chars": [i, i + 1], "grapheme": word[i], "phonemes": []}
)
return alignment
- [ ] Step 2: Add align_word wrapper
def align_word(
word: str,
ipa: list[str],
table: dict[str, set[str]],
) -> tuple[list[dict], str]:
"""
Align a word's characters to its IPA phonemes.
Tries DP alignment first, falls back to proportional if DP fails.
Returns (alignment, confidence) where confidence is "high" or "low".
"""
result = dp_align(word, ipa, table)
if result is not None:
return result, "high"
return fallback_align(word, ipa), "low"
- [ ] Step 3: Add run_pipeline and update main
def run_pipeline(
cmu_path: Path,
ipa_mapping_path: Path,
) -> tuple[dict, dict]:
"""
Run the full G2P alignment pipeline.
Returns (alignment_data, stats) where:
- alignment_data: dict[word -> list of variant alignments]
- stats: dict with coverage statistics
"""
t0 = time.time()
# Load data
print(f"Loading CMU dict from {cmu_path} ...")
entries = load_cmu_dict(cmu_path)
print(f" {len(entries):,} entries")
print(f"Loading ARPAbet-to-IPA mapping ...")
ipa_mapping = arpa_to_ipa(ipa_mapping_path)
# Phase 1: Bootstrap
print("Phase 1: Bootstrapping grapheme table ...")
table = bootstrap_grapheme_table(entries, ipa_mapping)
print(f" {len(table)} graphemes in table")
# Phase 2: DP alignment
print("Phase 2: DP alignment ...")
result: dict[str, list[dict]] = {}
high_count = 0
low_count = 0
error_count = 0
for word, variant, arpa_phones in entries:
try:
ipa_phones = convert_arpa_sequence(arpa_phones, ipa_mapping)
except KeyError:
error_count += 1
continue
alignment, confidence = align_word(word, ipa_phones, table)
entry = {
"variant": variant,
"arpa": arpa_phones,
"ipa": ipa_phones,
"alignment": alignment,
"confidence": confidence,
}
if word not in result:
result[word] = []
result[word].append(entry)
if confidence == "high":
high_count += 1
else:
low_count += 1
total = high_count + low_count
elapsed = time.time() - t0
stats = {
"total_entries": total,
"unique_words": len(result),
"high_confidence": high_count,
"low_confidence": low_count,
"high_confidence_pct": round(high_count / total * 100, 2) if total else 0,
"errors_skipped": error_count,
"elapsed_seconds": round(elapsed, 1),
}
print(f" {high_count:,} high confidence ({stats['high_confidence_pct']}%)")
print(f" {low_count:,} low confidence")
if error_count:
print(f" {error_count:,} skipped (ARPAbet conversion errors)")
print(f" Completed in {elapsed:.1f}s")
return result, stats
if __name__ == "__main__":
result, stats = run_pipeline(CMU_DICT_PATH, ARPA_TO_IPA_PATH)
print(f"\nWriting {OUTPUT_PATH} ...")
with open(OUTPUT_PATH, "w") as f:
json.dump(result, f, ensure_ascii=False, indent=None, separators=(",", ":"))
file_size_mb = OUTPUT_PATH.stat().st_size / (1024 * 1024)
print(f" {file_size_mb:.1f} MB written")
# Print some sample alignments
print("\n--- Sample Alignments ---")
for sample_word in ["cat", "think", "make", "box", "church", "bartholomew", "live"]:
if sample_word in result:
for entry in result[sample_word]:
print(f"\n{sample_word} (variant {entry['variant']}, {entry['confidence']}):")
for a in entry["alignment"]:
phonemes_str = ", ".join(a["phonemes"]) if a["phonemes"] else "(silent)"
print(f" {a['grapheme']:>6} [{a['chars'][0]}:{a['chars'][1]}] -> {phonemes_str}")
- [ ] Step 4: Run tests to verify they pass
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py -v
Expected: All tests PASS (the TestFullPipeline tests will take a few seconds since they run the full pipeline)
- [ ] Step 5: Run full export
Run: cd /Users/jneumann/Repos/PhonoLex && python workers/scripts/g2p_alignment.py
Expected: Produces data/g2p_alignment.json, prints coverage stats showing >90% high confidence
- [ ] Step 6: Add output file to .gitignore
Add data/g2p_alignment.json to .gitignore (it is not currently gitignored).
- [ ] Step 7: Commit
git add workers/scripts/g2p_alignment.py tests/test_g2p_alignment.py
# If .gitignore was modified:
git add .gitignore
git commit -m "feat: complete G2P alignment pipeline with fallback and JSON export"
Chunk 5: Validation and Coverage Report¶
Task 9: Run full pipeline and validate against curated examples¶
Files: - No new files
- [ ] Step 1: Run the full export and review sample output
Run: cd /Users/jneumann/Repos/PhonoLex && python workers/scripts/g2p_alignment.py
Manually inspect the printed sample alignments for correctness: - "cat": c->k, a->ˈæ, t->t - "think": th->θ, i->ˈɪ, n->ŋ, k->k - "make": m->m, a->ˈeɪ, k->k, e->(silent) - "box": b->b, o->ˈɑ, x->[k,s] - "church": ch->tʃ, u->ˈɝ, r->(part of alignment), ch->tʃ - "bartholomew": see spec example - "live" variant 0 and 1: different vowels
- [ ] Step 2: Spot-check problematic words
Run: cd /Users/jneumann/Repos/PhonoLex && python -c "
import json
with open('data/g2p_alignment.json') as f:
data = json.load(f)
for word in ['colonel', 'knife', 'write', 'phone', 'enough', 'choir', 'o\\'brien', 'able-bodied']:
if word in data:
e = data[word][0]
print(f'{word} ({e[\"confidence\"]}):')
for a in e['alignment']:
ph = ', '.join(a['phonemes']) if a['phonemes'] else '(silent)'
print(f' {a[\"grapheme\"]:>6} -> {ph}')
print()
else:
print(f'{word}: NOT FOUND\n')
"
Review output for correctness. Flag any systematic issues.
- [ ] Step 3: Check low-confidence words
Run: cd /Users/jneumann/Repos/PhonoLex && python -c "
import json
with open('data/g2p_alignment.json') as f:
data = json.load(f)
low = []
for word, variants in data.items():
for v in variants:
if v['confidence'] == 'low':
low.append((word, v['variant']))
print(f'Low confidence: {len(low)} entries')
print(f'First 20:')
for w, v in low[:20]:
print(f' {w} (variant {v})')
"
Review to understand what kinds of words fall through to fallback.
- [ ] Step 4: Run full test suite one final time
Run: cd /Users/jneumann/Repos/PhonoLex && python -m pytest tests/test_g2p_alignment.py -v
Expected: All PASS
- [ ] Step 5: Final commit if any fixes were needed
git add -A
git commit -m "fix: address alignment issues found during validation"
(Skip this step if no fixes were needed.)