Skip to content

Commit e8cbf0b

Browse files
apolmigapolmig
authored andcommitted
feat: add language selection module with CLI/Gradio support
1 parent ce0ce23 commit e8cbf0b

2 files changed

Lines changed: 193 additions & 0 deletions

File tree

src/kugelaudio_open/languages.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
Language configuration for KugelAudio Open.
3+
4+
Structured metadata for the 24 supported European languages with quality
5+
tiers based on YODAS2 training data coverage.
6+
7+
Ref: https://github.com/Kugelaudio/kugelaudio-open/issues/10
8+
"""
9+
10+
from dataclasses import dataclass
11+
from typing import Dict, List, Optional
12+
13+
@dataclass(frozen=True)
14+
class Language:
15+
code: str # ISO 639-1
16+
name: str # English name
17+
native_name: str # Endonym
18+
flag: str
19+
tier: str # "high" | "medium" | "limited"
20+
21+
# Quality tiers reflect YODAS2 dataset representation (~200k hours).
22+
LANGUAGES: Dict[str, Language] = {
23+
"en": Language("en", "English", "English", "🇺🇸", "high"),
24+
"de": Language("de", "German", "Deutsch", "🇩🇪", "high"),
25+
"fr": Language("fr", "French", "Français", "🇫🇷", "high"),
26+
"es": Language("es", "Spanish", "Español", "🇪🇸", "high"),
27+
"it": Language("it", "Italian", "Italiano", "🇮🇹", "medium"),
28+
"pt": Language("pt", "Portuguese", "Português", "🇵🇹", "medium"),
29+
"nl": Language("nl", "Dutch", "Nederlands", "🇳🇱", "medium"),
30+
"pl": Language("pl", "Polish", "Polski", "🇵🇱", "medium"),
31+
"ru": Language("ru", "Russian", "Русский", "🇷🇺", "medium"),
32+
"uk": Language("uk", "Ukrainian", "Українська", "🇺🇦", "medium"),
33+
"cs": Language("cs", "Czech", "Čeština", "🇨🇿", "medium"),
34+
"ro": Language("ro", "Romanian", "Română", "🇷🇴", "limited"),
35+
"hu": Language("hu", "Hungarian", "Magyar", "🇭🇺", "limited"),
36+
"sv": Language("sv", "Swedish", "Svenska", "🇸🇪", "limited"),
37+
"da": Language("da", "Danish", "Dansk", "🇩🇰", "limited"),
38+
"fi": Language("fi", "Finnish", "Suomi", "🇫🇮", "limited"),
39+
"no": Language("no", "Norwegian", "Norsk", "🇳🇴", "limited"),
40+
"el": Language("el", "Greek", "Ελληνικά", "🇬🇷", "limited"),
41+
"bg": Language("bg", "Bulgarian", "Български", "🇧🇬", "limited"),
42+
"sk": Language("sk", "Slovak", "Slovenčina", "🇸🇰", "limited"),
43+
"hr": Language("hr", "Croatian", "Hrvatski", "🇭🇷", "limited"),
44+
"sr": Language("sr", "Serbian", "Српски", "🇷🇸", "limited"),
45+
"tr": Language("tr", "Turkish", "Türkçe", "🇹🇷", "limited"),
46+
}
47+
48+
# Display order: high-quality first, then alphabetical within tiers
49+
DISPLAY_ORDER: List[str] = [
50+
"en", "de", "fr", "es",
51+
"cs", "it", "nl", "pl", "pt", "ru", "uk",
52+
"bg", "da", "el", "fi", "hr", "hu", "no", "ro", "sk", "sr", "sv", "tr",
53+
]
54+
55+
DEFAULT_LANG = "en"
56+
57+
58+
def get(code: str) -> Optional[Language]:
59+
"""Look up language by ISO 639-1 code. Case-insensitive."""
60+
return LANGUAGES.get(code.lower().strip())
61+
62+
63+
def codes() -> List[str]:
64+
"""All supported language codes in display order."""
65+
return list(DISPLAY_ORDER)
66+
67+
68+
def validate(code: str) -> str:
69+
"""Validate and normalize a language code. Raises ValueError if unsupported."""
70+
c = code.lower().strip()
71+
if c not in LANGUAGES:
72+
raise ValueError(
73+
f"Unsupported language: '{code}'. "
74+
f"Supported: {', '.join(codes())}"
75+
)
76+
return c
77+
78+
79+
def quality_warning(code: str) -> Optional[str]:
80+
"""Return a warning string for limited-tier languages, None otherwise."""
81+
lang = get(code)
82+
if lang and lang.tier == "limited":
83+
return (
84+
f"⚠️ {lang.name} has limited training data. "
85+
f"Best quality: en, de, fr, es."
86+
)
87+
return None
88+
89+
90+
# ── Gradio helpers ────────────────────────────────────────────────────────────
91+
92+
def gradio_choices() -> List[str]:
93+
"""Formatted strings for Gradio dropdown: '🇩🇪 German (de)'."""
94+
out = []
95+
for c in DISPLAY_ORDER:
96+
lang = LANGUAGES[c]
97+
warn = " ⚠️" if lang.tier == "limited" else ""
98+
out.append(f"{lang.flag} {lang.name} ({c}){warn}")
99+
return out
100+
101+
102+
def parse_gradio_choice(choice: str) -> str:
103+
"""Extract language code from Gradio dropdown value."""
104+
try:
105+
return choice.split("(")[-1].split(")")[0].strip()
106+
except (IndexError, AttributeError):
107+
return DEFAULT_LANG

tests/test_languages.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""Tests for kugelaudio_open.languages — run with: pytest tests/test_languages.py -v"""
2+
3+
import pytest
4+
from kugelaudio_open.languages import (
5+
LANGUAGES, DISPLAY_ORDER, DEFAULT_LANG,
6+
get, codes, validate, quality_warning,
7+
gradio_choices, parse_gradio_choice,
8+
)
9+
10+
11+
class TestLanguageData:
12+
def test_count(self):
13+
assert len(LANGUAGES) == 23
14+
15+
def test_display_order_consistent(self):
16+
assert set(DISPLAY_ORDER) == set(LANGUAGES.keys())
17+
18+
def test_high_tier(self):
19+
for c in ["en", "de", "fr", "es"]:
20+
assert LANGUAGES[c].tier == "high"
21+
22+
def test_all_fields_populated(self):
23+
for code, lang in LANGUAGES.items():
24+
assert lang.code == code
25+
assert lang.name and lang.native_name and lang.flag
26+
assert lang.tier in ("high", "medium", "limited")
27+
28+
def test_frozen(self):
29+
with pytest.raises(AttributeError):
30+
LANGUAGES["en"].name = "Nope"
31+
32+
33+
class TestLookup:
34+
def test_valid(self):
35+
assert get("de").name == "German"
36+
37+
def test_case_insensitive(self):
38+
assert get("DE") is not None
39+
40+
def test_whitespace(self):
41+
assert get(" fr ") is not None
42+
43+
def test_invalid(self):
44+
assert get("xx") is None
45+
46+
47+
class TestValidate:
48+
def test_ok(self):
49+
assert validate("en") == "en"
50+
assert validate("DE") == "de"
51+
52+
def test_bad(self):
53+
with pytest.raises(ValueError, match="Unsupported"):
54+
validate("xx")
55+
56+
57+
class TestQualityWarning:
58+
def test_high_none(self):
59+
assert quality_warning("en") is None
60+
61+
def test_medium_none(self):
62+
assert quality_warning("it") is None
63+
64+
def test_limited_warns(self):
65+
w = quality_warning("bg")
66+
assert w and "⚠️" in w
67+
68+
69+
class TestGradio:
70+
def test_choices_count(self):
71+
assert len(gradio_choices()) == len(LANGUAGES)
72+
73+
def test_limited_has_warning(self):
74+
bg = [c for c in gradio_choices() if "(bg)" in c][0]
75+
assert "⚠️" in bg
76+
77+
def test_high_no_warning(self):
78+
en = [c for c in gradio_choices() if "(en)" in c][0]
79+
assert "⚠️" not in en
80+
81+
def test_parse_roundtrip(self):
82+
for choice, expected in zip(gradio_choices(), codes()):
83+
assert parse_gradio_choice(choice) == expected
84+
85+
def test_parse_fallback(self):
86+
assert parse_gradio_choice("garbage") == DEFAULT_LANG

0 commit comments

Comments
 (0)