|
| 1 | +""" |
| 2 | +Language configuration for KugelAudio Open. |
| 3 | +
|
| 4 | +Structured metadata for the 24 supported European languages with quality |
| 5 | +tiers based on YODAS2 training data coverage. |
| 6 | +
|
| 7 | +Ref: https://github.com/Kugelaudio/kugelaudio-open/issues/10 |
| 8 | +""" |
| 9 | + |
| 10 | +from dataclasses import dataclass |
| 11 | +from typing import Dict, List, Optional |
| 12 | + |
| 13 | +@dataclass(frozen=True) |
| 14 | +class Language: |
| 15 | + code: str # ISO 639-1 |
| 16 | + name: str # English name |
| 17 | + native_name: str # Endonym |
| 18 | + flag: str |
| 19 | + tier: str # "high" | "medium" | "limited" |
| 20 | + |
| 21 | +# Quality tiers reflect YODAS2 dataset representation (~200k hours). |
| 22 | +LANGUAGES: Dict[str, Language] = { |
| 23 | + "en": Language("en", "English", "English", "🇺🇸", "high"), |
| 24 | + "de": Language("de", "German", "Deutsch", "🇩🇪", "high"), |
| 25 | + "fr": Language("fr", "French", "Français", "🇫🇷", "high"), |
| 26 | + "es": Language("es", "Spanish", "Español", "🇪🇸", "high"), |
| 27 | + "it": Language("it", "Italian", "Italiano", "🇮🇹", "medium"), |
| 28 | + "pt": Language("pt", "Portuguese", "Português", "🇵🇹", "medium"), |
| 29 | + "nl": Language("nl", "Dutch", "Nederlands", "🇳🇱", "medium"), |
| 30 | + "pl": Language("pl", "Polish", "Polski", "🇵🇱", "medium"), |
| 31 | + "ru": Language("ru", "Russian", "Русский", "🇷🇺", "medium"), |
| 32 | + "uk": Language("uk", "Ukrainian", "Українська", "🇺🇦", "medium"), |
| 33 | + "cs": Language("cs", "Czech", "Čeština", "🇨🇿", "medium"), |
| 34 | + "ro": Language("ro", "Romanian", "Română", "🇷🇴", "limited"), |
| 35 | + "hu": Language("hu", "Hungarian", "Magyar", "🇭🇺", "limited"), |
| 36 | + "sv": Language("sv", "Swedish", "Svenska", "🇸🇪", "limited"), |
| 37 | + "da": Language("da", "Danish", "Dansk", "🇩🇰", "limited"), |
| 38 | + "fi": Language("fi", "Finnish", "Suomi", "🇫🇮", "limited"), |
| 39 | + "no": Language("no", "Norwegian", "Norsk", "🇳🇴", "limited"), |
| 40 | + "el": Language("el", "Greek", "Ελληνικά", "🇬🇷", "limited"), |
| 41 | + "bg": Language("bg", "Bulgarian", "Български", "🇧🇬", "limited"), |
| 42 | + "sk": Language("sk", "Slovak", "Slovenčina", "🇸🇰", "limited"), |
| 43 | + "hr": Language("hr", "Croatian", "Hrvatski", "🇭🇷", "limited"), |
| 44 | + "sr": Language("sr", "Serbian", "Српски", "🇷🇸", "limited"), |
| 45 | + "tr": Language("tr", "Turkish", "Türkçe", "🇹🇷", "limited"), |
| 46 | +} |
| 47 | + |
| 48 | +# Display order: high-quality first, then alphabetical within tiers |
| 49 | +DISPLAY_ORDER: List[str] = [ |
| 50 | + "en", "de", "fr", "es", |
| 51 | + "cs", "it", "nl", "pl", "pt", "ru", "uk", |
| 52 | + "bg", "da", "el", "fi", "hr", "hu", "no", "ro", "sk", "sr", "sv", "tr", |
| 53 | +] |
| 54 | + |
| 55 | +DEFAULT_LANG = "en" |
| 56 | + |
| 57 | + |
| 58 | +def get(code: str) -> Optional[Language]: |
| 59 | + """Look up language by ISO 639-1 code. Case-insensitive.""" |
| 60 | + return LANGUAGES.get(code.lower().strip()) |
| 61 | + |
| 62 | + |
| 63 | +def codes() -> List[str]: |
| 64 | + """All supported language codes in display order.""" |
| 65 | + return list(DISPLAY_ORDER) |
| 66 | + |
| 67 | + |
| 68 | +def validate(code: str) -> str: |
| 69 | + """Validate and normalize a language code. Raises ValueError if unsupported.""" |
| 70 | + c = code.lower().strip() |
| 71 | + if c not in LANGUAGES: |
| 72 | + raise ValueError( |
| 73 | + f"Unsupported language: '{code}'. " |
| 74 | + f"Supported: {', '.join(codes())}" |
| 75 | + ) |
| 76 | + return c |
| 77 | + |
| 78 | + |
| 79 | +def quality_warning(code: str) -> Optional[str]: |
| 80 | + """Return a warning string for limited-tier languages, None otherwise.""" |
| 81 | + lang = get(code) |
| 82 | + if lang and lang.tier == "limited": |
| 83 | + return ( |
| 84 | + f"⚠️ {lang.name} has limited training data. " |
| 85 | + f"Best quality: en, de, fr, es." |
| 86 | + ) |
| 87 | + return None |
| 88 | + |
| 89 | + |
| 90 | +# ── Gradio helpers ──────────────────────────────────────────────────────────── |
| 91 | + |
| 92 | +def gradio_choices() -> List[str]: |
| 93 | + """Formatted strings for Gradio dropdown: '🇩🇪 German (de)'.""" |
| 94 | + out = [] |
| 95 | + for c in DISPLAY_ORDER: |
| 96 | + lang = LANGUAGES[c] |
| 97 | + warn = " ⚠️" if lang.tier == "limited" else "" |
| 98 | + out.append(f"{lang.flag} {lang.name} ({c}){warn}") |
| 99 | + return out |
| 100 | + |
| 101 | + |
| 102 | +def parse_gradio_choice(choice: str) -> str: |
| 103 | + """Extract language code from Gradio dropdown value.""" |
| 104 | + try: |
| 105 | + return choice.split("(")[-1].split(")")[0].strip() |
| 106 | + except (IndexError, AttributeError): |
| 107 | + return DEFAULT_LANG |
0 commit comments