Skip to main content

🚀 Try this Example

View the complete source code on GitHub.
from typing import Literal

import fal
from fal.exceptions import FieldException
from fal.toolkit import File
from fastapi import Response
from pydantic import BaseModel, Field


class AmEnglishRequest(BaseModel):
    prompt: str = Field(
        default="",
        examples=[
            "The future belongs to those who believe in the beauty of their dreams. So, dream big, work hard, and make it happen!"
        ],
        ui={"important": True},
    )
    text: str = Field(
        default="",
        examples=[
            "The future belongs to those who believe in the beauty of their dreams. So, dream big, work hard, and make it happen!"
        ],
    )
    # Use Literal for voice to restrict to specific enum values
    voice: Literal[
        "af_heart",
        "af_alloy",
        "af_aoede",
        "af_bella",
        "af_jessica",
        "af_kore",
        "af_nicole",
        "af_nova",
        "af_river",
        "af_sarah",
        "af_sky",
        "am_adam",
        "am_echo",
        "am_eric",
        "am_fenrir",
        "am_liam",
        "am_michael",
        "am_onyx",
        "am_puck",
        "am_santa",
    ] = Field(
        examples=["af_heart"],
        default="af_heart",
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )


class BrEnglishRequest(BaseModel):
    prompt: str = Field(
        examples=[
            "Ladies and gentlemen, welcome aboard. Please ensure your seatbelt is fastened and your tray table is stowed as we prepare for takeoff."
        ]
    )
    voice: Literal[
        "bf_alice",
        "bf_emma",
        "bf_isabella",
        "bf_lily",
        "bm_daniel",
        "bm_fable",
        "bm_george",
        "bm_lewis",
    ] = Field(
        examples=["bf_alice"],
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )


class JapaneseRequest(BaseModel):
    prompt: str = Field(examples=["夢を追いかけることを恐れないでください。努力すれば、必ず道は開けます!"])
    voice: Literal[
        "jf_alpha",
        "jf_gongitsune",
        "jf_nezumi",
        "jf_tebukuro",
        "jm_kumo",
    ] = Field(
        examples=["jf_alpha"],
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )


class AmEngOutput(BaseModel):
    audio: File = Field(
        description="The generated music",
        examples=[
            File._from_url(
                "https://fal.media/files/elephant/dXVMqWsBDG9yan3kaOT0Z_tmp0vvkha3s.wav"
            )
        ],
    )


class BrEngOutput(BaseModel):
    audio: File = Field(
        description="The generated music",
        examples=[
            File._from_url(
                "https://fal.media/files/kangaroo/4wpA60Kum6UjOVBKJoNyL_tmpxfrkn95k.wav"
            )
        ],
    )


class JapaneseOutput(BaseModel):
    audio: File = Field(
        description="The generated music",
        examples=[
            File._from_url(
                "https://fal.media/files/lion/piLhqKO8LJxrWaNg2dVUv_tmpp6eff6zl.wav"
            )
        ],
    )


class Kokoro(
    fal.App,
    min_concurrency=0,  # type: ignore
    max_concurrency=1,  # type: ignore
    keep_alive=3000,  # type: ignore
    name="kokoro",  # type: ignore
):
    requirements = [
        "kokoro==0.8.4",
        "soundfile==0.13.1",
        "misaki[en]==0.8.4",
        "misaki[ja]==0.8.4",
        "misaki[zh]==0.8.4",
        "numpy==1.26.4",
        "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
    ]
    machine_type = "L"  # Use a CPU machine type since Kokoro is only 82M parameters and runs efficiently on CPU

    async def setup(self):
        from kokoro import KPipeline

        self.pipelines = {}
        self.pipelines["American English"] = KPipeline(lang_code="a")
        self.pipelines["British English"] = KPipeline(lang_code="b")
        self.pipelines["Japanese"] = KPipeline(lang_code="j")

    async def _generate(
        self,
        request: AmEnglishRequest,
        response: Response,
        language: str = "American English",
    ):
        prompt = request.prompt or request.text
        if len(prompt) >= 20000:
            # Use fieldexception to nicely render the error in the UI
            raise FieldException(
                field="prompt",
                message="Prompt must be less than 20000 characters.",
            )

        import tempfile

        import numpy as np
        import soundfile as sf

        pipeline = self.pipelines[language]
        generator = pipeline(
            prompt,
            voice=request.voice,
            speed=request.speed,
            split_pattern=r"\n+",
        )
        for i, (gs, ps, audio) in enumerate(generator):
            if i == 0:
                final_audio = audio.detach().cpu().numpy()
            else:
                audio = audio.detach().cpu().numpy()
                final_audio = np.concatenate((final_audio, audio), axis=0)

        # Set the billing units to be a minimum of 1 and scale with 1000 characters
        response.headers["x-fal-billable-units"] = str(max(1, len(prompt) // 1000))

        # Use a temporary file to save the audio and then send it via the cdn through the File object
        with tempfile.NamedTemporaryFile(suffix=".wav") as f:
            sf.write(f.name, final_audio, 24000)
            return AmEngOutput(
                audio=File.from_path(f.name, content_type="audio/wav", repository="cdn")
            )

    @fal.endpoint("/")
    async def generate(
        self, request: AmEnglishRequest, response: Response
    ) -> AmEngOutput:
        return await self._generate(request, response, language="American English")

    @fal.endpoint("/american-english")
    async def generate_am_english(
        self, request: AmEnglishRequest, response: Response
    ) -> AmEngOutput:
        return await self._generate(request, response, language="American English")

    @fal.endpoint("/british-english")
    async def generate_br_english(
        self, request: BrEnglishRequest, response: Response
    ) -> BrEngOutput:
        return await self._generate(request, response, language="British English")

    @fal.endpoint("/japanese")
    async def generate_japanese(
        self, request: JapaneseRequest, response: Response
    ) -> JapaneseOutput:
        return await self._generate(request, response, language="Japanese")


...  # Define the rest of the languages as endpoints similarly
Or clone this repository:
git clone https://github.com/fal-ai-community/fal-demos.git
cd fal-demos
pip install -e .
# Use the app name (defined in pyproject.toml)
fal run kokoro
# Or use the full file path:
# fal run fal_demos/tts/kokoro.py::Kokoro
Before you run, make sure you have:
  • Authenticated with fal: fal auth login
  • Activated your virtual environment (recommended): python -m venv venv && source venv/bin/activate (macOS/Linux) or venv\Scripts\activate (Windows)

Key Features

  • Multi-Language Support: American English, British English, Japanese with native voices
  • CPU-Efficient Deployment: Lightweight 82M parameter model runs efficiently on CPU
  • Multiple Endpoints: Language-specific endpoints with shared generation logic
  • Voice Variety: Multiple voice options for each supported language
  • Audio Streaming: Generator-based audio processing for memory efficiency
  • Character-Based Billing: Usage-based pricing tied to text length
  • Advanced Validation: Custom error handling with user-friendly messages
  • Audio File Management: Temporary file handling and CDN integration

When to Use CPU Deployment

CPU deployment is ideal when:
  • Models are lightweight (< 100M parameters)
  • Inference is fast enough on CPU
  • Cost optimization is important
  • GPU resources are not required
  • Multiple concurrent requests can share CPU resources efficiently

Project Setup

from typing import Literal

import fal
from fal.exceptions import FieldException
from fal.toolkit import File
from fastapi import Response
from pydantic import BaseModel, Field

Language-Specific Input Models

Define input models for each supported language with appropriate voice options:
class AmEnglishRequest(BaseModel):
    prompt: str = Field(
        default="",
        examples=[
            "The future belongs to those who believe in the beauty of their dreams. So, dream big, work hard, and make it happen!"
        ],
        ui={"important": True},
    )
    text: str = Field(
        default="",
        examples=[
            "The future belongs to those who believe in the beauty of their dreams. So, dream big, work hard, and make it happen!"
        ],
    )
    voice: Literal[
        "af_heart",    # American Female voices
        "af_alloy",
        "af_aoede",
        "af_bella",
        "af_jessica",
        "af_kore",
        "af_nicole",
        "af_nova",
        "af_river",
        "af_sarah",
        "af_sky",
        "am_adam",     # American Male voices
        "am_echo",
        "am_eric",
        "am_fenrir",
        "am_liam",
        "am_michael",
        "am_onyx",
        "am_puck",
        "am_santa",
    ] = Field(
        examples=["af_heart"],
        default="af_heart",
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )

class BrEnglishRequest(BaseModel):
    prompt: str = Field(
        examples=[
            "Ladies and gentlemen, welcome aboard. Please ensure your seatbelt is fastened and your tray table is stowed as we prepare for takeoff."
        ]
    )
    voice: Literal[
        "bf_alice",    # British Female voices
        "bf_emma",
        "bf_isabella",
        "bf_lily",
        "bm_daniel",   # British Male voices
        "bm_fable",
        "bm_george",
        "bm_lewis",
    ] = Field(
        examples=["bf_alice"],
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )

class JapaneseRequest(BaseModel):
    prompt: str = Field(
        examples=["夢を追いかけることを恐れないでください。努力すれば、必ず道は開けます!"]
    )
    voice: Literal[
        "jf_alpha",    # Japanese Female voices
        "jf_gongitsune",
        "jf_nezumi",
        "jf_tebukuro",
        "jm_kumo",     # Japanese Male voices
    ] = Field(
        examples=["jf_alpha"],
        description="Voice ID for the desired voice.",
    )
    speed: float = Field(
        default=1.0,
        ge=0.1,
        le=5.0,
        description="Speed of the generated audio. Default is 1.0.",
    )

Language-Specific Output Models

class AmEngOutput(BaseModel):
    audio: File = Field(
        description="The generated audio",
        examples=[
            File._from_url(
                "https://fal.media/files/elephant/dXVMqWsBDG9yan3kaOT0Z_tmp0vvkha3s.wav"
            )
        ],
    )

class BrEngOutput(BaseModel):
    audio: File = Field(
        description="The generated audio",
        examples=[
            File._from_url(
                "https://fal.media/files/kangaroo/4wpA60Kum6UjOVBKJoNyL_tmpxfrkn95k.wav"
            )
        ],
    )

class JapaneseOutput(BaseModel):
    audio: File = Field(
        description="The generated audio",
        examples=[
            File._from_url(
                "https://fal.media/files/lion/piLhqKO8LJxrWaNg2dVUv_tmpp6eff6zl.wav"
            )
        ],
    )

Application Configuration for CPU Deployment

class Kokoro(fal.App):
    min_concurrency = 0
    max_concurrency = 1
    keep_alive = 3000  # Longer keep-alive for TTS services
    app_name = "kokoro"
    requirements = [
        "kokoro==0.8.4",
        "soundfile==0.13.1",
        "misaki[en]==0.8.4",  # English language support
        "misaki[ja]==0.8.4",  # Japanese language support
        "misaki[zh]==0.8.4",  # Chinese language support
        "numpy==1.26.4",
        # Spacy model for English NLP
        "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl",
    ]
    machine_type = "L"  # CPU machine - efficient for lightweight models

    async def setup(self):
        from kokoro import KPipeline

        # Initialize pipelines for each supported language
        self.pipelines = {}
        self.pipelines["American English"] = KPipeline(lang_code="a")
        self.pipelines["British English"] = KPipeline(lang_code="b")
        self.pipelines["Japanese"] = KPipeline(lang_code="j")

Shared Generation Logic

Create a reusable generation method that handles all languages:
async def _generate(
    self,
    request: AmEnglishRequest,
    response: Response,
    language: str = "American English",
):
    # Handle both 'prompt' and 'text' fields for backwards compatibility
    prompt = request.prompt or request.text

    # Custom validation with user-friendly error messages
    if len(prompt) >= 20000:
        raise FieldException(
            field="prompt",
            message="Prompt must be less than 20000 characters.",
        )

    import tempfile
    import numpy as np
    import soundfile as sf

    # Get the appropriate pipeline for the language
    pipeline = self.pipelines[language]

    # Generate audio using streaming approach
    generator = pipeline(
        prompt,
        voice=request.voice,
        speed=request.speed,
        split_pattern=r"\n+",  # Split on line breaks for better pacing
    )

    # Process audio chunks and concatenate
    for i, (gs, ps, audio) in enumerate(generator):
        if i == 0:
            final_audio = audio.detach().cpu().numpy()
        else:
            audio = audio.detach().cpu().numpy()
            final_audio = np.concatenate((final_audio, audio), axis=0)

    # Character-based billing calculation
    response.headers["x-fal-billable-units"] = str(max(1, len(prompt) // 1000))

    # Save audio to temporary file and upload to CDN
    with tempfile.NamedTemporaryFile(suffix=".wav") as f:
        sf.write(f.name, final_audio, 24000)  # 24kHz sample rate
        return AmEngOutput(
            audio=File.from_path(
                f.name,
                content_type="audio/wav",
                repository="cdn"  # Upload to CDN for fast access
            )
        )

Multiple Endpoint Definitions

Define language-specific endpoints using the shared generation logic:
@fal.endpoint("/")
async def generate(
    self, request: AmEnglishRequest, response: Response
) -> AmEngOutput:
    return await self._generate(request, response, language="American English")

@fal.endpoint("/american-english")
async def generate_am_english(
    self, request: AmEnglishRequest, response: Response
) -> AmEngOutput:
    return await self._generate(request, response, language="American English")

@fal.endpoint("/british-english")
async def generate_br_english(
    self, request: BrEnglishRequest, response: Response
) -> BrEngOutput:
    return await self._generate(request, response, language="British English")

@fal.endpoint("/japanese")
async def generate_japanese(
    self, request: JapaneseRequest, response: Response
) -> JapaneseOutput:
    return await self._generate(request, response, language="Japanese")

Key Concepts and Best Practices

CPU-Efficient Deployment

Why CPU for TTS:
  • Kokoro is only 82M parameters - runs efficiently on CPU
  • Lower cost compared to GPU instances
  • Sufficient performance for real-time TTS
  • Better resource utilization for multiple concurrent requests

Audio Streaming and Memory Management

Generator-based processing:
# Stream audio generation to handle long texts efficiently
generator = pipeline(prompt, voice=request.voice, speed=request.speed)

# Process chunks incrementally
for i, (gs, ps, audio) in enumerate(generator):
    if i == 0:
        final_audio = audio.detach().cpu().numpy()
    else:
        final_audio = np.concatenate((final_audio, audio), axis=0)

Character-Based Billing

# Scale billing with text length (per 1000 characters)
response.headers["x-fal-billable-units"] = str(max(1, len(prompt) // 1000))

Audio File Handling

# Use temporary files for audio processing
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
    sf.write(f.name, final_audio, 24000)  # Save with proper sample rate
    return Output(
        audio=File.from_path(
            f.name,
            content_type="audio/wav",
            repository="cdn"  # Auto-upload to CDN
        )
    )

Multi-Language Architecture

Pipeline initialization:
self.pipelines = {
    "American English": KPipeline(lang_code="a"),
    "British English": KPipeline(lang_code="b"),
    "Japanese": KPipeline(lang_code="j"),
}
Language-specific voice options:
# American English voices
voice: Literal[
    "af_heart", "af_alloy", "af_aoede",  # Female
    "am_adam", "am_echo", "am_eric",     # Male
]

# British English voices
voice: Literal[
    "bf_alice", "bf_emma", "bf_lily",    # Female
    "bm_daniel", "bm_george", "bm_lewis" # Male
]

Advanced Features

Custom Validation

if len(prompt) >= 20000:
    raise FieldException(
        field="prompt",
        message="Prompt must be less than 20000 characters.",
    )

Backwards Compatibility

# Support both 'prompt' and 'text' field names
prompt = request.prompt or request.text

Flexible Text Processing

generator = pipeline(
    prompt,
    voice=request.voice,
    speed=request.speed,
    split_pattern=r"\n+",  # Split on paragraphs for natural pacing
)

Deployment and Usage

Running the Service

# Development
fal run fal_demos/tts/kokoro.py::Kokoro

# Production deployment
fal deploy kokoro

Making Requests

American English:
import fal_client

result = await fal_client.submit_async(
    "your-username/kokoro/american-english",
    arguments={
        "prompt": "Hello, this is a test of American English text-to-speech!",
        "voice": "af_heart",
        "speed": 1.2
    }
)
British English:
result = await fal_client.submit_async(
    "your-username/kokoro/british-english",
    arguments={
        "prompt": "Cheerio! This is British English text-to-speech.",
        "voice": "bf_alice",
        "speed": 1.0
    }
)
Japanese:
result = await fal_client.submit_async(
    "your-username/kokoro/japanese",
    arguments={
        "prompt": "こんにちは、これは日本語の音声合成です。",
        "voice": "jf_alpha",
        "speed": 0.9
    }
)

Use Cases

  • Content Creation: Generate voiceovers for videos and podcasts
  • Accessibility: Convert text content to audio for visually impaired users
  • E-Learning: Create educational content with natural-sounding narration
  • Customer Service: Generate dynamic audio responses for chatbots
  • Multilingual Applications: Support global audiences with native-sounding voices
  • Book Reading: Convert written content to audiobooks

Performance Optimizations

Memory Efficiency

# Stream processing prevents memory buildup for long texts
for i, (gs, ps, audio) in enumerate(generator):
    # Process incrementally rather than loading all at once

Cost Optimization

machine_type = "L"  # CPU is sufficient and cost-effective
keep_alive=3000     # Longer keep-alive reduces cold starts

Key Takeaways

  • CPU deployment is ideal for lightweight models like Kokoro (82M parameters)
  • Multi-language support requires separate pipelines and voice models
  • Character-based billing aligns costs with resource usage
  • Audio streaming handles long texts efficiently without memory issues
  • Temporary file handling with CDN upload provides fast, reliable audio delivery
  • Multiple endpoints with shared logic offer flexibility while maintaining DRY principles
  • Custom validation provides better user experience with clear error messages
This pattern is perfect for building production-ready TTS services that need to support multiple languages and voices while maintaining cost efficiency and high performance through CPU-optimized deployment.