Skip to main content

Basic Usage


import fal

class MyApp(fal.App):
    def setup(self):
        # Initialize models and resources once per runner
        pass
    
    @fal.endpoint("/")
    def predict(self, input_data):
        # Process requests
        return {"result": "..."}

Configuration Options

You can configure your app using class variables or the host_kwargs dictionary for advanced options.

Environment Configuration

requirements (list[str])
List of pip packages to install in the environment.
class MyApp(fal.App):
    requirements = ["numpy==1.24.0", "pandas", "torch>=2.0.0"]
local_python_modules (list[str])
List of local Python module names to include in the deployment.
class MyApp(fal.App):
    local_python_modules = ["my_utils", "custom_models"]

Machine Configuration

machine_type (str | list[str])
Hardware type(s) to use. Can be a single type or a list of types in order of preference. CPU Machines:
  • "XS" - 0.50 CPU cores, 512MB RAM
  • "S" - 1 CPU core, 1GB RAM (default)
  • "M" - 2 CPU cores, 2GB RAM
  • "L" - 4 CPU cores, 15GB RAM
GPU Machines:
  • "GPU-A100" - 12 CPU cores, 60GB RAM, 1 GPU core (40GB VRAM)
  • "GPU-H100" - 12 CPU cores, 112GB RAM, 1 GPU core (80GB VRAM)
  • "GPU-H200" - 12 CPU cores, 112GB RAM, 1 GPU core (141GB VRAM)
  • "GPU-B200" - 24 CPU cores, 112GB RAM, 1 GPU core (192GB VRAM)
class MyApp(fal.App):
    # Single machine type
    machine_type = "GPU-H100"
    
    # Or with multiple options (fal will pick whichever is available)
    machine_type = ["GPU-H100", "GPU-H200"]
num_gpus (int)
Number of GPUs required for the application.
class MyApp(fal.App):
    machine_type = "GPU-H100"
    num_gpus = 2  # Request 2 H100 GPUs

Timeout Configuration

request_timeout (int)
Maximum time in seconds for a single request to complete.
class MyApp(fal.App):
    request_timeout = 300  # 5 minutes
startup_timeout (int)
Maximum time in seconds for the environment to start up.
class MyApp(fal.App):
    startup_timeout = 600  # 10 minutes for large model loading

Authentication

app_auth (str)
Authentication mode for the application. Options:
  • "private": Only accessible with your API key
  • "public": Accessible without authentication
  • "shared": Accessible with any valid fal API key
  • None: Inherit from deployment command
class MyApp(fal.App):
    app_auth = "shared"  # Allow access with any valid fal key

App Metadata

app_name (str)
Custom name for the application. Auto-generated from class name if not specified.
class MyApp(fal.App):
    app_name = "image-generator-v2"

Scaling Configuration

Control how your application scales to handle traffic. These options help balance performance and cost.
keep_alive (int)
Time in seconds to keep idle runners alive. Default: 10 seconds.
class MyApp(fal.App):
    keep_alive = 300  # Keep runners alive for 5 minutes after last request
min_concurrency (int)
Minimum number of runners to keep running at all times. Default: 0.
class MyApp(fal.App):
    min_concurrency = 2  # Always keep 2 runners ready
max_concurrency (int)
Maximum number of runners that can be created. Default: 10.
class MyApp(fal.App):
    max_concurrency = 50  # Allow up to 50 runners during peak traffic
concurrency_buffer (int)
Number of extra runners to provision beyond current demand. Default: 0.
class MyApp(fal.App):
    concurrency_buffer = 2  # Keep 2 extra runners ready for traffic spikes
max_multiplexing (int)
Maximum number of requests a single runner can handle concurrently. Default: 1.
class MyApp(fal.App):
    max_multiplexing = 5  # Each runner can handle 5 concurrent requests
Note:See the Scaling Guide for detailed explanations and examples of these options.

Complete Example

Here’s a comprehensive example showing all common configuration options:
import fal
from typing import Dict, Any

class ImageGenerationApp(fal.App):
    # Environment setup
    requirements = [
        "torch==2.1.0",
        "transformers==4.35.0",
        "diffusers==0.24.0",
        "accelerate",
        "pillow",
    ]
    local_python_modules = ["custom_pipeline"]
    
    # Machine configuration
    machine_type = ["GPU-H100", "GPU-H200"]  # will pick whichever is available
    num_gpus = 1
    
    # Timeouts
    request_timeout = 600      # 10 minutes per request
    startup_timeout = 900      # 15 minutes for model loading
    
    # Authentication
    app_auth = "shared"        # Accessible with any valid fal key
    app_name = "stable-diffusion-xl"
    
    # Scaling configuration
    keep_alive = 300           # 5 minutes
    min_concurrency = 1        # Keep 1 runner always ready
    max_concurrency = 2       # Scale up to 10 runners max
    concurrency_buffer = 1     # Keep 1 extra runner for spikes
    max_multiplexing = 1       # 1 request per runner (GPU-bound workload)
    
    def setup(self):
        """Initialize models once per runner."""
        import torch
        from diffusers import DiffusionPipeline
        
        self.pipe = DiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float16,
            variant="fp16",
            use_safetensors=True,
        )
        self.pipe.to("cuda")
    
    @fal.endpoint("/generate")
    def generate(self, 
                 prompt: str,
                 negative_prompt: str = "",
                 steps: int = 30,
                 width: int = 1024,
                 height: int = 1024) -> Dict[str, Any]:
        """Generate an image from a text prompt."""
        
        image = self.pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=steps,
            width=width,
            height=height,
        ).images[0]
        
        # Convert to base64 for API response
        import io
        import base64
        
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        image_base64 = base64.b64encode(buffer.getvalue()).decode()
        
        return {
            "image": image_base64,
            "content_type": "image/png",
            "width": width,
            "height": height,
        }
    
    @fal.endpoint("/health")
    def health_check(self) -> Dict[str, str]:
        """Simple health check endpoint."""
        return {"status": "healthy", "model": "sdxl"}

See Also

I