diff --git a/llms.txt b/llms.txt new file mode 100644 index 0000000000..2ab5e6b406 --- /dev/null +++ b/llms.txt @@ -0,0 +1,6953 @@ +## All Parameters & Configuration Details for Proxy Mode + +- Understanding `config.yaml` Structure +- `environment_variables` Section: Defining Variables within YAML +- `model_list` Section: Defining Models and Deployments + - Model List Parameter Summary + - `model_name` + - `litellm_params` + - `model` + - `api_key` + - `api_base` + - Provider-Specific Settings (within `litellm_params`) + - `temperature` + - `max_tokens` + - `rpm` + - `tpm` + - `model_info` + - `id` + - `mode` + - `input_cost_per_token` + - `output_cost_per_token` + - `max_tokens` + - `base_model` + - `metadata` + - Additional Fields (within `model_info`) +- `litellm_settings` Section: Configuring the LiteLLM Library + - litellm\_settings Parameter Summary + - Logging & Callback Settings + - `success_callback` + - `failure_callback` + - `callbacks` + - `service_callbacks` + - `turn_off_message_logging` + - `redact_user_api_key_info` + - `langfuse_default_tags` + - `modify_params` + - `enable_preview_features` + - `set_verbose` + - `json_logs` + - Networking & Timeout Settings + - `request_timeout` + - `force_ipv4` + - `forward_traceparent_to_llm_provider` + - Fallback & Reliability Settings + - `default_fallbacks` + - `content_policy_fallbacks` + - `context_window_fallbacks` + - `disable_end_user_cost_tracking` + - `disable_end_user_cost_tracking_prometheus_only` + - `key_generation_settings` *(Enterprise Feature)* + - `disable_add_transform_inline_image_block` + - `disable_hf_tokenizer_download` + - Caching Settings + - `cache` + - `cache_params` + - `type` + - Redis-specific `host` + - Redis-specific `port` + - Redis-specific `password` + - Redis-specific `namespace` + - Redis-specific `redis_startup_nodes` + - Redis-specific `service_name` + - Redis-specific `sentinel_nodes` + - S3-specific `s3_bucket_name` + - S3-specific `s3_region_name` + - S3-specific `s3_aws_access_key_id` + - S3-specific `s3_aws_secret_access_key` + - S3-specific `s3_endpoint_url` + - Qdrant Specific (Semantic Cache) `qdrant_semantic_cache_embedding_model` + - Qdrant Specific (Semantic Cache) `qdrant_collection_name` + - Qdrant Specific (Semantic Cache) `qdrant_quantization_config` + - Qdrant Specific (Semantic Cache) `similarity_threshold` + - Cache Params applies to all - `supported_call_types` + - Cache Params applies to all - `mode` + - Cache Params applies to all - `ttl` + - Callback Settings (`callback_settings`) + - `callback_settings.otel.message_logging` +- `general_settings` Section: Proxy General Settings + - general\_settings Parameter Summary + - Default Model Selection + - `completion_model` + - `embedding_model` + - `image_generation_model` + - `moderation_model` + - `infer_model_from_keys` + - Master Key & Authentication + - `master_key` + - `enable_jwt_auth` *(Enterprise Feature)* + - `litellm_jwtauth` *(Enterprise Feature)* + - `allowed_routes` + - `admin_only_routes` *(Enterprise Feature)* + - `allowed_ips` + - `enforce_user_param` + - `enable_oauth2_proxy_auth` *(Enterprise Feature)* + - `use_x_forwarded_for` + - `custom_auth` *(Enterprise Feature or Advanced Usage)* + - `allow_user_auth` *(Deprecated)* + - `custom_sso` *(Enterprise Feature or Advanced Usage)* + - Database & Persistence + - `database_url` + - `database_connection_pool_limit` + - `database_connection_timeout` + - `allow_requests_on_db_unavailable` + - `disable_spend_logs` + - `disable_adding_master_key_hash_to_db` + - `store_model_in_db` *(Enterprise Feature)* + - `store_prompts_in_spend_logs` + - `disable_prisma_schema_update` *(Advanced Usage)* + - `proxy_batch_write_at` + - `proxy_budget_rescheduler_min_time` + - `proxy_budget_rescheduler_max_time` + - Key Management & Encryption + - `key_management_system` *(Enterprise Feature)* + - `key_management_settings` *(Enterprise Feature)* + - `use_azure_key_vault` *(Enterprise Feature)* + - `use_google_kms` *(Enterprise Feature)* + - `default_team_disabled` *(Enterprise Feature)* + - `custom_key_generate` *(Advanced Usage)* + - Encryption Salt (Environment Variable) + - Rate Limiting & Quotas + - `max_parallel_requests` + - `global_max_parallel_requests` + - `max_request_size_mb` + - `max_response_size_mb` + - `proxy_budget_rescheduler_min_time` + - `proxy_budget_rescheduler_max_time` + - Monitoring, Alerting & Health Checks + - `background_health_checks` + - `health_check_interval` + - `health_check_details` + - `alerting` + - `alerting_threshold` + - `alerting_args` + - `alert_types` + - `alert_to_webhook_url` + - `spend_report_frequency` *(Enterprise Feature)* + - `forward_openai_org_id` + - `forward_client_headers_to_llm_api` + - `use_client_credentials_pass_through_routes` *(Enterprise Feature)* + - `allow_client_side_credentials` *(Use with Caution)* + - Other Miscellaneous Settings + - `service_account_settings` *(Enterprise Feature)* + - `provider_budget_config` *(Advanced Feature)* + - `model_group_alias` + - `retry_after` *(Advanced Feature)* + - `num_retries` +- `router_settings` Section: Routing & Load-Balancing Settings + - router\_settings Parameter Summary + - Routing Strategy & Model Selection + - `routing_strategy` + - `model_group_alias` + - `default_litellm_params` + - `default_max_parallel_requests` + - `default_priority` + - `polling_interval` + - `caching_groups` + - `assistants_config` *(Advanced/Enterprise Feature)* + - Multi-Instance Coordination & Scaling + - `redis_host` + - `redis_port` + - `redis_password` + - `redis_url` + - `client_ttl` + - `cache_responses` + - `routing_strategy_args` + - Pre-Call Checks & Validation + - `enable_pre_call_checks` + - `optional_pre_call_checks` + - Failover & Retry Policies + - `allowed_fails` + - `cooldown_time` + - `disable_cooldowns` + - `retry_policy` + - `allowed_fails_policy` + - `fallbacks` + - `content_policy_fallbacks` + - `default_fallbacks` + - `max_fallbacks` + - `num_retries` + - `model_group_retry_policy` *(SDK-only/advanced)* + - Timeouts & Debugging + - `timeout` + - `stream_timeout` + - `debug_level` + - `set_verbose` *(Deprecated)* + +--- + +## **Understanding `config.yaml` Structure** + +The `config.yaml` file serves as the **central configuration hub** for the LiteLLM Proxy Server. It's a YAML-formatted file that consolidates all the settings needed to define how your proxy will operate, including: which LLM models you'll use, how to connect to them, how to route requests, how to manage authentication, and how to customize various aspects of the proxy's behavior. A well-structured and understood `config.yaml` is *essential* for a functional and secure LiteLLM Proxy deployment. + +The `config.yaml` file is organized into several top-level sections, each responsible for a specific area of configuration. These sections are: + +* **`environment_variables`** (Optional): This section allows you to define environment variables *directly within* the `config.yaml` file. This can be convenient for managing credentials and endpoints within the configuration itself, especially in deployment environments like Kubernetes, where managing external environment variables might be more complex. + +* **`model_list`** (Required): This is a **crucial section** that defines all the LLMs and model deployments that your LiteLLM Proxy will manage and serve. It's where you specify *which* models you want to use, their backend configurations (provider, API keys, endpoints, etc.), and any associated metadata like cost information or context window limits. + +* **`litellm_settings`**: This section contains general settings that configure the behavior of the core **LiteLLM Python library** *within* the proxy. These are settings that apply broadly across the proxy, influencing things like logging, timeouts, fallback mechanisms, and caching behavior. + +* **`callback_settings`** (Optional): This section lets you fine-tune the behavior of specific **logging and monitoring callbacks** that you've enabled in `litellm_settings`. For example, you can use it to control message logging for the OpenTelemetry (`otel`) callback. + +* **`general_settings`**: This section encompasses **proxy server-level settings**. These are configurations that apply to the proxy server *as a whole*, including settings for the master key, database connections, authentication methods (like JWT or OAuth2), global rate limits, alerting, and various feature flags. + +* **`router_settings`**: This section focuses on the proxy's **routing and request handling** configurations. It includes settings for choosing the load balancing strategy (e.g., round-robin, least-busy, latency-based), defining retry policies, setting concurrency limits, and configuring how fallbacks to alternative models should work. + +**Key Principles to Keep in Mind:** + +* **YAML Format:** The `config.yaml` file must be a valid YAML file. YAML is a human-readable data serialization format that uses indentation (spaces, not tabs) to define structure. +* **Environment Variable Overrides:** Most configuration values in `config.yaml` can also be set via **environment variables**. If a setting is defined both in `config.yaml` and as an environment variable, the YAML setting typically takes precedence (except where noted in this guide). +* **Precedence:** Settings defined in `router_settings` generally override similar settings in `litellm_settings` for router-specific operations. This allows for fine-grained control over the router's behavior. +* **Restart Required:** Changes to `config.yaml` generally require **restarting the LiteLLM Proxy Server** for the changes to take effect. + +--- + +## `environment_variables` Section: Defining Variables within YAML + +**Type:** Mapping (String to String) + +**Optional:** Yes + +This section provides a way to define environment variables *directly within* your `config.yaml` file. While you can always set environment variables externally in your operating system or deployment environment, the `environment_variables` section offers some convenience, especially in certain contexts: + +* **Centralized Configuration:** It allows you to keep all your configuration settings, including environment variables, in a single, centralized `config.yaml` file. +* **Kubernetes Deployments:** This is particularly helpful in Kubernetes deployments, where you might prefer to define environment variables within your configuration files (e.g., ConfigMaps or Secrets) rather than setting them separately in the Kubernetes environment. +* **Simplified Management:** For some deployments, managing environment variables within the `config.yaml` can simplify setup and configuration. + +Each entry under `environment_variables` is a key-value pair: + +* **Key:** The name of the environment variable (e.g., `REDIS_HOST`, `OPENAI_API_KEY`). +* **Value:** The string value to be assigned to that environment variable. + +**Example:** + +```yaml +environment_variables: + REDIS_HOST: "cache.example.com" + REDIS_PORT: "6379" + OPENAI_API_KEY: "sk-..." +``` + +In this example, the LiteLLM Proxy will operate as if the environment variables `REDIS_HOST`, `REDIS_PORT`, and `OPENAI_API_KEY` were set externally in the system environment. + +**Referencing Defined Environment Variables:** + +Within *other* parts of your `config.yaml` file, you can **reference** these defined environment variables using the syntax: `"os.environ/VAR_NAME"`. This tells LiteLLM to fetch the actual value from the environment (either defined in the `environment_variables` section or externally). + +**Example:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" # Referencing the OpenAI API key defined in environment_variables +``` + +Here, `"os.environ/OPENAI_API_KEY"` will be replaced with the value of the `OPENAI_API_KEY` environment variable (either `sk-...` from the example above, or a value set externally if one exists). + +**Note:** If an environment variable is defined *both* in the `environment_variables` section *and* externally in your system environment, the YAML setting (within `config.yaml`) typically takes precedence. + +--- + +## `model_list` Section: Defining Models and Deployments + +**Type:** List of Mappings + +**Required:** Yes + +The `model_list` section is **mandatory** and forms the core of your LiteLLM Proxy configuration. It's where you define all the LLMs and model deployments that your proxy will manage and make accessible to your applications. Each entry in the `model_list` represents a single model or model alias that the proxy will serve. + +**Structure of a `model_list` Entry:** + +Each entry in the `model_list` is a **mapping (dictionary)** that contains the following key components: + +* **`model_name`** (String, Required): This is the user-friendly **alias** or name that client applications will use to refer to this specific model deployment when making API requests to the proxy. This is the name your application code will use. +* **`litellm_params`** (Object, Required): This is a **nested mapping (object)** that contains the core parameters that the LiteLLM library will use to actually interact with the backend LLM API. This is where you specify the *actual* model identifier (provider and model name), API keys, base URLs, and other provider-specific settings. +* **`model_info`** (Object, Optional): This is an optional nested mapping that allows you to provide **metadata and descriptive attributes** about the model, such as cost per token, context window size, and a description. While optional, it's highly recommended to provide this information for monitoring, cost tracking, and management purposes. + +**Example `model_list` Section:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-azure # User-facing alias: "gpt-3.5-turbo-azure" + litellm_params: + model: azure/my-deployment-name # Actual Azure OpenAI deployment name + api_base: "https://my-azure-endpoint.openai.azure.com/" # Azure endpoint + api_key: "os.environ/AZURE_API_KEY" # API Key (loaded from environment variable) + api_version: "2023-07-01-preview" # Azure API version + temperature: 0.7 # Default temperature + model_info: + max_tokens: 4096 # Context window size + input_cost_per_token: 0.0000015 # Input cost per 1k tokens + output_cost_per_token: 0.0000020 # Output cost per 1k tokens + mode: "completion" # Model type + metadata: "Azure OpenAI GPT-3.5 Turbo deployment in EU region" # Description + + - model_name: llama2-chat-local # User-facing alias: "llama2-chat-local" + litellm_params: + model: openai/llama-2-13b-chat-hf # Using OpenAI-compatible interface + api_base: "http://localhost:8000/v1" # Local server URL + api_key: "none" # No API key needed for local model + model_info: + id: "llama2-chat-13b-local" # Unique ID (optional but recommended) + max_tokens: 4096 + mode: "completion" + input_cost_per_token: 0 # No cost for local model (example) + output_cost_per_token: 0 +``` + +This example defines *two* model deployments: + +1. `gpt-3.5-turbo-azure`: An alias for an Azure OpenAI deployment of GPT-3.5 Turbo. The `litellm_params` specify the necessary details to connect to the Azure OpenAI service, including the deployment name, API base URL, API key (loaded from an environment variable), and API version. The `model_info` section provides metadata like the context window size (`max_tokens`) and estimated costs per token. +2. `llama2-chat-local`: An alias for a locally served LLaMA 2 model (presumably using an OpenAI-compatible interface like vLLM). The `litellm_params` use the `openai/` prefix to indicate an OpenAI-compatible API, and the `api_base` points to a local server URL. No API key is needed (`api_key: "none"`). + +**Key Concepts within `model_list`:** + +* **Model Aliases:** The `model_name` acts as an alias. Your client applications use this alias when making API requests, without needing to know the specific backend provider or deployment details. This abstraction allows you to change the backend model without modifying your application code, as long as the `model_name` remains consistent. +* **Provider Routing:** The `litellm_params.model` value (e.g., `"azure/..."`, `"openai/..."`, `"anthropic/..."`) tells LiteLLM *which* provider's API to use for this model. The proxy uses this to route requests to the correct backend. +* **Environment Variables:** It's a best practice to use environment variables (referenced via `"os.environ/VAR_NAME"`) to store sensitive information like API keys, rather than hardcoding them directly in the `config.yaml` file. +* **`model_info` for Transparency:** The `model_info` section is highly recommended for transparency and manageability. It provides valuable metadata that can be used for monitoring, cost tracking, and displaying model information in the proxy's Admin UI (if enabled). + +--- + +### Model List Parameter Summary + +| Parameter | Type | Required | Description | +| :------------------ | :---------------- | :------- | :----------------------------------------------------------------------------- | +| `model_name` | String | Yes | User-facing alias for the model. | +| `litellm_params` | Object | Yes | LiteLLM parameters for backend API calls. | +| `model_info` | Object | No | Metadata about the model (cost, context window, etc.). | + +### `model_name` + +**YAML Key:** `model_name` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required Parameter) + +**Description:** The `model_name` parameter defines the **user-facing alias** or name of a model deployment within your LiteLLM Proxy configuration. This is the name that client applications will use when making API requests to the proxy. It acts as a logical identifier, allowing you to abstract away the complexities of backend model names and provider-specific details. Each `model_name` within the `model_list` must be unique. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo # User-facing alias + litellm_params: + model: azure/gpt-turbo-small-eu # Backend model details + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +### `litellm_params` + +**YAML Key:** `litellm_params` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required Parameter) + +**Description:** The `litellm_params` section is a **required mapping** within each `model_list` entry. It contains the core configuration parameters that the LiteLLM library uses to interact with the actual backend LLM API. These parameters dictate *how* LiteLLM will call the underlying model provider. At a minimum, it **must** include the `model` parameter, but can include a wide range of settings to customize API calls. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: # Defining litellm_params for this model + model: azure/gpt-turbo-small-eu + api_base: "https://my-azure-endpoint.openai.azure.com/" + api_key: "os.environ/AZURE_API_KEY_EU" + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only section header) + +--- + +#### `model` (`litellm_params`) + +**YAML Key:** `model` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required Parameter within `litellm_params`) + +**Description:** The `model` parameter, nested within `litellm_params`, is **required** and specifies the **actual backend model identifier**. This string tells LiteLLM *which* LLM provider to use and *which specific model* from that provider to target. The format of this string is provider-specific, and typically follows the pattern `"provider_name/model_identifier"`. For example: + +* `"openai/gpt-3.5-turbo"`: Specifies OpenAI's GPT-3.5 Turbo model. +* `"azure/gpt-turbo-small-eu"`: Specifies an Azure OpenAI deployment named "gpt-turbo-small-eu". +* `"anthropic/claude-instant-1"`: Specifies Anthropic's Claude Instant-1 model. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu # Specifying the backend model (Azure OpenAI) + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `api_key` (`litellm_params`) + +**YAML Key:** `api_key` + +**Type:** String + +**Environment Variable:** Corresponding environment variable depends on the provider (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `AZURE_API_KEY`, etc.) + +**Default Value:** N/A (Required for most providers) + +**Description:** The `api_key` parameter, within `litellm_params`, is used to provide the **API key or authentication token** required to access the backend LLM provider's API. Whether this parameter is strictly *required* depends on the specific provider and model. Many commercial providers (like OpenAI, Anthropic, Azure) require an API key for authentication. It's highly recommended to **not hardcode** API keys directly in your `config.yaml` file. Instead, use environment variable references like `"os.environ/API_KEY_NAME"` to fetch the key from environment variables for better security and easier management. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" # Referencing OpenAI API key from environment variable + # ... other litellm_params ... +``` + +**Example Environment Variable:** + +```bash +export OPENAI_API_KEY="sk-..." # Setting OpenAI API key in environment +``` + +--- + +#### `api_base` (`litellm_params`) + +**YAML Key:** `api_base` + +**Type:** String + +**Environment Variable:** Corresponding environment variable depends on the provider (e.g., `AZURE_API_BASE`, custom endpoint environment variables). + +**Default Value:** Provider-specific default base URL. + +**Description:** The `api_base` parameter, under `litellm_params`, allows you to specify a **custom base URL** for the LLM provider's API endpoint. This is often necessary when you are not using the default public endpoint of a provider. Common use cases include: + +* **Azure OpenAI:** Azure OpenAI deployments require a custom `api_base` URL that points to your specific Azure endpoint. +* **Open-Source Model Gateways:** When using open-source models served through gateways like vLLM or custom inference servers, you'll need to set `api_base` to the URL of your gateway. +* **Non-Default Regions or Endpoints:** Some providers might offer APIs in different regions or with custom endpoints, requiring you to override the default `api_base`. + +If `api_base` is not provided, LiteLLM will use a **provider-specific default base URL** where applicable. + +**Example YAML:** + +```yaml +model_list: + - model_name: azure-gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_base: "https://my-azure-endpoint.openai.azure.com/" # Custom api_base for Azure OpenAI + api_key: "os.environ/AZURE_API_KEY_EU" + # ... other litellm_params ... +``` + +**Example Environment Variable:** + +```bash +export AZURE_API_BASE="https://my-azure-endpoint.openai.azure.com/" # Setting Azure API base URL in environment +``` + +--- + +#### Provider-Specific Settings (within `litellm_params`) + +**YAML Key:** Provider-specific parameter names (e.g., `api_version`, `aws_region_name`, `project_id`, etc.) + +**Type:** Depends on the parameter (String, Integer, Boolean, etc.) + +**Environment Variable:** Corresponding environment variable depends on the provider and parameter. + +**Default Value:** Provider-specific defaults. + +**Description:** Beyond the common parameters like `api_key` and `api_base`, the `litellm_params` section can include **provider-specific settings**. These parameters are unique to each LLM provider and are used to configure provider-specific behaviors or features. Examples include: + +* **`api_version` (Azure OpenAI):** Specifies the API version to use for Azure OpenAI deployments. +* **`aws_region_name` (AWS Bedrock):** Sets the AWS region for Bedrock models. +* **`project_id` (Google Vertex AI):** Specifies the Google Cloud Project ID for Vertex AI models. +* **`deployment_id` (Azure OpenAI):** Used to target a specific deployment within Azure OpenAI. + +Refer to the "Providers" section of this documentation and the official documentation of each LLM provider to identify available provider-specific parameters and their usage. + +**Example YAML:** + +```yaml +model_list: + - model_name: azure-gpt-4 + litellm_params: + model: azure/gpt-4-deployment + api_base: "https://my-azure-endpoint.openai.azure.com/" + api_key: "os.environ/AZURE_API_KEY_EU" + api_version: "2023-07-01-preview" # Provider-specific parameter: Azure API Version + # ... other litellm_params ... +``` + +**Example Environment Variable:** + +```bash +export AZURE_API_VERSION="2023-07-01-preview" # Setting Azure API version in environment +``` + +--- + +#### `temperature` (`litellm_params`) + +**YAML Key:** `temperature` + +**Type:** Number (Float or Integer) + +**Environment Variable:** N/A + +**Default Value:** Provider-specific default (often 1.0 for OpenAI, but can vary). + +**Description:** The `temperature` parameter, within `litellm_params`, controls the **randomness or creativity** of the LLM's output. It's a common parameter across many LLM providers, though the *exact* range and interpretation might slightly vary. + +* **Lower values (e.g., 0.1, 0.0):** Result in more deterministic, focused, and predictable output. The model will tend to choose the most likely next tokens, making it suitable for tasks requiring factual accuracy or code generation. +* **Higher values (e.g., 0.7, 1.0):** Introduce more randomness and creativity into the output. The model will consider a wider range of possible next tokens, leading to more surprising, diverse, and potentially less predictable text. Useful for creative writing, brainstorming, or tasks where originality is valued over strict accuracy. + +A `temperature` of `0` makes the output almost entirely deterministic (always choosing the most likely token), while values approaching `1` (or sometimes higher, depending on provider) maximize randomness. + +**Example YAML:** + +```yaml +model_list: + - model_name: creative-writer + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + temperature: 0.8 # Setting higher temperature for creative output + # ... other litellm_params ... + - model_name: code-generator + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + temperature: 0.0 # Setting lower temperature for deterministic code generation + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `max_tokens` (`litellm_params`) + +**YAML Key:** `max_tokens` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** Provider-specific default (often varies significantly by model). + +**Description:** The `max_tokens` parameter, in `litellm_params`, sets a **limit on the maximum number of tokens** the LLM should generate in its response. This is a crucial parameter for controlling output length, latency, and cost. + +* **Token Limit:** `max_tokens` specifies the *upper bound* on the generated text length. The LLM may stop generating before reaching this limit if it naturally completes its response (e.g., reaches a stop sequence or end-of-text token). +* **Cost Control:** Limiting `max_tokens` is a direct way to control the cost of LLM API calls, as you are billed per token. +* **Latency Control:** Shorter `max_tokens` values generally result in lower latency, as the model has less text to generate. +* **Context Window Consideration:** Be mindful of the model's maximum context window when setting `max_tokens`. The sum of input tokens (prompt) and `max_tokens` should not exceed the model's context limit. + +**Example YAML:** + +```yaml +model_list: + - model_name: concise-summary + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + max_tokens: 256 # Limiting response length for concise summaries + # ... other litellm_params ... + - model_name: detailed-answer + litellm_params: + model: openai/gpt-4 + api_key: "os.environ/OPENAI_API_KEY" + max_tokens: 1024 # Allowing longer responses for detailed answers + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `rpm` (`litellm_params`) + +**YAML Key:** `rpm` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** No default (no model-level RPM limit unless explicitly set). + +**Description:** The `rpm` parameter, within `litellm_params`, sets a **requests-per-minute (RPM) rate limit** specifically for *this particular model deployment*. This is a deployment-level rate limit, distinct from global or per-key rate limits configured elsewhere. + +* **Model-Specific Rate Limiting:** The `rpm` limit applies only to the model deployment in which it is defined. It does not affect other models or deployments in your `model_list`. +* **Concurrency Control:** `rpm` helps manage concurrency at the model level, preventing overload of specific backend deployments, especially slower or resource-intensive models. +* **Resource Management:** Use `rpm` to control the rate of requests to different models based on their capacity, cost, or priority. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-low-rpm + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + rpm: 6 # Setting RPM limit to 6 requests per minute for this deployment + # ... other litellm_params ... + - model_name: gpt-3.5-turbo-high-rpm + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + rpm: 100 # Higher RPM limit for a more robust deployment + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `tpm` (`litellm_params`) + +**YAML Key:** `tpm` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** No default (no model-level TPM limit unless explicitly set). + +**Description:** The `tpm` parameter, under `litellm_params`, defines a **tokens-per-minute (TPM) rate limit** specifically for *this model deployment*. Similar to `rpm`, this is a deployment-level limit, separate from global or per-key TPM limits. + +* **Token-Based Rate Limiting:** `tpm` limits the total number of tokens that can be processed by this model deployment within a minute. This includes both input (prompt) tokens and output (completion) tokens. +* **Granular Control:** `tpm` provides finer-grained control over usage compared to `rpm`, as it accounts for the length and complexity of requests, not just the number of requests. +* **Cost Optimization:** Use `tpm` to manage costs more precisely, particularly for models with variable token pricing or when you want to limit total token consumption for a deployment. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-4-economy + litellm_params: + model: openai/gpt-4 + api_key: "os.environ/OPENAI_API_KEY" + tpm: 50000 # Setting TPM limit to 50,000 tokens per minute for cost-conscious deployment + # ... other litellm_params ... + - model_name: gpt-4-performance + litellm_params: + model: openai/gpt-4 + api_key: "os.environ/OPENAI_API_KEY" + tpm: 200000 # Higher TPM limit for a performance-focused deployment + # ... other litellm_params ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +### `model_info` + +**YAML Key:** `model_info` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** No default (optional section). + +**Description:** The `model_info` section is an **optional mapping** within each `model_list` entry. It allows you to provide **metadata and descriptive attributes** about the model deployment. While not strictly required for the proxy to function, `model_info` is highly recommended for: + +* **Monitoring and Observability:** Provides data points for dashboards and monitoring systems. +* **Cost Tracking:** Enables accurate cost calculations and spend analysis. +* **Model Management UI (Admin UI):** Populates information in the proxy's Admin UI (if enabled) for model browsing and documentation. +* **API Endpoint `/model/info`:** Data in `model_info` is surfaced via the proxy's `/model/info` endpoint, allowing clients to programmatically retrieve model metadata. + +`model_info` can include fields like cost per token, context window size, model mode (completion, embedding, etc.), and custom metadata. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: # Defining model_info for metadata + max_tokens: 4096 + mode: "completion" + metadata: "Azure OpenAI GPT-3.5 Turbo (EU deployment)" + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only section header) + +--- + +#### `id` (`model_info`) + +**YAML Key:** `id` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** If not provided, LiteLLM may auto-generate a unique ID, but explicit definition is recommended for consistency. + +**Description:** The `id` parameter, within `model_info`, allows you to specify a **unique identifier** for the model deployment. While technically optional, providing a custom `id` is **strongly recommended** for: + +* **Consistent Referencing:** Ensures a stable, human-readable identifier for the model, especially when managing multiple deployments or versions. +* **Monitoring and Logging:** Makes it easier to track and analyze logs and metrics associated with a specific model instance. +* **Admin UI Clarity:** Provides a clear identifier in the proxy's Admin UI and `/model/info` endpoint. + +If you don't provide an `id`, LiteLLM may automatically generate one, but relying on auto-generated IDs can make management and tracking less predictable. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + id: "azure-gpt-3.5-turbo-eu-01" # Custom ID for this deployment + max_tokens: 4096 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `mode` (`model_info`) + +**YAML Key:** `mode` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** If not provided, LiteLLM may attempt to infer the mode based on the model name, but explicit definition is recommended. + +**Description:** The `mode` parameter, within `model_info`, specifies the **general function or type** of the model deployment. This informs the proxy about how to handle the model, particularly for tasks like health checks and API routing. Common values include: + +* `"completion"`: For chat completion and text completion models (most common). +* `"embedding"`: For embedding models. +* `"image_generation"`: For image generation models. +* `"audio_transcription"`: For audio transcription models. + +Setting `mode` correctly is important for features like health checks, which may perform mode-specific validations (e.g., health checks for image models expect `mode: "image_generation"`). While the proxy might *infer* the mode in some cases, it's best practice to **explicitly define** the `mode` for clarity and robustness. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" # Explicitly defining mode as "completion" + max_tokens: 4096 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `input_cost_per_token` (`model_info`) + +**YAML Key:** `input_cost_per_token` + +**Type:** Number (Float or Integer) + +**Environment Variable:** N/A + +**Default Value:** If not provided, LiteLLM may use a default cost from its internal model cost map, if available for the model. + +**Description:** The `input_cost_per_token` parameter, within `model_info`, specifies the **cost in USD (or your chosen currency) per 1,000 input tokens** for this model deployment. This is used for internal cost tracking and reporting within LiteLLM Proxy. + +* **Cost Tracking Accuracy:** Providing accurate cost data is crucial for precise spend tracking and budget enforcement. +* **Custom Pricing:** Use this parameter to define custom pricing if the default cost map is inaccurate or if you have negotiated specific pricing with your provider. +* **Cost Units:** Costs are always interpreted in USD (or your chosen base currency) and are typically represented as a very small decimal number (e.g., `0.0000015` for $0.0015 per 1,000 tokens). + +If `input_cost_per_token` is not provided, LiteLLM may attempt to use a default cost from its internal model cost map. However, for accurate cost tracking, it's recommended to **explicitly define both `input_cost_per_token` and `output_cost_per_token`**. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + input_cost_per_token: 0.0000015 # Specifying input cost per 1K tokens + output_cost_per_token: 0.0000020 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `output_cost_per_token` (`model_info`) + +**YAML Key:** `output_cost_per_token` + +**Type:** Number (Float or Integer) + +**Environment Variable:** N/A + +**Default Value:** If not provided, LiteLLM may use a default cost from its internal model cost map, if available for the model. + +**Description:** The `output_cost_per_token` parameter, within `model_info`, defines the **cost in USD (or your chosen currency) per 1,000 output tokens** generated by this model deployment. This is used in conjunction with `input_cost_per_token` for comprehensive cost tracking. + +* **Complete Cost Picture:** You should always provide *both* `input_cost_per_token` and `output_cost_per_token` to get a full picture of the cost implications of using a model. +* **Pricing Variations:** Different models and providers may have varying costs for input and output tokens. Ensure you configure these values accurately based on your provider's pricing. + +If `output_cost_per_token` is omitted, LiteLLM might attempt to use a default cost from its internal model cost map. However, for reliable cost tracking, it's recommended to **explicitly set both input and output cost parameters**. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + input_cost_per_token: 0.0000015 + output_cost_per_token: 0.0000020 # Specifying output cost per 1K tokens + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `max_tokens` (`model_info`) + +**YAML Key:** `max_tokens` (within `model_info`) + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** If not provided, LiteLLM may use a default max tokens value from its internal model database, if available for the model. + +**Description:** The `max_tokens` parameter, when placed inside the `model_info` section (note: this is *different* from `litellm_params.max_tokens`), specifies the **maximum context length or token limit** of the model itself. This value represents the total number of tokens (input + output) that the model can handle in a single request. + +* **Context Window Enforcement:** LiteLLM Proxy uses this `max_tokens` value to enforce request size limits. It can prevent requests that exceed the model's context window from being sent to the backend, avoiding errors. +* **Informational and Reporting:** This value is also used for informational purposes, such as reporting the model's context window via the `/model/info` endpoint and in logs. + +While LiteLLM might use a default `max_tokens` from its internal database if you omit it, it's best practice to **explicitly define `model_info.max_tokens`** for each model, ensuring accurate context window awareness and limit enforcement. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + max_tokens: 4096 # Specifying max context tokens for GPT-3.5 Turbo + input_cost_per_token: 0.0000015 + output_cost_per_token: 0.0000020 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `base_model` (`model_info`) + +**YAML Key:** `base_model` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** No default value. + +**Description:** The `base_model` parameter, within `model_info`, is an **optional** field that allows you to specify the **underlying base model name or version** that a given `model_name` alias corresponds to. + +* **Versioning and Lineage Tracking:** Useful when you have custom models or model aliases that are based on specific versions of foundation models (e.g., a fine-tuned model based on a particular GPT-4 snapshot). +* **Clarity and Documentation:** Helps clarify the origin and basis of a model alias, especially when managing multiple versions or custom deployments. +* **Cost Mapping (Internal):** In some cases, LiteLLM might use `base_model` to help determine the correct cost mapping for models that don't explicitly report their costs. + +In many cases, you can omit `base_model`, especially for standard models where the `model_name` is already self-explanatory. However, it's good practice to use it when you want to clearly document the underlying model lineage or need it for accurate cost tracking of custom models. + +**Example YAML:** + +```yaml +model_list: + - model_name: my-custom-gpt-4-model + litellm_params: + model: azure/my-custom-gpt-4-deployment + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + base_model: "gpt-4-1106-preview" # Indicating this custom model is based on GPT-4 preview version + max_tokens: 8192 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `metadata` (`model_info`) + +**YAML Key:** `metadata` + +**Type:** String, or any other data type serializable to YAML (e.g., Number, Boolean, List, Mapping). + +**Environment Variable:** N/A + +**Default Value:** No default value. + +**Description:** The `metadata` parameter, within `model_info`, is a highly flexible and **optional** field that allows you to attach **arbitrary descriptive information** to a model deployment. This metadata is free-form and can be used for various purposes: + +* **Documentation and Descriptions:** Provide human-readable descriptions of the model, its purpose, version, or any relevant notes for documentation or UI display. +* **UI Labels and Categorization:** Use metadata to categorize models, add tags, or provide labels for easier filtering and organization in the Admin UI or model selection interfaces. +* **Custom Application Logic:** Your application code can retrieve and use this metadata via the `/model/info` endpoint to implement custom logic based on model attributes. + +`metadata` can be a simple string, or it can be a more complex data structure (like a dictionary or list) if you need to store structured information. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + metadata: "OpenAI GPT-3.5 Turbo model (2023-03-01 snapshot)" # Simple string metadata + max_tokens: 4096 + # ... other model_info fields ... + - model_name: llama2-chat-v2 + litellm_params: + model: openai/llama-2-13b-chat-hf + api_base: "http://localhost:8000/v1" + api_key: "none" + model_info: + mode: "completion" + metadata: + version: 2 # Using structured metadata (dictionary) + organization: "HuggingFace" + description: "Local LLaMA-2 13B Chat model, version 2" + max_tokens: 4096 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### Additional Fields (within `model_info`) + +**YAML Key:** Any custom string (e.g., `version`, `organization`, `notes`, etc.) + +**Type:** Any data type serializable to YAML. + +**Environment Variable:** N/A + +**Default Value:** No default value. + +**Description:** The `model_info` section is **extensible**. You can add **any custom fields** you need beyond the predefined parameters (like `id`, `mode`, `max_tokens`, etc.). These custom fields will be stored and returned by the `/model/info` API, but generally do not affect the proxy's core operation. + +* **Application-Specific Metadata:** Use custom fields to store information that is relevant to your specific application, organization, or workflow. +* **Record-Keeping and Tracking:** Add fields for version numbers, organization names, notes, tags, or any other data you want to associate with each model deployment for record-keeping purposes. +* **UI Customization (Advanced):** In highly customized setups, you could potentially use these fields to drive dynamic behavior in custom UIs or dashboards that interact with the proxy's `/model/info` endpoint. + +**Example YAML:** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + model_info: + mode: "completion" + input_cost_per_token: 0.0000015 + output_cost_per_token: 0.0000020 + version: "2023-03-01" # Custom field: Model version + organization: "OpenAI" # Custom field: Provider organization + notes: "Original snapshot of GPT-3.5 Turbo" # Custom field: Notes/description + max_tokens: 4096 + # ... other model_info fields ... +``` + +**Example Environment Variable:** N/A (YAML-only parameters) + +--- + +## `litellm_settings` Section: Configuring the LiteLLM Library + +This section allows you to configure the behavior of the underlying LiteLLM Python library within the proxy. These settings globally influence logging, networking, caching, and fallback behaviors. Many of these settings correspond to functions within the `litellm` module or internal flags. While most values can also be set via environment variables (prefixed with `LITELLM_`), defining them in `config.yaml` under `litellm_settings` is often more convenient. If a setting is defined in both `litellm_settings` and `router_settings`, the `router_settings` value will take precedence specifically for router operations. + + +### litellm\_settings Parameter Summary + +| Parameter | Type | Default Value | Description | +| :------------------------------------------ | :------------------ | :------------ | :----------------------------------------------------------------------------- | +| **Logging & Callback Settings** | | | | +| `success_callback` | Array of Strings | `[]` | Callbacks for successful requests. | +| `failure_callback` | Array of Strings | `[]` | Callbacks for failed requests. | +| `callbacks` | Array of Strings | `[]` | Callbacks for both success and failure. | +| `service_callbacks` | Array of Strings | `[]` | Callbacks for service health monitoring. | +| `turn_off_message_logging` | Boolean | `false` | Disable logging of message content. | +| `redact_user_api_key_info` | Boolean | `false` | Redact user API key info from logs. | +| `langfuse_default_tags` | Array of Strings | `[]` | Default tags for Langfuse integration. | +| `modify_params` | Boolean | `false` | Enable request parameter modification. | +| `enable_preview_features` | Boolean | `false` | Enable experimental/preview features. | +| `set_verbose` | Boolean | `false` | Enable verbose debug logging. | +| `json_logs` | Boolean | `false` | Output logs in JSON format. | +| **Networking & Timeout Settings** | | | | +| `request_timeout` | Integer | High Default | Timeout for LLM API calls (seconds). | +| `force_ipv4` | Boolean | `false` | Force IPv4 for LLM requests. | +| `forward_traceparent_to_llm_provider` | Boolean | `false` | Forward `traceparent` header to LLM provider. | +| **Fallback & Reliability Settings** | | | | +| `default_fallbacks` | Array of Strings | `[]` | Global fallback models for general errors. | +| `content_policy_fallbacks` | Array of Objects | `[]` | Fallbacks for ContentPolicyViolationError. | +| `context_window_fallbacks` | Array of Objects | `[]` | Fallbacks for ContextWindowExceededError. | +| `disable_end_user_cost_tracking` | Boolean | `false` | Disable tracking of end-user costs. | +| `disable_end_user_cost_tracking_prometheus_only` | Boolean | `false` | Disable end-user cost tracking in Prometheus only. | +| `key_generation_settings` | Object | `None` | Controls API key generation *(Enterprise Feature)*. | +| `disable_add_transform_inline_image_block` | Boolean | `false` | Disable auto-add `#transform=inline` for Fireworks AI. | +| `disable_hf_tokenizer_download` | Boolean | `false` | Disable Hugging Face tokenizer download. | +| **Caching Settings** | | | | +| `cache` | Boolean | `false` | Master switch to enable/disable caching. | +| `cache_params` | Object | `None` | Configuration for cache backend and behavior. | + +### Logging & Callback Settings + +This subsection within `litellm_settings` allows you to configure how the proxy handles logging and integrates with various monitoring and observability platforms. You can define callbacks for successful and failed requests, enable detailed debugging, and control what information is logged. + +#### `success_callback` + +**YAML Key:** `success_callback` + +**Type:** Array of Strings + +**Environment Variable:** `LITELLM_CALLBACKS` (comma-separated string) + +**Default Value:** `[]` (Empty list, no success callbacks enabled by default). + +**Description:** The `success_callback` parameter, within `litellm_settings`, is a **list of strings** that specifies which **callback integrations** should be executed *after* a request to the LLM API is **successfully completed**. Each string in the list should be the **name of a logging or monitoring callback** that you want to invoke on successful requests. Common examples include: `"langfuse"`, `"helicone"`, `"wandb"`, `"sentry"`, `"mlflow"`, etc. These callbacks are used for logging, monitoring, and observability purposes. + +**Example YAML:** + +```yaml +litellm_settings: + success_callback: ["langfuse", "helicone"] # Enabling Langfuse and Helicone callbacks for successful requests + # ... other litellm_settings ... +``` + +**Example Environment Variable:** + +```bash +export LITELLM_CALLBACKS="langfuse,helicone" # Setting success callbacks via environment variable +``` + +--- + +#### `failure_callback` + +**YAML Key:** `failure_callback` + +**Type:** Array of Strings + +**Environment Variable:** `LITELLM_CALLBACKS` (comma-separated string, same as `success_callback`) + +**Default Value:** `[]` (Empty list, no failure callbacks enabled by default). + +**Description:** The `failure_callback` parameter, in `litellm_settings`, is an **array of strings** that defines which **callback integrations** should be executed when a request to the LLM API **fails** (results in an error). Similar to `success_callback`, each string should be the name of a logging/monitoring callback. Failure callbacks are essential for capturing and reporting errors, enabling you to monitor the health and reliability of your LLM application. You might use callbacks like `"sentry"`, `"slack"`, or custom error logging functions in `failure_callback`. + +**Example YAML:** + +```yaml +litellm_settings: + failure_callback: ["sentry"] # Enabling Sentry callback for failed requests + # ... other litellm_settings ... +``` + +**Example Environment Variable:** + +```bash +export LITELLM_CALLBACKS="sentry" # Setting failure callback via environment variable (using LITELLM_CALLBACKS, same as success_callback) +``` + +--- + +#### `callbacks` (`litellm_settings`) + +**YAML Key:** `callbacks` + +**Type:** Array of Strings + +**Environment Variable:** `LITELLM_CALLBACKS` (comma-separated string, same as `success_callback` and `failure_callback`) + +**Default Value:** `[]` (Empty list, no general callbacks enabled by default). + +**Description:** The `callbacks` parameter, within `litellm_settings`, is an **array of strings** specifying callback integrations that should be executed **regardless of whether the LLM API call succeeds or fails**. These are general-purpose callbacks that should run for *every* request outcome. Common use cases for `callbacks` include: + +* **Comprehensive Logging:** Callbacks like `"otel"` (OpenTelemetry) for tracing or `"lunary"` for general logging that you want to capture for all requests, both successful and failed. +* **Auditing:** Callbacks for audit logging, where you need a record of every request attempt, regardless of success. +* **Metrics Collection:** Callbacks that increment counters or update metrics dashboards for every request, irrespective of the outcome. + +If a callback is listed in *both* `callbacks` and either `success_callback` or `failure_callback`, it will only be executed **once per request**, avoiding redundant callback invocations. + +**Example YAML:** + +```yaml +litellm_settings: + callbacks: ["otel"] # Enabling OpenTelemetry callback for all requests (success and failure) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** + +```bash +export LITELLM_CALLBACKS="otel" # Setting general callback via environment variable (using LITELLM_CALLBACKS, same as success_callback and failure_callback) +``` + +--- + +#### `service_callbacks` + +**YAML Key:** `service_callbacks` + +**Type:** Array of Strings + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no service callbacks enabled by default). + +**Description:** The `service_callbacks` parameter, within `litellm_settings`, is a **list of strings** that defines **service health monitoring callbacks**. These callbacks are *not* triggered by individual LLM API requests. Instead, they are designed to capture **internal service-level events** and failures within the LiteLLM Proxy itself, such as: + +* **Database Connection Issues:** Errors connecting to or interacting with the proxy's database (Postgres). +* **Cache Backend Failures:** Problems with Redis, S3, or other caching backends. +* **Internal Service Errors:** Exceptions or failures within the proxy's own components or services. + +`service_callbacks` are typically used for operational monitoring of the proxy's health and infrastructure, rather than for logging individual LLM request details. Examples include `"datadog"` and `"prometheus"` for reporting internal metrics and errors to monitoring platforms. + +**Example YAML:** + +```yaml +litellm_settings: + service_callbacks: ["datadog", "prometheus"] # Enabling Datadog and Prometheus service health callbacks + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +#### `turn_off_message_logging` + +**YAML Key:** `turn_off_message_logging` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Message logging is enabled by default). + +**Description:** The `turn_off_message_logging` parameter, in `litellm_settings`, is a **boolean flag** that controls whether the **content of prompts (messages) and responses** are logged to your configured callbacks. + +* **Privacy and Security:** Setting `turn_off_message_logging: true` prevents the actual text of conversations from being logged to external logging/monitoring systems. This is crucial for privacy-sensitive applications or when dealing with confidential data in prompts and responses. +* **Reduced Log Volume:** Disabling message logging can significantly reduce the volume of logs generated by the proxy, which can be beneficial for cost optimization and easier log management, especially in high-throughput environments. +* **Metadata Logging Still Active:** Even when message logging is turned off, the proxy will still log *metadata* about requests, such as token counts, costs, timestamps, model names, and error details. Only the actual text content of prompts and responses is suppressed. + +**Example YAML:** + +```yaml +litellm_settings: + turn_off_message_logging: true # Disabling message content logging for privacy + success_callback: ["langfuse"] # Still logging metadata to Langfuse, but no message text + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** In code, you can achieve the same effect by setting `litellm.turn_off_message_logging=True` globally in your application or proxy initialization code. + +--- + +#### `redact_user_api_key_info` + +**YAML Key:** `redact_user_api_key_info` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (User API key information is logged by default). + +**Description:** The `redact_user_api_key_info` parameter, within `litellm_settings`, is a **boolean flag** that controls whether **information related to the end-user's API key (virtual key)** is redacted (removed or anonymized) from logs sent to your configured callbacks. + +* **Privacy of User Credentials:** When enabled (`redact_user_api_key_info: true`), the proxy will attempt to remove or anonymize user-specific credential information from logs before they are sent to external logging systems (like Langfuse, OpenTelemetry, etc.). This includes: + * The hashed virtual API key itself. + * User IDs, team IDs, or any other identifiers associated with the virtual key. +* **Log Anonymization:** This feature helps prevent accidental exposure of user-specific credential information in external logs, enhancing privacy and security, especially in multi-tenant environments where user API keys are used. +* **Supported Loggers:** Redaction is supported for specific loggers, including Langfuse, OpenTelemetry, Logfire, and ArizeAI (as of the documentation). Check the official documentation for the most up-to-date list of supported loggers. + +**Example YAML:** + +```yaml +litellm_settings: + redact_user_api_key_info: true # Redacting user API key info from logs for privacy + success_callback: ["langfuse"] # Langfuse logging enabled, but user key info will be redacted + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** There is no direct environment variable equivalent for this setting; it must be configured in `config.yaml`. + +--- + +#### `langfuse_default_tags` + +**YAML Key:** `langfuse_default_tags` + +**Type:** Array of Strings + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no default Langfuse tags are added by default). + +**Description:** The `langfuse_default_tags` parameter, within `litellm_settings`, is an **array of strings** that allows you to specify a list of **default tags** to be automatically added to every log event sent to **Langfuse**, when using the Langfuse logging integration. + +* **Enhanced Langfuse Filtering and Analysis:** Tags are key-value pairs that can be attached to log events in Langfuse. By defining default tags, you can automatically enrich your Langfuse logs with useful metadata, making it easier to filter, group, and analyze requests within the Langfuse platform. +* **Predefined Metadata:** Common tags you might want to include by default could be: + * `"proxy_base_url"`: The base URL of your LiteLLM Proxy instance. + * `"user_api_key_alias"`: The alias or identifier of the user API key used for the request. + * `"semantic-similarity"`: Any semantic similarity scores or metrics calculated during request processing. + * `"cache_hit"`: Indicates if the response was served from the cache. + * `"user_api_key_user_id"`: The User ID associated with the API key. + +**Example YAML:** + +```yaml +litellm_settings: + success_callback: ["langfuse"] # Langfuse logging enabled + langfuse_default_tags: ["proxy_base_url", "user_api_key_alias", "cache_hit"] # Adding default tags to Langfuse logs + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** These tags will then be automatically applied to every log event sent to Langfuse by the proxy, aiding in filtering and analysis within the Langfuse UI. + +--- + +#### `modify_params` + +**YAML Key:** `modify_params` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Parameter modification is disabled by default). + +**Description:** The `modify_params` parameter, within `litellm_settings`, is an **advanced boolean flag** that enables a powerful, but potentially risky, feature: allowing **programmatic modification of request parameters** *just before* the proxy sends the request to the backend LLM provider. + +* **Custom Request Transformations:** When `modify_params: true`, the proxy allows custom code (typically via a plugin or callback) to intercept and alter the `litellm_params` dictionary or the request body itself *on the fly*. This opens up possibilities for highly customized request handling. +* **Advanced Use Cases:** `modify_params` is primarily intended for advanced and enterprise scenarios, such as: + * **Custom Plugin Logic:** Implementing complex plugins that need to dynamically adjust API requests based on real-time conditions, user context, or external data. + * **Provider-Specific Parameter Injection:** Programmatically injecting provider-specific parameters that are not directly exposed through standard LiteLLM parameters, but needed for certain backend models. + * **Request Augmentation:** Adding extra information or context to the request payload based on custom logic. + +**Important Security and Stability Considerations:** + +* **Security Risks:** Enabling `modify_params: true` introduces significant security risks if not used carefully. Malicious or poorly written custom code could potentially: + * Expose sensitive data by logging or transmitting modified requests. + * Bypass security checks or rate limits. + * Cause unexpected behavior or errors in the proxy or backend LLM. +* **Stability Risks:** Incorrectly modifying parameters can lead to malformed API requests, causing errors or unpredictable LLM behavior. +* **Advanced Feature – Use with Caution:** This parameter is marked as an advanced feature and should be used **only if you fully understand the risks** and have a clear, well-justified use case. **Thoroughly test and audit** any custom code that modifies request parameters. + +**Example YAML:** + +```yaml +litellm_settings: + modify_params: true # Enabling request parameter modification (advanced feature - use with caution!) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** The actual logic for *how* parameters are modified is *not* configured via `config.yaml`. `modify_params: true` simply *enables* the capability. The custom modification logic itself must be implemented in separate Python code, usually within a plugin or callback, that is designed to work with this feature. + +--- + +#### `enable_preview_features` + +**YAML Key:** `enable_preview_features` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Preview features are disabled by default). + +**Description:** The `enable_preview_features` parameter, within `litellm_settings`, is a **boolean flag** that controls whether **experimental or preview features** within the LiteLLM library are enabled. + +* **Access to New or Experimental Functionality:** LiteLLM may introduce new features or integrations that are initially marked as "preview" or "experimental". These features are typically under active development and may have limited stability, documentation, or support. +* **Early Access and Testing:** Enabling `enable_preview_features: true` allows you to access and test these preview features, giving you early access to new capabilities and the opportunity to provide feedback. +* **Potential Instability or Changes:** Preview features are *not* considered stable and are subject to change, removal, or movement to the Enterprise edition in future releases. **Do not rely on preview features in production environments** unless you are prepared for potential instability and changes. +* **Documentation Check:** Always consult the official LiteLLM documentation to understand *which* features are currently behind the `enable_preview_features` flag and what their limitations or caveats are. + +**Example YAML:** + +```yaml +litellm_settings: + enable_preview_features: true # Enabling preview features (use with caution!) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Example Use Case:** At the time of writing, enabling Azure's "O1" series models with streaming requires setting `enable_preview_features: true`. + +--- + +#### `set_verbose` + +**YAML Key:** `set_verbose` + +**Type:** Boolean + +**Environment Variable:** `LITELLM_VERBOSE=1` (or any non-zero value) + +**Default Value:** `false` (Verbose logging is disabled by default). + +**Description:** The `set_verbose` parameter, within `litellm_settings`, is a **boolean flag** that enables **verbose debug logging** from the core LiteLLM Python library itself. When enabled, LiteLLM will output detailed debug messages to the proxy's console (stdout), providing very granular information about its internal operations, API calls, and processing steps. + +* **Detailed Debugging Information:** Verbose logging is invaluable for troubleshooting complex issues, understanding the internal workings of LiteLLM, or debugging custom integrations. It can reveal: + * Detailed request and response payloads sent to and received from LLM providers. + * Internal function calls and execution flow within the LiteLLM library. + * Variable values and internal state during request processing. +* **High Log Volume:** Verbose logging generates a **very large volume of log output**. **Do not enable `set_verbose: true` in production environments** unless absolutely necessary for debugging a specific issue, as it can impact performance, consume disk space, and potentially expose sensitive information in logs. +* **Development and Testing Tool:** `set_verbose: true` is primarily intended as a **development and debugging tool** for local testing or controlled staging environments. + +**Example YAML:** + +```yaml +litellm_settings: + set_verbose: true # Enabling verbose debug logging (for development/debugging only!) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** + +```bash +export LITELLM_VERBOSE=1 # Enabling verbose logging via environment variable +``` + +**Note:** Setting the environment variable `LITELLM_VERBOSE=1` (or any non-zero value) has the same effect as setting `set_verbose: true` in `config.yaml`. You can also set `litellm.set_verbose=True` programmatically in your code. + +--- + +#### `json_logs` + +**YAML Key:** `json_logs` + +**Type:** Boolean + +**Environment Variable:** `LITELLM_JSON_LOGS=true` (or any value interpretable as boolean true, e.g., `"1"`, `"yes"`) + +**Default Value:** `false` (Logs are output in plain text format by default). + +**Description:** The `json_logs` parameter, within `litellm_settings`, is a **boolean flag** that controls the **format of the proxy's logs**. + +* **JSON Structured Logs:** When `json_logs: true`, the proxy will output all logs (including request logs, error logs, and other proxy events) in **JSON (JavaScript Object Notation) format** instead of the default plain text format. +* **Structured Logging Benefits:** JSON logs are highly beneficial for: + * **Log Aggregation and Analysis:** JSON format is easily parsed and ingested by log management systems (like Elasticsearch, Splunk, Datadog Logs, etc.), enabling efficient centralized logging, searching, and analysis. + * **Programmatic Processing:** JSON logs are structured data, making it much easier to programmatically parse, filter, and process log information for monitoring dashboards, automated alerts, or custom analytics tools. + * **Integration with Observability Pipelines:** JSON logs are well-suited for integration into modern observability pipelines and systems that expect structured log data. + +**Example YAML:** + +```yaml +litellm_settings: + json_logs: true # Enabling JSON formatted logs for structured logging + # ... other litellm_settings ... +``` + +**Example Environment Variable:** + +```bash +export LITELLM_JSON_LOGS=true # Enabling JSON logs via environment variable +``` + +**Note:** Enabling `json_logs: true` will cause the proxy to output all logs in JSON format to stdout (standard output). If you are shipping logs to a JSON log aggregator, this is highly recommended. The documentation notes that enabling JSON logs also causes the raw POST request payload to be logged as JSON as well, which can be useful for debugging but consider any potential security implications of logging request bodies in detail. + +--- + +### Networking & Timeout Settings + +This subsection of `litellm_settings` controls network-related configurations, primarily focusing on timeouts and connection behaviors for requests made to the backend LLM providers. These settings are crucial for ensuring responsiveness and handling network issues. + +#### `request_timeout` + +**YAML Key:** `request_timeout` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** While the documentation notes 6000 seconds (100 minutes), this seems exceptionally high and might be a typo. A more typical default, based on OpenAI SDK defaults and general API timeout practices, would be closer to **600 seconds (10 minutes)** or even less. You should test and confirm the actual default behavior in your LiteLLM Proxy version. + +**Description:** The `request_timeout` parameter, within `litellm_settings`, sets the **timeout duration in seconds** for all LLM API calls made by the LiteLLM library within the proxy. This timeout applies to the *entire* duration of a single API call to the backend LLM provider, including network latency, server-side processing, and response streaming. + +* **Preventing Hanging Requests:** `request_timeout` is crucial for preventing indefinite hanging or stalled requests. If a call to the upstream model takes longer than the specified timeout, the proxy will **abort the request** and raise a `Timeout` error. +* **Resource Management:** Timeouts help free up resources on the proxy server and prevent it from getting stuck waiting for unresponsive or slow LLM providers. +* **User Experience:** Setting a reasonable timeout ensures that clients receive timely responses or error indications, rather than waiting indefinitely for a response that may never come. +* **Recommended Value:** It's generally recommended to set a `request_timeout` to a **reasonable value**, balancing responsiveness with allowing enough time for legitimate LLM processing. A value like **30-60 seconds** might be a good starting point for many applications, but you should adjust it based on your expected LLM response times and application requirements. + +**Example YAML:** + +```yaml +litellm_settings: + request_timeout: 30 # Setting request timeout to 30 seconds + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** There is also a `router_settings.timeout` parameter, which sets a timeout for the *overall request handling* within the proxy router, including retries and fallbacks. The `litellm_settings.request_timeout` is specifically for the underlying LLM API call itself. If both are set, `router_settings.timeout` typically takes precedence for overall request handling, while `litellm_settings.request_timeout` governs the individual LLM call timeout. + +--- + +#### `force_ipv4` + +**YAML Key:** `force_ipv4` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (IPv6 is preferred if available, otherwise IPv4 is used). + +**Description:** The `force_ipv4` parameter, in `litellm_settings`, is a **boolean flag** that, when set to `true`, **forces all outgoing LLM API requests from the proxy to use IPv4** (Internet Protocol version 4) exclusively. + +* **IPv6 Compatibility Issues Workaround:** In certain network environments or with specific LLM providers, IPv6 connectivity might cause issues, such as HTTPX connection errors (particularly observed with Anthropic API). Setting `force_ipv4: true` can act as a workaround by forcing the proxy to use IPv4, potentially resolving these connectivity problems. +* **IPv4-Only Environments:** In environments where IPv6 is not fully supported or reliably configured, forcing IPv4 ensures consistent and predictable network communication for LLM API calls. +* **Default IPv6 Preference:** By default (`force_ipv4: false`), the proxy will attempt to use IPv6 if it is available and properly configured in the system environment. If IPv6 is not available or fails, it will fall back to IPv4 automatically. Setting `force_ipv4: true` disables this IPv6 preference and forces IPv4-only communication. + +**Example YAML:** + +```yaml +litellm_settings: + force_ipv4: true # Forcing IPv4 for all LLM API requests (workaround for IPv6 issues) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** Only enable `force_ipv4: true` if you are experiencing specific IPv6 connectivity issues with your LLM provider APIs. In most modern network environments, leaving it at the default `false` is recommended, allowing the proxy to leverage IPv6 if available. + +--- + +#### `forward_traceparent_to_llm_provider` + +**YAML Key:** `forward_traceparent_to_llm_provider` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Traceparent header is not forwarded by default). + +**Description:** The `forward_traceparent_to_llm_provider` parameter, within `litellm_settings`, is a **boolean flag** that controls whether the proxy will **forward any incoming `traceparent` header** it receives from the client application to the **upstream LLM API call**. + +* **Distributed Tracing Propagation:** The `traceparent` header is part of the W3C Trace Context standard for distributed tracing. When a client application initiates a trace (e.g., using OpenTelemetry or a similar tracing system) and includes a `traceparent` header in its request to the proxy, enabling `forward_traceparent_to_llm_provider: true` will propagate this trace context to the backend LLM provider. +* **End-to-End Traceability:** This allows for end-to-end traceability of requests across your entire system, from the client application, through the LiteLLM Proxy, and into the LLM provider's infrastructure (if the provider also supports and honors trace context propagation). This is invaluable for debugging and performance monitoring of distributed LLM applications. +* **Self-Hosted or Internal LLMs:** `forward_traceparent_to_llm_provider: true` is **generally safe and useful** when you are proxying to **self-hosted LLMs** or **internal model services** that you control or trust and that are designed to handle trace context propagation. +* **Caution with External Services:** **Exercise caution** when enabling this for **external, third-party LLM services** (like OpenAI, Anthropic, Google Vertex AI, AWS Bedrock). External services may not recognize or correctly handle the `traceparent` header. In some cases, forwarding unknown headers to external services might even cause errors or unexpected behavior. **The documentation explicitly warns against using this with external services like AWS Bedrock or Google Vertex AI**, as they may reject requests with unknown headers. +* **Default Off for Safety:** The default value is `false` for safety, to avoid unintended issues with external services. + +**Example YAML:** + +```yaml +litellm_settings: + forward_traceparent_to_llm_provider: true # Enabling traceparent header forwarding for internal LLM backend + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Important:** Only enable `forward_traceparent_to_llm_provider: true` if you are certain that your LLM backend (typically a self-hosted or internal service) is designed to accept and handle `traceparent` headers. For external, third-party LLM providers, leave this setting at its default `false` unless you have explicit guidance from the provider to enable trace context propagation. + +--- + +### Fallback & Reliability Settings + +This subsection within `litellm_settings` provides options for configuring fallback models and other reliability features. These settings help your proxy handle errors and ensure service continuity when primary LLMs are unavailable or exceed context limits. + +#### `default_fallbacks` + +**YAML Key:** `default_fallbacks` + +**Type:** Array of Strings (List of model names) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no default fallbacks configured). + +**Description:** The `default_fallbacks` parameter, within `litellm_settings`, is an **array of model names** that defines a **global fallback list**. This list specifies which alternative models the proxy should attempt to use as a **last resort** if a request to the originally intended model **fails for any reason** (except for ContentPolicyViolationError and ContextWindowExceededError, which have their own dedicated fallback settings). + +* **Global Safety Net:** `default_fallbacks` acts as a **catch-all fallback** mechanism. If a request to a model fails and no model-specific fallback is defined (via `router_settings.fallbacks` or request-level `fallbacks` parameter) and the error is not a content policy or context window error, the proxy will try the models listed in `default_fallbacks` in the order they are specified. +* **Reliability and Redundancy:** Use `default_fallbacks` to enhance the reliability of your LLM application by ensuring that there are always backup options if primary models become unavailable or encounter issues. +* **Ordered Fallback Attempts:** The proxy will attempt to use the fallback models in the list **sequentially**. If the first fallback model also fails, it will try the next one, and so on, until either a fallback succeeds or the list is exhausted. +* **Model Names from** `model_list`:** The model names listed in `default_fallbacks` must be valid `model_name` aliases that are defined in your `model_list` section of `config.yaml`. + +**Example YAML:** + +```yaml +litellm_settings: + default_fallbacks: ["claude-instant-1", "gpt-3.5-turbo"] # Global fallback list: try Claude Instant-1, then GPT-3.5 Turbo + # ... other litellm_settings ... +``` + +In this example, if a request fails for any model and no specific fallback is set for that model, the proxy will first attempt to use the model aliased as `"claude-instant-1"`. If that also fails, it will then try `"gpt-3.5-turbo"`. If both fallbacks fail, the request will ultimately result in an error. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `default_fallbacks` is a *global* fallback mechanism. For more granular control, you can define model-specific fallbacks using `router_settings.fallbacks` (see Router Settings section). + +--- + +#### `content_policy_fallbacks` + +**YAML Key:** `content_policy_fallbacks` + +**Type:** Array of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no content policy fallbacks configured). + +**Description:** The `content_policy_fallbacks` parameter, within `litellm_settings`, is an **array of objects** used to specify **fallback models specifically for `ContentPolicyViolationError`** errors. This error typically indicates that the requested content (prompt or completion) violated the content policy of the LLM provider (e.g., OpenAI, Anthropic, etc.). + +* **Content Moderation Handling:** `content_policy_fallbacks` allows you to define a specific fallback strategy to handle content policy rejections gracefully. Instead of simply returning an error to the client, the proxy can automatically retry the request with a different, potentially less restrictive, model. +* **Model-Specific Fallbacks for Content Policy:** You can define fallbacks on a per-model basis. Each object in the `content_policy_fallbacks` list is a **mapping** that associates a *source model* (or model group) with a *list of fallback models* to try if a `ContentPolicyViolationError` occurs for the source model. +* **Example Mapping:** `content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}]` means: "If model `gpt-3.5-turbo-small` returns a `ContentPolicyViolationError`, retry the request with `claude-opus`." +* **List of Fallbacks:** For each source model, you can specify a list of fallback models to try in order. The proxy will attempt the fallbacks sequentially until one succeeds or the list is exhausted. +* **Model Names from** `model_list`:** The model names used as source models and fallback models must be valid `model_name` aliases defined in your `model_list` section. + +**Example YAML:** + +```yaml +litellm_settings: + content_policy_fallbacks: # Defining fallbacks specifically for ContentPolicyViolationError + - gpt-3.5-turbo-small: ["claude-instant-1", "gpt-3.5-turbo-large"] # If gpt-3.5-turbo-small fails content policy, try Claude Instant-1, then GPT-3.5-turbo-large + - gpt-4: ["claude-2"] # If gpt-4 fails content policy, try Claude-2 + # ... other litellm_settings ... +``` + +In this example, if a request to `gpt-3.5-turbo-small` results in a `ContentPolicyViolationError`, the proxy will first retry the request with `claude-instant-1`. If that also fails (or also results in a content policy error), it will then try `gpt-3.5-turbo-large`. Separate fallback mappings are defined for `gpt-4` to use `claude-2` as a content policy fallback. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `content_policy_fallbacks` is specifically for handling `ContentPolicyViolationError` errors. For other types of errors, use the general `default_fallbacks` or `router_settings.fallbacks` mechanisms. + +--- + +#### `context_window_fallbacks` + +**YAML Key:** `context_window_fallbacks` + +**Type:** Array of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no context window fallbacks configured). + +**Description:** The `context_window_fallbacks` parameter, within `litellm_settings`, is an **array of objects** used to specify **fallback models specifically for `ContextWindowExceededError`** errors. This error occurs when the input prompt (or combined prompt and response length) is too long and exceeds the maximum context window of the model. + +* **Context Length Handling:** `context_window_fallbacks` provides a mechanism to automatically handle requests that are too long for a given model's context window. Instead of failing, the proxy can intelligently retry the request with a model that has a larger context window. +* **Model-Specific Fallbacks for Context Window Errors:** Similar to `content_policy_fallbacks`, you define fallbacks on a per-model basis. Each object in the `context_window_fallbacks` list is a **mapping** that associates a *source model* (or model group) with a *list of fallback models* with larger context windows. +* **Example Mapping:** `context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}]` means: "If model `gpt-3.5-turbo-small` returns a `ContextWindowExceededError`, first retry with `gpt-3.5-turbo-large`, and if that also fails (or also gets a context window error), then try `claude-opus`." +* **Ordered Fallback Attempts:** The proxy will try the fallback models in the list sequentially until a model with a sufficiently large context window is found or the list is exhausted. +* **Model Names from** `model_list`:** The model names used as source models and fallback models must be valid `model_name` aliases defined in your `model_list`. + +**Example YAML:** + +```yaml +litellm_settings: + context_window_fallbacks: # Defining fallbacks specifically for ContextWindowExceededError + - gpt-3.5-turbo-small: ["gpt-3.5-turbo-large", "claude-opus"] # If gpt-3.5-turbo-small gets context window error, try gpt-3.5-turbo-large, then Claude Opus + - gpt-3.5-turbo: ["gpt-4"] # If gpt-3.5-turbo (4K context) gets context window error, try gpt-4 (8K+ context) + # ... other litellm_settings ... +``` + +In this example, if `gpt-3.5-turbo-small` (perhaps a model with a 4K context window) gets a request that's too long, the proxy will first try `gpt-3.5-turbo-large` (e.g., a 16K context model). If that still fails due to context length, it will then attempt to use `claude-opus` (potentially a model with a very large context). A separate fallback is defined for `gpt-3.5-turbo` (assuming it has a smaller context than `gpt-4`). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `context_window_fallbacks` is specifically for `ContextWindowExceededError` errors. For other error types, use the general `default_fallbacks` or `router_settings.fallbacks` mechanisms. + +--- + +#### `disable_end_user_cost_tracking` + +**YAML Key:** `disable_end_user_cost_tracking` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (End-user cost tracking is enabled by default). + +**Description:** The `disable_end_user_cost_tracking` parameter, within `litellm_settings`, is a **boolean flag** that controls whether the proxy should **track costs at the *end-user* level**. By default, LiteLLM Proxy tracks and records the cost of each request, attributing it to the specific end-user (if a `user` identifier is provided in the request or associated with the API key). This per-user cost tracking is used for: + +* **Usage Dashboards:** Generating dashboards that show LLM spend broken down by user, team, or project. +* **Cost Control and Budgeting:** Enforcing budgets and quotas at the user or team level. +* **Detailed Usage Analysis:** Providing granular usage data for cost optimization and resource allocation. + +Setting `disable_end_user_cost_tracking: true` will **turn off this per-user cost tracking**. The proxy will still track *aggregate* usage and costs, but it will not record or report costs at the individual end-user level. + +* **Privacy Considerations:** You might disable per-user cost tracking for privacy reasons, if you do not want to associate costs with specific users. +* **Simplified Cost Reporting:** If you only need aggregate cost metrics and do not require per-user spend breakdowns, disabling this feature can simplify your data and reduce the amount of data stored in the proxy's database. + +**Example YAML:** + +```yaml +litellm_settings: + disable_end_user_cost_tracking: true # Disabling per-user cost tracking for privacy + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When end-user cost tracking is disabled, the proxy will still track and report *aggregate* costs (total spend across all requests), but the per-user breakdown will not be available. + +--- + +#### `disable_end_user_cost_tracking_prometheus_only` + +**YAML Key:** `disable_end_user_cost_tracking_prometheus_only` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (End-user cost tracking is enabled in Prometheus metrics by default). + +**Description:** The `disable_end_user_cost_tracking_prometheus_only` parameter, within `litellm_settings`, is a **boolean flag** that provides more granular control over end-user cost tracking. When set to `true`, it **disables per-user cost tracking *specifically in Prometheus metrics***, but **continues to record per-user cost data in the proxy's database spend logs**. + +* **Selective Metric Exposure:** This setting allows you to control *where* per-user cost information is exposed. You might use this if: + * You want to retain detailed per-user spend logs in your internal database for audit or detailed analysis. + * But you do *not* want to expose per-user cost metrics via Prometheus, perhaps for privacy or security reasons when Prometheus metrics are exposed externally or to a wider audience. +* **Database Logs Still Retain Detail:** Even when Prometheus tracking is disabled via this flag, the proxy will still write per-user spend information to the `spend_logs` database table, ensuring that detailed spend data is still captured internally. + +**Example YAML:** + +```yaml +litellm_settings: + disable_end_user_cost_tracking_prometheus_only: true # Disabling per-user cost tracking in Prometheus metrics only + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**In summary:** `disable_end_user_cost_tracking` turns off per-user cost tracking *everywhere* (database and Prometheus). `disable_end_user_cost_tracking_prometheus_only` turns it off *only in Prometheus metrics*, while keeping detailed per-user logs in the database. + +--- + +#### `key_generation_settings` + +**YAML Key:** `key_generation_settings` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** If not provided, default key generation settings apply (no special restrictions beyond normal authentication). + +**Description:** The `key_generation_settings` parameter, within `litellm_settings`, is an **advanced mapping** (object) that controls **who is allowed to generate API keys (virtual keys)** via the proxy's admin endpoints (e.g., the `/key/generate` API or the Admin UI). This is primarily an **enterprise-level configuration**, relevant when you expose a UI or API for users or teams to create their own API keys for accessing the proxy. + +* **Granular Key Generation Control:** `key_generation_settings` allows you to define fine-grained rules and restrictions on API key generation, enabling you to implement policies around who can create keys and what requirements must be met during key creation. +* **Sub-Keys for Key Types:** The `key_generation_settings` object can contain two sub-keys: + * `team_key_generation`: Settings specifically for generating **team API keys** (keys associated with a team or organization). + * `personal_key_generation`: Settings for generating **personal API keys** (keys not tied to a team). +* **Enterprise Feature:** This is primarily an **enterprise feature** used in multi-tenant scenarios or when you need to enforce strict governance over API key issuance. + +**Configuration Options within `key_generation_settings` Sub-Keys:** + +For both `team_key_generation` and `personal_key_generation`, you can define the following restrictions: + +* **`allowed_team_member_roles` / `allowed_user_roles`:** (Array of Strings): Lists the user roles that are permitted to generate keys of this type. Roles are typically defined within your user authentication system (e.g., "admin", "proxy_admin", "developer", etc.). Only users with roles listed here will be authorized to create keys. If not specified, there's no role-based restriction beyond general authentication. +* **`required_params`:** (Array of Strings): Lists parameters that are **required** to be provided when generating a key of this type. For example, you might require that team admins always provide `"tags"` when creating team API keys for cost tracking purposes. If a required parameter is missing during key generation, the request will be rejected. + +**Example YAML:** + +```yaml +litellm_settings: + key_generation_settings: # Enterprise Feature: Controlling key generation + team_key_generation: # Settings for team API keys + allowed_team_member_roles: ["admin"] # Only team members with "admin" role can create team keys + required_params: ["tags"] # Team keys must be created with "tags" parameter + personal_key_generation: # Settings for personal API keys + allowed_user_roles: ["proxy_admin", "key_manager"] # Only users with "proxy_admin" or "key_manager" roles can create personal keys + # No required_params specified for personal keys (optional) + # ... other litellm_settings ... +``` + +In this example: + +* Only team members with the `"admin"` role can create team API keys, and they *must* provide a `"tags"` parameter when creating a team key. +* Only users with the `"proxy_admin"` or `"key_manager"` roles can create personal API keys. There are no additional parameter requirements for personal keys. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `key_generation_settings` is an **enterprise feature** primarily relevant for multi-tenant deployments or organizations with strict API key governance policies. If unspecified, there are no special restrictions on key generation beyond the normal authentication requirements of the proxy. + +--- + +#### `disable_add_transform_inline_image_block` + +**YAML Key:** `disable_add_transform_inline_image_block` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Automatic `#transform=inline` addition is enabled by default). + +**Description:** The `disable_add_transform_inline_image_block` parameter, within `litellm_settings`, is a **boolean flag** specifically related to integration with **Fireworks AI models**. When set to `true`, it **disables the automatic addition of `#transform=inline` to the `image_url` parameter** for non-vision Fireworks AI models. + +* **Fireworks AI Specific Behavior:** Fireworks AI's API might require image URLs to include `#transform=inline` as a query parameter for certain models, especially for image generation calls when the model itself is *not* a vision model. +* **Automatic Parameter Appending (Default):** By default (`disable_add_transform_inline_image_block: false`), LiteLLM Proxy will automatically append `#transform=inline` to the `image_url` when making image generation requests to Fireworks AI models that are *not* vision models. This is to ensure compatibility with Fireworks AI's API requirements. +* **Disabling Auto-Addition (Optional):** Setting `disable_add_transform_inline_image_block: true` will turn off this automatic parameter appending. You might want to disable it if: + * You are using a Fireworks AI model where this parameter is not needed or causes issues. + * You want to have explicit control over the `image_url` and do not want the proxy to modify it automatically. +* **Fireworks AI Context:** This setting is **only relevant when you are using Fireworks AI models**. If you are not using Fireworks AI, you can safely ignore this parameter. + +**Example YAML:** + +```yaml +litellm_settings: + disable_add_transform_inline_image_block: true # Disabling auto-addition of #transform=inline for Fireworks AI + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** If you are not using Fireworks AI models, you can typically leave this parameter at its default `false` value. Only adjust it to `true` if you specifically need to disable the automatic `#transform=inline` appending for Fireworks AI models, and understand the implications for your Fireworks AI integration. + +--- + +#### `disable_hf_tokenizer_download` + +**YAML Key:** `disable_hf_tokenizer_download` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Hugging Face tokenizer download is enabled by default). + +**Description:** The `disable_hf_tokenizer_download` parameter, within `litellm_settings`, is a **boolean flag** that controls whether LiteLLM should attempt to **download Hugging Face tokenizer files**. + +* **Hugging Face Tokenizer Usage (Default):** By default (`disable_hf_tokenizer_download: false`), LiteLLM will try to use the **appropriate tokenizer** for Hugging Face models. This often involves downloading tokenizer files from the Hugging Face Hub when a Hugging Face model is first used via the proxy. Using the correct tokenizer is essential for accurate token counting and optimal performance with Hugging Face models. +* **Forcing OpenAI Tokenizer (Alternative):** Setting `disable_hf_tokenizer_download: true` instructs LiteLLM to **avoid downloading Hugging Face tokenizers**. Instead, it will **default to using the OpenAI tokenizer** for *all* models, including Hugging Face models. +* **Use Cases for Disabling HF Tokenizer Download:** You might want to disable Hugging Face tokenizer download in scenarios such as: + * **Offline Environments:** In environments with limited or no internet access, preventing tokenizer downloads can be necessary. + * **Restricted Environments:** In highly secure or restricted environments where downloading external files is prohibited or undesirable. + * **Startup Speed Optimization:** Downloading tokenizer files can sometimes add to the startup time of the proxy, especially on slower networks. Disabling downloads can speed up proxy startup, at the potential cost of less accurate tokenization for Hugging Face models. + * **Forced OpenAI Tokenizer:** If you specifically want to ensure that the OpenAI tokenizer is used for all models, even Hugging Face models, for consistency or compatibility reasons. + +**Example YAML:** + +```yaml +litellm_settings: + disable_hf_tokenizer_download: true # Disabling Hugging Face tokenizer downloads, forcing OpenAI tokenizer usage + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Important Considerations:** + +* **Tokenizer Accuracy:** Using the OpenAI tokenizer for Hugging Face models may lead to **less accurate token counting** and potentially suboptimal performance for some models, as Hugging Face models are often trained and optimized with their specific tokenizers. +* **Tokenization Mismatches:** Forcing the OpenAI tokenizer might lead to **tokenization mismatches** and unexpected behavior with certain Hugging Face models that heavily rely on their specific tokenization schemes. +* **Default Behavior Recommendation:** Unless you have a specific reason to disable Hugging Face tokenizer downloads (like offline environments or forced OpenAI tokenizer usage), it's generally **recommended to leave `disable_hf_tokenizer_download: false`** (the default) to ensure proper tokenizer usage for Hugging Face models. + +--- + +### Caching Settings + +The `cache` and `cache_params` settings, located under `litellm_settings`, are used to configure the proxy's caching behavior. You'll define the caching backend, set time-to-live (TTL) values, and customize which types of requests are cached. + +#### `cache` (`litellm_settings`) + +**YAML Key:** `cache` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** Often `false` by default, but can depend on the specific distribution or setup of LiteLLM Proxy. Check your default `config.yaml` or distribution-specific documentation. + +**Description:** The `cache` parameter, within `litellm_settings`, is the **master switch** for **enabling or disabling caching** of LLM responses within the LiteLLM Proxy Server. + +* **Global Caching Control:** Setting `cache: true` turns on caching **globally** for the proxy. This means that, by default, the proxy will attempt to cache eligible requests and retrieve cached responses whenever possible, using the cache backend and settings configured in `cache_params`. +* **Disabling Caching:** Setting `cache: false` completely **disables caching** throughout the proxy. No responses will be cached, and the proxy will always forward requests to the backend LLM provider, regardless of any `cache_params` configuration. +* **`cache_params` Requirement:** If you set `cache: true`, you **must also configure the `cache_params`** section in `litellm_settings`. `cache_params` is where you specify the type of cache backend (Redis, memory, S3, etc.) and its connection details. If `cache: true` but `cache_params` is missing or incorrectly configured, the proxy may fail to start or caching might not function correctly. +* **Default Off (Often):** Caching is often **disabled by default** in many distributions of LiteLLM Proxy, meaning `cache` is typically set to `false` initially. You must explicitly enable caching by setting `cache: true` and configuring `cache_params` if you want to use caching. + +**Example YAML:** + +```yaml +litellm_settings: + cache: true # Master switch: Enabling caching globally + cache_params: # Required: Cache backend configuration + type: "redis" + host: "cache.example.com" + port: 6379 + password: "your_redis_password" + # ... other cache_params ... + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `cache: true` is just the master switch. The actual behavior of caching (what gets cached, how long it's cached, etc.) is further controlled by the settings within the `cache_params` section. + +--- + +#### `cache_params` (`litellm_settings`) + +**YAML Key:** `cache_params` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (Required if `cache: true`) + +**Description:** The `cache_params` parameter, within `litellm_settings`, is a **required mapping** (object) when you have enabled caching by setting `cache: true`. `cache_params` is where you **configure the details of your cache backend**, specifying: + +* **`type`:** The type of cache backend to use (Redis, memory, S3, Qdrant, etc.). +* **Backend-Specific Settings:** Connection details and configurations that are specific to the chosen `type` of cache backend (e.g., Redis host, port, password; S3 bucket name, region, credentials). +* **Common Cache Settings:** General caching behavior settings like `supported_call_types`, `mode`, and `ttl`. + +If `cache: true` is set, but `cache_params` is missing, incomplete, or incorrectly configured, the proxy may fail to start, or caching functionality may not work as expected. + +**Example YAML (Redis Cache Configuration):** + +```yaml +litellm_settings: + cache: true # Caching is enabled + cache_params: # Cache backend configuration + type: "redis" # Using Redis as cache backend + host: "cache.example.com" # Redis server hostname + port: 6379 # Redis server port + password: "your_redis_password" # Redis password (if required) + namespace: "litellm.proxy.cache" # Namespace for Redis keys + supported_call_types: ["acompletion", "aembedding"] # Caching only chat completions and embeddings + mode: "default_off" # Caching is opt-in per request + ttl: 600 # Cache TTL of 600 seconds (10 minutes) + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only section header) + +**Note:** The specific parameters within `cache_params` depend heavily on the `type` of cache backend you choose. Refer to the documentation for each cache `type` (Redis, S3, Qdrant, etc.) for details on the required and optional settings. + +--- + +##### `type` (`cache_params`) + +**YAML Key:** `type` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required Parameter within `cache_params` when `cache: true`) + +**Description:** The `type` parameter, nested within `cache_params`, is **required** and specifies the **type of cache backend** that the LiteLLM Proxy should use for caching responses. This parameter determines *where* and *how* the proxy will store and retrieve cached data. Valid values for `type` include: + +* `"redis"`: Configures the proxy to use a **Redis database** as the cache backend. Redis is a popular in-memory data store, well-suited for caching due to its speed and persistence. +* `"memory"`: Uses **in-memory caching**. Cached responses are stored in the proxy server's RAM. This is the simplest option to configure, and very fast, but the cache is **not persistent** (data is lost if the proxy restarts) and has limited capacity. +* `"s3"`: Uses **AWS S3 (Simple Storage Service)** as the cache backend. S3 is a scalable object storage service, suitable for large caches and persistent storage, especially in AWS environments. +* `"qdrant"`: Enables **Qdrant semantic caching**. This is an advanced caching strategy that uses a Qdrant vector database to store and retrieve cached responses based on the *semantic similarity* of prompts, rather than exact string matching. +* `"disk"`: Uses a **local disk-based cache**. Cached responses are stored as files on the proxy server's local file system. Provides persistence between restarts but may be slower than in-memory or Redis. +* `"hosted"`: Uses a **hosted cache** managed by the LiteLLM team (at `api.litellm.ai`). This is a convenient option for fast caching without needing to manage your own infrastructure. + +**Example YAML (Redis Cache Type):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" # Setting cache type to Redis + host: "cache.example.com" + port: 6379 + # ... other Redis-specific settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** The choice of `type` depends on your application's requirements, scale, persistence needs, and infrastructure. `"redis"` is often a good balance of performance and persistence for many production setups. `"memory"` is suitable for testing or simple, non-persistent caching. `"s3"` and `"qdrant"` are for more advanced, large-scale, or semantic caching scenarios. `"disk"` is a persistent alternative when Redis or S3 are not feasible. `"hosted"` provides convenience for quick setup. + +--- + +##### Redis-Specific Settings (within `cache_params`) + +*(Introductory text explaining Redis-specific settings)* + +###### `host` (`cache_params` -> Redis) + +**YAML Key:** `host` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when `cache_params.type: "redis"`) + +**Description:** The `host` parameter, within `cache_params` when using `type: "redis"`, specifies the **hostname or IP address of your Redis server**. This is the network address where the LiteLLM Proxy will connect to access the Redis database for caching. + +* **Redis Server Address:** Provide the hostname (e.g., `"redis.example.com"`) or IP address (e.g., `"192.168.1.100"`) of your Redis instance. +* **Local or Remote Redis:** The Redis server can be running locally on the same machine as the proxy, or it can be a remote Redis instance accessible over the network. +* **Required for Redis Cache:** This parameter is **required** when you configure `cache_params.type: "redis"`. If `host` is missing, the proxy will not be able to connect to Redis and caching will not function correctly. + +**Example YAML (Redis `host`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + host: "cache.example.com" # Specifying Redis server hostname + port: 6379 + # ... other Redis settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +###### `port` (`cache_params` -> Redis) + +**YAML Key:** `port` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `6379` (Standard default Redis port) + +**Description:** The `port` parameter, within `cache_params` and when `type: "redis"`, specifies the **port number** on which your Redis server is listening for connections. + +* **Redis Port Number:** Provide the port number of your Redis instance. The standard default Redis port is `6379`, and if your Redis server is using the default port, you can often omit this parameter as it will default to `6379`. +* **Required for Non-Default Ports:** If your Redis server is running on a **non-default port**, you **must** specify the correct `port` value. +* **Integer Value:** `port` must be an integer representing a valid port number. + +**Example YAML (Redis `port`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + host: "cache.example.com" + port: 6380 # Specifying a non-default Redis port (e.g., 6380) + # ... other Redis settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +--- + +###### `password` (`cache_params` -> Redis) + +**YAML Key:** `password` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** No default value (no password assumed by default). + +**Description:** The `password` parameter, under `cache_params` when using `type: "redis"`, is used to provide the **password for your Redis server**, if your Redis instance is configured with password-based authentication. + +* **Redis Authentication:** If your Redis server requires a password for client connections, you **must** provide the correct password using this parameter. +* **Security Best Practice:** It is highly recommended to **secure your Redis instance with a password** in production environments to prevent unauthorized access to your cache data. +* **Omit for Passwordless Redis:** If your Redis server does *not* require a password (e.g., in a development or trusted network environment), you can omit this parameter. Do *not* provide a password if your Redis server is not configured to require one, as this could cause connection errors. + +**Example YAML (Redis `password`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + host: "cache.example.com" + port: 6379 + password: "your_redis_password" # Providing Redis password for authentication + # ... other Redis settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Note:** For enhanced security, consider storing the Redis password in an environment variable or a secure secret management system and referencing it in your `config.yaml` using `"os.environ/REDIS_PASSWORD"` instead of hardcoding the password directly in the YAML file. + +--- + +###### `namespace` (`cache_params` -> Redis) + +**YAML Key:** `namespace` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `"litellm.caching.caching"` (Default namespace used by LiteLLM Caching). + +**Description:** The `namespace` parameter, within `cache_params` and when `type: "redis"`, specifies a **prefix or namespace** that will be added to **all keys** used by LiteLLM Proxy when storing data in Redis. + +* **Key Collision Prevention:** Using a namespace is crucial when your Redis instance is **shared with other applications or services**. It prevents key collisions by ensuring that LiteLLM Proxy's cache keys are prefixed with a unique namespace, isolating them from keys used by other applications in the same Redis database. +* **Organization and Management:** Namespaces can also improve the organization and manageability of your Redis cache, making it easier to identify and manage keys related to the LiteLLM Proxy. +* **Default Namespace:** If you don't provide a `namespace`, LiteLLM will use a default namespace, typically `"litellm.caching.caching"`. While the default namespace might be sufficient for many setups, it's still **best practice to explicitly define a custom namespace**, especially in shared Redis environments. +* **Custom Namespace String:** You can set `namespace` to any string value you choose. Choose a string that is descriptive and unique to your LiteLLM Proxy deployment. + +**Example YAML (Redis `namespace`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + host: "cache.example.com" + port: 6379 + password: "your_redis_password" + namespace: "my-litellm-proxy-cache" # Setting a custom namespace for Redis keys + # ... other Redis settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Always set a custom `namespace` when using Redis caching in production or shared Redis environments. Choose a namespace that is specific to your LiteLLM Proxy deployment to avoid potential key conflicts. + +--- + +###### `redis_startup_nodes` (`cache_params` -> Redis) + +**YAML Key:** `redis_startup_nodes` + +**Type:** List of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** N/A (Optional, only used for Redis Cluster) + +**Description:** The `redis_startup_nodes` parameter, within `cache_params` when `type: "redis"`, is used specifically when connecting to a **Redis Cluster**. Redis Cluster is a distributed Redis setup that provides high availability and scalability. `redis_startup_nodes` is a **list of objects**, where each object defines a **Redis Cluster node** that the proxy can use to bootstrap the connection to the cluster. + +* **Redis Cluster Bootstrapping:** When connecting to a Redis Cluster, the proxy needs to initially connect to one or more *startup nodes* in the cluster to discover the cluster topology and routing information. `redis_startup_nodes` provides the addresses of these nodes. +* **List of Nodes:** The value of `redis_startup_nodes` should be a list, where each item in the list is a **mapping (object)** containing: + * `host`: The hostname or IP address of a Redis Cluster node. + * `port`: The port number of the Redis Cluster node. +* **Multiple Startup Nodes:** You can provide multiple startup nodes in the list for redundancy and fault tolerance during cluster discovery. The proxy will attempt to connect to the first available node in the list. +* **Cluster-Specific Setting:** `redis_startup_nodes` is **only used for Redis Cluster setups**. Do not use it when connecting to a standalone Redis server or Redis Sentinel. + +**Example YAML (Redis Cluster `redis_startup_nodes`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + redis_startup_nodes: # Defining Redis Cluster startup nodes + - host: "redis-node-1.example.com" # Address of first startup node + port: 7001 + - host: "redis-node-2.example.com" # Address of second startup node (redundancy) + port: 7002 + # ... other Redis settings ... +``` + +In this example, the proxy is configured to connect to a Redis Cluster. It is provided with two startup nodes: `redis-node-1.example.com:7001` and `redis-node-2.example.com:7002`. The proxy will use these to discover the cluster and establish connections to all nodes in the cluster. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When using Redis Cluster, you typically do *not* need to specify `host` and `port` directly under `cache_params`. Instead, you should use `redis_startup_nodes` to define the cluster nodes. Also, you generally do not need to use `service_name` or `sentinel_nodes` when connecting to a Redis Cluster. + +--- + +###### `service_name` (`cache_params` -> Redis) + +**YAML Key:** `service_name` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when using Redis Sentinel) + +**Description:** The `service_name` parameter, within `cache_params` and when `type: "redis"`, is used specifically when connecting to a **Redis Sentinel** setup. Redis Sentinel provides high availability for Redis by monitoring master and replica instances and performing automatic failover in case of master failures. `service_name` specifies the **name of the Redis Sentinel service** that the proxy should connect to. + +* **Redis Sentinel Service Name:** Provide the service name of your Redis Sentinel setup. This is the name configured for your master Redis instance within the Sentinel configuration. +* **Sentinel-Specific Setting:** `service_name` is **only used for Redis Sentinel setups**. Do not use it when connecting to a standalone Redis server or Redis Cluster. +* **High Availability:** Redis Sentinel provides high availability by automatically promoting a replica to master if the current master fails. The proxy, when configured with Sentinel, will automatically connect to the current master instance as managed by Sentinel. + +**Example YAML (Redis Sentinel `service_name`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + service_name: "mymaster" # Specifying Redis Sentinel service name + sentinel_nodes: # Defining Sentinel node addresses + - ["redis-sentinel1.example.com", 26379] + - ["redis-sentinel2.example.com", 26379] + # ... other Redis settings ... +``` + +In this example, the proxy is configured to connect to a Redis Sentinel setup named `"mymaster"`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When using Redis Sentinel, you typically do *not* need to specify `host` and `port` directly under `cache_params`. Instead, you should use `service_name` and `sentinel_nodes` to define the Sentinel setup. Also, you generally do not need to use `redis_startup_nodes` when connecting via Redis Sentinel. + +--- + +###### `sentinel_nodes` (`cache_params` -> Redis) + +**YAML Key:** `sentinel_nodes` + +**Type:** List of Lists (List of [host, port] pairs) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when using Redis Sentinel) + +**Description:** The `sentinel_nodes` parameter, within `cache_params` and when `type: "redis"`, is also used specifically for **Redis Sentinel** setups. It is a **list of lists**, where each inner list is a **pair of `[host, port]`** representing the address of a **Redis Sentinel node**. + +* **Sentinel Node Addresses:** Provide a list of addresses for your Redis Sentinel instances. You should list multiple Sentinel nodes for redundancy. The proxy will use these addresses to connect to the Sentinel cluster and discover the current master Redis instance. +* **Host-Port Pairs:** Each entry in the `sentinel_nodes` list should be a list containing two elements: + * The hostname or IP address of a Sentinel node (string). + * The port number of the Sentinel node (integer). +* **Sentinel-Specific Setting:** `sentinel_nodes` is **only used for Redis Sentinel setups**. Do not use it when connecting to a standalone Redis server or Redis Cluster. + +**Example YAML (Redis Sentinel `sentinel_nodes`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + service_name: "mymaster" + sentinel_nodes: # Defining Redis Sentinel node addresses + - ["redis-sentinel1.example.com", 26379] # Address of first Sentinel node + - ["redis-sentinel2.example.com", 26379] # Address of second Sentinel node (redundancy) + # ... other Redis settings ... +``` + +In this example, the proxy is configured with two Sentinel node addresses: `redis-sentinel1.example.com:26379` and `redis-sentinel2.example.com:26379`. The proxy will use these to connect to the Sentinel cluster and discover the master Redis instance. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When using Redis Sentinel, you typically do *not* need to specify `host` and `port` directly under `cache_params`. Instead, use `service_name` and `sentinel_nodes`. Also, you generally do not need to use `redis_startup_nodes` when connecting via Redis Sentinel. + +--- + +##### S3-Specific Settings (within `cache_params`) + +*(Introductory text explaining S3-specific settings)* + +###### `s3_bucket_name` (`cache_params` -> S3) + +**YAML Key:** `s3_bucket_name` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when `cache_params.type: "s3"`) + +**Description:** The `s3_bucket_name` parameter, within `cache_params` when using `type: "s3"`, specifies the **name of the AWS S3 bucket** that the LiteLLM Proxy should use for caching responses. + +* **S3 Bucket Identifier:** Provide the name of your S3 bucket as a string. Ensure that the bucket exists and that the AWS credentials you provide (via `s3_aws_access_key_id` and `s3_aws_secret_access_key`, or IAM roles) have **read and write permissions** to this bucket. +* **Required for S3 Cache:** This parameter is **required** when you configure `cache_params.type: "s3"`. If `s3_bucket_name` is missing, the proxy will not be able to use S3 for caching. + +**Example YAML (S3 `s3_bucket_name`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "s3" + s3_bucket_name: "my-litellm-proxy-cache-bucket" # Specifying S3 bucket name + s3_region_name: "us-west-2" + s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" + s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" + # ... other S3 settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Note:** Ensure that your S3 bucket is properly secured with appropriate access policies to protect your cached data. + +--- + +###### `s3_region_name` (`cache_params` -> S3) + +**YAML Key:** `s3_region_name` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when `cache_params.type: "s3"`) + +**Description:** The `s3_region_name` parameter, within `cache_params` when `type: "s3"`, specifies the **AWS region** in which your S3 bucket is located. AWS regions are geographic locations where AWS services are hosted (e.g., `"us-east-1"`, `"eu-west-2"`, `"ap-southeast-2"`). + +* **Bucket Region:** Provide the AWS region name as a string (e.g., `"us-west-2"`). The region name must **match the region** where your `s3_bucket_name` bucket is actually created. +* **Required for S3 Cache:** This parameter is **required** when you configure `cache_params.type: "s3"`. If `s3_region_name` is missing, the proxy will not be able to access your S3 bucket. + +**Example YAML (S3 `s3_region_name`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "s3" + s3_bucket_name: "my-litellm-proxy-cache-bucket" + s3_region_name: "us-west-2" # Specifying AWS region of the S3 bucket + s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" + s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" + # ... other S3 settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** Specifying the correct `s3_region_name` is essential for the proxy to connect to your S3 bucket in the correct AWS geographic location. Incorrect region names will lead to connection failures. + +--- + +###### `s3_aws_access_key_id` (`cache_params` -> S3) + +**YAML Key:** `s3_aws_access_key_id` + +**Type:** String or `os.environ/...` reference + +**Environment Variable:** `AWS_ACCESS_KEY_ID` (if not using `os.environ` reference in YAML). + +**Default Value:** N/A (Required when `cache_params.type: "s3"`) + +**Description:** The `s3_aws_access_key_id` parameter, within `cache_params` when `type: "s3"`, is used to provide your **AWS Access Key ID** for authenticating with AWS S3. This is one part of your AWS credentials required to access your S3 bucket for caching. + +* **AWS Credential:** Provide your AWS Access Key ID as a string. This is a sensitive credential, so it is **highly recommended to use an environment variable reference** (like `"os.environ/AWS_ACCESS_KEY_ID"`) rather than hardcoding the access key directly in your `config.yaml` file. +* **Required for S3 Cache (Unless Using IAM Roles):** Unless your proxy is running in an AWS environment with an IAM role that grants it access to the S3 bucket, you **must** provide AWS credentials (Access Key ID and Secret Access Key) to authenticate with S3. +* **Security Best Practice:** Storing AWS credentials in environment variables or using IAM roles is significantly more secure than embedding them directly in configuration files. + +**Example YAML (S3 `s3_aws_access_key_id` with environment variable reference):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "s3" + s3_bucket_name: "my-litellm-proxy-cache-bucket" + s3_region_name: "us-west-2" + s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" # Referencing AWS Access Key ID from environment variable + s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" + # ... other S3 settings ... +``` + +**Example Environment Variable:** + +```bash +export AWS_ACCESS_KEY_ID="AKIA..." # Setting AWS Access Key ID in environment +``` + +**Security Note:** Never hardcode your AWS Access Key ID directly in your `config.yaml` file. Always use environment variables or IAM roles for managing AWS credentials securely. + +--- + +###### `s3_aws_secret_access_key` (`cache_params` -> S3) + +**YAML Key:** `s3_aws_secret_access_key` + +**Type:** String or `os.environ/...` reference + +**Environment Variable:** `AWS_SECRET_ACCESS_KEY` (if not using `os.environ` reference in YAML). + +**Default Value:** N/A (Required when `cache_params.type: "s3"`) + +**Description:** The `s3_aws_secret_access_key` parameter, within `cache_params` when `type: "s3"`, is used to provide your **AWS Secret Access Key** for authenticating with AWS S3. This is the second part of your AWS credentials required to access your S3 bucket. + +* **AWS Credential:** Provide your AWS Secret Access Key as a string. This is a **highly sensitive credential**. **Always use an environment variable reference** (like `"os.environ/AWS_SECRET_ACCESS_KEY"`) to fetch the secret key from environment variables. **Never hardcode** the secret key directly in your `config.yaml` file. +* **Required for S3 Cache (Unless Using IAM Roles):** Similar to `s3_aws_access_key_id`, you **must** provide the Secret Access Key (or use IAM roles) for the proxy to authenticate with S3, unless your proxy is running in an AWS environment with an IAM role that provides S3 access. +* **Security Imperative:** Protecting your AWS Secret Access Key is of paramount importance. Treat it as a highly sensitive secret and follow AWS best practices for credential management. + +**Example YAML (S3 `s3_aws_secret_access_key` with environment variable reference):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "s3" + s3_bucket_name: "my-litellm-proxy-cache-bucket" + s3_region_name: "us-west-2" + s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" + s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" # Referencing AWS Secret Access Key from environment variable + # ... other S3 settings ... +``` + +**Example Environment Variable:** + +```bash +export AWS_SECRET_ACCESS_KEY="abcd1234..." # Setting AWS Secret Access Key in environment +``` + +**Security Warning:** **Absolutely never hardcode your AWS Secret Access Key directly in your `config.yaml` file or any version control system.** Always use environment variables or IAM roles for secure credential management. Compromising your AWS Secret Access Key can have serious security implications for your AWS account. + +--- + +###### `s3_endpoint_url` (`cache_params` -> S3) + +**YAML Key:** `s3_endpoint_url` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** If not provided, defaults to the standard AWS S3 endpoint URL for the specified `s3_region_name`. + +**Description:** The `s3_endpoint_url` parameter, within `cache_params` when `type: "s3"`, is an **optional** parameter that allows you to specify a **custom S3-compatible endpoint URL**. + +* **Non-AWS S3 Services:** `s3_endpoint_url` is primarily used when you are **not using Amazon's own AWS S3 service**, but rather an **S3-compatible object storage service** from another provider. Examples include: + * **Backblaze B2 Storage:** Backblaze B2 is an S3-compatible cloud storage service. + * **Cloudflare R2 Storage:** Cloudflare R2 is another S3-compatible object storage service. + * **MinIO:** MinIO is an open-source object storage server that is S3-compatible and can be self-hosted. +* **Custom Endpoints:** By providing a custom `s3_endpoint_url`, you can instruct the LiteLLM Proxy to connect to these S3-compatible services instead of the default AWS S3 endpoint. +* **AWS S3 Default Endpoint:** If you are using **AWS S3** and your bucket is in a standard AWS region, you can **omit `s3_endpoint_url`**. In this case, the proxy will automatically use the standard AWS S3 endpoint URL for the specified `s3_region_name`. + +**Example YAML (S3 `s3_endpoint_url` for Backblaze B2):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "s3" + s3_bucket_name: "my-b2-bucket-name" + s3_region_name: "us-west-004" # Backblaze B2 region code + s3_aws_access_key_id: "os.environ/B2_APPLICATION_KEY_ID" # B2 application key ID + s3_aws_secret_access_key: "os.environ/B2_APPLICATION_KEY" # B2 application key + s3_endpoint_url: "https://s3.us-west-004.backblazeb2.com" # Custom endpoint URL for Backblaze B2 + # ... other S3 settings ... +``` + +In this example, the proxy is configured to use Backblaze B2 storage as the cache backend, specifying the custom `s3_endpoint_url` for Backblaze B2. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When using AWS S3 itself, you typically do *not* need to provide `s3_endpoint_url`, and can rely on the default AWS S3 endpoint for your specified `s3_region_name`. Only use `s3_endpoint_url` when connecting to non-AWS S3-compatible services. + +--- + +##### Qdrant (Semantic Caching) Settings (within `cache_params`) + +*(Introductory text explaining Qdrant-specific settings for semantic caching)* + +###### `qdrant_semantic_cache_embedding_model` (`cache_params` -> Qdrant) + +**YAML Key:** `qdrant_semantic_cache_embedding_model` + +**Type:** String (Must be a `model_name` defined in your `model_list`) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when using Qdrant semantic cache) + +**Description:** The `qdrant_semantic_cache_embedding_model` parameter, within `cache_params` when using `type: "qdrant"` or enabling semantic cache, **specifies the `model_name` of an embedding model** that the proxy should use to generate embeddings for cache keys in Qdrant. + +* **Embedding Model for Semantic Search:** Semantic caching relies on vector embeddings to measure the semantic similarity between prompts. You need to designate an embedding model that the proxy will use to generate these embeddings. +* **`model_name` Reference:** The value of `qdrant_semantic_cache_embedding_model` must be a valid `model_name` that is already defined in your `model_list` section of `config.yaml`. This `model_name` should correspond to an embedding model (e.g., OpenAI's `text-embedding-ada-002`, or a local embedding model you have configured). +* **Embedding Model Configuration:** You must ensure that the `model_name` you specify here is indeed configured as an embedding model in your `model_list`, with appropriate `litellm_params` for calling the embedding API (e.g., API key, API base, etc.). +* **Required for Semantic Cache:** `qdrant_semantic_cache_embedding_model` is **required** when you are using Qdrant semantic caching (either `type: "qdrant"` or by enabling semantic cache features). If you are using Qdrant semantic cache, but this parameter is missing, the proxy will not be able to generate embeddings for cache keys and semantic caching will not function. + +**Example YAML (Qdrant `qdrant_semantic_cache_embedding_model`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "qdrant" # Using Qdrant semantic cache + qdrant_semantic_cache_embedding_model: "openai-embedding" # Using "openai-embedding" model for generating embeddings + qdrant_collection_name: "litellm_cache_collection" + # ... other Qdrant settings ... +model_list: + - model_name: openai-embedding # Defining the embedding model used for semantic cache + litellm_params: + model: openai/text-embedding-ada-002 + api_key: "os.environ/OPENAI_API_KEY" + model_info: + mode: "embedding" # Important: Marking this model as "embedding" mode + # ... other model_info ... + - model_name: gpt-3.5-turbo # Example completion model + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + # ... other litellm_params ... + model_info: + mode: "completion" # Marking this model as "completion" mode + # ... other model_info ... +``` + +In this example, `qdrant_semantic_cache_embedding_model: "openai-embedding"` specifies that the model named `"openai-embedding"` (which is also defined in `model_list` as an embedding model using OpenAI's `text-embedding-ada-002`) should be used to generate embeddings for the Qdrant semantic cache. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** It's crucial that the `model_name` you specify for `qdrant_semantic_cache_embedding_model` actually corresponds to a properly configured embedding model in your `model_list` and that its `model_info.mode` is set to `"embedding"`. + +--- + +###### `qdrant_collection_name` (`cache_params` -> Qdrant) + +**YAML Key:** `qdrant_collection_name` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when using Qdrant semantic cache) + +**Description:** The `qdrant_collection_name` parameter, within `cache_params` when `type: "qdrant"` or enabling semantic cache, specifies the **name of the Qdrant collection** that the LiteLLM Proxy should use to store vector embeddings for semantic caching. + +* **Qdrant Collection Identification:** Provide the name of your Qdrant collection as a string. Ensure that the collection exists in your Qdrant database. If the collection does not exist, LiteLLM may attempt to create it automatically (depending on Qdrant configuration and permissions), but it's generally recommended to create the collection beforehand and configure its settings appropriately (e.g., vector dimensions, distance metric). +* **Required for Qdrant Semantic Cache:** `qdrant_collection_name` is **required** when you are using Qdrant semantic caching. If this parameter is missing, the proxy will not know which Qdrant collection to use for storing and searching embeddings. + +**Example YAML (Qdrant `qdrant_collection_name`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "qdrant" # Using Qdrant semantic cache + qdrant_semantic_cache_embedding_model: "openai-embedding" + qdrant_collection_name: "litellm_cache_collection" # Specifying Qdrant collection name + qdrant_api_base: "http://localhost:6334" + # ... other Qdrant settings ... +``` + +In this example, `"litellm_cache_collection"` is specified as the name of the Qdrant collection to be used for semantic caching. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Best Practice:** It's recommended to use a **dedicated Qdrant collection** specifically for the LiteLLM Proxy's semantic cache. Choose a descriptive and unique collection name to avoid conflicts with other data in your Qdrant database. + +--- + +###### `qdrant_quantization_config` (`cache_params` -> Qdrant) + +**YAML Key:** `qdrant_quantization_config` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** No quantization configured by default (vectors are stored as full-precision floats). + +**Description:** The `qdrant_quantization_config` parameter, within `cache_params` when `type: "qdrant"` or enabling semantic cache, allows you to specify a **quantization method** for vector embeddings stored in Qdrant. Quantization is a technique to reduce the memory footprint and potentially improve search speed of vector embeddings in a vector database like Qdrant. + +* **Vector Quantization Options:** Currently, the documentation mentions support for `"binary"` quantization. Other quantization methods might be supported or added in future versions of LiteLLM. + * `"binary"`: Binary quantization reduces the size of vectors by representing them using binary codes (0s and 1s) instead of full-precision floating-point numbers. This can significantly reduce storage space and improve search performance, especially for very large datasets. +* **Performance Trade-offs:** Quantization typically involves a trade-off between **storage/performance gains** and **potential loss of precision** in the vector embeddings. Binary quantization is a more aggressive form of quantization and may result in some loss of accuracy in semantic similarity search compared to using full-precision vectors. +* **Optional Optimization:** `qdrant_quantization_config` is **optional**. If you omit this parameter, vectors will be stored in Qdrant using their original full-precision floating-point representation (no quantization). Quantization is typically used for **optimizing storage and performance** when dealing with very large caches or when memory/performance is a critical concern. + +**Example YAML (Qdrant `qdrant_quantization_config`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "qdrant" # Using Qdrant semantic cache + qdrant_semantic_cache_embedding_model: "openai-embedding" + qdrant_collection_name: "litellm_cache_collection" + qdrant_quantization_config: "binary" # Enabling binary quantization for Qdrant vectors + qdrant_api_base: "http://localhost:6334" + # ... other Qdrant settings ... +``` + +In this example, `qdrant_quantization_config: "binary"` enables binary quantization for vectors stored in the Qdrant cache. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Consider using quantization (e.g., `"binary"`) if you are planning to build a very large semantic cache with Qdrant, where storage space and search performance are critical. If accuracy is paramount and you have sufficient resources, you can omit `qdrant_quantization_config` to use full-precision vectors. Test and evaluate the impact of quantization on your specific use case to determine the optimal setting. + +--- + +###### `similarity_threshold` (`cache_params` -> Qdrant) + +**YAML Key:** `similarity_threshold` + +**Type:** Number (Float) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required when using Qdrant semantic cache) + +**Description:** The `similarity_threshold` parameter, within `cache_params` when `type: "qdrant"` or enabling semantic cache, is a **floating-point number** that defines the **cosine similarity threshold** for determining a cache hit in Qdrant semantic caching. + +* **Semantic Similarity Matching:** Semantic caching, unlike exact-match caching, retrieves cached responses based on the *semantic similarity* of the incoming prompt to prompts that are already in the cache. Cosine similarity is a common metric used to measure semantic similarity between vector embeddings. +* **Threshold Value:** `similarity_threshold` sets the **minimum cosine similarity score** required for a cached response to be considered a match for an incoming request. The cosine similarity score ranges from 0 to 1, where: + * **1.0:** Perfect cosine similarity (vectors are identical in direction). + * **0.0:** No cosine similarity (vectors are orthogonal). + * **Values closer to 1.0:** Indicate higher semantic similarity. + * **Values closer to 0.0:** Indicate lower semantic similarity. +* **Tuning for Cache Hit Rate vs. Relevance:** The `similarity_threshold` value directly impacts the **cache hit rate** and the **relevance of cached responses**: + * **Higher threshold (closer to 1.0):** Makes semantic matching **more strict**. Only prompts that are *very* semantically similar to cached prompts will result in a cache hit. This leads to a **lower cache hit rate** but potentially **higher relevance** of cached responses (as they are very close in meaning to the current request). + * **Lower threshold (closer to 0.0):** Makes semantic matching **more lenient**. Prompts that are *somewhat* semantically similar can trigger a cache hit. This results in a **higher cache hit rate** but potentially **lower relevance** of cached responses (as they might be less semantically aligned with the current request). +* **Experimentation and Tuning:** The optimal `similarity_threshold` value depends on your specific application and use case. You will likely need to **experiment and tune** this value to find a balance between cache hit rate and the desired level of semantic relevance for cached responses. A typical starting point might be around `0.8` or `0.85`. + +**Example YAML (Qdrant `similarity_threshold`):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "qdrant" # Using Qdrant semantic cache + qdrant_semantic_cache_embedding_model: "openai-embedding" + qdrant_collection_name: "litellm_cache_collection" + similarity_threshold: 0.8 # Setting cosine similarity threshold to 0.8 + qdrant_api_base: "http://localhost:6334" + # ... other Qdrant settings ... +``` + +In this example, `similarity_threshold: 0.8` means that a cached response will only be considered a match if the cosine similarity between the incoming prompt's embedding and a cached prompt's embedding is 0.8 or higher. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Start with a `similarity_threshold` around `0.8` and then adjust it based on your testing and evaluation of cache hit rates and the semantic relevance of retrieved cached responses for your application. Lower the threshold to increase cache hits (potentially at the cost of relevance), and raise it to decrease cache hits but increase relevance. + +--- + +##### Common Cache Settings (within `cache_params`) + +*(Introductory text explaining common cache settings that apply regardless of the backend type)* + +###### `supported_call_types` (`cache_params` -> Common) + +**YAML Key:** `supported_call_types` + +**Type:** Array of Strings + +**Environment Variable:** N/A + +**Default Value:** If not set, a default list might be used (check documentation for default behavior in your LiteLLM Proxy version). Commonly defaults to `["acompletion", "atext_completion", "aembedding", "atranscription"]`. + +**Description:** The `supported_call_types` parameter, within `cache_params`, is an **array of strings** that allows you to specify **which types of API calls should be cached** by the LiteLLM Proxy. This provides granular control over what gets cached and what does not. + +* **Selective Caching:** By default, LiteLLM Proxy is capable of caching several types of API calls: + * `"acompletion"`: Asynchronous chat completion calls (corresponding to `/chat/completions` endpoint). + * `"atext_completion"`: Asynchronous text completion calls (`/completions` endpoint). + * `"aembedding"`: Asynchronous embedding calls (`/embeddings` endpoint). + * `"atranscription"`: Asynchronous audio transcription calls (`/audio/transcriptions` endpoint). +* **Customize Caching Scope:** Using `supported_call_types`, you can customize *which* of these call types are actually cached. For example: + * `supported_call_types: ["acompletion", "aembedding"]`: Only cache chat completions and embeddings, but not text completions or transcriptions. + * `supported_call_types: ["acompletion"]`: Only cache chat completions, and disable caching for all other call types. + * `supported_call_types: []`: Disable caching for *all* call types, even if `cache: true` is set. This is effectively a way to disable caching selectively without turning off the master `cache` switch. +* **Performance Optimization:** Use `supported_call_types` to optimize caching for your specific application's needs. You might choose to cache only the most frequently used or expensive call types, while excluding less critical or less cacheable calls. + +**Example YAML:** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + # ... Redis settings ... + supported_call_types: ["acompletion", "aembedding"] # Only caching chat completions and embeddings + # ... other cache settings ... +``` + +In this example, only asynchronous chat completion (`acompletion`) and embedding (`aembedding`) calls will be cached. Text completions (`atext_completion`) and transcriptions (`atranscription`) will not be cached, even though caching is generally enabled (`cache: true`). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** If `supported_call_types` is not explicitly set in your `config.yaml`, LiteLLM Proxy may use a **default list of call types** that are enabled for caching (check the documentation for your specific version for the default list). However, it's best practice to **explicitly configure `supported_call_types`** to clearly define the scope of your caching strategy. + +--- + +###### `mode` (`cache_params` -> Common) + +**YAML Key:** `mode` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `"default_off"` (Caching is opt-in per request by default). + +**Description:** The `mode` parameter, within `cache_params`, controls the **default caching behavior** of the LiteLLM Proxy. It determines whether caching is enabled by default for all eligible requests, or if it needs to be explicitly enabled on a per-request basis. + +* `"default_off"`: **Caching is opt-in per request**. This is the **default mode**. In `"default_off"` mode, caching is **disabled by default** for all requests. To enable caching for a *specific* request, the client application must explicitly request caching, typically by including a `caching: true` parameter in the API call (e.g., in the request headers or body, depending on the client library). +* `"default_on"`: Caching is **opt-out per request** (Note: This mode might be less common in stable versions and might be considered more of a conceptual option; verify if `"default_on"` is actually supported in your version's documentation). In a hypothetical `"default_on"` mode, caching would be **enabled by default** for all eligible requests. To *disable* caching for a specific request, the client application would need to explicitly opt-out, likely using a parameter like `caching: false` in the API call. + +**Example YAML (Caching Mode: Default Off):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + # ... Redis settings ... + mode: "default_off" # Caching is opt-in per request (default mode) + # ... other cache settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** `"default_off"` mode is generally recommended for production environments and provides the most control. It ensures that caching is only applied when explicitly requested by the client application, giving you granular control over caching behavior and preventing unintended caching of sensitive or dynamic data. The `"default_on"` mode, if supported, might be useful for testing or development where you want caching to be enabled broadly, but is generally less suitable for production due to the lack of explicit control. Always check the documentation for your specific LiteLLM Proxy version to confirm the actually supported caching modes and their default behavior. + +--- + +###### `ttl` (`cache_params` -> Common) + +**YAML Key:** `ttl` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** No default TTL is specified in the provided documentation snippets. You may need to check the full documentation for the default TTL value in your specific LiteLLM Proxy version. If no default is documented, it's likely that cache entries might persist indefinitely unless explicitly evicted or the cache backend has its own eviction policies. It's **highly recommended to explicitly set a `ttl` value**. + +**Description:** The `ttl` parameter, within `cache_params`, sets the **time-to-live (TTL) in seconds** for cache entries. TTL determines how long a cached response remains valid and will be served from the cache before it is considered **expired**. + +* **Cache Expiration:** `ttl` is a fundamental parameter for cache management. It defines the maximum age of a cached response. After the specified TTL duration has elapsed since the response was cached, the cached entry is considered **stale** or expired. +* **Cache Invalidation:** When a cached response is expired due to TTL, the proxy will **not use it** for subsequent requests, even if the request matches the cache key. Instead, the proxy will forward the request to the backend LLM provider, get a fresh response, and potentially cache the new response (replacing the expired entry). +* **Balancing Freshness and Performance:** The `ttl` value represents a trade-off between: + * **Cache Hit Rate and Performance:** Longer TTL values (e.g., hours or days) increase the likelihood of cache hits and improve performance, as cached responses remain valid for longer. However, they might serve stale or outdated information if the underlying LLM or data changes frequently. + * **Data Freshness and Accuracy:** Shorter TTL values (e.g., minutes or seconds) ensure that responses are more up-to-date and accurate, reflecting recent changes in the LLM or data, but they reduce the cache hit rate and may increase API costs and latency. +* **Appropriate TTL Value:** The optimal `ttl` value depends heavily on your application's requirements for data freshness, the frequency of changes in the underlying LLM or data, and your performance/cost optimization goals. For relatively static data or tasks where slightly stale data is acceptable, you can use longer TTL values. For applications requiring highly dynamic or real-time data, use shorter TTL values or disable caching altogether. + +**Example YAML (Cache TTL of 10 minutes):** + +```yaml +litellm_settings: + cache: true + cache_params: + type: "redis" + # ... Redis settings ... + ttl: 600 # Setting cache TTL to 600 seconds (10 minutes) + # ... other cache settings ... +``` + +In this example, cache entries will expire and be considered invalid after 600 seconds (10 minutes) from when they were initially cached. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Always explicitly set a `ttl` value** that is appropriate for your application's needs. If data freshness is critical, use shorter TTLs. If performance and cost reduction are prioritized and some staleness is acceptable, use longer TTLs. Monitor your cache hit rates and application behavior to fine-tune the `ttl` value for optimal performance and data accuracy. If no caching is desired for certain data, disable caching for those call types using `supported_call_types` or disable caching altogether with `cache: false`. + +--- + +### Callback Settings (`callback_settings`) + +This section is used for fine-tuning of the individual callbacks configured in `litellm_setting`. You will be able to customize settings of selected callback. + +#### `callback_settings.otel.message_logging` + +**YAML Key:** `callback_settings.otel.message_logging` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `true` (Message logging in OpenTelemetry is enabled by default). + +**Description:** The `callback_settings.otel.message_logging` parameter, within the top-level `callback_settings` section and specifically under the `otel` key, is a **boolean flag** that controls whether the **OpenTelemetry (`otel`) callback should log message content and responses**. This setting provides fine-grained control over what data is included in OpenTelemetry traces specifically for the OTEL callback. + +* **OTEL-Specific Message Logging Control:** This parameter applies **only to the OpenTelemetry logging integration**. It does not affect other logging callbacks or the general `turn_off_message_logging` setting. +* **Message Content in Traces (Default):** By default (`message_logging: true` or if this parameter is omitted), the OpenTelemetry callback will log the **full content of prompts (messages) and responses** within the OpenTelemetry traces. This provides detailed context within traces, which is helpful for debugging and understanding the flow of conversations or requests. +* **Disabling Message Content in OTEL Traces:** Setting `message_logging: false` will instruct the OTEL callback **not to log message content or responses** in the traces it generates. Only high-level metadata (like request parameters, timestamps, etc.) will be included in the OTEL spans. +* **Privacy or Trace Volume Reduction:** You might disable message logging in OTEL traces for: + * **Privacy Reasons:** If you are concerned about sensitive data appearing in your OpenTelemetry traces, especially if traces are exported to external systems or accessed by a wider audience. + * **Reduced Trace Volume:** Omitting message content can significantly reduce the size and volume of OpenTelemetry traces, which can be beneficial for performance in high-throughput systems or when dealing with large trace datasets. + +**Example YAML:** + +```yaml +litellm_settings: + callbacks: ["otel"] # Enabling OpenTelemetry callback +callback_settings: # Fine-tuning callback behavior + otel: # Settings specific to the "otel" callback + message_logging: false # Disabling message content logging in OTEL traces +``` + +In this example, OpenTelemetry tracing is enabled for all requests (via `callbacks: ["otel"]`), but the `message_logging: false` setting under `callback_settings.otel` ensures that the actual text content of prompts and responses will *not* be included in the generated OTEL traces. Only metadata will be traced. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `callback_settings.otel.message_logging: false` provides a way to control message logging specifically for OpenTelemetry, without affecting other logging callbacks or the global `turn_off_message_logging` setting, which controls message logging across *all* callbacks. + +--- + +## `general_settings` Section: Proxy General Settings + +The `general_settings` section contains a wide range of proxy-wide configurations, affecting overall proxy behavior, security, authentication, and database interactions. These are general settings that are not specific to a particular model or routing strategy. + +### general\_settings Parameter Summary + +| Parameter | Type | Default Value | Description | +| :------------------------------------------------- | :-------- | :------------------------------------------ | :------------------------------------------------------------------------------------------------------------- | +| **Default Model Selection** | | | | +| `completion_model` | String | `None` | Default model for completion/chat requests. | +| `embedding_model` | String | `None` | Default model for embedding requests. | +| `image_generation_model` | String | `None` | Default model for image generation requests. | +| `moderation_model` | String | `None` | Default model for moderation requests. | +| `infer_model_from_keys` | Boolean | `false` | Infer model from API keys. | +| **Master Key & Authentication** | | | | +| `master_key` | String | `None` | Master API key for proxy admin. | +| `enable_jwt_auth` | Boolean | `false` | Enable JWT authentication for admin *(Enterprise Feature)*. | +| `litellm_jwtauth` | Object | `None` | JWT authentication settings *(Enterprise Feature)*. | +| `allowed_routes` | Array | All Standard Routes | Allowed API routes for non-admin users. | +| `admin_only_routes` | Array | `[]` | API routes accessible only to admins *(Enterprise Feature)*. | +| `allowed_ips` | Array | All IPs Allowed | IP allowlist for proxy access. | +| `enforce_user_param` | Boolean | `false` | Enforce `user` parameter in API requests. | +| `enable_oauth2_proxy_auth` | Boolean | `false` | Enable OAuth2.0 proxy authentication *(Enterprise Feature)*. | +| `use_x_forwarded_for` | Boolean | `false` | Use `X-Forwarded-For` header for client IP. | +| `custom_auth` | String | `None` | Path to custom authentication module *(Enterprise Feature or Advanced Usage)*. | +| `allow_user_auth` | Boolean | `false` | *Deprecated* User authentication toggle. | +| `custom_sso` | String | `None` | Path to custom SSO logic *(Enterprise Feature or Advanced Usage)*. | +| **Database & Persistence** | | | | +| `database_url` | String | `None` | Database connection URL. | +| `database_connection_pool_limit` | Integer | `100` | Max DB connection pool size. | +| `database_connection_timeout` | Integer | `60` | DB connection timeout (seconds). | +| `allow_requests_on_db_unavailable` | Boolean | `false` | Allow requests if DB is unavailable *(Use with Caution)*. | +| `disable_spend_logs` | Boolean | `false` | Disable writing transaction logs to the database. | +| `disable_adding_master_key_hash_to_db` | Boolean | `false` | Disable storing master key hash in DB. | +| `store_model_in_db` | Boolean | `false` | Store model info in DB *(Enterprise Feature)*. | +| `store_prompts_in_spend_logs` | Boolean | `false` | Store prompt/response content in spend logs. | +| `disable_prisma_schema_update` | Boolean | `false` | Disable automatic DB schema updates *(Advanced Usage)*. | +| `proxy_batch_write_at` | Integer | `10` | Batch write interval for spend logs (seconds). | +| `proxy_budget_rescheduler_min_time` | Integer | `597` | Min time for budget rescheduler check (seconds). | +| `proxy_budget_rescheduler_max_time` | Integer | `605` | Max time for budget rescheduler check (seconds). | +| **Key Management & Encryption** | | | | +| `key_management_system` | String | `None` | Key Management System to use *(Enterprise Feature)*. | +| `key_management_settings` | List | `None` | KMS configuration settings *(Enterprise Feature)*. | +| `use_azure_key_vault` | Boolean | `false` | Use Azure Key Vault for key management *(Enterprise Feature)*. | +| `use_google_kms` | Boolean | `false` | Use Google Cloud KMS for encryption *(Enterprise Feature)*. | +| `default_team_disabled` | Boolean | `false` | Disable personal API key creation *(Enterprise Feature)*. | +| `custom_key_generate` | String | `None` | Path to custom API key generation function *(Advanced Usage)*. | +| Encryption salt (environment variable: `LITELLM_SALT_KEY`) | String | Randomly Generated (but should be set) | Salt key for encryption (set via environment variable). | +| **Rate Limiting & Quotas** | | | | +| `max_parallel_requests` | Integer | Provider/Model Dependent | Max parallel requests per deployment/model. | +| `global_max_parallel_requests` | Integer | No Global Cap (Resource Limited) | Max parallel requests across the entire proxy. | +| `max_request_size_mb` | Integer | Reasonable Default (e.g., 5-10MB) | Max request payload size (MB). | +| `max_response_size_mb` | Integer | Reasonable Default (e.g., 10MB) | Max response size (MB). | +| `proxy_budget_rescheduler_min_time` | Integer | `597` | Min time for budget rescheduler check (seconds). | +| `proxy_budget_rescheduler_max_time` | Integer | `605` | Max time for budget rescheduler check (seconds). | +| **Monitoring, Alerting & Health Checks** | | | | +| `background_health_checks` | Boolean | `true` | Enable background health checks on models. | +| `health_check_interval` | Integer | `300` | Interval between health checks (seconds). | +| `health_check_details` | Boolean | `true` (or version dependent) | Show detailed info in `/health` endpoint. | +| `alerting` | Array | `[]` | Alerting methods (e.g., `["slack"]`). | +| `alerting_threshold` | Integer | `None` | Threshold for triggering alerts. | +| `alerting_args` | Object | `None` | Arguments for alerting integrations. | +| `alert_types` | Array | Default Set for Slack Alerting | Alert types for Slack alerting. | +| `alert_to_webhook_url` | Object | `None` | Custom webhook URLs per alert type. | +| `spend_report_frequency` | String | `None` | Frequency of spend reports *(Enterprise Feature)*. | +| `forward_openai_org_id` | Boolean | `false` | Forward `OpenAI-Organization` header to OpenAI API. | +| `forward_client_headers_to_llm_api` | Boolean | `false` | Forward client headers (X-*) to LLM API. | +| `use_client_credentials_pass_through_routes` | Boolean | `false` | Allow client credentials on pass-through routes *(Enterprise Feature)*. | +| `allow_client_side_credentials` | Boolean | `false` | Allow provider credentials in request body/headers *(Use with Caution)*. | +| **Other Miscellaneous Settings** | | | | +| `service_account_settings` | List | `None` | Settings for service account keys *(Enterprise Feature)*. | +| `provider_budget_config` | Object | `None` | Budget limits per LLM provider *(Advanced Feature)*. | +| `model_group_alias` | Object | `None` | Alias mapping for model groups. | +| `retry_after` | Integer | `0` | Base retry delay (seconds) *(Advanced Feature)*. | +| `num_retries` | Integer | `3` | Number of retry attempts. | + +### Default Model Selection + +#### `completion_model` + +**YAML Key:** `completion_model` + +**Type:** String (Must be a `model_name` defined in your `model_list`) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default completion model is set). If not configured, requests without a `model` parameter will likely result in an error. + +**Description:** The `completion_model` parameter, within `general_settings`, defines the **default `model_name`** (as defined in your `model_list`) that the proxy should use when a client application sends a **completion or chat request** (to endpoints like `/v1/completions` or `/v1/chat/completions`) **without explicitly specifying a `model` parameter** in the request body. + +* **Default Model for Unqualified Requests:** This setting acts as a fallback. In practice, well-behaved API clients should always specify a `model` in their requests. However, `completion_model` ensures that if a request *does* arrive without a `model`, the proxy has a designated default model to use. +* **Model Alias from** `model_list`:** The value of `completion_model` must be a valid `model_name` that you have already defined in your `model_list` section of `config.yaml`. This ensures that the default model is one that the proxy is configured to serve. +* **Ensuring a Default Behavior:** Setting `completion_model` provides a safety net, guaranteeing that even if a client omits the `model` parameter, the proxy can still process the request using a predefined default model. This can be useful in scenarios where you want to enforce a specific model as the standard choice, or for simplifying client-side code in certain use cases. + +**Example YAML:** + +```yaml +general_settings: + completion_model: "gpt-3.5-turbo" # Setting "gpt-3.5-turbo" as the default completion model + # ... other general_settings ... +model_list: + - model_name: gpt-3.5-turbo # Defining "gpt-3.5-turbo" in model_list + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + # ... other litellm_params ... +``` + +In this configuration, if a client sends a request to `/v1/chat/completions` *without* including `"model": "..."` in the request body, the proxy will automatically route that request to the model deployment aliased as `"gpt-3.5-turbo"` (which, in this example, is configured to use an Azure OpenAI deployment). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** While `completion_model` provides a default, it's generally best practice for client applications to always **explicitly specify the `model`** in their API requests to avoid ambiguity and ensure they are targeting the intended model. + +--- + +#### `embedding_model` + +**YAML Key:** `embedding_model` + +**Type:** String (Must be a `model_name` defined in your `model_list`) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default embedding model is set). If not configured, requests to `/v1/embeddings` without a `model` parameter will likely result in an error. + +**Description:** The `embedding_model` parameter, within `general_settings`, defines the **default `model_name`** (from your `model_list`) that the proxy should use for **embedding requests** (to the `/v1/embeddings` endpoint) when the client request **does not specify a `model` parameter**. + +* **Default Embedder:** Similar to `completion_model`, `embedding_model` sets a default for embedding-related API calls. If a client sends a request to `/v1/embeddings` omitting the `model` parameter, the proxy will assume this default model for generating embeddings. +* **Model Alias from** `model_list`:** The value of `embedding_model` must be a valid `model_name` that is already defined in your `model_list` section of `config.yaml`. This ensures that the default embedder is one that the proxy is configured to manage and serve. +* **Directing Embedding Calls:** Setting `embedding_model` is useful if you want to direct all embedding-related API calls to a specific embedding model without requiring clients to always specify the model name in their requests. This can simplify client code and enforce a consistent embedding strategy. + +**Example YAML:** + +```yaml +general_settings: + embedding_model: "text-embedding-ada-002" # Setting "text-embedding-ada-002" as the default embedding model + # ... other general_settings ... +model_list: + - model_name: text-embedding-ada-002 # Defining "text-embedding-ada-002" in model_list + litellm_params: + model: openai/text-embedding-ada-002 + api_key: "os.environ/OPENAI_API_KEY" + # ... other litellm_params ... +``` + +In this configuration, if a client sends a request to `/v1/embeddings` without specifying `"model": "..."` in the request body, the proxy will automatically use the model deployment aliased as `"text-embedding-ada-002"` as the default embedder. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** While `embedding_model` provides a default embedder, it's generally good practice for API clients to explicitly specify the `model` in their `/v1/embeddings` requests to avoid reliance on defaults and ensure they are using the intended embedding model. + +--- + +#### `image_generation_model` + +**YAML Key:** `image_generation_model` + +**Type:** String (Must be a `model_name` defined in your `model_list`) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default image generation model is set). If not configured, requests to `/images/generations` may not be handled or might result in an error. + +**Description:** The `image_generation_model` parameter, within `general_settings`, defines the **default model name** (from your `model_list`) that the proxy should use for **image generation requests** (to the `/images/generations` endpoint). **Crucially, this setting *overrides* any `model` parameter provided in the client's image generation request payload.** + +* **Forcing a Specific Image Model:** When `image_generation_model` is set, the proxy will **always route image generation requests to this specified model**, regardless of whether the client includes a `model` parameter in their request. This effectively enforces the use of a particular image generation model for all image-related API calls. +* **Model Alias from** `model_list`:** The value of `image_generation_model` must be a valid `model_name` that you have already defined in your `model_list` section of `config.yaml`. +* **Enforcing a Single Image Model Policy:** This setting is useful when you want to strictly control which image generation model is used by your application and ensure that all image requests are routed to a specific, designated model. For example, you might want to force all image generation to use DALL-E 2 and prevent users from requesting other image models via the proxy. + +**Example YAML:** + +```yaml +general_settings: + image_generation_model: "dalle-2" # Setting "dalle-2" as the default and *only* image generation model + # ... other general_settings ... +model_list: + - model_name: dalle-2 # Defining "dalle-2" in model_list + litellm_params: + model: openai/dall-e-2 + api_key: "os.environ/OPENAI_API_KEY" + # ... other litellm_params ... +``` + +In this configuration, *all* requests to the `/images/generations` endpoint, regardless of any `model` parameter they might include, will be routed to the model deployment aliased as `"dalle-2"`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** Setting `image_generation_model` effectively **ignores** any `model` parameter provided by the client in image generation requests. This enforces a strict policy of using the configured default model for all image-related API calls. If you want to allow clients to choose image models, do *not* set `image_generation_model`. + +--- + +#### `moderation_model` + +**YAML Key:** `moderation_model` + +**Type:** String (Must be a `model_name` defined in your `model_list`) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default moderation model is set). If not configured, moderation requests might not be handled correctly or may result in an error. + +**Description:** The `moderation_model` parameter, within `general_settings`, defines the **default `model_name`** (from your `model_list`) that the proxy should use for **moderation requests** (to the `/moderations` endpoint). + +* **Default Moderation Model:** When a client application sends a request to the `/moderations` endpoint, which is typically used for content moderation checks, the proxy will use the model specified by `moderation_model` to perform the moderation, *unless* a different moderation model is explicitly configured in the request itself (Note: OpenAI's standard `/moderations` endpoint does *not* typically accept a `model` parameter in the request body; this setting is relevant if LiteLLM's proxy supports multiple moderation models or custom moderation endpoints in the future). +* **Model Alias from** `model_list`:** The value of `moderation_model` must be a valid `model_name` that is already defined in your `model_list` section of `config.yaml`. This should be a model that is suitable for content moderation tasks (e.g., OpenAI's moderation models or similar). +* **Ensuring a Moderation Model is Used:** Setting `moderation_model` ensures that the proxy has a designated model to use for moderation checks, even if the client's request does not explicitly specify one. + +**Example YAML:** + +```yaml +general_settings: + moderation_model: "openai-moderation" # Setting "openai-moderation" as the default moderation model + # ... other general_settings ... +model_list: + - model_name: openai-moderation # Defining "openai-moderation" in model_list + litellm_params: + model: openai/text-moderation-stable + api_key: "os.environ/OPENAI_API_KEY" + # ... other litellm_params ... +``` + +In this configuration, when a client sends a request to `/moderations` without a `model` parameter, the proxy will use the model deployment aliased as `"openai-moderation"` for performing the content moderation check. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** In practice, OpenAI's standard `/moderations` endpoint does not typically allow specifying a model parameter in the request body. The `moderation_model` setting in `general_settings` is more relevant if LiteLLM Proxy were to support multiple moderation models or custom moderation endpoints in the future. Currently, it mainly serves to define a default moderation model within your proxy configuration. + +--- + +#### `infer_model_from_keys` + +**YAML Key:** `infer_model_from_keys` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Model is not inferred from API keys by default). + +**Description:** The `infer_model_from_keys` parameter, within `general_settings`, is a **boolean flag** that, when set to `true`, instructs the proxy to attempt to **infer the intended model for a request based on the API key (virtual key) used in the request**. + +* **API Key-Based Model Routing:** This feature is relevant in scenarios where you issue different API keys that are **scoped to specific models or model groups**. For example, you might issue one set of API keys that are only authorized to access GPT-4 models, and another set for GPT-3.5 models. +* **Key-Based Inference Logic:** When `infer_model_from_keys: true`, the proxy will examine the API key used in an incoming request. If the API key is configured to be associated with a specific model (or set of models), the proxy will attempt to **automatically route the request to that model**, *even if the client specifies a different `model` in the request body*. +* **Prioritizing Key-Based Routing:** If model inference from the API key is successful, it will **take precedence** over any `model` parameter provided by the client. In essence, the API key becomes the primary factor in determining the model to be used. If model inference from the key is *not* possible (e.g., the key is not associated with a specific model), the proxy will then fall back to using the `model` parameter from the client request (or the `completion_model`/`embedding_model` defaults if no model is specified in the request either). +* **Multi-Tenant Scenarios:** `infer_model_from_keys` is particularly useful in **multi-tenant environments** where you want to enforce model access control at the API key level, ensuring that each user or team is restricted to using only the models they are authorized to access. + +**Example YAML:** + +```yaml +general_settings: + infer_model_from_keys: true # Enabling model inference from API keys + # ... other general_settings ... +# Example model_list (showing model_name aliases, not directly related to infer_model_from_keys): +model_list: + - model_name: gpt-4-key-scoped + litellm_params: + model: azure/gpt-4-deployment + api_key: "os.environ/AZURE_GPT4_API_KEY" # This key would be associated with gpt-4 in key management + - model_name: gpt-3.5-turbo-key-scoped + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_GPT35_API_KEY" # This key would be associated with gpt-3.5-turbo in key management +``` + +In this configuration, if a request comes in with an API key that is internally marked as being associated with `"gpt-4-key-scoped"`, the proxy will route it to the Azure GPT-4 deployment (as defined by the `gpt-4-key-scoped` model alias), *regardless* of what `model` parameter the client might have included in the request body. If a request comes with a key associated with `"gpt-3.5-turbo-key-scoped"`, it will be routed to the OpenAI GPT-3.5 Turbo deployment, and so on. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `infer_model_from_keys: true` is typically used in conjunction with the proxy's **virtual key management system**. You would need to configure your API keys (via the Admin UI or API) to associate them with specific models or model groups for this feature to be effective. If you are not using virtual keys or have not configured key-to-model associations, enabling `infer_model_from_keys: true` will likely have no effect. + +--- + +### Master Key & Authentication + +This subsection within `general_settings` focuses on configuring the master API key, which provides administrative access to the proxy, and enabling/configuring various authentication methods (JWT, OAuth2) for enhanced security. + +#### `master_key` (`general_settings`) + +**YAML Key:** `master_key` + +**Type:** String (Secret, high-entropy string) + +**Environment Variable:** `PROXY_MASTER_KEY` + +**Default Value:** `None` (Proxy will not start in key-managed mode if `master_key` is not set). + +**Description:** The `master_key` parameter, within `general_settings`, defines the **master API key** for your LiteLLM Proxy Server. This is a **critical security credential** that acts as an **administrator key** for the proxy. + +* **Admin Authentication:** The `master_key` is used to authenticate access to **privileged admin endpoints** of the proxy, such as: + * Generating new API keys (virtual keys). + * Listing, updating, or deleting API keys. + * Viewing usage statistics and spend reports. + * Accessing the Admin UI (if enabled). +* **Security Importance:** The `master_key` is a **highly sensitive secret**. **Protect it carefully.** Anyone who possesses the `master_key` gains administrative control over your LiteLLM Proxy instance. +* **High Entropy String:** The `master_key` value should be a **strong, randomly generated string** with high entropy (i.e., difficult to guess). It is often recommended to generate a key that resembles an OpenAI API key (starting with `sk-` followed by a long random string), but any sufficiently random string will work. +* **Required for Key-Managed Mode:** Setting `master_key` is **essential** if you want to run the LiteLLM Proxy in **"key-managed" mode**, where the proxy manages API keys (virtual keys), enforces rate limits, tracks usage, and provides admin functionalities. If `master_key` is not set, the proxy may run in a limited or "single-key" mode, without key management features. + +**Example YAML:** + +```yaml +general_settings: + master_key: "sk-my_strong_master_key_1234567890abcdef" # Setting the master API key + # ... other general_settings ... +``` + +**Example Environment Variable:** + +```bash +export PROXY_MASTER_KEY="sk-my_strong_master_key_1234567890abcdef" # Setting master key via environment variable +``` + +**Security Warning:** **Never hardcode your `master_key` directly in your `config.yaml` file or store it in version control.** Always use environment variables ( `PROXY_MASTER_KEY` ) or secure secret management practices to protect your master API key. Compromising the `master_key` grants full administrative access to your LiteLLM Proxy, potentially leading to unauthorized access, data breaches, or abuse of your LLM infrastructure. + +--- + +#### `enable_jwt_auth` (`general_settings`) + +**YAML Key:** `enable_jwt_auth` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (JWT authentication is disabled by default). + +**Description:** The `enable_jwt_auth` parameter, within `general_settings`, is an **enterprise-level boolean flag** that enables **JWT (JSON Web Token) based authentication** for accessing the **proxy's admin UI and admin API endpoints**. + +* **JWT Authentication for Admin Access:** When `enable_jwt_auth: true`, the proxy will require valid JWTs for authentication to privileged admin routes (like `/key/generate`, `/team/*`, `/admin/*`, and the Admin UI). This provides a more robust and standardized authentication mechanism compared to just using the `master_key`. +* **Enterprise Feature:** JWT authentication is primarily an **enterprise feature**, suitable for organizations that need to integrate the proxy with existing SSO (Single Sign-On) systems or centralized authentication infrastructure. +* **JWT Validation Required:** If you enable `enable_jwt_auth: true`, you **must also configure the `litellm_jwtauth` section** in `general_settings`. `litellm_jwtauth` is where you define the settings for validating incoming JWTs (e.g., JWT secret key, algorithms, issuer, audience). If `enable_jwt_auth: true` but `litellm_jwtauth` is not properly configured, admin access via JWT will not function correctly. + +**Example YAML (Enabling JWT Auth):** + +```yaml +general_settings: + enable_jwt_auth: true # Enterprise Feature: Enabling JWT authentication for admin access + litellm_jwtauth: # Required: JWT authentication settings + secret: "my-jwt-verification-secret" + algorithms: ["HS256"] + # ... other jwt auth settings ... + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `enable_jwt_auth: true` is typically used in conjunction with `litellm_jwtauth` to fully configure JWT-based authentication. It is an **enterprise feature** and requires careful configuration of JWT validation settings. + +--- + +#### `litellm_jwtauth` (`general_settings`) + +**YAML Key:** `litellm_jwtauth` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (Required if `enable_jwt_auth: true`) + +**Description:** The `litellm_jwtauth` parameter, within `general_settings`, is an **enterprise-level mapping** (object) that **configures the settings for JWT (JSON Web Token) authentication** when `enable_jwt_auth: true` is enabled. This section defines how the proxy will **validate and verify incoming JWTs** used for admin access. + +* **JWT Validation Configuration:** `litellm_jwtauth` must be configured when JWT authentication is enabled. It specifies how the proxy should validate the signature and claims of incoming JWTs to ensure they are valid and authorized. +* **Required Settings:** Key settings within `litellm_jwtauth` typically include: + * `secret`: (String): The **secret key** or **public key** used to verify the signature of JWTs. The type of key (secret or public) depends on the JWT algorithm used (e.g., `HS256` typically uses a secret key, while `RS256` uses a public key). For `HS256`, this should be the same secret used to *sign* the JWTs. For asymmetric algorithms like `RS256`, this would be the *public key* corresponding to the private key used for signing. + * `algorithms`: (Array of Strings): A list of **accepted JWT algorithms** (e.g., `["HS256"]`, `["RS256"]`, `["HS256", "RS256"]`). The proxy will only accept JWTs signed using one of the algorithms listed here. Common algorithms include `HS256` (HMAC-SHA256), `RS256` (RSA-SHA256), etc. + +* **Optional Settings:** Depending on your JWT setup, you might also need to configure: + * `issuer`: (String, Optional): The expected **issuer** ( `iss` claim) of the JWTs. If set, the proxy will verify that the `iss` claim in the JWT matches this value. + * `audience`: (String or Array of Strings, Optional): The expected **audience** (`aud` claim) of the JWTs. If set, the proxy will verify that the `aud` claim in the JWT matches this value (or is included in the list of allowed audiences). + * `leeway`: (Integer, Optional): A **leeway or clock skew** value (in seconds) to account for potential clock differences between the proxy server and the JWT issuer. This can be useful to avoid JWT validation failures due to slight clock synchronization issues. + +**Example YAML (JWT Authentication Configuration):** + +```yaml +general_settings: + enable_jwt_auth: true # Enterprise Feature: JWT authentication is enabled + litellm_jwtauth: # JWT authentication settings (required when enable_jwt_auth: true) + secret: "myJWTsecret" # Secret key for JWT verification (HS256 algorithm) + algorithms: ["HS256"] # Accepting HS256 algorithm + issuer: "https://my-sso-provider.example.com" # Optional: Expected JWT issuer + audience: "litellm-proxy" # Optional: Expected JWT audience + leeway: 60 # Optional: Leeway of 60 seconds for clock skew + # ... other general_settings ... +``` + +In this example, the proxy is configured to validate JWTs using the `HS256` algorithm and the secret key `"myJWTsecret"`. It also verifies the issuer and audience claims and allows for a 60-second clock skew. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Note:** The `secret` value in `litellm_jwtauth` is a **highly sensitive security credential**. Protect it carefully. For `HS256`, this is a shared secret that must be kept confidential. For asymmetric algorithms like `RS256`, the `secret` here would typically be the **public key**, while the corresponding **private key** used for signing JWTs must be securely managed and kept private by your JWT issuer. Consider using secure secret management practices to handle your JWT secrets. + +--- + +#### `allowed_routes` + +**YAML Key:** `allowed_routes` + +**Type:** Array of Strings (List of API route paths) + +**Environment Variable:** N/A + +**Default Value:** If not set, all standard proxy API routes are allowed for non-admin users. + +**Description:** The `allowed_routes` parameter, within `general_settings`, is an **array of strings** that defines a **whitelist of proxy API routes that are explicitly allowed to be accessed by non-admin users** (i.e., users authenticating with regular API keys or JWTs *without* admin privileges). + +* **Route-Based Access Control:** `allowed_routes` implements a form of **route-based access control**. It lets you restrict which proxy endpoints regular users can access, while potentially reserving other routes for admin-only access. +* **Security and Policy Enforcement:** Use `allowed_routes` to enforce security policies and control which functionalities are exposed to regular API clients. For example, you might want to allow access only to the `/v1/chat/completions` and `/v1/embeddings` endpoints for standard users, and restrict access to admin or management endpoints. +* **Route Path Strings:** Each string in the `allowed_routes` array should be a **proxy API route path**, starting with `/`. Examples include: + * `"/v1/chat/completions"`: The chat completions endpoint. + * `"/v1/embeddings"`: The embeddings endpoint. + * `"/v1/completions"`: The text completions endpoint. + * `"/model/info"`: The model information endpoint. + + **Important:** Do *not* include the proxy's base URL (e.g., `http://0.0.0.0:4000`) in the route paths. Only specify the path portion (e.g., `"/v1/chat/completions"`). + +* **Whitelist Approach:** `allowed_routes` is a **whitelist**. Only the routes explicitly listed in this array will be accessible to non-admin users. Any other routes will be **blocked** for regular API keys. Admin users (authenticated with the `master_key` or admin JWT) are typically exempt from these route restrictions and can access all proxy endpoints. +* **Default Allow-All (If Not Set):** If `allowed_routes` is **not set** in your `config.yaml`, the proxy will default to **allowing access to all standard proxy API routes** for non-admin users. In this case, route-based access control is effectively disabled, and regular users can access any standard endpoint. + +**Example YAML (Restricting Routes to Chat and Embeddings):** + +```yaml +general_settings: + allowed_routes: # Whitelisting allowed routes for non-admin users + - "/v1/chat/completions" # Allowing chat completions endpoint + - "/v1/embeddings" # Allowing embeddings endpoint + # ... other general_settings ... +``` + +In this configuration, only the `/v1/chat/completions` and `/v1/embeddings` endpoints will be accessible to users authenticating with regular API keys. Any requests to other proxy endpoints (e.g., `/v1/completions`, `/key/generate`, `/admin/*`, etc.) using non-admin keys will be rejected. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `allowed_routes` is typically used in conjunction with authentication mechanisms (like virtual keys or JWT auth) to enforce access control policies. It provides a basic level of route-based authorization. For more granular authorization logic, consider using custom authentication modules (`custom_auth` setting) or more advanced RBAC (Role-Based Access Control) features if available in Enterprise editions. + +--- + +#### `admin_only_routes` + +**YAML Key:** `admin_only_routes` + +**Type:** Array of Strings (List of API route paths) + +**Environment Variable:** N/A + +**Default Value:** `[]` (No routes are admin-only by default). + +**Description:** The `admin_only_routes` parameter, within `general_settings`, is an **enterprise-level array of strings** that defines a **blacklist of proxy API routes** that are **exclusively accessible to admin users**. This is the inverse of `allowed_routes`. + +* **Route-Based Admin Restriction:** `admin_only_routes` allows you to designate specific proxy endpoints as **admin-only**. If a non-admin user (using a regular API key or a JWT without admin privileges) attempts to access any route listed in `admin_only_routes`, the proxy will **block the request** and return an unauthorized or not-found error. +* **Reserving Routes for Admin Functions:** Use `admin_only_routes` to reserve sensitive or administrative functionalities for authorized admin users only. Common routes to make admin-only might include: + * `"/key/*"`: All API key management endpoints (e.g., `/key/generate`, `/key/list`, `/key/update`, `/key/delete`). + * `"/team/*"`: Team management endpoints (if applicable). + * `"/admin/*"`: General admin and configuration endpoints. + * `"/v1/fine-tunes"`: Fine-tuning related endpoints (if supported by the proxy and you want to restrict access to fine-tuning). +* **Route Path Strings:** Similar to `allowed_routes`, each string in the `admin_only_routes` array should be a **proxy API route path**, starting with `/` (e.g., `"/key/generate"`, `"/admin/settings"`). Do *not* include the base URL. +* **Blacklist Approach:** `admin_only_routes` acts as a **blacklist**. Any route listed here will be accessible *only* to admin users. All other routes (that are not explicitly blocked) will be accessible to non-admin users (subject to any `allowed_routes` whitelisting, if configured). +* **Enterprise Feature:** `admin_only_routes` is primarily an **enterprise feature** used in more controlled or security-conscious deployments where finer-grained access control is needed. + +**Example YAML (Making Key Management Routes Admin-Only):** + +```yaml +general_settings: + admin_only_routes: # Blacklisting routes for admin-only access + - "/key" # All /key/* endpoints (key management) + - "/team" # All /team/* endpoints (team management) + - "/admin" # All /admin/* endpoints (general admin) + # ... other general_settings ... +``` + +In this configuration, any requests to routes starting with `/key`, `/team`, or `/admin` will be blocked for non-admin users. Only users authenticating with the `master_key` or a JWT with admin privileges will be able to access these routes. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `admin_only_routes` is typically used in conjunction with authentication mechanisms (like virtual keys, JWT auth, or OAuth2) to enforce route-level access control. It provides a way to reserve sensitive proxy functionalities to authorized administrators. If neither `allowed_routes` nor `admin_only_routes` are set, all standard proxy API routes are generally accessible to non-admin users (subject to any other authentication requirements). + +--- + +#### `allowed_ips` + +**YAML Key:** `allowed_ips` + +**Type:** Array of Strings (List of IP addresses or CIDR ranges) + +**Environment Variable:** N/A + +**Default Value:** If not set, all IP addresses are allowed by default. + +**Description:** The `allowed_ips` parameter, within `general_settings`, is an **array of strings** that defines an **IP address allowlist** for accessing the LiteLLM Proxy Server. When set, the proxy will **only accept incoming requests that originate from IP addresses or CIDR ranges listed in this allowlist**. + +* **IP-Based Access Control:** `allowed_ips` provides a basic form of **IP-based access control**. It restricts access to the proxy server based on the source IP address of the incoming request. +* **Security Layer:** Use `allowed_ips` as an **additional layer of security**, especially in environments where you want to limit access to the proxy to only trusted networks or clients (e.g., within a private network, from specific office locations, or from known client IP ranges). +* **IP Addresses or CIDR Ranges:** Each string in the `allowed_ips` array can be either: + * An **individual IPv4 address** (e.g., `"192.168.1.100"`, `"198.51.100.5"`). + * A **CIDR (Classless Inter-Domain Routing) range** (e.g., `"10.0.0.0/8"`, `"192.0.2.0/24"`). CIDR notation allows you to specify a range of IP addresses using a base IP address and a subnet mask. +* **Default Allow-All (If Not Set):** If `allowed_ips` is **not set** in your `config.yaml`, the proxy will default to **allowing requests from all IP addresses**. In this case, IP-based access control is effectively disabled, and the proxy will accept requests from any source IP address (subject to other authentication mechanisms). +* **Simple IP Filtering:** `allowed_ips` provides a relatively simple IP allowlisting mechanism. For more sophisticated or dynamic IP-based access control, you might need to use a web application firewall (WAF) or other network-level security tools in front of the proxy. + +**Example YAML (Allowing Requests from a Private Network and a Specific Public IP):** + +```yaml +general_settings: + allowed_ips: # IP allowlist configuration + - "10.0.0.0/8" # Allow requests from the entire 10.0.0.0/8 private network range + - "198.51.100.5" # Allow requests from this specific public IP address + # ... other general_settings ... +``` + +In this example, the proxy will only accept requests originating from IP addresses within the `10.0.0.0/8` private network range, or from the specific public IP address `198.51.100.5`. Requests from any other IP addresses will be rejected (typically with an "Unauthorized" or "Not Found" error). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Note:** `allowed_ips` provides a basic layer of IP-based security. However, IP-based access control alone is generally **not sufficient for robust security**, especially in public-facing applications. It's recommended to combine `allowed_ips` with other security measures, such as strong authentication (virtual keys, JWT, OAuth2), rate limiting, and network-level firewalls, for a more comprehensive security posture. + +--- + +#### `enforce_user_param` + +**YAML Key:** `enforce_user_param` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (The `user` parameter is optional by default). + +**Description:** The `enforce_user_param` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy **requires every API request to include a `user` parameter** (either in the request body or as a header). + +* **User Identification Requirement:** When `enforce_user_param: true`, the proxy will **reject any API request that does not contain a `user` parameter**. This parameter is typically used to identify the end-user or application user on whose behalf the LLM request is being made. +* **Cost Attribution and Tracking:** Enforcing the `user` parameter is often used for **cost attribution and tracking**. By requiring user identification, you can accurately track LLM usage and spend on a per-user basis, which is essential for: + * Usage dashboards and reports. + * Cost allocation and chargebacks within an organization. + * Per-user budgeting and quota enforcement. +* **Audit Logging:** Including a `user` identifier in each request also enhances audit logging, making it easier to trace requests back to specific users or applications. +* **OpenAI API Compatibility:** The `user` parameter is a standard parameter supported by the OpenAI API (and many other LLM providers). LiteLLM Proxy can pass this `user` parameter to the backend LLM provider when supported by the provider's API. + +**Example YAML:** + +```yaml +general_settings: + enforce_user_param: true # Enforcing the user parameter on all API requests + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**How to Provide the `user` Parameter in Requests:** + +When `enforce_user_param: true` is set, client applications must include the `user` parameter in their API requests. This can be done in a few ways, depending on the API endpoint and client library: + +* **In the Request Body (for `/chat/completions`, `/completions`, `/embeddings`):** + +```json +{ + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "Hello"} + ], + "user": "user123" // User identifier in the request body +} +``` + +* **As a Header (e.g., `X-User-ID`):** The exact header name might be provider-specific or configurable. Check the documentation for the specific header to use. + +If a request is sent to the proxy *without* a `user` parameter when `enforce_user_param: true` is enabled, the proxy will reject the request and return an error indicating that the `user` parameter is required. + +**Note:** If you are not interested in per-user cost tracking or do not need to identify end-users for each request, you can leave `enforce_user_param: false` (the default). In this case, the `user` parameter becomes optional, and the proxy will still process requests even if it's not provided. + +--- + +#### `enable_oauth2_proxy_auth` (`general_settings`) + +**YAML Key:** `enable_oauth2_proxy_auth` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (OAuth2.0 proxy authentication is disabled by default). + +**Description:** The `enable_oauth2_proxy_auth` parameter, within `general_settings`, is an **enterprise-level boolean flag** that enables **OAuth2.0 authentication** for accessing the LiteLLM Proxy Server. + +* **OAuth2.0 Integration for Proxy Access:** When `enable_oauth2_proxy_auth: true`, the proxy will enforce an **OAuth2.0 authentication flow** for all incoming API requests. This means that clients must present a valid OAuth2.0 access token to authenticate with the proxy. +* **Enterprise Authentication Standard:** OAuth2.0 is a widely used industry standard protocol for authorization and delegated access control. Enabling OAuth2.0 proxy authentication is typically relevant in **enterprise environments** where organizations already use OAuth2.0-based identity providers (like Okta, Auth0, Azure AD, etc.) for user authentication. +* **Integration with OAuth2.0 Proxies or Providers:** `enable_oauth2_proxy_auth: true` implies that you have an **OAuth2.0 proxy** (like OAuth2-Proxy, Keycloak, or similar) or an **OAuth2.0 identity provider** configured in front of your LiteLLM Proxy deployment. The LiteLLM Proxy itself does not act as an OAuth2.0 provider; it relies on an external OAuth2.0 system for authentication. + +**Example YAML (Enabling OAuth2.0 Proxy Auth):** + +```yaml +general_settings: + enable_oauth2_proxy_auth: true # Enterprise Feature: Enabling OAuth2.0 proxy authentication + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**How OAuth2.0 Proxy Authentication Typically Works (Conceptual):** + +1. **Client Request to Proxy:** A client application attempts to send a request to the LiteLLM Proxy Server. +2. **OAuth2.0 Proxy Intercepts:** An OAuth2.0 proxy (deployed in front of the LiteLLM Proxy) intercepts the request. +3. **Authentication Check:** The OAuth2.0 proxy checks if the client has a valid OAuth2.0 access token. + * **No Token:** If no valid token is found, the OAuth2.0 proxy might redirect the client to an authentication flow (e.g., to log in with their credentials at an identity provider). + * **Valid Token:** If a valid token is present, the OAuth2.0 proxy validates the token against the OAuth2.0 provider. +4. **Forward to LiteLLM Proxy:** If the OAuth2.0 token is successfully validated by the OAuth2.0 proxy, it forwards the *authenticated* request to the LiteLLM Proxy Server. Typically, the OAuth2.0 proxy might add headers to the forwarded request indicating the authenticated user or client identity. +5. **LiteLLM Proxy Access Granted:** Since `enable_oauth2_proxy_auth: true` is set in the LiteLLM Proxy, it now trusts that requests reaching it have already been authenticated by the OAuth2.0 proxy. It will then process the request and route it to the backend LLM. + +**Important Notes:** + +* **OAuth2.0 Proxy Required:** `enable_oauth2_proxy_auth: true` **requires that you deploy an OAuth2.0 proxy in front of the LiteLLM Proxy Server**. The LiteLLM Proxy itself does not implement the OAuth2.0 authentication flow; it relies on the external OAuth2.0 proxy for authentication enforcement. +* **Enterprise Feature:** OAuth2.0 proxy authentication is an **enterprise feature** for advanced authentication scenarios. For simpler authentication needs, consider using virtual API keys or JWT authentication (if Enterprise). +* **Configuration Complexity:** Setting up OAuth2.0 proxy authentication typically involves configuring both the OAuth2.0 proxy and the LiteLLM Proxy, and integrating them with your OAuth2.0 identity provider. Refer to the documentation of your OAuth2.0 proxy and identity provider for detailed configuration steps. + +--- + +#### `use_x_forwarded_for` + +**YAML Key:** `use_x_forwarded_for` + +**Type:** Boolean (Documentation misleadingly lists it as `str`, but it's effectively a boolean flag) + +**Environment Variable:** N/A + +**Default Value:** `false` (Direct connection IP is used by default). + +**Description:** The `use_x_forwarded_for` parameter, within `general_settings`, is a **boolean flag** that controls how the proxy determines the **client's IP address**. + +* **X-Forwarded-For Header:** When `use_x_forwarded_for: true`, the proxy will use the **`X-Forwarded-For` HTTP header** to determine the client's IP address. This header is commonly used when a proxy server (like LiteLLM Proxy) is deployed behind another proxy or load balancer. Load balancers and reverse proxies often add the `X-Forwarded-For` header to incoming requests, containing the original client IP address. +* **Proxy Chain Scenario:** `use_x_forwarded_for: true` is relevant in deployments where the LiteLLM Proxy is behind another proxy or load balancer (e.g., in a cloud environment, behind an API gateway, or behind a reverse proxy like Nginx or HAProxy). In such cases, the direct connection IP address that the LiteLLM Proxy sees might be the IP of the intermediary proxy or load balancer, not the original client's IP. The `X-Forwarded-For` header is used to get the *original* client IP address from the proxy chain. +* **Direct Connection IP (Default):** By default (`use_x_forwarded_for: false`), the proxy will use the **direct connection IP address** as seen by the server socket. This is the IP address of the immediate client that connected to the proxy, which might be the intermediary proxy or load balancer if the proxy is behind one. +* **IP Allowlisting or Rate Limiting:** The client IP address, as determined by this setting, is often used for features like: + * IP-based allowlisting (`allowed_ips` setting). + * Rate limiting based on IP address. + * Logging client IP addresses for audit or analytics purposes. + +**Example YAML:** + +```yaml +general_settings: + use_x_forwarded_for: true # Using X-Forwarded-For header to determine client IP (behind a proxy) + allowed_ips: ["10.0.0.0/8"] # IP allowlist based on X-Forwarded-For IP + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Caution:** + +* **Trust in Proxies:** When you enable `use_x_forwarded_for: true`, the proxy will **trust the `X-Forwarded-For` header value** to determine the client IP. This is generally safe if you control the entire proxy chain (e.g., you manage both the load balancer and the LiteLLM Proxy). However, in less controlled environments, be aware that malicious clients could potentially *spoof* or manipulate the `X-Forwarded-For` header to bypass IP-based access controls. +* **Deployment Architecture:** Only enable `use_x_forwarded_for: true` if you are certain that your deployment architecture involves a proxy or load balancer that is correctly adding and managing the `X-Forwarded-For` header. Incorrectly using this setting in a non-proxy environment might lead to unexpected behavior or incorrect IP address detection. + +--- + +#### `custom_auth` (`general_settings`) + +**YAML Key:** `custom_auth` + +**Type:** String (Path to a Python module) + +**Environment Variable:** N/A + +**Default Value:** `None` (Default authentication logic is used). + +**Description:** The `custom_auth` parameter, within `general_settings`, is an **advanced setting** that allows you to **override or extend the default authentication logic** of the LiteLLM Proxy Server by providing a path to a **custom authentication module** written in Python. + +* **Custom Authentication Logic:** `custom_auth` enables you to implement highly customized authentication mechanisms that go beyond the built-in options (virtual keys, JWT, OAuth2). This is for scenarios where you need to integrate the proxy with: + * **Existing Authentication Systems:** Integrate with your organization's specific authentication backend, user database, or identity provider that is not directly supported by built-in auth methods. + * **Complex Authorization Rules:** Implement custom authorization rules based on user attributes, request parameters, or external data sources. + * **Non-Standard Authentication Flows:** Implement authentication flows that deviate from standard API key, JWT, or OAuth2.0 patterns. +* **Python Module Path:** The value of `custom_auth` should be a **string** representing the **Python module path** to your custom authentication logic. This is typically in the format `"module.function_name"` or `"module.ClassName.method_name"`, pointing to a Python file and a function or method within that file that implements your custom authentication logic. +* **Advanced Usage and Responsibility:** `custom_auth` is a very powerful, but also **advanced and potentially risky** feature. Implementing custom authentication logic requires strong Python development skills and a deep understanding of security principles. **You are fully responsible** for the security and correctness of your custom authentication module. Incorrectly implemented custom authentication can introduce serious security vulnerabilities. + +**Example YAML (Custom Authentication Module Path):** + +```yaml +general_settings: + custom_auth: "my_custom_auth_module.my_authentication_function" # Path to custom auth function + # ... other general_settings ... +``` + +In this example, the proxy is configured to use a custom authentication function named `my_authentication_function` located in the Python module `my_custom_auth_module.py`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Implementing a Custom Authentication Function/Method:** + +Your custom authentication module (e.g., `my_custom_auth_module.py`) must define a function or method that conforms to a specific interface. The exact interface and expected behavior will depend on what aspect of authentication you are customizing (e.g., request authentication, admin authentication). Refer to the LiteLLM Proxy documentation for detailed specifications on how to implement custom authentication modules. Typically, your custom function or method will: + +1. **Receive Request Information:** Receive information about the incoming API request (e.g., headers, body, path, etc.). +2. **Perform Authentication Logic:** Implement your custom authentication checks, such as: + * Validating custom tokens or credentials in request headers or body. + * Checking against an external authentication service or user database. + * Implementing custom authorization rules based on request parameters or user attributes. +3. **Return Authentication Result:** Return a boolean value (or raise an exception) to indicate whether the request is authenticated and authorized. +4. **Handle Errors:** Properly handle authentication failures, returning appropriate error responses or raising exceptions to inform the proxy about authentication issues. + +**Security Warning:** Implementing custom authentication logic requires careful attention to security best practices. **Ensure your custom authentication module is thoroughly tested, audited, and secured** to prevent vulnerabilities. Improperly implemented custom authentication can create significant security risks for your proxy and LLM applications. + +--- + +#### `allow_user_auth` (`general_settings`) + +**YAML Key:** `allow_user_auth` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (User authentication is disabled by default). + +**Description:** The `allow_user_auth` parameter, within `general_settings`, is marked as **deprecated**. It represents an **older, less robust approach to user authentication** in the LiteLLM Proxy. + +* **Deprecated Feature:** `allow_user_auth` is **not recommended for new deployments**. It is considered a legacy or deprecated setting and is likely to be removed in future versions. +* **Older Authentication Method:** This flag likely toggles an older authentication mechanism that might have allowed end-users to authenticate directly with provider API keys or some other less secure method. The exact behavior might be version-specific and is not well-documented in the current stable documentation. +* **Use Virtual Keys, JWT, or OAuth2 Instead:** For modern and secure authentication, you should use the recommended methods: + * **Virtual API Keys:** For managing and controlling access to the proxy for different users, teams, or projects. + * **JWT Authentication:** For enterprise-grade authentication and integration with SSO systems. + * **OAuth2.0 Proxy Authentication:** For integrating with OAuth2.0 identity providers. + +**Example YAML (Deprecated - Avoid Using):** + +```yaml +general_settings: + allow_user_auth: true # Deprecated: Avoid using allow_user_auth + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Do not use `allow_user_auth` in new deployments.** Leave it at its default `false` value. Instead, focus on implementing robust authentication using virtual API keys, JWT authentication (`enable_jwt_auth`), or OAuth2.0 proxy authentication (`enable_oauth2_proxy_auth`), which provide much more secure and manageable authentication solutions. If you encounter `allow_user_auth` in older configurations, it's recommended to migrate to one of the modern authentication methods for improved security and long-term maintainability. + +--- + +#### `custom_sso` (`general_settings`) + +**YAML Key:** `custom_sso` + +**Type:** String (Path to a Python module) + +**Environment Variable:** N/A + +**Default Value:** `None` (Default SSO logic is used). + +**Description:** The `custom_sso` parameter, within `general_settings`, is an **advanced setting** that allows you to provide a path to a **custom SSO (Single Sign-On) module** written in Python. This enables you to **replace the default SSO logic** of the proxy's Admin UI with your own custom SSO integration. + +* **Custom SSO Integration:** `custom_sso` is primarily relevant for **enterprise users** who need to integrate the LiteLLM Proxy Admin UI with their organization's existing SSO infrastructure. This is for scenarios where the default email/password or token-based login for the Admin UI is not sufficient, and you need to leverage your organization's centralized identity management system for admin access. +* **Python Module Path:** The value of `custom_sso` should be a **string** representing the **Python module path** to your custom SSO logic. This is typically in the format `"module.function_name"` or `"module.ClassName.method_name"`, pointing to a Python file and a function or method that implements your custom SSO handling. +* **Advanced UI Customization:** `custom_sso` is an advanced customization option that requires Python development skills and a good understanding of SSO protocols (like SAML, OpenID Connect, etc.). It's used when you need to deeply customize the Admin UI's authentication flow and integrate it with a specific SSO provider. + +**Example YAML (Custom SSO Module Path):** + +```yaml +general_settings: + custom_sso: "my_custom_sso_module.my_sso_login_handler" # Path to custom SSO login handler + # ... other general_settings ... +``` + +In this example, the proxy is configured to use a custom SSO login handler function named `my_sso_login_handler` located in the Python module `my_sso_module.py`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Implementing a Custom SSO Login Handler:** + +Your custom SSO module (e.g., `my_sso_module.py`) must define a function or method that implements the logic for handling SSO login requests to the Admin UI. The exact interface and expected behavior will depend on the specific SSO protocol and your integration requirements. Refer to the LiteLLM Proxy documentation (especially enterprise-level documentation) for details on how to implement custom SSO handlers. Typically, your custom SSO handler will: + +1. **Handle Login Requests:** Intercept login requests to the Admin UI. +2. **Initiate SSO Flow:** Redirect the user to your SSO provider's authentication endpoint. +3. **Process SSO Response:** Handle the response from the SSO provider after successful authentication. +4. **Establish Admin Session:** Create a session or token that authenticates the user as an admin within the LiteLLM Proxy Admin UI. + +**Security Warning:** Implementing custom SSO logic requires careful attention to security best practices. **Ensure your custom SSO module is thoroughly tested, audited, and secured** to prevent vulnerabilities. Improperly implemented SSO can create significant security risks for your Admin UI and proxy access. This is an advanced enterprise feature that should only be used by experienced developers with a strong understanding of SSO and security. + +--- + +### Database & Persistence + +This subsection within `general_settings` deals with configuring the connection to the Postgres database, which is essential for key-managed mode, usage tracking, and other persistent data storage within the proxy. + +#### `database_url` (`general_settings`) + +**YAML Key:** `database_url` + +**Type:** String (Database connection URL) + +**Environment Variable:** `DATABASE_URL` + +**Default Value:** `None` (Proxy may use an in-memory SQLite database for very basic functionality if `database_url` is not set, but this is **not recommended for production** and may be limited in features). + +**Description:** The `database_url` parameter, within `general_settings`, specifies the **connection URL** for the **Postgres database** that the LiteLLM Proxy Server will use. Setting up a Postgres database and providing its connection URL via `database_url` is **essential for running the proxy in "key-managed" mode** and for enabling core features like: + +* **Virtual API Keys:** Storing and managing virtual API keys, including their permissions, rate limits, and budgets. +* **Usage Logging:** Persistently logging all LLM requests, responses, token usage, costs, and other transaction details in the `spend_logs` database table. +* **Budgets and Rate Limits Enforcement:** Storing and enforcing user and team budgets and rate limits. +* **Admin UI Functionality:** Enabling the full functionality of the Admin UI for key management, user management, and reporting. + +* **Postgres Database Requirement:** LiteLLM Proxy **requires a Postgres database** for key-managed mode. It is designed to work with Postgres as its primary persistent data store. While a default in-memory SQLite database might be used if `database_url` is not provided, this is **not suitable for production** and will have limited features. +* **Connection URL Format:** The `database_url` string should be a valid **Postgres connection URL**, adhering to the standard URL format for Postgres database connections. It typically includes information like: + * `postgresql://`: Indicates the Postgres database dialect. + * `username`: The username for database authentication. + * `password`: The password for database authentication. + * `host`: The hostname or IP address of the Postgres server. + * `port`: The port number of the Postgres server (typically `5432`). + * `database_name`: The name of the Postgres database to connect to. + +**Example YAML (Postgres `database_url`):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" # Postgres connection URL + master_key: "sk-my_master_admin_key123" # Master key required for key-managed mode + # ... other general_settings ... +``` + +In this example, `database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db"` provides the connection details for a Postgres database. + +**Example Environment Variable:** + +```bash +export DATABASE_URL="postgresql://litellm_user:password@dbhost:5432/litellm_db" # Setting database URL via environment variable +``` + +**Security Note:** The `database_url` string often contains database credentials (username and password). **Never hardcode database credentials directly in your `config.yaml` file or store them in version control.** Always use environment variables (`DATABASE_URL`) or secure secret management practices to protect your database credentials. Compromising your database credentials can have serious security implications for your proxy and its data. + +**Recommendation:** **Always set a `database_url`** pointing to a properly configured Postgres database if you intend to use the LiteLLM Proxy in key-managed mode or want to leverage its full feature set, especially in production environments. For testing or very basic usage without key management, you might be able to run without `database_url`, but this is not recommended for production. + +--- + +#### `database_connection_pool_limit` + +**YAML Key:** `database_connection_pool_limit` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `100` (Maximum 100 connections in the pool if not set). + +**Description:** The `database_connection_pool_limit` parameter, within `general_settings`, sets the **maximum number of database connections** that the proxy's connection pool will maintain. Connection pooling is a technique to improve database performance and efficiency by reusing existing database connections instead of establishing a new connection for each request. + +* **Connection Pool Size:** `database_connection_pool_limit` controls the **maximum size of the connection pool**. This value determines the maximum number of concurrent database connections that the proxy can keep open and ready to use. +* **Performance Tuning:** Tuning the connection pool limit can be important for **performance optimization**, especially under high load. + * **Too Small Pool:** If the pool limit is too small, the proxy might run out of available connections under heavy load, leading to connection timeouts, increased latency, and potential request failures. + * **Too Large Pool:** If the pool limit is too large, it might consume excessive resources on both the proxy server and the database server, potentially impacting performance and scalability. +* **Concurrency and Throughput:** The optimal `database_connection_pool_limit` depends on the expected concurrency of your proxy workload and the capacity of your database server. A larger connection pool can generally handle higher concurrency, but you need to ensure your database server can support the increased number of connections. +* **Default Pool Size:** If `database_connection_pool_limit` is not set in your `config.yaml`, the proxy will typically use a **default pool size of 100**. This default is often suitable for many common workloads. + +**Example YAML (Setting Database Connection Pool Limit):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" + database_connection_pool_limit: 50 # Limiting database connection pool to 50 connections + # ... other general_settings ... +``` + +In this example, the maximum size of the database connection pool is limited to 50 connections. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** For most deployments, the default `database_connection_pool_limit` of 100 is likely sufficient. You might consider **adjusting this value** if you are experiencing: + +* **Database Connection Timeouts:** If you see errors related to database connection timeouts under heavy load, it might indicate that your connection pool is too small, and you might need to increase `database_connection_pool_limit`. +* **Database Performance Issues:** If your database server is becoming overloaded or experiencing performance degradation, it might indicate that your connection pool is too large, and you might need to decrease `database_connection_pool_limit`. + +Monitor your proxy and database server performance under load to determine the optimal `database_connection_pool_limit` for your specific environment and workload. + +--- + +#### `database_connection_timeout` + +**YAML Key:** `database_connection_timeout` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `60` (60 seconds, or 1 minute, if not set). + +**Description:** The `database_connection_timeout` parameter, within `general_settings`, sets the **timeout duration in seconds** for the proxy to **acquire a database connection from the connection pool**. + +* **Connection Acquisition Timeout:** When the proxy needs to interact with the database, it attempts to acquire a connection from the connection pool. If all connections in the pool are currently in use and no connection becomes available within the `database_connection_timeout` duration, the proxy will **timeout** and raise an error (typically a connection timeout error). +* **Preventing Indefinite Waiting:** Timeouts are essential to prevent the proxy from getting stuck indefinitely waiting for database connections if the connection pool is exhausted or the database server is unresponsive. +* **Resource Management:** Timeouts help manage resources and ensure that requests do not hang indefinitely, consuming proxy resources while waiting for database connections. +* **Default Timeout:** If `database_connection_timeout` is not set in your `config.yaml`, the proxy will use a **default timeout of 60 seconds (1 minute)**. This default is often a reasonable starting point. + +**Example YAML (Setting Database Connection Timeout):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" + database_connection_timeout: 30 # Setting database connection timeout to 30 seconds + # ... other general_settings ... +``` + +In this example, the database connection timeout is set to 30 seconds. If the proxy cannot acquire a connection from the pool within 30 seconds, it will timeout and raise an error. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `database_connection_timeout` of 60 seconds is often sufficient. You might consider **adjusting this value** if: + +* **You are experiencing Database Connection Timeout Errors Frequently:** If you see frequent errors related to database connection timeouts, especially under load, it might indicate that your connection pool is too small (consider increasing `database_connection_pool_limit`) or that your database server is slow or overloaded. You might try *increasing* `database_connection_timeout` slightly to allow more time for connection acquisition, but also investigate potential database performance issues. +* **You Need Faster Timeout for Faster Error Reporting:** If you want the proxy to fail faster in case of database connection issues, you can *decrease* `database_connection_timeout` to a smaller value. + +Monitor your proxy and database server logs for connection-related errors to determine if you need to adjust the `database_connection_timeout` value. + +--- + +#### `allow_requests_on_db_unavailable` + +**YAML Key:** `allow_requests_on_db_unavailable` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Proxy will not allow requests if the database is unavailable by default). + +**Description:** The `allow_requests_on_db_unavailable` parameter, within `general_settings`, is a **boolean flag** that, when set to `true`, allows the proxy to **continue processing LLM API requests even if the Postgres database is unreachable or unavailable**. + +* **Database Dependency Relaxation:** Normally, if the proxy cannot connect to the database (e.g., database server is down, network issues), it might refuse to process requests because it relies on the database for critical functions like: + * Virtual API key validation and authentication. + * Rate limit and budget enforcement. + * Usage logging and cost tracking. +* **Private VPC Environments (Use with Caution):** Setting `allow_requests_on_db_unavailable: true` is **strongly discouraged in most production environments**, especially those that are publicly accessible or require robust security and accountability. However, it *might* be considered in **private VPC (Virtual Private Cloud) deployments** where: + * You have a high degree of trust in the network security within your VPC. + * You are running the proxy in a non-critical or testing environment. + * You are willing to accept the risk of reduced security and accountability if the database becomes unavailable. + * You are primarily using static API keys (e.g., via environment variables) and are not relying on virtual keys or database-backed features. +* **Security and Accountability Trade-Offs:** Enabling `allow_requests_on_db_unavailable: true` **significantly reduces security and accountability**. When the database is unavailable: + * **Virtual Key Validation is Skipped:** The proxy cannot verify the validity or status of virtual API keys. Any API key presented to the proxy might be accepted, even if it's invalid or expired. + * **Rate Limits and Budgets are Not Enforced:** The proxy cannot retrieve or enforce rate limits or budgets stored in the database. Rate limiting and quota enforcement will effectively be disabled. + * **Usage Logging May Be Lost:** The proxy might not be able to write request logs to the database, potentially losing usage data during database outages. +* **Use Only in Specific, Controlled Scenarios:** **Only enable `allow_requests_on_db_unavailable: true` if you fully understand the security and accountability implications** and are running the proxy in a **private, controlled environment** where these risks are acceptable. **Never enable this in publicly accessible, production deployments where security and usage tracking are important.** + +**Example YAML (Allowing Requests on DB Unavailable - Use with Extreme Caution):** + +```yaml +general_settings: + allow_requests_on_db_unavailable: true # WARNING: Allowing requests even if database is unavailable (USE WITH EXTREME CAUTION!) + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Warning:** **Enabling `allow_requests_on_db_unavailable: true` is a significant security and accountability risk.** **Only use this in private, non-production, and highly controlled environments where you fully understand and accept the implications of running without database dependency.** In most typical production deployments, you should **leave this setting at its default `false` value** to ensure that the proxy properly validates API keys, enforces policies, and tracks usage, relying on the database for these critical functions. + +--- + +#### `disable_spend_logs` + +**YAML Key:** `disable_spend_logs` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Spend logs are enabled by default). + +**Description:** The `disable_spend_logs` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy should **write detailed transaction logs to the database's `spend_logs` table**. + +* **Transaction Logging (Default):** By default (`disable_spend_logs: false`), the proxy will record **every LLM API request** in the `spend_logs` database table. These logs include detailed information about each transaction, such as: + * Timestamp of the request. + * Model name used. + * API key used (hashed or redacted based on `redact_user_api_key_info` setting). + * Input tokens, output tokens, and total tokens. + * Calculated cost of the request. + * User identifier (if provided). + * Request parameters and response metadata. +* **Disabling Transaction Logging:** Setting `disable_spend_logs: true` will **stop writing these detailed transaction logs to the database**. The proxy will still perform other functions (like routing requests, enforcing rate limits, etc.), but it will not persist detailed transaction records in the `spend_logs` table. +* **Reduced Database Load and Storage:** Disabling spend logs can reduce the load on your Postgres database and decrease the storage space used by the `spend_logs` table, which can be beneficial in very high-throughput scenarios or when database resources are limited. +* **Limited Reporting and Audit Trail:** If you disable spend logs, you will **lose the detailed transaction history** that is stored in the `spend_logs` table. This will impact your ability to: + * Generate granular spend reports and usage dashboards. + * Perform detailed usage analysis or cost optimization studies. + * Maintain a comprehensive audit trail of all LLM requests. + +**Example YAML:** + +```yaml +general_settings: + disable_spend_logs: true # Disabling writing detailed spend logs to the database + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Leave `disable_spend_logs: false` (the default) in most production environments** where you need detailed usage tracking, cost reporting, and audit trails. Only consider enabling `disable_spend_logs: true` in specific scenarios where: + +* You have extremely high request volumes and database write performance is a bottleneck. +* You are not using the LiteLLM Proxy's built-in reporting and usage tracking features and rely on external logging or monitoring systems. +* You have strict data retention policies and do not need to maintain a long-term history of every transaction in the database. + +--- + +#### `disable_adding_master_key_hash_to_db` + +**YAML Key:** `disable_adding_master_key_hash_to_db` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Master key hash is added to the database by default). + +**Description:** The `disable_adding_master_key_hash_to_db` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy should store a **hashed version of the `master_key` in the database**. + +* **Master Key Hash for Spend Tracking:** Normally, when you configure a `master_key` for the proxy, the proxy will generate a secure hash of this master key and store it in the database. This hashed master key is used for internal tracking purposes, primarily to link administrative actions (like generating new API keys, viewing usage reports, etc.) to the identity of the master key user. +* **Reduced Security Information (Optional):** Setting `disable_adding_master_key_hash_to_db: true` will **prevent the proxy from storing the master key hash in the database**. This slightly reduces the amount of security-related information stored in the database. +* **Slightly Reduced Audit Insight:** If you disable storing the master key hash, you will lose the ability to directly link admin actions to the master key identity in the database logs. This might make audit trails slightly less informative in terms of identifying *who* performed administrative actions. +* **Minor Security Enhancement:** Disabling master key hash storage can be seen as a **minor security enhancement**, as it prevents even a hashed representation of the master key from being stored in the database, potentially reducing the attack surface in case of a database compromise (though the master key itself should *never* be stored in the database in plaintext or easily reversible form). + +**Example YAML:** + +```yaml +general_settings: + disable_adding_master_key_hash_to_db: true # Disabling storing master key hash in the database for slightly enhanced security + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** In most typical deployments, leaving `disable_adding_master_key_hash_to_db: false` (the default) is acceptable. The security risk of storing a hashed master key in the database is generally low, and the benefit of having a slightly richer audit trail might be valuable. Only consider setting this to `true` if you have very strict security requirements or want to minimize the amount of potentially sensitive information stored in your database, even if it's just a hash. + +--- + +#### `store_model_in_db` + +**YAML Key:** `store_model_in_db` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Model information is not stored in the database by default). + +**Description:** The `store_model_in_db` parameter, within `general_settings`, is an **enterprise-level boolean flag** that controls whether the proxy should **store model and credential information in the database**. + +* **Model Persistence in Database (Optional):** By default (`store_model_in_db: false`), the LiteLLM Proxy primarily relies on the `config.yaml` file as the **source of truth** for model configurations. Model definitions and credential settings are read from the `config.yaml` file when the proxy starts. +* **Database-Backed Model Management (Enterprise):** Setting `store_model_in_db: true` enables an alternative mode where the proxy will **persist model configurations and potentially credentials in the Postgres database**. This is typically used to support: + * **Admin UI Model Management:** Enabling the Admin UI to add, edit, or delete models directly through the UI interface, and have these changes persisted in the database. + * **Dynamic Model Updates:** Allowing for dynamic updates to model configurations without requiring restarts of the proxy server. + * **Multi-Instance Synchronization:** In multi-instance proxy deployments, storing model info in a shared database can help synchronize model configurations across all proxy instances. +* **Enterprise Feature:** `store_model_in_db: true` is primarily an **enterprise feature** used in more advanced deployments that require dynamic model management or UI-based configuration. +* **Configuration Source of Truth Shift:** When `store_model_in_db: true`, the database becomes a more active part of the configuration management process. While `config.yaml` might still be used for initial setup and general settings, model definitions can be managed and updated via the database and Admin UI. + +**Example YAML (Enabling Database-Backed Model Storage):** + +```yaml +general_settings: + store_model_in_db: true # Enterprise Feature: Enabling model storage in database + # ... database_url and other general_settings ... + master_key: "sk-my_master_admin_key123" # Master key required for key-managed mode + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `store_model_in_db: true` is typically used in enterprise environments where dynamic model management via the Admin UI or API is desired. For simpler deployments where model configurations are primarily managed via `config.yaml` and restarts are acceptable for configuration changes, you can leave this setting at its default `false` value. Enabling `store_model_in_db: true` implies that you will be managing models through the database and Admin UI (or API) rather than primarily through direct editing of `config.yaml`. + +--- + +#### `store_prompts_in_spend_logs` + +**YAML Key:** `store_prompts_in_spend_logs` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Prompt and response content are not stored in spend logs by default). + +**Description:** The `store_prompts_in_spend_logs` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy should **store the actual prompt and response content** (the text of messages) in the `spend_logs` database table. + +* **Default: Metadata-Only Logging:** By default (`store_prompts_in_spend_logs: false`), the `spend_logs` table in the database only stores *metadata* about each request, such as token counts, costs, timestamps, model names, user identifiers, and other operational data. The actual text content of the prompts and responses is **not stored** in the database logs by default. This is often the preferred setting for privacy and to reduce database storage requirements. +* **Full Conversation Logging (Optional):** Setting `store_prompts_in_spend_logs: true` will instruct the proxy to also **store the full text of the prompts and responses** in the `spend_logs` table for every transaction. +* **Use Cases for Storing Prompts:** You might enable `store_prompts_in_spend_logs: true` in scenarios where you need to: + * **Detailed Audit Trails:** Maintain a complete audit trail of all conversations, including the exact prompts and responses exchanged with LLMs. This might be required for compliance, regulatory, or internal policy reasons. + * **In-Depth Analysis of Conversations:** Enable more detailed analysis of user interactions, conversation flows, and the content of prompts and responses. + * **Quality Monitoring and Improvement:** Review and analyze actual conversations to identify areas for prompt improvement, model performance evaluation, or content quality assessment. +* **Privacy and Storage Considerations:** Storing full prompts and responses in the database has significant **privacy and storage implications**. + * **Increased Data Volume:** The `spend_logs` table will grow much larger, potentially requiring more database storage space and impacting database performance, especially with high request volumes. + * **Privacy Risks:** Storing conversation text in the database increases the risk of sensitive or confidential information being stored in the database logs. You must carefully consider data privacy and security implications and ensure appropriate security measures are in place to protect this data. + +**Example YAML:** + +```yaml +general_settings: + store_prompts_in_spend_logs: true # Enabling storing full prompts and responses in database spend logs (increased data volume and privacy considerations!) + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security and Privacy Warning:** **Exercise extreme caution** when enabling `store_prompts_in_spend_logs: true`. Carefully consider the privacy and security implications of storing full conversation text in your database. Ensure you have appropriate data security and privacy policies in place to protect this potentially sensitive data. In most production scenarios, especially those handling sensitive user data, it is often **recommended to leave `store_prompts_in_spend_logs: false`** (the default) to avoid storing message content in the database and minimize privacy risks. If you need to log conversation content for audit or analysis, consider using dedicated logging integrations (like Langfuse, Helicone, etc.) that are designed for secure and privacy-aware logging practices. + +--- + +#### `disable_prisma_schema_update` + +**YAML Key:** `disable_prisma_schema_update` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Automatic database schema migrations are enabled by default). + +**Description:** The `disable_prisma_schema_update` parameter, within `general_settings`, is an **advanced boolean flag** that controls whether the proxy should **automatically run database schema migrations** on startup. + +* **Prisma Schema Migrations (Default):** LiteLLM Proxy uses Prisma, an ORM (Object-Relational Mapper), to manage its database schema (tables, columns, indexes, etc.). By default (`disable_prisma_schema_update: false`), when the proxy starts, it will use Prisma to **automatically check if the database schema is up-to-date**. If the schema is out of date (e.g., due to code changes in a new proxy version that introduce database schema modifications), Prisma will automatically apply the necessary **database schema migrations** to bring the database schema up to the latest version. This ensures that the database schema is always compatible with the proxy code. +* **Manual Schema Migrations (Advanced):** Setting `disable_prisma_schema_update: true` **disables these automatic schema migrations**. In this case, the proxy will **not attempt to update the database schema on startup**. You would then be responsible for managing database schema migrations **manually**, typically using Prisma's command-line tools or other database migration mechanisms. +* **Advanced Users and Controlled Environments:** Disabling automatic schema updates is primarily relevant for **advanced users** and **controlled production environments** where database migrations are managed as part of a separate, more controlled DevOps or database administration process. Reasons to disable automatic schema updates might include: + * **Database Migration Control:** You want to have explicit control over when and how database schema migrations are applied, perhaps as part of a planned deployment pipeline. + * **Read-Only Database User:** If the database user that the proxy uses to connect to the database has **read-only permissions** (for security reasons), the proxy will not be able to apply schema migrations anyway. In such cases, you *must* disable automatic schema updates and manage migrations externally by a database administrator with appropriate permissions. + * **Schema Migration Pipelines:** You have a dedicated DevOps pipeline or database migration system that handles schema changes, and you do not want the proxy to attempt automatic migrations on startup. + +**Example YAML (Disabling Automatic Schema Updates):** + +```yaml +general_settings: + disable_prisma_schema_update: true # Advanced: Disabling automatic database schema migrations + # ... database_url and other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Advanced Usage Warning:** Disabling automatic schema updates (`disable_prisma_schema_update: true`) is an **advanced configuration** and should only be done if you fully understand the implications and have a plan for **manually managing database schema migrations**. If you disable automatic schema updates and the database schema is not compatible with the proxy code, the proxy may not function correctly, or you might encounter database-related errors. In most typical deployments, especially for less experienced users or in development/testing environments, it's **recommended to leave `disable_prisma_schema_update: false`** (the default) and allow the proxy to handle database schema migrations automatically on startup, ensuring database compatibility. If you choose to disable automatic updates, ensure you have a robust manual schema migration process in place and that database administrators are responsible for applying schema changes whenever you update the proxy version. + +--- + +#### `proxy_batch_write_at` + +**YAML Key:** `proxy_batch_write_at` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `10` (10 seconds batch write interval if not set). + +**Description:** The `proxy_batch_write_at` parameter, within `general_settings`, controls the **frequency of batch-writing spend logs to the database**. Instead of writing each transaction log entry to the database immediately after each request, the proxy can **batch multiple log entries together** and insert them into the database in a single operation. This batching can improve database write performance, especially under high request loads. + +* **Batching for Performance:** Database write operations can be relatively slow compared to in-memory operations. Batching multiple writes together reduces the overhead of individual database operations and can improve overall proxy performance, especially in high-throughput scenarios. +* **Time-Based Batching:** `proxy_batch_write_at` specifies the **time interval in seconds** that the proxy will wait before batch-writing accumulated spend logs to the database. +* **Flush Frequency:** For example, if you set `proxy_batch_write_at: 5`, the proxy will attempt to write accumulated spend logs to the database every 5 seconds (or when the batch reaches a certain size, whichever comes first). +* **Trade-off: Real-Time Logging vs. Performance:** + * **Lower `proxy_batch_write_at` (e.g., 1-5 seconds):** Results in **more frequent database writes** and logs that are closer to real-time. This might be desirable if you need very up-to-date spend logs or real-time monitoring. However, it can increase database load and potentially reduce overall proxy performance under very high load. + * **Higher `proxy_batch_write_at` (e.g., 10-30 seconds or more):** Reduces the frequency of database writes, **improving database write performance** and potentially increasing overall proxy throughput. However, spend logs will be written in batches and might be slightly less real-time. +* **Default Batch Interval:** The default `proxy_batch_write_at` value is `10 seconds`. This is generally a good balance between performance and reasonably timely logging for many use cases. + +**Example YAML (Setting Batch Write Interval):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" + proxy_batch_write_at: 5 # Setting batch write interval to 5 seconds (more frequent writes) + # ... other general_settings ... +``` + +In this example, the proxy will attempt to batch-write spend logs to the database every 5 seconds. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `proxy_batch_write_at` of 10 seconds is often a good starting point. You might consider **adjusting this value** if: + +* **Database Write Performance Bottleneck:** If database write operations are becoming a bottleneck under high load, you can try *increasing* `proxy_batch_write_at` to reduce write frequency. +* **Near Real-Time Logging Requirement:** If you need spend logs to be as close to real-time as possible, you can *decrease* `proxy_batch_write_at` to a lower value (e.g., 1-2 seconds), but be mindful of the potential impact on database load under high concurrency. + +Monitor your proxy and database performance to determine the optimal `proxy_batch_write_at` value for your specific workload and performance goals. + +--- + +#### `proxy_budget_rescheduler_min_time` + +**YAML Key:** `proxy_budget_rescheduler_min_time` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `597` (seconds, approximately 9 minutes and 57 seconds). + +**Description:** The `proxy_budget_rescheduler_min_time` parameter, within `general_settings`, is part of a pair of settings (`proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`) that control the **frequency at which the proxy checks for budget resets**. Budget resets are relevant for features like monthly or weekly budgets for API keys, users, or teams. `proxy_budget_rescheduler_min_time` specifies the **minimum time in seconds** before the proxy should schedule a budget reset check. + +* **Budget Rescheduling Task:** The proxy server periodically runs a background task to check for and apply budget resets. This task is responsible for resetting monthly or weekly budgets at the appropriate time (e.g., at the beginning of each month or week). +* **Randomized Reschedule Interval:** To avoid all proxy instances performing budget resets at the exact same time (which could cause a "thundering herd" effect on the database), the proxy uses a **randomized reschedule interval**. It will schedule the budget reset check at a random time between `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`. +* **Minimum Reschedule Time:** `proxy_budget_rescheduler_min_time` sets the **lower bound** of this random interval. The proxy will not schedule a budget reset check sooner than this many seconds after the previous check. +* **Default Minimum Time:** The default value of `597` seconds (approximately 9 minutes and 57 seconds) is part of a default ~10-minute interval. + +**Example YAML (Adjusting Budget Reschedule Interval):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" + proxy_budget_rescheduler_min_time: 300 # Setting minimum budget reschedule time to 300 seconds (5 minutes) + proxy_budget_rescheduler_max_time: 305 # Setting maximum budget reschedule time to 305 seconds (5 minutes and 5 seconds) + # ... other general_settings ... +``` + +In this example, the budget rescheduler will run approximately every 5 minutes (randomly between 300 and 305 seconds). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** For most typical deployments, the **default values** for `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time` (around 10 minutes) are generally suitable. You typically do not need to change these unless you have very specific requirements for budget reset frequency. If you want to force more frequent budget checks (e.g., for testing purposes), you can set both `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time` to a lower value (e.g., 1 second each, as shown in some documentation examples for testing). However, for production, it's best to leave them at their default ~10-minute interval to minimize database load from frequent budget checks. + +--- + +#### `proxy_budget_rescheduler_max_time` + +**YAML Key:** `proxy_budget_rescheduler_max_time` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `605` (seconds, approximately 10 minutes and 5 seconds). + +**Description:** The `proxy_budget_rescheduler_max_time` parameter, within `general_settings`, is the second part of the pair of settings controlling budget reset frequency. It specifies the **maximum time in seconds** before the proxy should schedule a budget reset check. + +* **Maximum Reschedule Time:** `proxy_budget_rescheduler_max_time` sets the **upper bound** of the random interval for scheduling budget reset checks. The proxy will pick a random time between `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time` to perform the budget reset check. +* **Randomized Interval Range:** The range between `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time` defines the **randomized interval** within which the proxy will schedule budget resets. This randomization helps to distribute the load of budget checks over time and avoid simultaneous spikes in database activity across multiple proxy instances in a load-balanced setup. +* **Default Maximum Time:** The default value of `605` seconds (approximately 10 minutes and 5 seconds), combined with the default `proxy_budget_rescheduler_min_time` of 597 seconds, creates a default budget reset check interval of roughly 10 minutes (randomly between 597 and 605 seconds). + +**Example YAML (Adjusting Budget Reschedule Interval - Same as `proxy_budget_rescheduler_min_time` example):** + +```yaml +general_settings: + database_url: "postgresql://litellm_user:password@dbhost:5432/litellm_db" + proxy_budget_rescheduler_min_time: 300 # Setting minimum budget reschedule time to 300 seconds (5 minutes) + proxy_budget_rescheduler_max_time: 305 # Setting maximum budget reschedule time to 305 seconds (5 minutes and 5 seconds) + # ... other general_settings ... +``` + +In this example, the budget rescheduler will run approximately every 5 minutes, at a random time within a 5-second window (between 300 and 305 seconds). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** As with `proxy_budget_rescheduler_min_time`, the **default values** (around 10 minutes) are generally recommended for `proxy_budget_rescheduler_max_time` in most production deployments. Adjust these values only if you have specific needs for budget reset frequency. If you want to test budget resets more frequently, you can set both `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time` to a low value (e.g., 1 second each), but remember to revert to the default ~10-minute interval for production use to avoid unnecessary database load. + +--- + +### Key Management & Encryption + +This subsection of `general_settings` allows you to configure external Key Management Systems (KMS) for enhanced security of your API keys and other sensitive data, moving beyond the default local encryption. + +#### `key_management_system` (`general_settings`) + +**YAML Key:** `key_management_system` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** If not set, the proxy uses its built-in local encryption with the master key. + +**Description:** The `key_management_system` parameter, within `general_settings`, allows you to specify an **external Key Management System (KMS)** that the proxy should use for managing encryption keys, instead of its default built-in encryption mechanism. + +* **External Key Management for Enhanced Security:** Integrating with a dedicated KMS like AWS KMS, Azure Key Vault, or Google Cloud KMS can significantly enhance the security and manageability of your encryption keys, especially in enterprise environments. KMS systems are designed for secure key storage, rotation, access control, and auditing. +* **Supported KMS Options:** Currently, LiteLLM Proxy supports integration with: + * `"google_kms"`: Google Cloud KMS (Key Management Service) + * `"azure_kms"`: Azure Key Vault + * *(Potentially "aws_kms" in some versions, although the documentation snippet doesn't explicitly list it as a YAML option, but mentions `USE_AWS_KMS` environment variable and `key_management_settings` for AWS. Verify in the full documentation for your version.)* +* **Built-in Encryption (Default):** If `key_management_system` is **not set**, the proxy will use its **built-in local encryption** mechanism. This default encryption uses the `master_key` and an internal salt to encrypt sensitive data (like virtual API keys) stored in the database. While the built-in encryption provides basic security, it might not meet the stringent security requirements of some enterprise environments. +* **Enterprise Feature:** KMS integration is primarily considered an **enterprise-level feature** for organizations with strict security and compliance requirements that mandate the use of dedicated key management systems. + +**Example YAML (Using Google Cloud KMS):** + +```yaml +general_settings: + key_management_system: "google_kms" # Enterprise Feature: Using Google Cloud KMS for key management + key_management_settings: # Required: KMS-specific settings + - project_id: "my-gcp-project" + location_id: "global" + key_ring_id: "litellm-keys" + key_id: "master-key" + # ... database_url and other general_settings ... + master_key: "sk-my_master_admin_key123" # Master key still needed, but managed by KMS +``` + +In this example, `key_management_system: "google_kms"` instructs the proxy to use Google Cloud KMS for key management. The `key_management_settings` section provides the necessary configuration details for accessing the specified Google Cloud KMS key. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** When you set `key_management_system` to an external KMS, the `master_key` parameter is still required in `general_settings`. However, instead of being used for direct encryption, the `master_key` in this case might be used as a *reference* or to bootstrap the KMS integration. The actual encryption and decryption operations will be delegated to the configured KMS. Refer to the documentation for the specific KMS integration (Google KMS, Azure Key Vault) for detailed configuration steps and credential requirements. KMS integration typically involves setting up appropriate permissions and roles for the proxy to access and use the KMS service. + +--- + +#### `key_management_settings` (`general_settings`) + +**YAML Key:** `key_management_settings` + +**Type:** List of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required if `key_management_system` is set) + +**Description:** The `key_management_settings` parameter, within `general_settings`, is a **required list of objects** (mappings) when you have enabled external key management by setting `key_management_system` to a value like `"google_kms"` or `"azure_kms"`. `key_management_settings` provides the **configuration details that are specific to the chosen Key Management System**. + +* **KMS-Specific Configuration:** The structure and required keys within `key_management_settings` **vary depending on the `key_management_system`** you have selected. You must consult the LiteLLM Proxy documentation for the specific KMS integration (Google KMS, Azure Key Vault) to determine the correct configuration parameters for your chosen KMS. +* **List of Configuration Objects:** `key_management_settings` is a *list*. While typically you might only need one configuration object in the list for a single KMS integration, the list structure allows for potential future extensibility or more complex KMS setups. Currently, you would usually have a list containing a single configuration object. +* **Example: Google KMS Settings:** For `key_management_system: "google_kms"`, the `key_management_settings` object typically includes: + * `project_id`: (String): Your Google Cloud Project ID. + * `location_id`: (String): The location (region) of your Google Cloud KMS key (e.g., `"global"`, `"us-central1"`). + * `key_ring_id`: (String): The ID of the Google Cloud KMS Key Ring. + * `key_id`: (String): The ID of the Google Cloud KMS Key within the Key Ring. + +**Example YAML (Google KMS `key_management_settings`):** + +```yaml +general_settings: + key_management_system: "google_kms" # Enterprise Feature: Using Google Cloud KMS + key_management_settings: # Required: KMS configuration settings + - project_id: "my-gcp-project" # Google Cloud Project ID + location_id: "global" # KMS Key location (region) + key_ring_id: "litellm-keys" # KMS Key Ring ID + key_id: "master-key" # KMS Key ID + # ... database_url and other general_settings ... + master_key: "sk-my_master_admin_key123" # Master key still needed, but managed by KMS +``` + +In this example, `key_management_settings` provides the Google Cloud KMS-specific parameters needed to access the KMS key named `"master-key"` within the Key Ring `"litellm-keys"` in the `"global"` location of the Google Cloud Project `"my-gcp-project"`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** The specific keys and values within `key_management_settings` are **highly dependent on the `key_management_system` type**. Always consult the documentation for the specific KMS integration you are using (Google KMS, Azure Key Vault, etc.) to understand the required and optional configuration parameters. Incorrectly configured `key_management_settings` will prevent the proxy from using the external KMS for encryption, and may lead to startup errors or security vulnerabilities. + +--- + +#### `use_azure_key_vault` (`general_settings`) + +**YAML Key:** `use_azure_key_vault` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Azure Key Vault integration is disabled by default). + +**Description:** The `use_azure_key_vault` parameter, within `general_settings`, is an **enterprise-level boolean flag** that provides a **simplified way to enable Azure Key Vault integration** for key and secret management. + +* **Simplified Azure Key Vault Enablement:** Setting `use_azure_key_vault: true` is a shortcut to enable Azure Key Vault integration. It is functionally equivalent to setting `key_management_system: "azure_kms"`, but it may simplify configuration in some Azure-centric deployments. +* **Azure Key Vault for Secret Management:** When enabled, the proxy will attempt to use Azure Key Vault to manage secrets, including: + * Loading API keys (like provider API keys, virtual keys, etc.) from Azure Key Vault instead of relying solely on environment variables or the database. + * Storing newly generated API keys and secrets in Azure Key Vault. + * Potentially using Azure Key Vault to encrypt the proxy's master key. +* **Environment Variable Configuration:** When `use_azure_key_vault: true`, you will typically need to configure **Azure-specific environment variables** to provide the proxy with credentials to access your Azure Key Vault. These environment variables might include: + * `AZURE_KEY_VAULT_URI`: The URI of your Azure Key Vault. + * `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_TENANT_ID`: Azure AD application credentials for authenticating to Azure Key Vault (or other authentication methods supported by Azure SDK, like Managed Identity). Refer to Azure Key Vault documentation for authentication options. +* **Enterprise Feature:** Azure Key Vault integration is primarily an **enterprise feature** for organizations using Azure and wanting to leverage Azure Key Vault for secure secret management. + +**Example YAML (Enabling Azure Key Vault Integration):** + +```yaml +general_settings: + use_azure_key_vault: true # Enterprise Feature: Enabling Azure Key Vault integration (simplified) + # ... database_url and other general_settings ... + master_key: "sk-my_master_admin_key123" # Master key still needed, but managed in Azure Key Vault +``` + +**Example Environment Variable:** + +```bash +export AZURE_KEY_VAULT_URI="https://my-key-vault.vault.azure.net/" # Setting Azure Key Vault URI +export AZURE_CLIENT_ID="your-azure-client-id" # Setting Azure AD application credentials +export AZURE_CLIENT_SECRET="your-azure-client-secret" +``` + +--- + +```yaml +config.yaml # LiteLLM Proxy Configuration Guide - Enhanced Version (Continued) +``` + +##### `use_google_kms` (`general_settings`) + +**YAML Key:** `use_google_kms` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Google Cloud KMS integration is disabled by default). + +**Description:** The `use_google_kms` parameter, within `general_settings`, is an **enterprise-level boolean flag** that provides a **simplified way to enable Google Cloud KMS (Key Management Service) integration** for encryption key management. + +* **Simplified Google Cloud KMS Enablement:** Setting `use_google_kms: true` is a shortcut for enabling Google Cloud KMS integration. It is functionally equivalent to setting `key_management_system: "google_kms"`, offering a more direct toggle for GCP-centric deployments. +* **Google Cloud KMS for Key Security:** When enabled, the proxy will attempt to use Google Cloud KMS to encrypt and manage encryption keys, providing enhanced security and compliance by leveraging GCP's dedicated key management service. +* **Google Cloud Environment Requirement:** Using `use_google_kms: true` implies that your LiteLLM Proxy is deployed within a **Google Cloud environment** and has the necessary **Google Cloud credentials** configured to access your Google Cloud KMS project and keys. These credentials are typically provided via environment variables like `GOOGLE_APPLICATION_CREDENTIALS` or by running the proxy in a GCP environment with an associated service account that has KMS permissions. + +**Example YAML (Enabling Google Cloud KMS Integration):** + +```yaml +general_settings: + use_google_kms: true # Enterprise Feature: Enabling Google Cloud KMS integration (simplified) + # ... database_url and other general_settings ... + master_key: "sk-my_master_admin_key123" # Master key still needed, but managed in Google Cloud KMS +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `use_google_kms: true` is a convenience toggle for enabling Google Cloud KMS integration. For more detailed configuration of Google Cloud KMS, including specifying the KMS key location, key ring, and key ID, you should use the `key_management_settings` parameter in conjunction with `key_management_system: "google_kms"`. `use_google_kms: true` might use default or pre-configured KMS settings in some cases (check full documentation for your version for exact behavior). It is an **enterprise feature**. + +--- + +#### `default_team_disabled` + +**YAML Key:** `default_team_disabled` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Users can create personal API keys by default). + +**Description:** The `default_team_disabled` parameter, within `general_settings`, is an **enterprise-level boolean flag** that controls whether users are allowed to create **personal API keys** (API keys that are *not* associated with a team). + +* **Team-Centric Key Management (Enterprise):** In LiteLLM Proxy's enterprise features, API keys can be associated with "teams" (representing organizations, departments, or projects) or can be "personal" keys (not tied to a specific team). `default_team_disabled: true` enforces a **team-centric key management policy**. +* **Disabling Personal Keys:** When `default_team_disabled: true`, the proxy will **prevent users from creating personal API keys**. Any attempt to generate a new API key that is not explicitly associated with a team will be rejected by the proxy. +* **Force Team Association:** This setting ensures that **all API keys are always associated with a team**. This is useful in enterprise setups where you want to enforce organizational control and ensure that all LLM usage is attributed to and managed within the context of a team or project. +* **Usage Tracking and Cost Control:** Forcing team-based keys can improve usage tracking and cost control at the team level, as all API key usage will be directly linked to a team, facilitating team-based reporting and budgeting. +* **Default Personal Keys Allowed (If Not Set):** By default (`default_team_disabled: false`), users *can* create both team API keys (if they are members of a team and have permissions) and personal API keys (which are not associated with any team). + +**Example YAML (Disabling Personal API Keys):** + +```yaml +general_settings: + default_team_disabled: true # Enterprise Feature: Disabling creation of personal API keys + # ... other general_settings ... +``` + +In this configuration, if a user attempts to generate a new API key without specifying a team association, the proxy will reject the request. Only team API keys can be created. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `default_team_disabled: true` is primarily used in **enterprise or multi-tenant deployments** where you want to enforce strict organizational control over API key usage and ensure that all usage is tracked and managed at the team level. In simpler setups or development environments, you can typically leave this setting at its default `false` value to allow users to create both personal and team API keys. + +--- + +#### `custom_key_generate` (`general_settings`) + +**YAML Key:** `custom_key_generate` + +**Type:** String (Path to a Python function or method) + +**Environment Variable:** N/A + +**Default Value:** `None` (Default key generation logic is used). + +**Description:** The `custom_key_generate` parameter, within `general_settings`, is an **advanced setting** that allows you to specify a **custom Python function or method** to be used for **generating API keys (virtual keys)** within the proxy. + +* **Override Default Key Generation:** By default, when you use the proxy's API or Admin UI to generate a new API key, LiteLLM Proxy uses its built-in logic to create a random, secure API key string (typically resembling OpenAI keys, starting with `sk-`). `custom_key_generate` lets you **replace this default key generation process** with your own custom logic. +* **Custom Key Formats or Logic:** You might want to use `custom_key_generate` to: + * Generate API keys in a **different format** than the default (e.g., keys that are more human-readable, or keys that follow a specific internal format). + * Integrate with an **external key issuance system** or key management service. + * Implement **custom key generation algorithms** or logic that are specific to your organization's requirements. +* **Python Function or Method Path:** The value of `custom_key_generate` should be a **string** representing the **Python module path** to your custom key generation function or method. This is typically in the format `"module.function_name"` or `"module.ClassName.method_name"`, pointing to a Python file and the function or method within that file that implements your custom key generation logic. +* **Advanced Usage – Custom Key Generation:** `custom_key_generate` is an **advanced usage scenario** for organizations that require highly customized API key generation processes. For most typical deployments, the default key generation logic is sufficient. + +**Example YAML (Custom Key Generation Function Path):** + +```yaml +general_settings: + custom_key_generate: "my_custom_keygen_module.generate_custom_api_key" # Path to custom key generation function + master_key: "sk-my_master_admin_key123" # Master key still needed for key-managed mode + # ... other general_settings ... +``` + +In this example, the proxy is configured to use a custom function named `generate_custom_api_key` located in the Python module `my_custom_keygen_module.py` whenever a new API key needs to be generated. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Implementing a Custom Key Generation Function/Method:** + +Your custom key generation module (e.g., `my_custom_keygen_module.py`) must define a function or method that conforms to a specific interface. Typically, your custom function or method should: + +1. **Accept No Arguments (or specific arguments as documented):** The function or method might be expected to accept no arguments, or it might be passed specific arguments by the proxy depending on the version and documentation. Refer to the LiteLLM Proxy documentation for the exact function signature expected for custom key generation. +2. **Generate a New API Key String:** Implement your custom logic to generate a new API key string. This could involve: + * Generating a random string in a specific format. + * Fetching a key from an external key management system. + * Using a custom key generation algorithm. +3. **Return the API Key String:** The function or method **must return a string** value, which will be used as the new API key. + +**Security Warning:** Implementing custom key generation requires careful attention to security best practices. **Ensure your custom key generation logic produces strong, secure, and unpredictable API keys.** If your custom logic introduces weak or predictable key generation, it could create significant security vulnerabilities for your proxy and LLM applications. **Thoroughly test and audit** your custom key generation module to ensure it meets your security requirements. + +--- + +#### Encryption salt (environment variable) + +**YAML Key:** N/A (Configured via environment variable, not in `config.yaml`) + +**Type:** String (Secret, high-entropy string, 32+ bytes recommended) + +**Environment Variable:** `LITELLM_SALT_KEY` + +**Default Value:** If not set via environment variable, a randomly generated salt might be used internally by LiteLLM, but this is **not recommended for production** as the salt might change across proxy restarts, potentially invalidating encryption. + +**Description:** `LITELLM_SALT_KEY` is an **environment variable** (not a `config.yaml` parameter) that allows you to provide a **custom encryption salt** for the LiteLLM Proxy. + +* **Encryption Salt for Data Security:** LiteLLM Proxy uses encryption to protect sensitive data, such as API keys, stored in the database. The encryption process typically involves a combination of the `master_key` (configured in `config.yaml`) and an **encryption salt**. A salt is a random value that is added to the data before it is encrypted. Using a salt makes the encryption more robust against certain types of attacks, such as rainbow table attacks. +* **Custom Salt for Consistency and Security:** By setting the `LITELLM_SALT_KEY` environment variable, you provide a **custom salt** that the proxy will use for encryption. **It is highly recommended to set a custom, stable `LITELLM_SALT_KEY` in production environments** to ensure: + * **Consistent Encryption:** Using the same salt across proxy restarts ensures that data encrypted by the proxy can be decrypted correctly even after restarts. If you do not set `LITELLM_SALT_KEY`, the proxy might generate a *new* random salt each time it starts. If the salt changes, data encrypted with the old salt might not be decryptable, potentially leading to data loss or inconsistencies. + * **Enhanced Security:** While LiteLLM may generate a random salt if you don't provide one, setting your own `LITELLM_SALT_KEY` gives you more control over the encryption process. Choose a **strong, high-entropy salt** (32+ bytes recommended) for optimal security. +* **Environment Variable Configuration Only:** `LITELLM_SALT_KEY` **must be set as an environment variable**. There is no corresponding YAML parameter in `config.yaml` for setting the salt. +* **Secret and Confidential:** The `LITELLM_SALT_KEY` is a **security-sensitive secret**. Protect it as carefully as you protect your `master_key` and API keys. Do not expose it in logs, configuration files (except environment variable settings), or version control. + +**Example Environment Variable Setting (Setting `LITELLM_SALT_KEY`):** + +```bash +export LITELLM_SALT_KEY="YOUR_VERY_LONG_RANDOM_SALT_KEY_32_BYTES_OR_MORE" # Setting a custom encryption salt via environment variable +``` + +**Security Warning:** The `LITELLM_SALT_KEY` is a **critical security credential**. **Protect it carefully.** Choose a strong, high-entropy salt (32+ bytes or more is recommended). **Never hardcode the salt directly in your `config.yaml` file or store it in version control.** Always use secure environment variable management or secret management systems to handle `LITELLM_SALT_KEY`. Compromising the salt, in combination with a compromised `master_key`, could potentially weaken the encryption of your data. + +--- + +### Rate Limiting & Quotas + +This subsection within `general_settings` lets you configure global rate limits and concurrency controls for the entire proxy, preventing overload and managing resource usage. + +#### `max_parallel_requests` + +**YAML Key:** `max_parallel_requests` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** Provider and model-dependent. If not set in `general_settings` or `router_settings`, the proxy's behavior regarding parallel requests might depend on the specific routing strategy or model configuration. You should test and verify the default behavior in your LiteLLM Proxy version if you do not explicitly set this limit. + +**Description:** The `max_parallel_requests` parameter, within `general_settings`, sets a **limit on the maximum number of concurrent requests** that the proxy will handle **for a specific model deployment or model alias**. This is a per-deployment concurrency limit. + +* **Deployment-Level Concurrency Control:** `max_parallel_requests` is applied **individually to each model deployment** defined in your `model_list`. It controls the maximum number of requests that can be actively processed *at the same time* for a particular model alias. +* **Resource Protection:** Use `max_parallel_requests` to protect backend LLM deployments from overload. By limiting concurrent requests, you can prevent overwhelming slower or less scalable models. +* **Queueing or Rejection of Excess Requests:** When the number of concurrent requests for a model reaches the `max_parallel_requests` limit, subsequent requests will typically be **queued** (if queueing is enabled and configured) or **rejected** (if queueing is disabled or the queue is full). The exact behavior depends on the proxy's configuration and routing strategy. +* **Tuning for Model Capacity:** Set `max_parallel_requests` based on the **capacity and performance characteristics** of each backend model deployment. Faster or more scalable models can typically handle higher concurrency, while slower or resource-constrained models might need lower `max_parallel_requests` limits. + +**Example YAML (Setting Per-Model Parallel Request Limits):** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-fast + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + max_parallel_requests: 20 # Allowing up to 20 concurrent requests for this deployment + # ... other litellm_params ... + - model_name: gpt-4-slow + litellm_params: + model: azure/gpt-4-deployment + api_key: "os.environ/AZURE_API_KEY_EU" + max_parallel_requests: 5 # Limiting to only 5 concurrent requests for this (potentially slower) deployment + # ... other litellm_params ... +``` + +In this example, `gpt-3.5-turbo-fast` is configured to handle up to 20 concurrent requests, while `gpt-4-slow` is limited to only 5 concurrent requests, reflecting potential differences in their performance or backend capacity. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `max_parallel_requests` is a *per-deployment* limit. If you have multiple deployments or replicas of the same model behind a model alias (e.g., for load balancing), each deployment will have its *own* independent `max_parallel_requests` limit. To set a *global* concurrency limit across the entire proxy, use `global_max_parallel_requests`. Also, be aware that `router_settings` can also influence how parallel requests are handled, especially in conjunction with routing strategies like `"least-busy"`. + +--- + +#### `global_max_parallel_requests` + +**YAML Key:** `global_max_parallel_requests` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** If not set, there might be no explicit global limit beyond system resources, but it's recommended to set a reasonable limit. + +**Description:** The `global_max_parallel_requests` parameter, within `general_settings`, sets a **global limit on the total number of concurrent requests** that the **entire LiteLLM Proxy Server** will handle *across all models and all deployments*. This is a proxy-level concurrency cap. + +* **Proxy-Wide Concurrency Control:** `global_max_parallel_requests` acts as a **circuit breaker** at the proxy level. It limits the total number of requests that the proxy server will actively process at any given time, regardless of which model or API endpoint is being requested. +* **Infrastructure Protection:** Use `global_max_parallel_requests` to protect your proxy server infrastructure from overload, especially in high-traffic scenarios or during traffic spikes. It prevents the proxy itself from becoming overwhelmed by an excessive number of concurrent requests. +* **Resource Management:** Global concurrency limiting helps manage the overall resource consumption of the proxy server (CPU, memory, network connections, etc.). +* **Queueing or Rejection of Excess Requests:** When the total number of concurrent requests across the proxy reaches the `global_max_parallel_requests` limit, subsequent requests will be **queued** (if queueing is enabled and configured) or **rejected**. The proxy will typically return an error to the client indicating that the server is currently overloaded. + +**Example YAML (Setting Global Parallel Request Limit):** + +```yaml +general_settings: + global_max_parallel_requests: 500 # Setting global concurrency limit to 500 requests + # ... other general_settings ... +``` + +In this example, the LiteLLM Proxy Server will never handle more than 500 requests concurrently *in total*, across all model deployments and all API endpoints. If the 500 concurrent request limit is reached, any further incoming requests will be queued or rejected until existing requests complete. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** It's generally recommended to **set a `global_max_parallel_requests` value in production environments** to protect your proxy server from overload. The optimal value depends on the resources available to your proxy server (CPU, memory, network bandwidth) and the expected traffic volume. You should perform load testing to determine an appropriate global concurrency limit that balances throughput and stability for your deployment. + +--- + +#### `max_request_size_mb` + +**YAML Key:** `max_request_size_mb` + +**Type:** Integer (representing Megabytes) + +**Environment Variable:** N/A + +**Default Value:** A reasonable default size limit is likely applied, but the exact default is not specified in the documentation snippets. A default around **5-10 MB** is common for web servers. You should test and verify the actual default in your LiteLLM Proxy version, and it is **highly recommended to explicitly set this limit**. + +**Description:** The `max_request_size_mb` parameter, within `general_settings`, sets a limit on the **maximum size of the incoming request payload** (the JSON request body) that the proxy will accept, measured in **Megabytes (MB)**. + +* **Request Payload Size Limit:** `max_request_size_mb` is used to prevent excessively large requests from being processed by the proxy. It sets a limit on the size of the **entire JSON request body**, including the prompt, parameters, and any other data sent in the request. +* **Denial-of-Service (DoS) Protection:** Limiting request size is a basic security measure to help protect against certain types of Denial-of-Service (DoS) attacks. Malicious actors might attempt to send extremely large requests to overwhelm the proxy server or backend LLM providers. Setting `max_request_size_mb` provides a defense against such attacks. +* **Resource Management:** Limiting request size also helps manage resource consumption on the proxy server. Processing very large request payloads can consume significant memory and CPU resources. Setting a reasonable limit prevents the proxy from being overwhelmed by oversized requests. +* **Preventing Abuse:** It can also help prevent accidental or intentional abuse, where users might send extremely large prompts or attempt to upload very large files via the API. + +**Example YAML (Setting Max Request Size Limit):** + +```yaml +general_settings: + max_request_size_mb: 5 # Limiting max request payload size to 5 MB + # ... other general_settings ... +``` + +In this example, the proxy will reject any incoming API request whose JSON body exceeds 5 MB in size. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Always set a `max_request_size_mb` value in production environments** to protect against DoS attacks, manage resource consumption, and prevent abuse. A value around **5-10 MB** is often a reasonable starting point for typical LLM applications. You can adjust this value based on the expected size of your prompts and the file upload capabilities you want to support. If you are allowing image or file uploads via the proxy, you might need to increase `max_request_size_mb` accordingly, but always consider the security and resource implications of larger request sizes. If you are *not* allowing file uploads or very large prompts, you can set a lower limit (e.g., 1-2 MB) for stricter size control. + +--- + +#### `max_response_size_mb` + +**YAML Key:** `max_response_size_mb` + +**Type:** Integer (representing Megabytes) + +**Environment Variable:** N/A + +**Default Value:** A reasonable default response size limit is likely applied, but the exact default is not specified in the documentation snippets. A default around **10 MB** is common for web servers. You should test and verify the actual default in your LiteLLM Proxy version, and it is **highly recommended to explicitly set this limit**. + +**Description:** The `max_response_size_mb` parameter, within `general_settings`, sets a limit on the **maximum size of the response payload** (the JSON response body) that the proxy will send back to the client, measured in **Megabytes (MB)**. + +* **Response Payload Size Limit:** `max_response_size_mb` is used to prevent the proxy from sending back excessively large responses. If an LLM generates a response that exceeds this size limit, the proxy will **not forward the full response** to the client. +* **Preventing Overly Large Responses:** This is a safety mechanism to prevent scenarios where an LLM might generate extremely long responses (e.g., due to prompt injection or model misbehavior). Uncontrolled large responses can: + * Overwhelm client applications with excessive data. + * Consume excessive bandwidth and network resources. + * Potentially cause issues with client-side processing or memory limitations. +* **Resource Protection (Proxy and Client):** Limiting response size helps protect both the proxy server and client applications from being overwhelmed by oversized responses. +* **Safety and Abuse Prevention:** It acts as a safeguard against potential abuse or unexpected LLM behavior that might lead to the generation of excessively long outputs. + +**Example YAML (Setting Max Response Size Limit):** + +```yaml +general_settings: + max_response_size_mb: 10 # Limiting max response payload size to 10 MB + # ... other general_settings ... +``` + +In this example, if an LLM generates a response that is larger than 10 MB (in JSON body size), the proxy will not send the full response back to the client. The exact behavior in case of exceeding this limit might depend on the proxy version (e.g., it might truncate the response, return an error, or close the connection). Check the documentation for your specific version to confirm the exact behavior. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Always set a `max_response_size_mb` value in production environments** as a safety measure to prevent overly large responses. A value around **10 MB** is often a reasonable starting point. You can adjust this value based on the expected typical response sizes in your application. If your application is designed to handle very large responses (e.g., for document generation or long-form content creation), you might increase this limit, but always consider the potential resource implications and client-side handling capabilities for large responses. If you are primarily dealing with short, conversational responses, you can set a lower limit (e.g., 2-5 MB) for stricter size control. + +--- + +#### `proxy_budget_rescheduler_min_time` / `proxy_budget_rescheduler_max_time` + +*(These parameters were already documented earlier in the Database & Persistence section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `proxy_budget_rescheduler_min_time` / `proxy_budget_rescheduler_max_time` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `proxy_budget_rescheduler_min_time`: `597` seconds, `proxy_budget_rescheduler_max_time`: `605` seconds (approximately 10 minutes interval). + +**Description:** *(See detailed description in the [Database & Persistence](#database--persistence) section above for `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`)*. These parameters control the **frequency of budget reset checks**. + +--- + +### Key Management & Encryption + +#### `key_management_system` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `key_management_system` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** If not set, the proxy uses its built-in local encryption with the master key. + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `key_management_system`)*. Specifies which **Key Management System** (if any) to use for managing encryption keys. Supported values are `"google_kms"` (Google Cloud KMS) or `"azure_kms"` (Azure Key Vault). + +--- + +#### `key_management_settings` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `key_management_settings` + +**Type:** List of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** N/A (Required if `key_management_system` is set) + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `key_management_settings`)*. Provides **configuration details for the chosen KMS**. Structure depends on the KMS type (AWS KMS, Azure Key Vault, Google KMS). + +--- + +#### `use_azure_key_vault` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `use_azure_key_vault` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Azure Key Vault integration is disabled by default). + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `use_azure_key_vault`)*. If `true`, enables using **Azure Key Vault** for managing keys and secrets *(Enterprise Feature)*. + +--- + +#### `use_google_kms` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `use_google_kms` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Google Cloud KMS integration is disabled by default). + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `use_google_kms`)*. If `true`, enables using **Google Cloud KMS** for encryption of keys *(Enterprise Feature)*. + +--- + +#### `default_team_disabled` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `default_team_disabled` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Personal keys allowed by default). + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `default_team_disabled`)*. If `true`, **users cannot create personal API keys** (keys with no team) *(Enterprise Feature)*. + +--- + +#### `custom_key_generate` (`general_settings`) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `custom_key_generate` + +**Type:** String (Path to a Python function or method) + +**Environment Variable:** N/A + +**Default Value:** `None` (Default key generation logic is used). + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for `custom_key_generate`)*. Specifies a **custom function or method for generating API keys** *(Advanced Usage)*. + +--- + +#### Encryption salt (environment variable) + +*(This parameter was already documented earlier in the Key Management & Encryption section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** N/A (Configured via environment variable, not in `config.yaml`) + +**Type:** String (Secret, high-entropy string, 32+ bytes recommended) + +**Environment Variable:** `LITELLM_SALT_KEY` + +**Default Value:** If not set, a randomly generated salt might be used internally, but setting it via environment variable is strongly recommended for production. + +**Description:** *(See detailed description in the [Key Management & Encryption](#key-management--encryption) section above for Encryption salt (environment variable): `LITELLM_SALT_KEY`)*. `LITELLM_SALT_KEY` environment variable is used to set a custom salt for encryption. + +--- + +### Rate Limiting & Quotas + +#### `max_parallel_requests` + +*(This parameter was already documented earlier in the Rate Limiting & Quotas section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `max_parallel_requests` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** Provider and model-dependent. + +**Description:** *(See detailed description in the [Rate Limiting & Quotas](#rate-limiting--quotas) section above for `max_parallel_requests`)*. The maximum number of requests that can be handled in parallel **per deployment/model**. + +--- + +#### `global_max_parallel_requests` + +*(This parameter was already documented earlier in the Rate Limiting & Quotas section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `global_max_parallel_requests` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** No Global Cap (Resource Limited). + +**Description:** *(See detailed description in the [Rate Limiting & Quotas](#rate-limiting--quotas) section above for `global_max_parallel_requests`)*. The maximum number of requests allowed in parallel **across the entire proxy**. + +--- + +#### `max_request_size_mb` + +*(This parameter was already documented earlier in the Rate Limiting & Quotas section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `max_request_size_mb` + +**Type:** Integer (representing Megabytes) + +**Environment Variable:** N/A + +**Default Value:** Reasonable Default (e.g., 5-10MB). + +**Description:** *(See detailed description in the [Rate Limiting & Quotas](#rate-limiting--quotas) section above for `max_request_size_mb`)*. The maximum **request payload size** in megabytes that the proxy will accept. + +--- + +#### `max_response_size_mb` + +*(This parameter was already documented earlier in the Rate Limiting & Quotas section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `max_response_size_mb` + +**Type:** Integer (representing Megabytes) + +**Environment Variable:** N/A + +**Default Value:** Reasonable Default (e.g., 10MB). + +**Description:** *(See detailed description in the [Rate Limiting & Quotas](#rate-limiting--quotas) section above for `max_response_size_mb`)*. The maximum **response size** in megabytes that the proxy will send back. + +--- + +#### `proxy_budget_rescheduler_min_time` / `proxy_budget_rescheduler_max_time` + +*(These parameters were already documented earlier in the Database & Persistence section. No need to repeat the detailed description here. Just a cross-reference to the previous documentation.)* + +**YAML Key:** `proxy_budget_rescheduler_min_time` / `proxy_budget_rescheduler_max_time` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `proxy_budget_rescheduler_min_time`: `597` seconds, `proxy_budget_rescheduler_max_time`: `605` seconds (approximately 10 minutes interval). + +**Description:** *(See detailed description in the [Database & Persistence](#database--persistence) section above for `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`)* – these control budget reset check frequency. + +--- + +### Monitoring, Alerting & Health Checks + +This subsection of `general_settings` contains configurations for enabling background health checks on your models, setting alerting thresholds, and integrating with alerting services like Slack. + +#### `background_health_checks` + +**YAML Key:** `background_health_checks` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `true` (Background health checks are enabled by default). + +**Description:** The `background_health_checks` parameter, within `general_settings`, is a **boolean flag** that enables or disables **background health checks** for the model endpoints configured in your `model_list`. + +* **Periodic Health Monitoring:** When `background_health_checks: true`, the proxy will periodically perform health checks on each model endpoint. These checks are designed to verify that the backend LLM services are: + * **Available and Reachable:** The proxy will attempt to connect to the model endpoint and make a small, low-cost request (e.g., a simple ping or a short completion request). + * **Responding within Expected Latency:** The health check will measure the response time of the model endpoint. If the response time exceeds a configured threshold (not directly configurable via `config.yaml` but may be internally defined or based on default timeouts), the model might be considered unhealthy. +* **Proactive Issue Detection:** Background health checks help proactively detect issues with backend LLM deployments, such as: + * Model service outages or downtime. + * Network connectivity problems to the LLM provider. + * Performance degradation or increased latency in LLM responses. +* **Health Status Reporting:** The results of background health checks are used internally by the proxy for routing decisions (e.g., avoiding unhealthy models in load balancing) and are also exposed via the `/health` endpoint, allowing you to monitor the health status of your models. +* **Resource Consumption:** Background health checks consume a small amount of resources (network traffic, minimal LLM API calls). The frequency of health checks is controlled by the `health_check_interval` parameter. +* **Disabling Health Checks (Optional):** Setting `background_health_checks: false` will **disable background health checks entirely**. You might want to disable them in development environments, testing scenarios, or if you have an external health monitoring system and do not need the proxy's built-in health checks. However, for most production deployments, it is **highly recommended to keep background health checks enabled** to ensure proactive monitoring and early detection of issues with your LLM infrastructure. + +**Example YAML:** + +```yaml +general_settings: + background_health_checks: true # Enabling background health checks for model endpoints + health_check_interval: 300 # Check interval set to 300 seconds (5 minutes) + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Always keep `background_health_checks: true` (the default) enabled in production environments** to benefit from proactive monitoring and early detection of issues with your LLM deployments. Adjust the `health_check_interval` to control the frequency of checks, balancing monitoring granularity with resource consumption. + +--- + +#### `health_check_interval` + +**YAML Key:** `health_check_interval` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `300` (300 seconds, or 5 minutes, if not set). + +**Description:** The `health_check_interval` parameter, within `general_settings`, specifies the **time interval in seconds** between each **background health check** performed by the proxy for each model endpoint configured in your `model_list`. This parameter controls how frequently the proxy will check the health status of your backend LLM deployments. + +* **Health Check Frequency Control:** `health_check_interval` determines how often the proxy will initiate a health check probe for each model. A lower value means more frequent checks, while a higher value means less frequent checks. +* **Responsiveness vs. Resource Usage Trade-off:** + * **Shorter Interval (e.g., 60 seconds or less):** Results in **more frequent health checks**. This provides **faster detection of model outages or performance degradation**, as the proxy will check health status more often. However, it also increases the **resource consumption** of the proxy (more frequent network traffic, more frequent minimal LLM API calls for health checks) and might slightly increase API call volume to the backend LLM providers (though health check calls are typically designed to be low-cost). + * **Longer Interval (e.g., 300 seconds or more):** Reduces the frequency of health checks. This decreases the resource overhead of health checks on the proxy and backend LLMs. However, it also means that the proxy will take **longer to detect model outages or performance issues**, as health checks are performed less frequently. +* **Default Interval:** The default `health_check_interval` is `300 seconds` (5 minutes). This is often a good balance for many use cases, providing reasonably timely health monitoring without excessive overhead. + +**Example YAML (Adjusting Health Check Interval):** + +```yaml +general_settings: + background_health_checks: true # Background health checks enabled + health_check_interval: 600 # Setting health check interval to 600 seconds (10 minutes) - less frequent checks + # ... other general_settings ... +``` + +In this example, the proxy will perform background health checks for each model endpoint every 600 seconds (10 minutes). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `health_check_interval` of 300 seconds (5 minutes) is often a good starting point. You might consider **adjusting this value** if: + +* **You Need Faster Outage Detection:** If your application requires very rapid detection of model outages or performance issues, you can *decrease* `health_check_interval` to a lower value (e.g., 60 seconds or less). +* **You Want to Reduce Health Check Overhead:** If you have a very large number of models or deployments and want to minimize the resource consumption of health checks, you can *increase* `health_check_interval` to a higher value (e.g., 600 seconds or more), accepting a slightly longer detection time for issues. +* **Model Stability:** If your backend LLM deployments are generally very stable and reliable, you might use a longer `health_check_interval`. If they are more prone to occasional issues or performance fluctuations, a shorter interval might be more beneficial. + +Monitor your proxy's performance and the frequency of health check alerts to determine the optimal `health_check_interval` for your specific environment and needs. + +--- + +#### `health_check_details` + +**YAML Key:** `health_check_details` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `true` in some versions, but behavior might be version-dependent. Check your documentation for the exact default in your LiteLLM Proxy version. It's recommended to explicitly set this value. + +**Description:** The `health_check_details` parameter, within `general_settings`, is a **boolean flag** that controls the **level of detail** included in the response from the proxy's `/health` endpoint. The `/health` endpoint is used to check the overall health and status of the proxy server and its configured model deployments. + +* **Detailed Health Information (Default):** If `health_check_details: true` (or if it's the default behavior in your version), the `/health` endpoint will return a **more detailed JSON response**. This detailed response typically includes: + * **Overall Proxy Health Status:** Indicates if the proxy server itself is healthy and functioning correctly. + * **Per-Model Deployment Health Status:** For each model deployment defined in your `model_list`, the response will include its individual health status (e.g., "healthy", "unhealthy", "degraded"). + * **Detailed Model Metrics:** For each model, the response might include various performance metrics, such as: + * Remaining rate limits (RPM/TPM). + * Average response latency. + * Last health check timestamp. + * Other deployment-specific status information. +* **Simplified Health Status (Details Hidden):** Setting `health_check_details: false` will **simplify the `/health` endpoint response**. In this simplified mode, the `/health` endpoint will likely return a **minimalistic response**, typically just indicating a general "pass" or "fail" status for the proxy's overall health. Detailed per-model information, metrics, and rate limit details will be **hidden** from the response. +* **Use Cases for Hiding Details:** You might want to set `health_check_details: false` in scenarios such as: + * **Publicly Exposed Health Endpoint:** If you are exposing the `/health` endpoint publicly (e.g., for load balancers, external monitoring systems, or public status pages), you might want to **hide detailed internal information** like rate limits or per-model metrics for security or privacy reasons. A simplified pass/fail response might be sufficient for external health checks without leaking sensitive operational details. + * **Simplified Monitoring:** For basic monitoring setups where you only need a simple "proxy is up/down" indicator and don't require granular per-model health data. + +**Example YAML (Hiding Health Check Details):** + +```yaml +general_settings: + health_check_details: false # Hiding detailed health info in /health endpoint for simpler response + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** + +* **For Internal Monitoring and Debugging:** For internal monitoring dashboards, operational dashboards, or debugging purposes, it's generally recommended to leave `health_check_details: true` (or use the default, if it is enabled by default in your version) to get the **most comprehensive health information** via the `/health` endpoint. The detailed response is invaluable for diagnosing issues, tracking model health, and monitoring performance. +* **For Publicly Exposed Health Checks:** If you are exposing the `/health` endpoint to external systems or public networks, consider setting `health_check_details: false` to **hide sensitive internal details** like rate limits or per-model metrics. A simpler pass/fail indicator might be sufficient for external health checks and avoids potentially leaking operational information. + +--- + +#### `alerting` + +**YAML Key:** `alerting` + +**Type:** Array of Strings + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no alerting methods enabled by default). + +**Description:** The `alerting` parameter, within `general_settings`, is an **array of strings** that specifies a list of **alerting methods or integrations** that the proxy should use to send notifications about certain events, such as model outages, budget exceedances, or performance thresholds being crossed. + +* **Proactive Notifications:** Alerting is a crucial feature for **proactive monitoring and incident response**. It allows the proxy to automatically notify you or your operations team when critical events occur, enabling faster detection and resolution of issues. +* **Supported Alerting Methods:** Common alerting methods supported by LiteLLM Proxy include: + * `"slack"`: Sends alerts to Slack channels. + * `"email"`: Sends alerts via email. + * `"discord"`: Sends alerts to Discord channels. + * `"microsoft_teams"`: Sends alerts to Microsoft Teams channels. + * `"webhook"`: Sends alerts to a generic webhook URL (for integration with custom alerting systems or other notification platforms). + + *Check the documentation for the most up-to-date list of supported alerting methods, as new integrations might be added in newer versions.* + +* **Multiple Alerting Methods:** You can list **multiple alerting methods** in the `alerting` array. The proxy will send alerts via *all* configured methods whenever an alertable event occurs. For example, you might configure `alerting: ["slack", "email"]` to receive alerts both in a Slack channel and via email. +* **Configuration Required for Each Method:** For each alerting method you enable, you will typically need to provide **additional configuration details** in the `alerting_args` section of `general_settings`. For example, for Slack alerting, you need to provide the Slack webhook URL and channel. For email alerting, you need to configure SMTP server settings, sender/receiver addresses, etc. + +**Example YAML (Enabling Slack Alerting):** + +```yaml +general_settings: + alerting: ["slack"] # Enabling Slack alerting method + alerting_args: # Required: Configuration for Slack alerting + slack_webhook_url: "https://hooks.slack.com/services/..." # Your Slack webhook URL + slack_channel: "#llm-alerts" # Slack channel to send alerts to + # ... other general_settings ... +``` + +In this example, Slack alerting is enabled. The `alerting_args` section provides the necessary Slack-specific configuration (webhook URL and channel). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** Simply listing an alerting method in the `alerting` array is not sufficient to enable alerting. You **must also configure the necessary settings for that method** (like webhook URLs, email server details, etc.) in the `alerting_args` section (or via environment variables, depending on the alerting method). Refer to the documentation for each alerting integration for details on the required configuration parameters. + +--- + +#### `alerting_threshold` + +**YAML Key:** `alerting_threshold` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `None` (No alerting threshold is set by default. Alerts might be triggered based on other criteria, or some alerts might always be sent when enabled, depending on the alert type). + +**Description:** The `alerting_threshold` parameter, within `general_settings`, specifies a **numerical threshold value** that is used to **trigger certain types of alerts**. The *exact meaning* of this threshold depends on the specific alert type and how it is implemented by the proxy's alerting system. + +* **Threshold-Based Alerts:** `alerting_threshold` is typically used for alerts that are triggered based on a **numeric value exceeding a certain limit**. Examples might include: + * **Latency Alerts:** Alert when the average response latency for a model exceeds a specified threshold (e.g., `alerting_threshold: 2000` might mean "alert if average latency exceeds 2000 milliseconds"). + * **Error Rate Alerts:** Alert when the error rate for a model exceeds a certain percentage. + * **Queue Length Alerts:** Alert when the request queue length for a model exceeds a threshold. + * **Health Check Failure Threshold:** Alert after a certain number of consecutive health check failures for a model (though this might be controlled by an internal, non-configurable threshold rather than `alerting_threshold`). +* **Threshold Unit and Interpretation:** The **unit and interpretation** of the `alerting_threshold` value are **alert-type dependent**. For latency alerts, it might be milliseconds. For error rate, it might be a percentage. For queue length, it might be the number of queued requests. Consult the documentation for the specific alerting integrations you are using (Slack, Email, etc.) to understand how `alerting_threshold` is used for each alert type. +* **Optional Parameter:** `alerting_threshold` is **optional**. Not all types of alerts rely on a numerical threshold. Some alerts might be triggered based on other criteria (e.g., any model outage, any database connection error, budget exceeded, etc.) without needing a threshold value. In such cases, you might not need to set `alerting_threshold`. + +**Example YAML (Setting Alerting Threshold for Latency - Hypothetical Example):** + +```yaml +general_settings: + alerting: ["slack"] # Slack alerting enabled + alerting_threshold: 2000 # Setting alerting threshold to 2000 milliseconds (2 seconds) - hypothetical example, check actual alert types and units + alerting_args: + slack_webhook_url: "https://hooks.slack.com/services/..." + slack_channel: "#llm-alerts" + # ... other general_settings ... +``` + +In this *hypothetical* example, `alerting_threshold: 2000` *might* mean that the proxy will send a Slack alert if the average response latency for any model exceeds 2000 milliseconds (2 seconds). **However, the actual meaning and units of `alerting_threshold` are alert-type specific, and you must consult the documentation for the specific alerting integrations you are using to confirm how this parameter is interpreted for different alert types.** + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** The `alerting_threshold` parameter is often used in conjunction with the `alerting` and `alerting_args` parameters to configure alerting integrations and their specific behavior. Check the documentation for each alerting integration (Slack, Email, etc.) to understand if and how `alerting_threshold` is used and what its appropriate value and unit should be for different types of alerts. + +--- + +#### `alerting_args` + +**YAML Key:** `alerting_args` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (Required if `alerting` list is not empty and you are using alerting methods that require arguments). + +**Description:** The `alerting_args` parameter, within `general_settings`, is a **required mapping** (object) if you have enabled any alerting methods using the `alerting` parameter (e.g., `alerting: ["slack"]`, `alerting: ["email"]`, etc.). `alerting_args` is used to provide **additional configuration arguments** that are **specific to each alerting integration**. The exact parameters needed within `alerting_args` depend on the alerting method you are configuring. + +* **Integration-Specific Settings:** `alerting_args` acts as a container for settings that are unique to each alerting platform or service. For example: + * **Slack Alerting:** Requires a `slack_webhook_url` (the Slack webhook URL to send alerts to) and optionally a `slack_channel` (the Slack channel name). + * **Email Alerting:** Requires settings for your SMTP server (e.g., `smtp_host`, `smtp_port`, `smtp_user`, `smtp_password`), sender email address (`from_email`), and recipient email addresses (`to_emails`). + * **Webhook Alerting:** Requires a `webhook_url` (the URL of your webhook endpoint). + * Other integrations might have their own specific API keys, endpoints, or authentication details that need to be configured in `alerting_args`. + +**Example YAML (Slack Alerting Configuration in `alerting_args`):** + +```yaml +general_settings: + alerting: ["slack"] # Enabling Slack alerting + alerting_args: # Required: Arguments for Slack alerting + slack_webhook_url: "https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX" # Slack webhook URL (replace with your actual URL) + slack_channel: "#llm-alerts" # Slack channel to send alerts to (replace with your channel name) + # ... other general_settings ... +``` + +In this example, `alerting_args` provides the `slack_webhook_url` and `slack_channel` parameters, which are required to configure the Slack alerting integration. You will need to replace the placeholder webhook URL and channel name with your actual Slack webhook and channel details. + +**Example YAML (Email Alerting Configuration in `alerting_args`):** + +```yaml +general_settings: + alerting: ["email"] # Enabling email alerting + alerting_args: # Required: Arguments for email alerting + smtp_host: "smtp.example.com" # SMTP server hostname + smtp_port: 587 # SMTP server port (e.g., 587 for TLS) + smtp_user: "alert-sender@example.com" # Sender email address + smtp_password: "your_smtp_password" # Sender email password + from_email: "alert-sender@example.com" # Email address alerts will be sent from + to_emails: ["admin1@example.com", "admin2@example.com"] # List of recipient email addresses + # ... other general_settings ... +``` + +This example shows the configuration for email alerting, including SMTP server details, sender credentials, and recipient email addresses within `alerting_args`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Important:** The specific keys and values required within `alerting_args` are **integration-dependent**. **Always consult the documentation for the specific alerting method you are configuring (Slack, Email, Datadog, etc.)** to determine the exact parameters that are needed in `alerting_args` and how to obtain or configure those values (e.g., how to create a Slack webhook URL, how to configure SMTP settings, etc.). If `alerting` is not empty, but the required `alerting_args` are missing or incomplete, the alerting integration may fail to initialize or function correctly. + +--- + +#### `alert_types` + +**YAML Key:** `alert_types` + +**Type:** Array of Strings + +**Environment Variable:** N/A + +**Default Value:** For Slack alerting, a default set of alert types is likely enabled (check documentation for the default list). If using other alerting methods or for finer-grained control, it's recommended to explicitly set `alert_types`. + +**Description:** The `alert_types` parameter, within `general_settings`, is an **array of strings** that allows you to **filter or select which types of alerts** will be sent via the configured alerting methods (currently, this parameter is documented as being primarily relevant for **Slack alerting**). + +* **Alert Type Filtering:** `alert_types` lets you control the **granularity of alerts**. Instead of receiving notifications for *every* possible alertable event, you can use `alert_types` to specify that you only want to be notified about *specific types* of alerts that are most critical or relevant to you. +* **Slack Alert Type Examples:** For Slack alerting, common alert types might include: + * `"errors"`: Alerts for general errors encountered by the proxy. + * `"timeouts"`: Alerts for request timeouts (LLM API calls exceeding `request_timeout`). + * `"unhealthy_models"`: Alerts when model deployments are detected as unhealthy via background health checks. + * `"budget_exceeded"`: Alerts when API key, user, or team budgets are exceeded. + * `"region_outage"`: (Enterprise) Alerts for region outages affecting LLM providers. + + *The exact list of available `alert_types` may vary slightly depending on the proxy version. Consult the documentation for your specific LiteLLM Proxy version to get the most up-to-date list of supported alert types for Slack and other alerting integrations.* + +* **Reduced Alert Noise:** Using `alert_types` to filter alerts can help **reduce alert noise** and prevent alert fatigue. You can focus on receiving notifications only for the most critical events that require immediate attention, while suppressing less critical or informational alerts. +* **Default Alert Types (May Vary):** If `alert_types` is not set in your `config.yaml`, a **default set of alert types** is likely enabled for Slack alerting (check the documentation for the default list in your version). However, for clarity and control, it's recommended to **explicitly configure `alert_types`** to define exactly which alerts you want to receive. + +**Example YAML (Filtering Slack Alerts to Only Errors and Unhealthy Models):** + +```yaml +general_settings: + alerting: ["slack"] # Slack alerting enabled + alert_types: ["errors", "unhealthy_models"] # Only sending Slack alerts for "errors" and "unhealthy_models" + alerting_args: + slack_webhook_url: "https://hooks.slack.com/services/..." + slack_channel: "#llm-alerts" + # ... other general_settings ... +``` + +In this example, even though Slack alerting is generally enabled, the `alert_types: ["errors", "unhealthy_models"]` setting ensures that the proxy will only send Slack alerts for "errors" and "unhealthy_models" events. Alerts for other event types (like budget warnings, slow API calls, etc., if those are also configured) will be suppressed for Slack. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Use `alert_types` to **customize the type of alerts you receive** via Slack or other alerting integrations. Start by reviewing the available alert types for your chosen method (e.g., Slack). Then, select the alert types that are most relevant to your operational needs and monitoring priorities, and list them in the `alert_types` array in your `config.yaml` to filter and reduce alert noise. + +--- + +#### `alert_to_webhook_url` + +**YAML Key:** `alert_to_webhook_url` + +**Type:** Object (Mapping of alert type to webhook URL) + +**Environment Variable:** N/A + +**Default Value:** `None` (All alerts for a given alerting method are sent to the same default destination configured in `alerting_args`). + +**Description:** The `alert_to_webhook_url` parameter, within `general_settings`, is an **advanced mapping** (object) that allows you to define **custom webhook URLs for specific alert types**. This provides highly granular control over alert routing, enabling you to send **different types of alerts to different webhook endpoints**. + +* **Granular Alert Routing:** `alert_to_webhook_url` lets you override the default webhook URL (configured in `alerting_args`) and specify a **dedicated webhook URL for each alert type**. This is useful for: + * **Routing Critical Alerts to Different Channels:** Send high-priority alerts (e.g., "model_outage", "budget_exceeded") to a dedicated channel or system for immediate attention, while sending less critical alerts to a different channel or webhook. + * **Integrating with Different Systems Based on Alert Type:** Route different alert types to different monitoring, incident management, or automation systems. For example, send performance alerts to a performance monitoring dashboard, and budget alerts to a cost management system. +* **Mapping of Alert Type to Webhook URL:** `alert_to_webhook_url` is a dictionary (mapping) where: + * **Keys:** Are **alert type names** (strings), such as `"budget_exceeded"`, `"model_unhealthy"`, `"errors"`, etc. (Check the documentation for the available alert type names for your chosen alerting methods). + * **Values:** Are **webhook URLs** (strings). For each alert type key, you provide the specific webhook URL where alerts of that type should be sent. +* **Overrides Default Webhook (if applicable):** If you configure `alert_to_webhook_url` for a specific alert type, the proxy will use the webhook URL defined here for that alert type, **overriding** any generic webhook URL that might be configured in `alerting_args` (if applicable for the alerting method). + +**Example YAML (Routing Budget Alerts to a Dedicated Webhook):** + +```yaml +general_settings: + alerting: ["webhook"] # Generic webhook alerting enabled + alert_to_webhook_url: # Custom webhook URLs per alert type + budget_exceeded: "https://mywebhook.site/budget_alerts" # Budget alerts go to this webhook + model_unhealthy: "https://hooks.slack.com/services/..." # Model unhealthy alerts still go to Slack (example) + alerting_args: + webhook_url: "https://mywebhook.site/default_alerts" # Default webhook URL for other alert types (if any) + # ... other general_settings ... +``` + +In this example, alerts of type `"budget_exceeded"` will be sent to the webhook URL `https://mywebhook.site/budget_alerts`, while alerts of type `"model_unhealthy"` (and potentially other alert types not explicitly mapped in `alert_to_webhook_url`) might still be sent to a Slack webhook (as configured in `alerting_args.slack_webhook_url` - this part is just illustrative; webhook and Slack alerting are different methods). If there were other alert types that are not explicitly mapped in `alert_to_webhook_url`, they would be sent to the `webhook_url` specified in `alerting_args` (if a default webhook URL is configured for the "webhook" alerting method). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `alert_to_webhook_url` provides very fine-grained control over alert routing, but it is an **advanced configuration option** and is typically only needed in complex monitoring setups where you need to route different alert types to different destinations. For simpler setups, you can usually rely on the generic webhook URL (if using webhook alerting) or the default channel/destination configured in `alerting_args` for each alerting method. + +--- + +#### `spend_report_frequency` + +**YAML Key:** `spend_report_frequency` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `None` (No automated spend reports are generated by default). + +**Description:** The `spend_report_frequency` parameter, within `general_settings`, is an **enterprise-level setting** that allows you to configure the **frequency of automated spend reports**. If configured, the proxy can periodically generate and send summary reports of LLM usage and spending to a designated email address or webhook. + +* **Automated Spend Reporting:** `spend_report_frequency` enables you to receive regular reports on your LLM spend, helping you track costs, monitor budgets, and analyze usage patterns over time. +* **Report Frequency Options:** You can specify the report frequency using a string value, such as: + * `"1d"`: Daily spend reports (sent every day). + * `"7d"`: Weekly spend reports (sent every 7 days, likely weekly). + * `"30d"`: Monthly spend reports (sent every 30 days, likely monthly). + * `None` (or omitting the parameter): Disables automated spend reports (default). +* **Report Delivery Method:** The exact method of report delivery (email, webhook, etc.) and the configuration for delivery (email addresses, webhook URLs, etc.) would be configured separately, likely in the `alerting_args` or another dedicated section of the `config.yaml` (check the enterprise documentation for details on report delivery configuration). +* **Enterprise Feature:** Automated spend reports are primarily considered an **enterprise feature**, useful for organizations that need regular, automated reports for cost management, financial tracking, or internal reporting purposes. + +**Example YAML (Setting Weekly Spend Reports):** + +```yaml +general_settings: + spend_report_frequency: "7d" # Enterprise Feature: Sending weekly spend reports + alerting: ["email"] # Email alerting must be enabled to send reports via email + alerting_args: # Configuration for email alerting (including report recipients) + smtp_host: "smtp.example.com" + smtp_port: 587 + smtp_user: "alert-sender@example.com" + smtp_password: "your_smtp_password" + from_email: "alert-sender@example.com" + to_emails: ["finance@example.com", "llm-admin@example.com"] # Send spend reports to finance and admin teams + # ... other general_settings ... +``` + +In this example, `spend_report_frequency: "7d"` configures the proxy to generate and send weekly spend reports. The reports are likely delivered via email, as email alerting is enabled (`alerting: ["email"]`) and email configuration is provided in `alerting_args`. The reports will be sent to the email addresses listed in `alerting_args.to_emails`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `spend_report_frequency` is typically used in conjunction with alerting integrations (like email or webhook) to deliver the generated spend reports. Check the enterprise documentation for details on report content, format, delivery configuration, and any additional requirements for enabling spend reporting features. It is an **enterprise feature**. + +--- + +#### `forward_openai_org_id` + +**YAML Key:** `forward_openai_org_id` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (The `OpenAI-Organization` header is not forwarded by default). + +**Description:** The `forward_openai_org_id` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy should **forward the `OpenAI-Organization` HTTP header** from the client request to the **upstream OpenAI API call**. + +* **OpenAI Organization Header:** The `OpenAI-Organization` header is a custom HTTP header that OpenAI's API allows clients to include in their requests to specify which **OpenAI organization** the request should be billed to or associated with. This is relevant for users who are part of multiple OpenAI organizations and need to control billing or access on an organizational level. +* **Header Propagation to OpenAI:** When `forward_openai_org_id: true`, if an incoming client request to the proxy *includes* an `OpenAI-Organization` header, the proxy will **pass this header through** and include it in the API request that it forwards to the OpenAI backend. +* **Multi-Organization Scenarios:** Enabling `forward_openai_org_id: true` is useful in scenarios where: + * You are using OpenAI's organization feature to manage multiple OpenAI accounts or billing groups. + * Your client applications need to specify which OpenAI organization to use for a given request (e.g., for cost allocation or access control within OpenAI). + * You want to ensure that the `OpenAI-Organization` header provided by the client is respected and propagated to the OpenAI API. +* **Default: Header Not Forwarded:** By default (`forward_openai_org_id: false`), the proxy will **not forward the `OpenAI-Organization` header**. If the client includes this header, it will be **ignored** by the proxy and not passed on to the OpenAI API call. + +**Example YAML:** + +```yaml +general_settings: + forward_openai_org_id: true # Forwarding OpenAI-Organization header to OpenAI API + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `forward_openai_org_id: true` is only relevant when you are proxying requests to **OpenAI API endpoints**. It has no effect when using other LLM providers. Also, this setting only controls the *forwarding* of the header. You still need to ensure that your client applications are actually sending the `OpenAI-Organization` header in their requests if you want to utilize this feature. + +--- + +#### `forward_client_headers_to_llm_api` + +**YAML Key:** `forward_client_headers_to_llm_api` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Client headers are not forwarded by default). + +**Description:** The `forward_client_headers_to_llm_api` parameter, within `general_settings`, is a **boolean flag** that controls whether the proxy should **forward *any* client-provided HTTP headers** (specifically, headers that typically start with `"X-"` – indicating custom headers) to the **upstream LLM API call**. + +* **Propagating Custom Headers:** When `forward_client_headers_to_llm_api: true`, the proxy will **inspect incoming client requests** for any HTTP headers that have names starting with `"X-"` (e.g., `"X-Request-ID"`, `"X-Correlation-ID"`, `"X-Custom-Header"`). If it finds such headers, it will **copy them and forward them** as part of the API request that it sends to the backend LLM provider. +* **Custom Header Pass-Through:** This feature allows you to **propagate custom metadata or context information** from your client applications to the backend LLM provider via HTTP headers. This can be useful for: + * **Correlation IDs or Request Tracing:** Passing through correlation IDs or trace IDs generated by the client application to maintain end-to-end request tracing across systems. + * **Custom Metadata or Context:** Forwarding application-specific metadata or context information to the LLM provider for logging, analytics, or model behavior customization (if the provider's API supports using custom headers for such purposes). + * **Provider-Specific Header Requirements:** In rare cases, some LLM providers or custom endpoints might require or expect specific custom headers for certain functionalities. `forward_client_headers_to_llm_api: true` could be used to pass these headers through from the client. +* **Default: Headers Not Forwarded:** By default (`forward_client_headers_to_llm_api: false`), the proxy will **not forward any client-provided headers** (including "X-" headers) to the upstream LLM API call. It will only forward the standard headers that are necessary for the LLM API to function correctly (e.g., `Authorization`, `Content-Type`). +* **Security and Privacy Considerations:** Be mindful of what information you are forwarding in custom headers. Avoid forwarding sensitive data or credentials in headers unless absolutely necessary and you understand the security implications. + +**Example YAML:** + +```yaml +general_settings: + forward_client_headers_to_llm_api: true # Forwarding client-provided X-* headers to LLM API + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security and Privacy Warning:** **Exercise caution** when enabling `forward_client_headers_to_llm_api: true`. **Carefully consider what information might be present in client-provided headers**, especially custom "X-" headers. Ensure that you are not unintentionally forwarding sensitive data, security tokens, or private information to external LLM providers if you are using third-party services. Only enable this feature if you have a clear and well-justified use case for propagating custom headers and understand the potential security and privacy implications. + +--- + +#### `use_client_credentials_pass_through_routes` (`general_settings`) + +**YAML Key:** `use_client_credentials_pass_through_routes` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Client-provided credentials are not allowed on pass-through routes by default). + +**Description:** The `use_client_credentials_pass_through_routes` parameter, within `general_settings`, is an **enterprise-level boolean flag** that controls whether the proxy should **allow the use of client-provided credentials on certain "pass-through" routes**. + +* **Pass-Through Routes (e.g.,** `/vertex-ai/*` **,** `/bedrock/*` **):** LiteLLM Proxy supports "pass-through" routes, which are designed to allow clients to directly access certain LLM provider APIs through the proxy, without the proxy fully mediating the API call. Pass-through routes are typically identified by prefixes in the API path, such as `/vertex-ai/*` (for Google Vertex AI) or `/bedrock/*` (for AWS Bedrock). The proxy acts more as a transparent passthrough for these routes, forwarding requests and responses with minimal modification. +* **Client-Provided Credentials on Pass-Through:** Normally, even for pass-through routes, the LiteLLM Proxy would still expect clients to authenticate using a **virtual API key** (managed by the proxy). However, when `use_client_credentials_pass_through_routes: true`, the proxy will **relax this requirement for pass-through routes**. On pass-through routes, the proxy will **not enforce its own virtual key authentication**. Instead, it will **allow the client to provide their *own* provider-specific credentials directly in the request**, such as: + * **Provider API keys:** For example, a client might include their actual Google Cloud API key when calling a `/vertex-ai/*` endpoint. + * **OAuth2.0 tokens:** For example, a client might include an OAuth2.0 access token for Google Cloud when calling a `/vertex-ai/*` endpoint. +* **Bypassing Virtual Key Auth on Pass-Through:** Enabling `use_client_credentials_pass_through_routes: true` effectively **bypasses the proxy's virtual key authentication** *specifically for requests to pass-through routes*. The authentication responsibility shifts to the client, who must provide valid provider-specific credentials in their requests. +* **Trusted Internal Users (Use Case):** This feature is intended for **specific, controlled use cases**, such as: + * **Trusted Internal Users:** You might want to allow **trusted internal users** or applications to directly access provider APIs (like Vertex AI or Bedrock) through the proxy, using their own provider credentials. This might be useful for advanced users who need direct access to provider-specific features or when you want to delegate authentication to the provider's native authentication mechanisms. + * **Advanced Integrations:** For complex integrations where you need to leverage provider-specific SDKs or features via the proxy, and you want to handle authentication directly with the provider. + +**Security Warning and Restrictions:** + +* **Security Risk – Reduced Proxy Control:** Enabling `use_client_credentials_pass_through_routes: true` **significantly reduces the security control and management capabilities of the LiteLLM Proxy** for pass-through routes. You are essentially **bypassing the proxy's authentication and authorization mechanisms** for those routes. +* **Limited Proxy Features on Pass-Through:** When using client-provided credentials on pass-through routes, you will likely **lose access to many of the proxy's core features** for those requests, such as: + * Virtual key-based access control and authorization. + * Rate limiting and quota enforcement. + * Per-user cost tracking. + * Detailed proxy-level logging and observability (you might still get basic proxy logs, but not the same level of integration as with virtual key-authenticated requests). +* **Use Only for Trusted Clients and Controlled Routes:** **`use_client_credentials_pass_through_routes: true` should be used with extreme caution and only in very specific, controlled scenarios** where you fully understand the security and accountability implications. **Never enable this for publicly accessible proxy deployments or for untrusted clients**. It's recommended to combine this feature with other access control mechanisms, such as `allowed_routes` or `admin_only_routes`, to further restrict access to pass-through endpoints. +* **Virtual Key Authentication is Generally Recommended:** In most typical use cases, it's **strongly recommended to use virtual key authentication** for *all* requests to the LiteLLM Proxy, including those targeting backend LLM providers via the proxy. Virtual keys provide a much more secure, manageable, and feature-rich way to control access and track usage through the proxy. Client-provided credentials should only be considered as an exception for very specific, controlled integration scenarios. + +**Example YAML (Enabling Client Credentials on Pass-Through Routes - Use with Extreme Caution):** + +```yaml +general_settings: + use_client_credentials_pass_through_routes: true # Enterprise Feature: Allowing client-provided credentials on pass-through routes (USE WITH EXTREME CAUTION!) + allowed_routes: ["/v1/chat/completions", "/v1/embeddings", "/vertex-ai/*", "/bedrock/*"] # Example: Restricting allowed routes, but pass-through routes are now client-authenticated + # ... other general_settings ... +``` + +In this example, `use_client_credentials_pass_through_routes: true` is enabled. This means that requests to routes starting with `/vertex-ai/*` and `/bedrock/*` will **not require virtual key authentication**. Clients accessing these routes must provide their own valid provider-specific credentials (e.g., Google Cloud credentials for `/vertex-ai/*` or AWS credentials for `/bedrock/*`) directly in the request. Regular proxy routes like `/v1/chat/completions` and `/v1/embeddings` (listed in `allowed_routes`) might still be protected by virtual key authentication, depending on your overall proxy configuration. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Warning:** **Enabling `use_client_credentials_pass_through_routes: true` significantly reduces the security and management control of your LiteLLM Proxy for pass-through routes.** **Only use this feature if you fully understand the security implications, trust your clients accessing these routes, and have implemented alternative security and access control measures for pass-through endpoints.** For most typical deployments, it is **strongly recommended to leave this setting at its default `false` value** and rely on virtual key authentication for all requests to the proxy to maintain robust security and centralized control. + +--- + +#### `allow_client_side_credentials` (`general_settings`) + +**YAML Key:** `allow_client_side_credentials` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Client-side credentials are not allowed by default). + +**Description:** The `allow_client_side_credentials` parameter, within `general_settings`, is a **boolean flag** that, when set to `true`, allows clients to **provide their provider credentials directly in the API request body or headers**, instead of requiring the use of proxy-managed virtual API keys. + +* **Client-Supplied Provider Keys:** When `allow_client_side_credentials: true`, the proxy will **accept and use provider API keys or tokens** that are included directly by the client in their API requests. Clients might provide credentials through: + * **Request Body:** Including API key or token parameters within the JSON request body (e.g., as part of the `litellm_params` object). + * **Request Headers:** Setting provider-specific HTTP headers for authentication (e.g., `OpenAI-Api-Key`, `Authorization: Bearer `). +* **Bypassing Virtual Keys Entirely:** Enabling `allow_client_side_credentials: true` **completely bypasses the LiteLLM Proxy's virtual API key authentication** mechanism for requests that include provider credentials directly. The proxy will **prioritize client-provided credentials** over virtual keys if both are present. +* **Simplified Testing or Limited Scenarios:** `allow_client_side_credentials: true` is **generally discouraged for production environments** due to significant security and management drawbacks. It might be considered in **very limited, controlled scenarios** such as: + * **Testing and Development:** For simplified testing or quick prototyping where you want to bypass the proxy's key management and directly use provider API keys for convenience. + * **Specific, Trusted Clients:** In highly controlled, internal environments where you have a very small number of trusted clients and you want to allow them to use their own provider keys for specific, limited purposes (e.g., for fine-tuning tasks where storing keys in the proxy might be undesirable). + +**Security Warning and Restrictions:** + +* **Major Security Risk – Complete Bypass of Proxy Security:** **Enabling `allow_client_side_credentials: true` poses a very significant security risk.** It essentially **turns off the proxy's authentication and authorization mechanisms** for any client that chooses to provide provider credentials directly. Any user who can reach the proxy endpoint could potentially: + * Bypass virtual key-based access controls, rate limits, and budgets. + * Use *any* valid provider API key (not just keys managed by your proxy). + * Potentially access models or functionalities that they are not authorized to use under normal proxy policies. +* **Loss of Proxy Management and Control:** When clients provide their own credentials: + * **Usage Tracking and Cost Control are Severely Impaired:** The proxy's ability to track usage, attribute costs to users or teams, and enforce budgets becomes unreliable or completely ineffective, as requests are no longer tied to proxy-managed virtual keys. + * **Audit Logging is Less Meaningful:** Audit logs might not accurately reflect user activity or usage patterns, as requests are not properly associated with proxy-managed identities. + * **Security Auditing and Compliance Become More Difficult:** It becomes much harder to audit access, enforce security policies, and maintain compliance when clients can bypass proxy authentication and provide their own credentials. +* **Never Enable in Production (Generally):** **`allow_client_side_credentials: true` should almost never be enabled in production environments.** It fundamentally undermines the security, management, and control benefits of using the LiteLLM Proxy Server. +* **Testing or Very Controlled Scenarios Only:** Only consider enabling this feature **for testing, development, or highly specific, isolated, and controlled scenarios** where you fully understand and accept the very significant security risks and loss of proxy control. + +**Example YAML (Enabling Client-Side Credentials - USE WITH EXTREME CAUTION!!!):** + +```yaml +general_settings: + allow_client_side_credentials: true # WARNING: Enabling client-side credentials - MAJOR SECURITY RISK! USE ONLY FOR TESTING OR CONTROLLED SCENARIOS + # ... other general_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Warning:** **Enabling `allow_client_side_credentials: true` is a severe security risk and is strongly discouraged for production use.** It is extremely important to understand the security implications and only enable this setting if you have a very specific and well-justified reason, and are fully aware of the risks of bypassing the proxy's authentication and access control mechanisms. In almost all typical deployments, you should **leave `allow_client_side_credentials: false`** (the default) to maintain robust security and centralized control through the LiteLLM Proxy's virtual key management system. + +--- + +### Other Miscellaneous Settings + +#### `service_account_settings` (`general_settings`) + +**YAML Key:** `service_account_settings` + +**Type:** List of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** `None` (No service account settings are configured by default). + +**Description:** The `service_account_settings` parameter, within `general_settings`, is an **enterprise-level setting** that allows you to define specific configurations that should apply **only to service account API keys**. + +* **Service Account Keys (Enterprise Concept):** In LiteLLM Proxy's enterprise features, API keys can be categorized as either "user" keys (associated with individual users) or "service account" keys (intended for automated systems, background processes, or service-to-service communication, rather than human users). +* **Differentiated Configuration for Service Accounts:** `service_account_settings` enables you to apply **different settings or policies** to service account keys compared to regular user keys. This is useful for scenarios where you want to: + * **Apply Different Rate Limits:** Set different RPM/TPM limits for service account keys compared to user keys. Service accounts might be used for high-volume automated tasks that require higher throughput, while user keys might have stricter rate limits. + * **Enforce Different Access Rules:** Restrict access to certain models or API endpoints for service account keys, or grant them access to a different set of resources compared to user keys. + * **Customize Logging or Monitoring:** Apply specific logging or monitoring configurations that are only relevant to service account usage. +* **List of Settings Objects:** `service_account_settings` is a **list of objects**. Each object in the list represents a set of settings that should apply to service account keys that match certain criteria. Currently, the documentation mentions a single supported criteria: `key_type: "service_account"`. This means you can define settings that apply to *all* service account keys. Future versions might allow for more granular criteria (e.g., settings based on service account name or tags). + +**Example YAML (Setting Different Rate Limits for Service Account Keys):** + +```yaml +general_settings: + service_account_settings: # Enterprise Feature: Settings specific to service account keys + - key_type: "service_account" # Apply these settings to service account keys + settings: # Settings to apply + max_parallel_requests: 100 # Service account keys get a higher parallel request limit + rpm: 10000 # Service account keys get a higher RPM limit + tpm: 1000000 # Service account keys get a higher TPM limit + # ... other general_settings ... +``` + +In this example, any API key that is designated as a "service account key" will automatically have the `max_parallel_requests`, `rpm`, and `tpm` limits set to the specified values (100, 10000, and 100000, respectively), overriding any default or other configured rate limits that might apply to regular user keys. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `service_account_settings` is an **enterprise feature** primarily used in scenarios where you need to differentiate the configuration and policies applied to service account keys versus regular user API keys. If you are not using service accounts or do not need to apply different settings to them, you can typically omit this section from your `config.yaml`. Check the enterprise documentation for more details on how to define and manage service account keys themselves (creation, designation, etc.). + +--- + +#### `provider_budget_config` (`general_settings`) + +**YAML Key:** `provider_budget_config` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (No provider-level budgets are configured by default). + +**Description:** The `provider_budget_config` parameter, within `general_settings`, is an **advanced mapping** (object) that allows you to define **budget limits at the LLM *provider* level**. This is a more granular form of budget control compared to user or team-level budgets. + +* **Provider-Level Spending Caps:** `provider_budget_config` enables you to set **spending limits for each LLM provider** that you are using through the proxy (e.g., OpenAI, Anthropic, Azure, etc.). This is useful for: + * **Controlling Spend per Provider:** Managing your overall spending with each LLM provider individually. + * **Allocating Budgets Across Providers:** Distributing your total LLM budget across different providers based on your usage patterns, cost optimization strategies, or provider-specific agreements. + * **Provider-Level Cost Tracking:** Gaining insights into how much you are spending with each provider. +* **Mapping of Provider Name to Budget Settings:** `provider_budget_config` is a dictionary where: + * **Keys:** Are **provider names** (strings), such as `"openai"`, `"azure"`, `"anthropic"`, etc. These names should correspond to the provider names used in your `model_list` and by the LiteLLM library internally. + * **Values:** Are **objects** that define the budget settings for that specific provider. Within each provider's budget settings object, you can typically define: + * `daily_budget`: (Number, Optional): A daily budget limit (in USD or your chosen currency) for this provider. + * `monthly_budget`: (Number, Optional): A monthly budget limit for this provider. + * Other budget-related settings might be added in future versions (check documentation). +* **Advanced Cost Control:** `provider_budget_config` provides a more sophisticated and granular approach to cost management, allowing you to control spending at the provider level, in addition to user and team-level budgets. +* **Advanced Feature:** Provider-level budget configuration is considered an **advanced feature**. For simpler cost control, user and team-level budgets might be sufficient. + +**Example YAML (Setting Daily Budgets for OpenAI and Azure):** + +```yaml +general_settings: + provider_budget_config: # Advanced Feature: Provider-level budget configuration + openai: # Budget settings for OpenAI provider + daily_budget: 100 # Daily budget of $100 for OpenAI models + azure: # Budget settings for Azure provider + daily_budget: 50 # Daily budget of $50 for Azure models + # ... database_url and other general_settings ... +``` + +In this example, a daily budget of $100 is set for all OpenAI models used via the proxy, and a daily budget of $50 is set for all Azure models. If the daily spend for either provider reaches its budget, the proxy might take actions such as: + +* Rejecting new requests to models from that provider (until the budget resets, typically at the start of the next day). +* Sending alerts or notifications about budget exceedances (if alerting is configured). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `provider_budget_config` is an **advanced feature** for organizations that need fine-grained control over LLM spending and want to manage budgets at the provider level. For simpler cost management, user and team-level budgets might be sufficient. Check the enterprise documentation for details on how budget limits are enforced and what actions the proxy takes when provider budgets are exceeded. + +--- + +#### `model_group_alias` (`general_settings`) + +**YAML Key:** `model_group_alias` + +**Type:** Object (Mapping of string to string) + +**Environment Variable:** N/A + +**Default Value:** `None` (No model group aliases are defined by default). + +**Description:** The `model_group_alias` parameter, within `general_settings`, is a **mapping** (dictionary) that allows you to define **aliases or alternative names for model groups**. This is a way to rename or remap model group names for internal routing and management purposes, without affecting the `model_name` aliases that are exposed to client applications in the `model_list`. + +* **Internal Model Group Renaming or Mapping:** `model_group_alias` provides a mechanism to create an **internal mapping** between model group names. If you have a model group with a certain name (e.g., `"claude-2-group"`) that you want to internally refer to or route requests to using a different name (e.g., `"claude-latest-group"`), you can use `model_group_alias` to define this mapping. +* **Routing and Management Aliasing:** The primary use case for `model_group_alias` is for **internal routing and management within the proxy**. It allows you to: + * **Update Model Group Names Internally:** Rename or update the internal name of a model group without changing the user-facing `model_name` aliases in `model_list`. This can be useful when you are reorganizing your model configurations or updating underlying deployments. + * **Abstract Model Group Names:** Create more abstract or generic internal names for model groups, while keeping the `model_name` aliases in `model_list` user-friendly and consistent. + * **Routing Strategy Logic:** Some routing strategies (e.g., custom routing strategies or advanced load balancing algorithms) might internally refer to model groups using specific names. `model_group_alias` could be used to map user-facing `model_name` aliases to these internal model group names for routing decisions. + +**Example YAML (Defining a Model Group Alias):** + +```yaml +general_settings: + model_group_alias: # Defining model group aliases for internal routing + "claude-2": "claude-2-group-v2" # Mapping "claude-2" alias to "claude-2-group-v2" internal group + # ... other general_settings ... +# Example model_list (showing model_name aliases, not directly related to model_group_alias): +model_list: + - model_name: claude-2 # User-facing model name alias + litellm_params: + model: anthropic/claude-2 + api_key: "os.environ/ANTHROPIC_API_KEY" + # ... other litellm_params ... +``` + +In this example, `model_group_alias: {"claude-2": "claude-2-group-v2"}` maps the `model_name` alias `"claude-2"` to the internal model group name `"claude-2-group-v2"`. Internally, within the proxy's routing logic or health check system (if they refer to model groups by name), the proxy will now use `"claude-2-group-v2"` when referring to the model group associated with the user-facing alias `"claude-2"`. However, client applications will still use `"claude-2"` as the `model_name` in their API requests. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `model_group_alias` is primarily used for internal routing and management purposes within the proxy. It does not typically affect how client applications interact with the proxy, as they still use the `model_name` aliases defined in `model_list`. It is an advanced configuration option for complex routing scenarios or internal proxy management. + +--- + +#### `retry_after` (`general_settings`) + +**YAML Key:** `retry_after` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `0` (No additional retry delay is added by default). + +**Description:** The `retry_after` parameter, within `general_settings`, is an **advanced setting** that allows you to set a **base retry delay** in seconds that the proxy will apply **before attempting a retry** for certain types of errors. + +* **Base Retry Delay (Advanced):** `retry_after` introduces a **fixed delay** before the proxy attempts a retry in addition to any provider-specific retry-after headers or internal retry logic. This is a more advanced form of retry control and is typically not needed in most common setups. +* **Scenarios for Retry Delay:** You might consider setting a `retry_after` value in very specific scenarios, such as: + * **Coordinating with External Rate Limits:** If you are interacting with an LLM provider that has very strict rate limits, and you want to introduce a consistent delay between retries to avoid overwhelming the provider's API or triggering rate limit bans. + * **Debugging or Testing:** For debugging or testing purposes, you might want to introduce a deliberate delay in retries to better observe the retry behavior or simulate specific network conditions. +* **Override by Provider Header:** It's important to note that if the upstream LLM provider's API response includes a `Retry-After` header (instructing the client to wait for a certain duration before retrying), that **provider-provided `Retry-After` header will *override* the `retry_after` value** configured in `config.yaml`. The proxy will always prioritize the retry delay suggested by the provider's API, if provided. `retry_after` acts as a *base* or *minimum* delay if no provider-specific `Retry-After` header is present. +* **Default No Delay:** By default, `retry_after` is set to `0`, meaning no additional delay is added by the proxy before retries, unless dictated by the provider's `Retry-After` header. + +**Example YAML (Setting a Base Retry Delay):** + +```yaml +general_settings: + retry_after: 2 # Setting a base retry delay of 2 seconds for certain errors + # ... other general_settings ... +``` + +In this example, if a retryable error occurs (and the provider does *not* send a `Retry-After` header), the proxy will wait for at least 2 seconds before attempting to retry the request. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `retry_after` is an **advanced setting** that is rarely needed in typical deployments. In most cases, you can rely on the proxy's default retry logic and any `Retry-After` headers provided by the LLM providers. Only consider setting `retry_after` if you have a specific need to introduce a base retry delay for advanced retry control or coordination with external rate limits. + +--- + +#### `num_retries` (`general_settings`) + +*(This parameter is also defined under `router_settings`. It is documented here under `general_settings` because the source documentation lists it in the `general_settings` section. However, it's important to note that `num_retries` can also be configured in `router_settings` and might be more commonly used there for router-specific retry policies. See also the `router_settings.num_retries` parameter documentation.)* + +**YAML Key:** `num_retries` (in `general_settings`) + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `3` (Proxy will attempt 3 retries by default). + +**Description:** The `num_retries` parameter, within `general_settings` (and also available in `router_settings`), sets the **default number of retry attempts** that the proxy will make for a failed LLM API request **before giving up and returning an error to the client**. This is a global or default retry count. + +* **Request Retries for Transient Errors:** Retries are a crucial mechanism for handling transient errors, network glitches, or temporary unavailability of backend LLM services. When an error occurs (e.g., a timeout, connection error, rate limit error, or internal server error), the proxy can automatically retry the request a certain number of times to attempt to recover and successfully complete the call. +* **Default Retry Count:** The default `num_retries` value is `3`. This means that, by default, if a request fails, the proxy will attempt to retry the call **up to 3 times** before ultimately failing the request and returning an error to the client. +* **Global or Default Retry Policy:** The `num_retries` setting in `general_settings` defines a **global or default retry policy**. It applies to all model deployments unless overridden by more specific retry configurations (e.g., via `router_settings.retry_policy`). +* **Retry Logic (Simplified):** The proxy's retry logic typically involves: + 1. Initial API call attempt. + 2. If the call fails with a retryable error, wait for a short delay (potentially influenced by `retry_after` or provider `Retry-After` header). + 3. Retry the request again (up to `num_retries` times). + 4. If all retry attempts fail, return an error to the client. +* **Reliability Enhancement:** Setting a reasonable `num_retries` value (like the default of 3) significantly improves the reliability and resilience of your LLM application by automatically handling transient errors and increasing the chances of successful request completion. + +**Example YAML (Setting Default Number of Retries):** + +```yaml +general_settings: + num_retries: 5 # Setting default retry attempts to 5 for all models + # ... other general_settings ... +``` + +In this example, the proxy will attempt up to 5 retries for any failed LLM API request before giving up. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `num_retries` value of 3 is generally a good balance between reliability and responsiveness for many use cases. You might consider **adjusting this value** if: + +* **You are Experiencing Frequent Transient Errors:** If you observe frequent transient errors in your logs (e.g., occasional network glitches or temporary provider hiccups), you can try *increasing* `num_retries` to give the proxy more chances to recover from these transient issues. +* **You Need Faster Failure Reporting for Certain Errors:** If you want the proxy to fail faster and report errors to the client more quickly, you can *decrease* `num_retries` to a lower value (e.g., 1 or 2), or even set it to `0` to disable retries entirely (not generally recommended for production). +* **Specific Model/Deployment Retry Policies:** For more fine-grained control over retries, you can configure retry policies at the **router level** using `router_settings.retry_policy` (see Router Settings section). `router_settings.retry_policy` allows you to define different retry counts and behaviors for specific error types or model groups, overriding the global `num_retries` setting. + +--- + +## `router_settings` Section: Routing & Load-Balancing Settings + +The `router_settings` section governs the request routing and load-balancing behavior of the LiteLLM Proxy, enabling you to define how requests are distributed across multiple model deployments and handle failures. + +### router\_settings Parameter Summary + +| Parameter | Type | Default Value | Description | +| :--------------------------------- | :------------------ | :----------------------------------- | :------------------------------------------------------------------------------------------------------ | +| **Routing Strategy & Model Selection** | | | | +| `routing_strategy` | String | `"simple-shuffle"` | Algorithm for routing requests. | +| `model_group_alias` | Object | `None` | Alias mapping for model groups. | +| `default_litellm_params` | Object | `None` | Default `litellm_params` for all routed requests. | +| `default_max_parallel_requests` | Integer | `None` | Default parallel request limit per deployment. | +| `default_priority` | Integer | `None` | Default request priority in scheduler queue. | +| `polling_interval` | Float | `0.003` | Polling interval for scheduler queue (seconds). | +| `caching_groups` | List of Tuples | `[]` | Groups of models sharing caches. | +| `assistants_config` | Object | `None` | Configuration for multi-turn assistants *(Advanced/Enterprise Feature)*. | +| **Multi-Instance Coordination & Scaling** | | | | +| `redis_host` | String | `None` | Redis hostname for router coordination. | +| `redis_port` | Integer | `None` | Redis port for router coordination. | +| `redis_password` | String | `None` | Redis password for router coordination. | +| `redis_url` | String | `None` | Full Redis connection URL (alternative to host/port/password). | +| `client_ttl` | Integer | `3600` | TTL for cached HTTP client connections (seconds). | +| `cache_responses` | Boolean | `false` | Enable caching at the router level. | +| `routing_strategy_args` | Object | `None` | Additional arguments for routing strategy. | +| **Pre-Call Checks & Validation** | | | | +| `enable_pre_call_checks` | Boolean | `false` (might be `true` in newer versions) | Enable pre-call checks (e.g., context window check). | +| `optional_pre_call_checks` | Array of Strings | `[]` | List of optional pre-call check plugins. | +| **Failover & Retry Policies** | | | | +| `allowed_fails` | Integer | `1` (might vary) | Allowed failures per model per minute before cooldown. | +| `cooldown_time` | Integer | `None` | Cooldown duration after exceeding allowed fails (seconds). | +| `disable_cooldowns` | Boolean | `false` | Disable model cooldown mechanism entirely. | +| `retry_policy` | Object | Default Retry Policy | Retry policy for different error types. | +| `allowed_fails_policy` | Object | Default Allowed Fails Policy | Allowed fails policy per error type. | +| `fallbacks` | Array of Objects | `[]` | Fallback models for general errors. | +| `content_policy_fallbacks` | Array of Objects | `[]` | Fallback models for ContentPolicyViolationError. | +| `default_fallbacks` | Array of Strings | `[]` | Default fallback models for unhandled errors. | +| `max_fallbacks` | Integer | `5` | Max number of fallback models to try. | +| `num_retries` | Integer | `3` | Number of retry attempts for a request. | +| `model_group_retry_policy` | Object | `None` | Retry policy per model group *(SDK-only/advanced)*. | +| **Timeouts & Debugging** | | | | +| `timeout` | Float | `600` (10 minutes) | Default timeout for a request (seconds). | +| `stream_timeout` | Float | Inherits from `timeout` | Default timeout for streaming requests (seconds). | +| `debug_level` | String | `"INFO"` | Logging verbosity level for router logs (`"DEBUG"` or `"INFO"`). | +| `set_verbose` | Boolean | `false` | *Deprecated* - Use `litellm_settings.set_verbose` instead. | + +### Routing Strategy & Model Selection + +Within `router_settings`, this subsection defines the algorithms and parameters used to select the appropriate backend model deployment for each incoming request. This includes strategies like round-robin, least-busy, latency-based, and custom routing. + +#### `routing_strategy` + +**YAML Key:** `routing_strategy` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `"simple-shuffle"` (Simple shuffle routing strategy is used by default). + +**Description:** The `routing_strategy` parameter, within `router_settings`, determines the **algorithm or strategy** that the LiteLLM Proxy Router will use to **route incoming requests** to different backend model deployments or providers, especially when you have multiple deployments configured for the same `model_name` alias in your `model_list`. + +* **Model Selection Algorithm:** `routing_strategy` dictates *how* the proxy chooses which backend model instance to send a request to out of the available options for a given `model_name`. +* **Load Balancing and Optimization:** Different routing strategies are designed to achieve different goals, such as: + * **Load Balancing:** Distributing traffic evenly across deployments to prevent overload and ensure fair resource utilization. + * **Performance Optimization:** Routing requests to the fastest or least busy deployments to minimize latency and improve response times. + * **Cost Optimization:** Routing requests to the least expensive deployments to reduce overall LLM costs. + * **Reliability and Failover:** Strategies that consider model health and availability to route requests away from unhealthy or failing deployments. +* **Supported Routing Strategies:** Valid values for `routing_strategy` include: + * `"simple-shuffle"`: (Default). A basic **round-robin style** strategy. It shuffles the list of available deployments for a given model alias and then iterates through them sequentially, distributing requests in a roughly even manner. If weights are assigned to deployments, the shuffle is weighted accordingly. + * `"least-busy"`: Routes requests to the model deployment that currently has the **fewest active (concurrent) requests**. This strategy aims to balance load by directing traffic to less loaded deployments. + * `"usage-based-routing"`: Routes requests based on **usage metrics**, such as token consumption or request count. This strategy attempts to balance load based on actual resource utilization, potentially favoring deployments with lower current usage. *Note: The documentation also mentions `"usage-based-routing-v2"`, which might be a newer or improved version of usage-based routing, potentially using Redis for asynchronous usage tracking. Check the documentation for your specific LiteLLM Proxy version for details on available usage-based routing options.* + * `"latency-based-routing"`: Routes requests to the model deployment that has exhibited the **lowest recent response latency**. This strategy prioritizes performance by directing traffic to the fastest-responding deployments. + * `"cost-based-routing"`: Routes requests to the **least expensive** model deployment that is capable of handling the request. This strategy prioritizes cost optimization by choosing the most cost-effective option among available models. + * `"custom"`: Allows you to implement and plug in a **completely custom routing strategy** by defining a Python class that inherits from `CustomRoutingStrategyBase` and specifying the path to your custom routing class using the `custom_routing_strategy_path` parameter in `router_settings`. + +**Example YAML (Setting Routing Strategy to Least Busy):** + +```yaml +router_settings: + routing_strategy: "least-busy" # Setting routing strategy to "least-busy" - route to least loaded deployment + # ... other router_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The **`"simple-shuffle"` strategy is a good default** for basic load balancing. For more advanced scenarios, consider: + +* `"least-busy"`: For load balancing based on current concurrency. +* `"latency-based-routing"`: For optimizing response time and user experience. +* `"usage-based-routing"` (or `"usage-based-routing-v2"`): For load balancing based on resource utilization or for cost optimization (if cost is correlated with usage). +* `"cost-based-routing"`: For direct cost optimization, always choosing the least expensive option. + +The optimal `routing_strategy` depends on your specific application requirements, performance goals, cost constraints, and the characteristics of your backend LLM deployments. Test different strategies and monitor performance metrics to determine the best routing approach for your use case. For very specialized routing logic, consider implementing a `"custom"` routing strategy. + +--- + +#### `model_group_alias` (`router_settings`) + +**YAML Key:** `model_group_alias` + +**Type:** Object (Mapping of string to string) + +**Environment Variable:** N/A + +**Default Value:** `None` (No model group aliases are defined by default). + +**Description:** The `model_group_alias` parameter, within `router_settings`, is a **mapping** (dictionary) that allows you to define **aliases or alternative names for model groups** specifically within the proxy router's context. This is similar to the `model_group_alias` under `general_settings`, but scoped to the router's operations. + +* **Router-Specific Model Group Remapping:** `router_settings.model_group_alias` provides a way to create aliases or alternative names for model groups that are used **specifically by the proxy router for routing decisions, health checks, and other router-level operations**. This can be useful for: + * **Routing Strategy Consistency:** Ensuring that routing strategies (like custom routing logic) consistently refer to model groups using specific names, even if the user-facing `model_name` aliases in `model_list` are different or change. + * **Simplified Router Configuration:** Making router configurations more readable or manageable by using shorter or more descriptive aliases for model groups within the `router_settings` section. + * **Internal Router Logic Abstraction:** Abstracting away the actual `model_name` aliases from the router's internal logic, allowing you to change the user-facing aliases in `model_list` without affecting router configurations that rely on model group names. + +**Example YAML (`router_settings.model_group_alias`):** + +```yaml +router_settings: + routing_strategy: "latency-based-routing" # Using latency-based routing strategy + model_group_alias: # Router-specific model group aliases + "claude-2": "claude-latest-group" # Mapping "claude-2" alias to "claude-latest-group" for router + # ... other router_settings ... +model_list: + - model_name: claude-2 # User-facing model name alias + litellm_params: + model: anthropic/claude-2 + api_key: "os.environ/ANTHROPIC_API_KEY" + # ... other litellm_params ... +``` + +In this example, `router_settings.model_group_alias: {"claude-2": "claude-latest-group"}` maps the `model_name` alias `"claude-2"` to the internal router model group name `"claude-latest-group"`. When the proxy router makes routing decisions (especially with latency-based routing in this example) or performs health checks, it will internally use the name `"claude-latest-group"` to refer to the model group associated with the user-facing alias `"claude-2"`. Client applications will still use `"claude-2"` as the `model_name` in their API requests. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `router_settings.model_group_alias` is primarily for internal routing and management within the proxy router. It is an advanced configuration option that is typically only needed in complex routing scenarios or when you want to create more abstract or consistent naming conventions for model groups within the router's configuration. In most simpler setups, you can likely omit `router_settings.model_group_alias` and rely on the `model_name` aliases directly for routing decisions. + +--- + +#### `default_litellm_params` (`router_settings`) + +**YAML Key:** `default_litellm_params` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default `litellm_params` are applied by default). + +**Description:** The `default_litellm_params` parameter, within `router_settings`, is an **optional mapping** (object) that allows you to define a set of **default `litellm_params`** that will be **automatically applied to *all* requests** that are routed through the proxy router. + +* **Global Request Parameter Defaults:** `default_litellm_params` provides a way to set global default values for `litellm_params` that are applied to every request, unless explicitly overridden in the individual model configurations (in `model_list`) or in the client's API request. +* **Enforcing Global Settings:** Use `default_litellm_params` to enforce certain behaviors or settings across all requests routed through the proxy, such as: + * **Default Temperature:** Set a global default `temperature` value (e.g., `0.0` for more deterministic output) that will be applied to all requests unless the client or model configuration overrides it. + * **Default** `max_tokens`:** Set a global default `max_tokens` limit as a safety measure to prevent excessively long responses across all models. + * **Other Global Parameters:** You can potentially set other valid `litellm_params` as defaults here, depending on your needs and the capabilities of the LiteLLM library and backend providers. +* **Overridable by Model or Request:** Default parameters set in `default_litellm_params` can be **overridden** by: + * `litellm_params` settings defined for specific models in your `model_list`. If a model in `model_list` has its own `temperature` setting, for example, that model-specific setting will take precedence over the global default from `default_litellm_params`. + * Parameters explicitly provided in the client's API request. If a client sends a request with a `temperature` parameter, that request-specific `temperature` value will override both the model-level setting (if any) and the global default from `default_litellm_params`. + +**Example YAML (Setting a Global Default Temperature):** + +```yaml +router_settings: + default_litellm_params: # Setting default litellm_params for all routed requests + temperature: 0.0 # Global default temperature set to 0.0 (deterministic output) + # ... other router_settings ... +# Example model_list (showing how model-specific settings can override defaults): +model_list: + - model_name: gpt-3.5-turbo-creative + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + temperature: 0.7 # Model-specific temperature (overrides global default) + # ... other litellm_params ... + - model_name: gpt-3.5-turbo-precise + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + # temperature: (not set here, will use global default of 0.0 from router_settings.default_litellm_params) + # ... other litellm_params ... +``` + +In this example, `router_settings.default_litellm_params: {temperature: 0.0}` sets a global default temperature of `0.0`. However, the `gpt-3.5-turbo-creative` model deployment overrides this global default by specifying its *own* `temperature: 0.7` in its `litellm_params`. The `gpt-3.5-turbo-precise` model, on the other hand, does *not* define its own `temperature`, so it will inherit the global default temperature of `0.0` from `router_settings.default_litellm_params`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `default_litellm_params` is a convenient way to enforce global default settings for requests routed through the proxy. However, remember that these defaults can be overridden at the model level in `model_list` or at the request level by the client application, providing flexibility when needed. + +--- + +#### `default_max_parallel_requests` (`router_settings`) + +**YAML Key:** `default_max_parallel_requests` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `None` (No default per-deployment parallel request limit unless explicitly set in `general_settings` or `router_settings`). + +**Description:** The `default_max_parallel_requests` parameter, within `router_settings`, sets a **default `max_parallel_requests` limit** that the proxy router will apply to **all model deployments** defined in your `model_list`, *unless* a specific `max_parallel_requests` limit is already defined for a model in `general_settings` or directly within the `model_list` entry itself. + +* **Router-Level Default Concurrency Limit:** `default_max_parallel_requests` provides a way to set a common concurrency limit across all model deployments managed by the proxy router, in cases where you don't want to configure `max_parallel_requests` individually for each model. +* **Fallback for Missing Model-Specific Limits:** If a model deployment in your `model_list` **does not have its own `max_parallel_requests`** parameter defined (either in `general_settings` or within the `model_list` entry), the router will **fall back to using the `default_max_parallel_requests` value** from `router_settings` to limit concurrent requests for that model. +* **Global Default Override:** If a model *does* have a `max_parallel_requests` value defined in `general_settings` or its `model_list` entry, that **model-specific limit will take precedence** over the `default_max_parallel_requests` from `router_settings`. `default_max_parallel_requests` only applies when a model-specific limit is *not* already defined. + +**Example YAML (Setting a Router-Level Default Parallel Request Limit):** + +```yaml +router_settings: + default_max_parallel_requests: 4 # Setting default parallel request limit to 4 for all deployments without specific limits + # ... other router_settings ... +model_list: + - model_name: gpt-3.5-turbo-fast + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + max_parallel_requests: 20 # Model-specific limit (overrides router default) + # ... other litellm_params ... + - model_name: gpt-4-slow + litellm_params: + model: openai/gpt-4-deployment + api_key: "os.environ/AZURE_API_KEY_EU" + # No max_parallel_requests defined here - will use router's default of 4 + # ... other litellm_params ... +``` + +In this configuration, `default_max_parallel_requests: 4` sets a router-level default limit of 4 concurrent requests. The `gpt-3.5-turbo-fast` deployment overrides this default with its own `max_parallel_requests: 20`. The `gpt-4-slow` deployment, however, does not define a specific limit, so it will inherit the router's default limit of 4 concurrent requests. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `default_max_parallel_requests` is a convenient way to set a common concurrency baseline across all your model deployments. It reduces the need to individually configure `max_parallel_requests` for each model if you want to enforce a similar concurrency policy across your proxy. However, you can still override this default on a per-model basis as needed. + +--- + +#### `default_priority` (`router_settings`) + +**YAML Key:** `default_priority` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `None` (No default priority is assigned by default). + +**Description:** The `default_priority` parameter, within `router_settings`, sets a **default priority level** for requests that are processed by the proxy router's **scheduler**. This parameter is relevant primarily when you are using LiteLLM Proxy's **request scheduling features** (e.g., using `.scheduler_acompletion()` in the SDK or any other queued request handling mechanisms). + +* **Request Prioritization for Scheduler:** When request scheduling is in use, the proxy router might use a priority queue to manage incoming requests, especially under high load or when resources are constrained. The `default_priority` parameter allows you to assign a default priority level to requests that do not explicitly specify their own priority. +* **Priority Value Interpretation:** The interpretation of the `default_priority` value (and priority values in general) is **implementation-dependent**. Typically, a **lower numerical value indicates a *higher* priority**, while a higher value indicates a lower priority. For example, `priority: 0` might represent the highest priority, `priority: 1` a medium priority, `priority: 2` a lower priority, and so on. Check the documentation for the specific scheduling or priority queue implementation used by your LiteLLM Proxy version to confirm how priority values are interpreted. +* **Default Priority Assignment:** If `default_priority` is set, any request that does not explicitly specify a `priority` parameter will be assigned this default priority level when it is added to the scheduler queue. +* **Optional Priority Setting:** `default_priority` is **optional**. If you do not set it, requests that do not specify a priority might be treated as having a **default priority of `None`**, or a specific default priority level as defined by the scheduler implementation (check documentation for default scheduler behavior in your version). + +**Example YAML (Setting Default Request Priority):** + +```yaml +router_settings: + default_priority: 10 # Setting default request priority to 10 (medium priority - assuming lower is higher priority) + # ... other router_settings ... +``` + +In this example, any request that is added to the scheduler queue and does not explicitly specify a `priority` parameter will be assigned a default priority level of `10`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `default_priority` is only relevant when you are actively using the LiteLLM Proxy's **request scheduling features**. If you are not using request scheduling, this parameter will have no effect. If you are using scheduling, you can use `default_priority` to establish a baseline priority level for most requests, and then override or adjust the priority for specific requests as needed using the `priority` parameter in the API call or SDK request options. Consult the documentation for the specific request scheduling mechanisms and priority queue implementation used by your LiteLLM Proxy version for details on priority value interpretation and behavior. + +--- + +#### `polling_interval` (`router_settings`) + +**YAML Key:** `polling_interval` + +**Type:** Float (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `0.003` (0.003 seconds, or 3 milliseconds, if not set). + +**Description:** The `polling_interval` parameter, within `router_settings`, controls the **frequency at which the proxy router polls the request queue** for new requests when using request scheduling. This parameter is relevant primarily when you are using LiteLLM Proxy's **request scheduling features** (e.g., using `.scheduler_acompletion()` in the SDK or any other queued request handling mechanisms). + +* **Queue Polling Frequency:** When request scheduling is enabled, incoming requests are typically added to a queue. The proxy router periodically checks this queue to see if there are new requests waiting to be processed. `polling_interval` determines **how often** the router performs this queue check, measured in **seconds**. +* **Responsiveness vs. Resource Usage Trade-off:** + * **Lower `polling_interval` (e.g., 0.003 seconds, the default):** Results in **more frequent queue polling**. This makes the proxy **more responsive to new incoming requests**, as it checks the queue very frequently and can start processing new requests almost immediately after they arrive in the queue. However, very frequent polling can also consume **more CPU resources** on the proxy server, as the router is constantly waking up and checking the queue. + * **Higher `polling_interval` (e.g., 0.1 seconds, 0.5 seconds, or higher):** Reduces the frequency of queue polling. This decreases the CPU overhead of queue polling on the proxy server. However, it also means that the proxy might be **slightly less responsive to new incoming requests**, as it will check the queue less often, and there might be a slightly longer delay before a newly added request starts to be processed. +* **Default Polling Interval:** The default `polling_interval` is set to `0.003 seconds` (3 milliseconds). This is a very short interval, indicating **very frequent polling** and a focus on minimizing latency and maximizing responsiveness to new requests. + +**Example YAML (Adjusting Polling Interval):** + +```yaml +router_settings: + polling_interval: 0.01 # Setting polling interval to 0.01 seconds (10 milliseconds) - less frequent polling + # ... other router_settings ... +``` + +In this example, the proxy router will poll the request queue every 0.01 seconds (10 milliseconds), which is still quite frequent, but less frequent than the default 0.003 seconds. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `polling_interval` of 0.003 seconds is generally optimized for **high responsiveness and low latency** in request scheduling. You typically do not need to change this unless you are experiencing: + +* **High CPU Usage on Proxy Server:** If you observe high CPU utilization on your proxy server, especially when using request scheduling, it might indicate that the very frequent queue polling is contributing to CPU overhead. In such cases, you can try *increasing* `polling_interval` slightly to reduce CPU usage, accepting a potential slight increase in request latency. +* **Specific Performance Tuning Needs:** In very specialized performance tuning scenarios, you might experiment with slightly adjusting `polling_interval` to fine-tune the balance between responsiveness and resource consumption based on your specific workload and infrastructure. However, for most typical deployments, the default value is likely optimal. + +**Caution:** Setting `polling_interval` to a *very high* value (e.g., 1 second or more) can significantly **reduce the responsiveness of request scheduling**, as the proxy will only check the queue infrequently, potentially leading to longer delays before new requests are processed, especially under high load. Avoid setting `polling_interval` to very large values unless you have a specific reason to do so and understand the impact on request latency. + +--- + +#### `caching_groups` (`router_settings`) + +**YAML Key:** `caching_groups` + +**Type:** List of Tuples (List of string tuples, e.g., `[["model_name_1", "model_name_2"], ["model_name_3", "model_name_4"]]`) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no caching groups defined by default). + +**Description:** The `caching_groups` parameter, within `router_settings`, is a **list of tuples** that allows you to define **groups of model names** that should **share the same cache**. This is useful when you have multiple model deployments that are functionally equivalent or interchangeable in terms of their output, and you want them to share a common cache to maximize cache hit rates and reduce redundant API calls. + +* **Shared Cache for Interchangeable Models:** `caching_groups` is designed for scenarios where you have **multiple deployments of the *same* or *semantically equivalent* model**, but they are represented by different `model_name` aliases in your `model_list`. Examples include: + * **Load-Balanced Deployments:** You have multiple instances of the same model (e.g., multiple replicas of `gpt-3.5-turbo` running for load balancing). You might want them to share a common cache so that a cache hit from one instance can benefit all instances. + * **Provider Redundancy:** You are using deployments of the same model from different providers (e.g., an Azure OpenAI deployment and an OpenAI direct deployment of GPT-3.5 Turbo). If the models are functionally interchangeable, you can configure them to share a cache. + * **Model Aliases for Different Versions:** You have model aliases that represent different versions or deployments of the same underlying model. If responses from different versions are considered cacheable for each other, you can group them to share a cache. +* **List of Model Name Tuples:** The `caching_groups` parameter is a **list**. Each item in the list is a **tuple of strings**. Each tuple represents a **group of `model_name` aliases** that should share a cache. All model names listed in a tuple must be valid `model_name` aliases defined in your `model_list`. +* **Cache Key Sharing:** When models are grouped in `caching_groups`, the proxy will treat them as sharing a **common cache space**. If a request comes in for any model in a group, the proxy will check the shared cache for a matching entry. If a cache hit is found (from a previous request to *any* model in the same group), the cached response can be used, regardless of which specific model within the group was originally called. +* **Cache Consistency Assumption:** `caching_groups` assumes that the models within a group are **functionally interchangeable in terms of their output**. Using a shared cache for models that produce significantly different responses or have different behavior might lead to incorrect or unexpected cached results. Only group models that are truly semantically equivalent or interchangeable for caching purposes. + +**Example YAML (Defining a Caching Group for OpenAI and Azure GPT-3.5 Turbo Deployments):** + +```yaml +router_settings: + caching_groups: # Defining caching groups for shared caches + - ["gpt-3.5-turbo-azure", "gpt-3.5-turbo-openai"] # Grouping Azure and OpenAI GPT-3.5 Turbo aliases to share a cache + # ... other router_settings ... +model_list: + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/gpt-turbo-small-eu + api_key: "os.environ/AZURE_API_KEY_EU" + # ... other litellm_params ... + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "os.environ/OPENAI_API_KEY" + # ... other litellm_params ... +``` + +In this example, `caching_groups: - ["gpt-3.5-turbo-azure", "gpt-3.5-turbo-openai"]` defines a caching group containing the `model_name` aliases `"gpt-3.5-turbo-azure"` and `"gpt-3.5-turbo-openai"`. This means that if a request comes in for either `"gpt-3.5-turbo-azure"` or `"gpt-3.5-turbo-openai"`, the proxy will check a *shared* cache for both models. If a cache hit is found (from a previous request to *either* model), the cached response can be used to serve the current request, regardless of which of the two model aliases was originally requested. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Use `caching_groups` strategically when you have multiple deployments of functionally equivalent models and want to maximize cache hit rates and cost savings by sharing a common cache across these deployments. Only group models that are truly interchangeable in terms of their output for caching purposes to avoid serving irrelevant or inappropriate cached responses. + +--- + +#### `assistants_config` + +**YAML Key:** `assistants_config` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (Assistants API configuration is not enabled by default). + +**Description:** The `assistants_config` parameter, within `router_settings`, is an **advanced/enterprise-level mapping** (object) that is mentioned in the documentation in the context of "Alerting Config" and "ServiceAccountSettings". However, the provided documentation snippets **do not offer detailed information or examples of how to configure or use `assistants_config`** in `router_settings`. + +* **Likely Related to Assistants API (Advanced/Enterprise):** The name `assistants_config` suggests that this parameter might be related to configuring integration with **"Assistants API" or similar advanced features** that are likely part of LiteLLM Proxy's enterprise edition. These features might involve: + * Multi-turn conversational assistants. + * Stateful sessions or conversation history management. + * Advanced routing or orchestration of requests within conversational contexts. +* **Advanced/Enterprise Feature – Limited Documentation:** Based on the limited documentation snippets, `assistants_config` appears to be an **advanced/enterprise feature** that is not fully documented in the standard documentation. Detailed configuration instructions, available options, and use cases for `assistants_config` are likely found in **enterprise-specific documentation or support materials**. +* **No Publicly Available Details:** Without further documentation, it is **not possible to provide a precise description of how to configure or use `assistants_config`**. The provided snippets only hint at its existence and possible relation to advanced features. + +**Example YAML:** There is no example YAML configuration for `assistants_config` in the provided documentation snippets. You would need to consult the **enterprise documentation** for LiteLLM Proxy to find configuration examples and usage details for `assistants_config`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** If you are interested in using `assistants_config`, you should: + +1. **Check Enterprise Documentation:** Consult the **LiteLLM Proxy Enterprise documentation** for detailed information on Assistants API integration and `assistants_config`. Look for sections related to "Advanced Features", "Enterprise Features", "Assistants API", or similar topics. +2. **Contact Enterprise Support:** If the documentation is insufficient, contact LiteLLM Enterprise support channels for guidance and configuration instructions for `assistants_config`. +3. **Limited Information in Public Docs:** Be aware that publicly available documentation for `assistants_config` might be limited, as it is likely an enterprise-level feature with restricted documentation access. + +**In Summary:** `assistants_config` is likely a placeholder for configuring advanced, enterprise-level features related to Assistants APIs within the LiteLLM Proxy Router. However, detailed configuration and usage information is **not available in the standard documentation** snippets provided and likely requires consulting enterprise-specific documentation or support resources. It is marked as an **advanced/enterprise feature**. + +--- + +### Multi-Instance Coordination & Scaling + +This subsection, inside `router_settings`, configures settings related to running multiple instances of the LiteLLM Proxy, such as connecting to Redis for shared state and load balancing. + +#### `redis_host` (`router_settings`) + +**YAML Key:** `redis_host` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `None` (Not required unless using multi-instance coordination features). + +**Description:** The `redis_host` parameter, within `router_settings`, is used to configure **multi-instance coordination** for the LiteLLM Proxy Server. It specifies the **hostname or IP address of a Redis server** that will be used for shared state management across multiple proxy instances. + +* **Multi-Instance Proxy Deployments:** `redis_host`, along with `redis_port` and `redis_password`, is relevant when you are running **multiple instances of the LiteLLM Proxy Server** (e.g., for horizontal scaling, high availability, or load balancing). In a multi-instance setup, each proxy instance typically runs as a separate process or container. +* **Shared State Management via Redis:** To enable coordination and shared state across these multiple proxy instances, they can be configured to connect to a **common Redis database**. Redis acts as a central store for shared data that needs to be consistent across all proxy instances. +* **Coordination Features Enabled by Redis:** When Redis is configured in `router_settings` (via `redis_host`, `redis_port`, `redis_password`): + * **Distributed Rate Limiting (RPM/TPM):** Rate limits (RPM/TPM) defined in `model_list` or via API can be enforced **globally across all proxy instances**. Without Redis, rate limits are typically enforced per-instance, meaning each proxy instance would have its own independent rate limit counters. With Redis, the proxy instances can share rate limit usage information, ensuring that the RPM/TPM limits are applied consistently across the entire proxy cluster. + * **Shared Health Check Data:** Health check information (model status, latency metrics, etc.) can be shared across proxy instances, allowing for more coordinated routing and failover decisions in load-balanced setups. +* **Redis Server Address:** `redis_host` specifies the hostname or IP address of your Redis server. This should be the address that is accessible by all proxy instances in your deployment. + +**Example YAML (Redis Host for Multi-Instance Coordination):** + +```yaml +router_settings: + redis_host: "redis-server.example.com" # Hostname or IP of the shared Redis server for coordination + redis_port: 6379 # Redis port + redis_password: "redis_password" # Redis password (if required) + # ... other router_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `redis_host` is used in conjunction with `redis_port` and `redis_password` to fully configure the Redis connection for multi-instance coordination. Redis is **required** for distributed rate limiting and shared state management across multiple proxy instances. If you are running a single proxy instance or do not need distributed coordination, you can omit `redis_host`, `redis_port`, and `redis_password`. + +--- + +#### `redis_port` (`router_settings`) + +**YAML Key:** `redis_port` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `None` (Not required unless using multi-instance coordination features and `redis_host` is set). + +**Description:** The `redis_port` parameter, within `router_settings`, is used in conjunction with `redis_host` to configure **multi-instance coordination** for the LiteLLM Proxy Server. It specifies the **port number** of the Redis server that will be used for shared state management. + +* **Redis Port Number:** Provide the port number on which your Redis server is listening for connections. The standard default Redis port is `6379`. +* **Required with** `redis_host` **for Redis Coordination:** `redis_port` is **required** when you are configuring Redis for multi-instance coordination (i.e., when `redis_host` is also set). If `redis_host` is provided but `redis_port` is missing, the proxy will not be able to connect to Redis. +* **Default Redis Port (If Applicable):** If your Redis server is using the **default port `6379`**, you *might* be able to omit `redis_port` in some LiteLLM Proxy versions, and it might default to 6379. However, it is **best practice to always explicitly specify `redis_port`** even if you are using the default port, to ensure clarity and avoid potential issues if the default port changes in the future. + +**Example YAML (Redis Host and Port for Coordination):** + +```yaml +router_settings: + redis_host: "redis-server.example.com" + redis_port: 6379 # Specifying Redis server port + redis_password: "redis_password" + # ... other router_settings ... +``` + +In this example, `redis_port: 6379` specifies that the Redis server is listening on port 6379. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `redis_port` is always used in conjunction with `redis_host` to define the full network address of your Redis server for multi-instance proxy coordination. If you are not using multi-instance coordination, you can omit `redis_host` and `redis_port`. + +--- + +#### `redis_password` (`router_settings`) + +**YAML Key:** `redis_password` + +**Type:** String + +**Environment Variable:** N/A + +**Default Value:** `None` (No password is assumed by default). + +**Description:** The `redis_password` parameter, within `router_settings`, is used in conjunction with `redis_host` and `redis_port` to configure **multi-instance coordination**. It specifies the **password for your Redis server**, if your Redis instance is configured with password-based authentication. + +* **Redis Authentication Password:** If your Redis server requires a password for client connections (which is highly recommended for security in production), you **must** provide the correct password using this parameter. +* **Redis Security Best Practice:** Securing your Redis instance with a password is a crucial security measure to prevent unauthorized access to your shared cache data and coordination state. +* **Omit for Passwordless Redis:** If your Redis server does *not* require a password (e.g., in a development or trusted network environment), you can omit this parameter. Do *not* provide a password if your Redis server is not configured to require one, as this could cause connection errors. + +**Example YAML (Redis Host, Port, and Password for Coordination):** + +```yaml +router_settings: + redis_host: "redis-server.example.com" + redis_port: 6379 + redis_password: "redis_password" # Providing Redis password for authentication + # ... other router_settings ... +``` + +In this example, `redis_password: "redis_password"` provides the password for authenticating with the Redis server. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security Note:** For enhanced security, consider storing the Redis password in an environment variable or a secure secret management system and referencing it in your `config.yaml` using `"os.environ/REDIS_PASSWORD"` instead of hardcoding the password directly in the YAML file. + +--- + +#### `redis_url` (`router_settings`) + +**YAML Key:** `redis_url` + +**Type:** String (Redis connection URL) + +**Environment Variable:** N/A + +**Default Value:** `None` (Not required unless using multi-instance coordination features). + +**Description:** The `redis_url` parameter, within `router_settings`, provides an **alternative way to configure the Redis connection** for multi-instance coordination. Instead of specifying `redis_host`, `redis_port`, and `redis_password` separately, you can provide a **single Redis connection URL** using the `redis_url` parameter. + +* **Single Connection String:** `redis_url` allows you to specify all Redis connection details (host, port, password, database, etc.) in a single, standardized URL string. This can simplify configuration in some cases. +* **Redis Connection URL Format:** The `redis_url` string should be a valid Redis connection URL, typically following the format: `redis://[:password@]host[:port][/database]`. Examples include: + * `"redis://localhost:6379"` (No password, default port) + * `"redis://:password@redis.example.com:6380/0"` (With password, non-default port, and database index 0) + * `"redis://redis-cluster.example.com:7001"` (For Redis Cluster, though using `redis_startup_nodes` is generally recommended for Redis Cluster). +* **Alternative to Separate Host/Port/Password:** `redis_url` is an **alternative to using `redis_host`, `redis_port`, and `redis_password`**. You should use *either* `redis_url` *or* the separate host/port/password parameters, but **not both**. If you provide `redis_url`, the proxy will ignore `redis_host`, `redis_port`, and `redis_password`. + +**Example YAML (Redis Connection via `redis_url`):** + +```yaml +router_settings: + redis_url: "redis://:your_redis_password@redis.example.com:6380/0" # Redis connection URL + # ... other router_settings ... +``` + +In this example, all Redis connection details (host, port, password, database index) are provided in a single `redis_url` string. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** While `redis_url` can simplify configuration, the documentation mentions a **potential performance issue** with using a single URL string compared to providing separate host/port fields. For optimal performance, especially in high-throughput scenarios, it might be **recommended to use the separate `redis_host`, `redis_port`, and `redis_password` parameters** instead of `redis_url`. However, for simpler setups or when convenience is prioritized, `redis_url` can be a valid option. If you encounter performance issues with Redis coordination, consider switching to separate host/port/password configuration. + +--- + +#### `client_ttl` (`router_settings`) + +**YAML Key:** `client_ttl` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `3600` (3600 seconds, or 1 hour, if not set). + +**Description:** The `client_ttl` parameter, within `router_settings`, specifies the **time-to-live (TTL) in seconds for cached HTTP client connections** that the proxy router maintains for backend LLM providers. + +* **HTTP Connection Caching:** To improve performance and reduce connection overhead, the proxy router can **cache HTTP client connections** to backend LLM providers. Instead of establishing a new HTTP connection for every request, the proxy can reuse existing, cached connections for subsequent requests to the same provider. +* **Connection Reusability Duration:** `client_ttl` determines **how long these cached HTTP client connections should be kept alive and reused** before they are considered **expired**. After the `client_ttl` duration has elapsed since a connection was cached, the proxy will discard the cached connection and establish a new connection for the next request. +* **Balancing Connection Reuse and Freshness:** + * **Longer `client_ttl` (e.g., 3600 seconds/1 hour, the default):** Maximizes **connection reuse**. Cached connections are kept alive for a longer duration, potentially improving performance by reducing connection setup overhead, especially for providers that are frequently called. However, very long TTLs might lead to connections becoming stale or less efficient over time if network conditions or server configurations change. + * **Shorter `client_ttl` (e.g., 600 seconds/10 minutes or less):** Results in **more frequent connection refreshes**. Cached connections are discarded and re-established more often. This might be useful in environments with frequently changing network conditions or if you want to ensure connections are relatively fresh. However, it can slightly increase connection setup overhead and potentially reduce performance benefits from connection caching. +* **Default TTL (1 Hour):** The default `client_ttl` is `3600 seconds` (1 hour). This is often a reasonable balance for many use cases, providing good connection reuse while still refreshing connections periodically. + +**Example YAML (Adjusting Client Connection TTL):** + +```yaml +router_settings: + client_ttl: 1800 # Setting client connection TTL to 1800 seconds (30 minutes) - shorter TTL + # ... other router_settings ... +``` + +In this example, cached HTTP client connections will be kept alive and reused for a maximum of 1800 seconds (30 minutes). After 30 minutes, they will be discarded and new connections will be established. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `client_ttl` of 3600 seconds (1 hour) is generally suitable for most deployments. You typically do not need to change this unless you are experiencing: + +* **Connection Issues or Stale Connections:** If you observe issues related to stale or broken HTTP connections after long periods of proxy uptime, you can try *decreasing* `client_ttl` to force more frequent connection refreshes. +* **Performance Tuning - Connection Overhead:** If you want to aggressively minimize connection setup overhead and maximize connection reuse, you *could* try increasing `client_ttl` to a longer duration (e.g., several hours), but monitor for potential issues with stale connections over very long TTLs. + +For most typical LLM API use cases, the default `client_ttl` is likely optimal, and you generally do not need to adjust it unless you have specific performance or connection-related concerns. + +--- + +#### `cache_responses` (`router_settings`) + +**YAML Key:** `cache_responses` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Router-level response caching is disabled by default). + +**Description:** The `cache_responses` parameter, within `router_settings`, is a **boolean flag** that controls whether the proxy router should **enable caching of LLM responses *at the router level***. + +* **Router-Level Caching (Optional):** By default (`cache_responses: false`), the proxy router itself does **not** actively cache LLM responses. Caching, if enabled, is typically configured and managed at the **LiteLLM library level** using the `cache` and `cache_params` settings within `litellm_settings`. +* **Enabling Router Cache (Alternative Caching Mechanism):** Setting `cache_responses: true` will instruct the proxy router to **also implement its own caching layer**. This router-level cache would be *in addition to* any caching configured via `litellm_settings.cache`. It's essentially a **second layer of caching**. +* **Redundant or Specialized Caching Layer:** Enabling `cache_responses: true` is generally **not recommended or needed in most typical deployments**. The primary and recommended caching mechanism in LiteLLM Proxy is the `litellm_settings.cache` configuration, which provides flexible and configurable caching options. `router_settings.cache_responses: true` might be used in **very specific or advanced scenarios** where you might want: + * **Redundant Caching:** A secondary caching layer at the router level, perhaps for specific routing strategies or to provide an extra layer of caching beyond the general LiteLLM library caching. + * **Custom Routing-Specific Caching Logic:** In combination with custom routing strategies (`routing_strategy: "custom"`), you might implement custom caching logic within your routing strategy and use `router_settings.cache_responses: true` to signal to the router that it should participate in caching. However, even in custom routing scenarios, it's often simpler and more manageable to rely on the standard `litellm_settings.cache` mechanism for caching and implement any custom caching logic within your callbacks or application code, rather than enabling router-level caching. +* **Default Off Recommendation:** For most typical deployments, it's **recommended to leave `cache_responses: false`** (the default) and manage caching primarily through the `litellm_settings.cache` and `cache_params` configurations. Only enable `router_settings.cache_responses: true` if you have a very specific and well-justified reason to use router-level caching, and you fully understand the implications of having two separate caching layers in your proxy setup. + +**Example YAML (Enabling Router-Level Response Caching - Advanced):** + +```yaml +router_settings: + cache_responses: true # Advanced: Enabling response caching at the router level (use with caution!) + # ... other router_settings ... +litellm_settings: + cache: true # General caching must also be enabled via litellm_settings.cache + cache_params: + type: "memory" # Example: Using in-memory cache for both router-level and general caching + # ... other cache_params ... + # ... other litellm_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Security and Performance Note:** Enabling `cache_responses: true` might introduce some additional overhead at the router level. Carefully consider the performance implications and whether the benefits of router-level caching justify the added complexity. For most use cases, relying on the standard `litellm_settings.cache` mechanism is sufficient and often more efficient. + +--- + +#### `routing_strategy_args` (`router_settings`) + +**YAML Key:** `routing_strategy_args` + +**Type:** Object (Mapping) + +**Environment Variable:** N/A + +**Default Value:** `None` (No routing strategy arguments are provided by default). + +**Description:** The `routing_strategy_args` parameter, within `router_settings`, is an **optional mapping** (object) that allows you to provide **additional keyword arguments or configuration parameters** that are **specific to the currently selected `routing_strategy`**. + +* **Strategy-Specific Arguments:** Different routing strategies might have their own specific configuration options or parameters that can fine-tune their behavior. `routing_strategy_args` is used to pass these strategy-specific arguments to the chosen routing algorithm. +* **Example: Latency-Based Routing Arguments:** For example, if you are using `routing_strategy: "latency-based-routing"`, you might use `routing_strategy_args` to configure parameters like: + * `ttl`: Time-to-live (TTL) for latency data in the cache (how long latency measurements are considered valid). + * `lowest_latency_buffer`: A buffer or threshold to determine which deployments are considered "low latency" and eligible for routing (e.g., deployments within a certain percentage of the lowest observed latency). +* **Strategy Documentation:** The **specific arguments** that are supported within `routing_strategy_args` and their meanings **depend entirely on the `routing_strategy` you have selected**. You must consult the documentation for the routing strategy you are using (e.g., documentation for `"latency-based-routing"`, `"usage-based-routing"`, or your custom routing strategy if you are using `"custom"`) to understand which arguments are available and how to configure them. +* **Optional Arguments:** `routing_strategy_args` is **optional**. If you do not provide any arguments, the routing strategy will typically use its default behavior and default parameter values. Only use `routing_strategy_args` if you want to customize the behavior of your chosen routing strategy beyond its default settings. + +**Example YAML (`routing_strategy_args` for Latency-Based Routing):** + +```yaml +router_settings: + routing_strategy: "latency-based-routing" # Using latency-based routing strategy + routing_strategy_args: # Strategy-specific arguments for latency-based routing + ttl: 60 # Setting latency cache TTL to 60 seconds + lowest_latency_buffer: 0.2 # Setting lowest latency buffer to 20% + # ... other router_settings ... +``` + +In this example, `routing_strategy_args` is used to configure the `ttl` (time-to-live) and `lowest_latency_buffer` parameters specifically for the `"latency-based-routing"` strategy. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Important:** The `routing_strategy_args` parameter is an **advanced configuration option** that is only relevant when you want to fine-tune the behavior of a specific routing strategy. For most typical deployments, you can often rely on the default settings of the routing strategies and omit `routing_strategy_args`. Only use `routing_strategy_args` if you have a specific need to customize the routing algorithm's behavior and you understand the meaning and impact of the specific arguments you are configuring for your chosen routing strategy. + +--- + +### Pre-Call Checks & Validation + +This subsection of `router_settings` allows enabling pre-call checks (like context window validation) and optional pre-call check plugins to be executed before routing the request. + +#### `enable_pre_call_checks` + +**YAML Key:** `enable_pre_call_checks` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` in older versions, but might be `true` in newer versions based on documentation snippets. Check the default value in your LiteLLM Proxy version and it is **recommended to explicitly set this value** for clarity. + +**Description:** The `enable_pre_call_checks` parameter, within `router_settings`, is a **boolean flag** that controls whether the proxy router should perform **pre-call checks** on each incoming request *before* sending the request to the backend LLM provider. + +* **Proactive Request Validation:** When `enable_pre_call_checks: true`, the proxy router will execute a series of checks on the incoming request *before* incurring the cost and latency of an LLM API call. These pre-call checks are designed to catch common issues or potential errors *early in the request processing pipeline*, before the request reaches the backend LLM. +* **Primary Pre-Call Check: Context Window Validation:** The **primary pre-call check** that is enabled by `enable_pre_call_checks: true` is a **context window validation**. The proxy will attempt to estimate the token count of the input prompt and check if it will exceed the maximum context window of the target model. If the prompt is too long, the proxy can: + * Reject the request immediately and return a `ContextWindowExceededError` to the client. This prevents wasted API calls for requests that are guaranteed to fail due to context length limits. + * Trigger context window fallback mechanisms (if configured via `litellm_settings.context_window_fallbacks`). +* **Early Error Prevention and Fallback:** Enabling pre-call checks is generally recommended as it helps: + * **Prevent avoidable errors:** Catch context window errors (a common type of error) proactively, before they reach the backend LLM. + * **Improve efficiency and reduce costs:** Avoid wasting API calls on requests that are likely to fail. + * **Trigger fallback strategies:** If context window fallbacks are configured, pre-call checks can automatically initiate fallback logic when a context window issue is detected. +* **Optional Feature (Can be Disabled):** While pre-call checks are generally beneficial, they are **optional** and can be disabled by setting `enable_pre_call_checks: false`. You might disable them if: + * You want to minimize any potential overhead from pre-call checks (though the overhead is typically very minimal). + * You are certain that your application code already handles context window limits and other potential issues proactively, and you do not need the proxy to perform these checks. +* **Default Off in Older Versions, Potentially On in Newer:** The default value of `enable_pre_call_checks` might be `false` in older versions of LiteLLM Proxy. However, the documentation snippets suggest that it might be enabled (`true`) in newer versions. **Check the default value in your specific LiteLLM Proxy version and it's recommended to explicitly set this parameter** in your `config.yaml` for clarity and to ensure the desired behavior. + +**Example YAML (Enabling Pre-Call Checks):** + +```yaml +router_settings: + enable_pre_call_checks: true # Enabling pre-call checks for context window validation + # ... other router_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Generally, it is recommended to enable `enable_pre_call_checks: true` in production environments** to benefit from proactive context window validation and early error prevention. The overhead of pre-call checks is typically minimal, while the benefits of avoiding wasted API calls and improving reliability are often significant. If you have specific performance concerns or do not want the proxy to perform pre-call checks, you can disable them by setting `enable_pre_call_checks: false`, but understand that you might then miss out on the proactive error detection and fallback capabilities that pre-call checks provide. + +--- + +#### `optional_pre_call_checks` + +**YAML Key:** `optional_pre_call_checks` + +**Type:** Array of Strings (List of plugin names) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no optional pre-call checks are enabled by default). + +**Description:** The `optional_pre_call_checks` parameter, within `router_settings`, is an **array of strings** that allows you to specify a list of **optional pre-call check plugins** that the proxy router should execute for each incoming request, *in addition to* the default pre-call checks (like context window validation) that are enabled by `enable_pre_call_checks: true`. + +* **Extend Pre-Call Validation with Plugins:** `optional_pre_call_checks` provides a plugin-based mechanism to **extend the pre-call validation logic** of the proxy router. You can enable or disable specific pre-call check plugins by listing their names in this array. +* **Currently Supported Pre-Call Check Plugins:** As of the documentation, the following pre-call check plugins are mentioned as being supported: + * `"router_budget_limiting"`: Enables a pre-call check to verify if the user or team associated with the request is **within their budget limits** before executing the LLM call. If the budget is exceeded, the request can be rejected (or handled according to budget enforcement policies). + * `"prompt_caching"`: Enables a pre-call check to see if there's a **cached response for the exact prompt** in the cache. If a cache hit is found, the proxy might serve the cached response directly without calling the backend LLM (this is a less common caching mode; typically caching is checked *after* routing but before making the backend call; this pre-call prompt caching would be an *additional* check). + * *Potentially other plugins might be added in future versions. Check the documentation for the most up-to-date list of available pre-call check plugins.* +* **Selective Plugin Activation:** `optional_pre_call_checks` allows you to **selectively enable only the pre-call check plugins that are relevant to your application's requirements**. For example, you might enable `"router_budget_limiting"` for cost control in a production environment, but disable it in a development environment where budget enforcement is not needed. + +**Example YAML (Enabling Router Budget Limiting and Prompt Caching Pre-Call Checks):** + +```yaml +router_settings: + enable_pre_call_checks: true # Default pre-call checks (context window validation) are enabled + optional_pre_call_checks: # Enabling optional pre-call check plugins + - "router_budget_limiting" # Enable budget limit check before making LLM calls + - "prompt_caching" # Enable prompt-level caching check before making LLM calls + # ... other router_settings ... +``` + +In this example, in addition to the default pre-call checks (enabled by `enable_pre_call_checks: true`), the proxy router will also execute two optional pre-call check plugins: `"router_budget_limiting"` and `"prompt_caching"`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `optional_pre_call_checks` provides a flexible way to extend the pre-call validation logic of the proxy by enabling or disabling specific check plugins as needed. The available plugins and their configuration options might be expanded in future versions. Consult the documentation for your specific LiteLLM Proxy version for the most up-to-date list of supported pre-call check plugins and their usage. + +--- + +### Failover & Retry Policies + +Within `router_settings`, this section defines how the proxy should handle failures, including setting retry policies for different error types, configuring fallback models, and managing cooldown periods. + +#### `allowed_fails` + +**YAML Key:** `allowed_fails` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `1` (A model is cooled down after 1 failure per minute by default, but this might vary - check your version's documentation). + +**Description:** The `allowed_fails` parameter, within `router_settings`, sets the **maximum number of failed requests allowed per model deployment per minute** before that deployment is considered **unhealthy** and is **temporarily taken out of rotation** (put in a "cooldown" state). + +* **Failure Threshold for Cooldown:** `allowed_fails` defines the **tolerance for failures** at the model deployment level. It specifies how many errors a deployment can experience within a rolling 1-minute window before the proxy considers it to be potentially unhealthy or unstable. +* **Cooldown Trigger:** If a model deployment experiences more than `allowed_fails` number of failures within a minute, the proxy router will trigger a **cooldown** for that deployment. During cooldown, the proxy will **stop routing new requests to that deployment** for a specified duration (controlled by the `cooldown_time` parameter). +* **Preventing Repeated Calls to Failing Models:** The cooldown mechanism, triggered by `allowed_fails`, is designed to prevent the proxy from repeatedly sending requests to a backend model deployment that is experiencing issues (e.g., intermittent errors, performance degradation, or temporary unreachability). By taking a failing model out of rotation temporarily, the proxy can avoid wasting resources on calls that are likely to fail and improve overall system stability. +* **Rolling 1-Minute Window:** The failure count for `allowed_fails` is tracked within a **rolling 1-minute window**. This means that the proxy is continuously monitoring the failure rate for each model deployment over the past minute. If the failure count within the last minute exceeds `allowed_fails`, cooldown is triggered. Failures that occurred more than 1 minute ago are no longer counted towards the current 1-minute failure threshold. + +**Example YAML (Setting Allowed Fails to 3):** + +```yaml +router_settings: + allowed_fails: 3 # Allowing up to 3 failures per model per minute before cooldown + cooldown_time: 60 # Cooldown duration set to 60 seconds (1 minute) + # ... other router_settings ... +``` + +In this example, `allowed_fails: 3` means that if a model deployment experiences more than 3 failed requests within any 1-minute period, it will be put into cooldown. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `allowed_fails` value of `1` (or a similar low number) is often a good starting point for production environments. It provides a relatively **sensitive failure detection** mechanism, quickly taking models out of rotation if they experience even a single failure within a minute. You might consider **adjusting `allowed_fails`** if: + +* **You Observe Overly Aggressive Cooldowns:** If you find that models are being put into cooldown too frequently or for minor, transient errors, you can try *increasing* `allowed_fails` to a higher value (e.g., 3, 5, or more) to make the cooldown mechanism less sensitive and allow for a higher tolerance of transient failures. +* **You Need Very Strict Failure Handling:** If your application requires extremely high reliability and you want the proxy to be very aggressive in taking failing models out of rotation, you can *decrease* `allowed_fails` to `0` (meaning cooldown is triggered immediately on the *first* failure within a minute). However, be cautious with setting it to `0`, as even very brief, transient errors might then trigger cooldowns too readily. + +Monitor your proxy logs and model health metrics to determine the optimal `allowed_fails` value for your specific environment and reliability requirements. + +--- + +#### `cooldown_time` + +**YAML Key:** `cooldown_time` + +**Type:** Integer (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `None` (No default cooldown time is specified in the documentation snippets. You **must explicitly set `cooldown_time` if you are using `allowed_fails` or any cooldown mechanism**). + +**Description:** The `cooldown_time` parameter, within `router_settings`, specifies the **duration in seconds** for which a model deployment will be placed in **"cooldown"** after it has exceeded the `allowed_fails` threshold. Cooldown is a state in which the proxy router **temporarily stops routing new requests to a deployment** that is considered unhealthy or unstable due to excessive failures. + +* **Cooldown Duration:** `cooldown_time` determines **how long** a model deployment will remain in the cooldown state. During the cooldown period, the proxy will **not send any new requests** to that deployment. +* **Recovery Period:** Cooldown provides a **temporary recovery period** for a potentially failing model deployment. It gives the backend service a chance to recover from transient issues, network glitches, overload, or other temporary problems that might have caused the failures. +* **Automatic Re-Entry into Rotation:** After the `cooldown_time` has elapsed, the proxy router will **automatically take the deployment out of cooldown** and **re-introduce it into rotation**. The proxy will then start sending new requests to the deployment again, assuming it has recovered. +* **Required with** `allowed_fails`:** `cooldown_time` is **typically used in conjunction with `allowed_fails`**. `allowed_fails` defines the failure threshold that triggers cooldown, and `cooldown_time` defines the duration of the cooldown period. If you are using `allowed_fails` to enable the cooldown mechanism, you **must also set `cooldown_time`** to specify how long the cooldown should last. If `cooldown_time` is not set, the proxy might not know how long to keep a model in cooldown, potentially leading to unexpected behavior. + +**Example YAML (Setting Cooldown Time to 60 Seconds):** + +```yaml +router_settings: + allowed_fails: 3 # Cooldown is triggered after 3 failures per minute + cooldown_time: 60 # Setting cooldown duration to 60 seconds (1 minute) + # ... other router_settings ... +``` + +In this example, if a model deployment exceeds the `allowed_fails` threshold (3 failures per minute), it will be placed in cooldown for 60 seconds (1 minute). During this 1-minute cooldown, no new requests will be routed to that deployment. After 1 minute, the deployment will automatically exit cooldown and be considered healthy again for routing. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** You **must explicitly set `cooldown_time` if you are using `allowed_fails` or any cooldown mechanism**. A `cooldown_time` value around **60-300 seconds (1-5 minutes)** is often a reasonable starting point for many production environments. You might consider adjusting `cooldown_time` if: + +* **Cooldowns are Too Short:** If you find that models are going in and out of cooldown very frequently or that the cooldown period is too short for models to fully recover from transient issues, you can try *increasing* `cooldown_time` to a longer duration (e.g., 300 seconds or more). +* **You Need Faster Recovery from Cooldown:** If you want models to be taken out of cooldown and re-enter rotation more quickly after a failure event, you can *decrease* `cooldown_time` to a shorter duration (e.g., 30-60 seconds). + +Monitor your proxy logs and model health metrics to determine the optimal `cooldown_time` for your specific environment and the typical recovery time for your backend LLM deployments. + +--- + +#### `disable_cooldowns` + +**YAML Key:** `disable_cooldowns` + +**Type:** Boolean + +**Environment Variable:** N/A + +**Default Value:** `false` (Model cooldowns are enabled by default when `allowed_fails` is configured). + +**Description:** The `disable_cooldowns` parameter, within `router_settings`, is a **boolean flag** that, when set to `true`, **completely disables the model cooldown mechanism**. + +* **Disabling Cooldown Functionality:** Setting `disable_cooldowns: true` will **override the `allowed_fails` and `cooldown_time` settings** for all model deployments. Even if you have configured `allowed_fails` to define a failure threshold, the proxy will **not trigger any cooldowns** for any model, regardless of how many errors they experience. +* **No Temporary Removal from Rotation:** With cooldowns disabled, if a model deployment starts failing or experiencing errors, the proxy will **continue to route requests to it**, even if it exceeds the `allowed_fails` threshold. The proxy will *not* temporarily remove the model from rotation or stop sending it traffic. +* **Use Cases for Disabling Cooldowns (Limited):** Disabling cooldowns is **generally not recommended for production environments** as it reduces the robustness and reliability of the proxy. You might consider disabling cooldowns only in very specific, controlled scenarios, such as: + * **Testing or Development:** For testing or development purposes, you might disable cooldowns to simplify debugging or to observe the raw error behavior of models without the proxy automatically intervening with cooldowns. + * **Highly Stable Environments (Rare):** In extremely rare scenarios where you are absolutely certain that your backend LLM deployments are exceptionally stable and never experience transient errors or failures, you *might* consider disabling cooldowns. However, even in highly reliable systems, transient issues can occur, and cooldowns provide a valuable safety net. +* **Reduced Reliability and Resilience (Generally Not Recommended):** Disabling cooldowns **reduces the proxy's ability to automatically handle failing models and maintain service reliability**. If a model deployment starts experiencing issues, the proxy will continue to send requests to it, potentially leading to increased error rates and a degraded user experience. + +**Example YAML (Disabling Model Cooldowns):** + +```yaml +router_settings: + allowed_fails: 3 # Allowed fails setting is configured, but will be ignored because cooldowns are disabled + cooldown_time: 60 # Cooldown time setting is configured, but will be ignored because cooldowns are disabled + disable_cooldowns: true # Disabling model cooldown mechanism entirely - models will never be put into cooldown + # ... other router_settings ... +``` + +In this example, even though `allowed_fails: 3` and `cooldown_time: 60` are configured, the setting `disable_cooldowns: true` will override them. The proxy will never put any model deployment into cooldown, regardless of how many failures it experiences. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** **Avoid setting `disable_cooldowns: true` in most production environments.** Model cooldowns are a valuable reliability feature that helps the proxy automatically manage failing deployments and maintain service availability. Leave `disable_cooldowns: false` (the default) and configure `allowed_fails` and `cooldown_time` to fine-tune the cooldown mechanism for your specific needs. Only consider disabling cooldowns for testing or very specific, controlled scenarios where you are willing to trade off the benefits of automatic failover for some other reason (which is rarely justified in production). + +--- + +#### `retry_policy` + +**YAML Key:** `retry_policy` + +**Type:** Object (Mapping of error type to retry count) + +**Environment Variable:** N/A + +**Default Value:** A default retry policy is applied if not configured (see example below for default error types and retry counts). + +**Description:** The `retry_policy` parameter, within `router_settings`, is an **object (mapping)** that allows you to define **custom retry policies** for **different types of errors** that might occur during LLM API calls. This provides fine-grained control over how the proxy handles retries, allowing you to tailor retry behavior based on the specific error type. + +* **Error-Specific Retry Logic:** Instead of a single global `num_retries` value that applies to all errors, `retry_policy` lets you configure **different retry counts for different categories of errors**. This is important because different error types might warrant different retry strategies. For example: + * **Transient Errors (e.g., `TimeoutError`, `RateLimitError`):** These are often temporary network issues or provider-side transient problems. For these types of errors, you might want to configure a **higher number of retries** to give the proxy more chances to recover and successfully complete the request. + * **Non-Transient Errors (e.g., `AuthenticationError`, `BadRequestError`):** These errors often indicate fundamental issues with the request itself (e.g., invalid API key, malformed request parameters). Retrying these types of errors is unlikely to succeed without modifying the request, so you might want to set a **lower number of retries** (or even zero retries) to fail faster and avoid wasting resources on futile retries. +* **Mapping of Error Type to Retry Count:** The `retry_policy` object is a dictionary where: + * **Keys:** Are **error type names** (strings). These names correspond to specific exception classes or error categories that LiteLLM recognizes (e.g., `"AuthenticationErrorRetries"`, `"TimeoutErrorRetries"`, `"RateLimitErrorRetries"`, `"ContentPolicyViolationErrorRetries"`, `"InternalServerErrorRetries"`, `"BadRequestErrorRetries"`). *Check the documentation for the exact list of supported error type names in your LiteLLM Proxy version*. + * **Values:** Are **integers** representing the **number of retries** to attempt for that specific error type. For example, `AuthenticationErrorRetries: 0` would mean "do not retry AuthenticationError errors", while `TimeoutErrorRetries: 3` would mean "retry TimeoutError errors up to 3 times". +* **Default Retry Policy:** If `retry_policy` is not configured, the proxy will apply a **default retry policy**, which typically includes retry counts for common error types (see example below). However, it's best practice to **explicitly define your `retry_policy`** in `config.yaml` to have clear control over retry behavior. + +**Example YAML (Custom Retry Policy):** + +```yaml +router_settings: + retry_policy: # Custom retry policy per error type + AuthenticationErrorRetries: 0 # Do not retry authentication errors + TimeoutErrorRetries: 5 # Retry timeout errors up to 5 times + RateLimitErrorRetries: 4 # Retry rate limit errors up to 4 times + ContentPolicyViolationErrorRetries: 2 # Retry content policy errors up to 2 times (might try fallbacks instead) + InternalServerErrorRetries: 3 # Retry internal server errors up to 3 times + BadRequestErrorRetries: 0 # Do not retry bad request errors (likely client-side issue) + # ... other router_settings ... +``` + +This example defines a custom retry policy with different retry counts for various error types. Authentication errors and bad request errors are not retried (0 retries), while timeout errors get 5 retries, rate limit errors get 4 retries, and content policy and internal server errors get 2 retries. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Default Retry Policy (Example - might vary slightly in different versions):** + +If `retry_policy` is not explicitly configured, LiteLLM Proxy might use a default retry policy that is similar to the example above, with retry counts like: + +```yaml +retry_policy: + AuthenticationErrorRetries: 3 + TimeoutErrorRetries: 3 + RateLimitErrorRetries: 3 + ContentPolicyViolationErrorRetries: 4 + InternalServerErrorRetries: 4 + BadRequestErrorRetries: 0 +``` + +**Recommendation:** It's highly recommended to **explicitly define your `retry_policy` in `router_settings`** to have fine-grained control over retry behavior and tailor it to your application's needs and error handling strategy. Review the default retry policy and then customize the retry counts for different error types based on your understanding of the types of errors your LLM application is likely to encounter and how you want to handle them. For non-transient errors (like authentication errors or bad requests), it's often best to set retries to `0` to fail fast. For transient errors (like timeouts or rate limits), a retry count of 3-5 is often a reasonable starting point. + +--- + +#### `allowed_fails_policy` + +**YAML Key:** `allowed_fails_policy` + +**Type:** Object (Mapping of error type to allowed fails count) + +**Environment Variable:** N/A + +**Default Value:** A default allowed fails policy is applied if not configured (see example below for default error types and allowed fails counts). + +**Description:** The `allowed_fails_policy` parameter, within `router_settings`, is an **object (mapping)** that allows you to customize the `allowed_fails` setting **for specific error types**. This provides even more granular control over the cooldown mechanism, letting you define different failure tolerances for different categories of errors. + +* **Error-Specific Failure Tolerance:** Instead of a single `allowed_fails` value that applies to all error types, `allowed_fails_policy` lets you configure **different allowed failure counts per minute for different error categories** before a model deployment is put into cooldown. +* **Fine-Tuning Cooldown Sensitivity:** This is useful because some types of errors might be considered more critical or indicative of a serious issue than others. For example: + * **`InternalServerError` (Provider-Side Errors):** Internal server errors from the LLM provider might be considered more serious and warrant a lower `allowed_fails` threshold, triggering cooldown more aggressively, as they might indicate a problem with the backend service itself. + * **`RateLimitError` (Rate Limits Exceeded):** Rate limit errors might be more transient or less indicative of a model deployment failure. You might want to set a **higher `allowed_fails` threshold** for rate limit errors, allowing a model to experience more rate limit errors within a minute before going into cooldown, as rate limits are often temporary and might resolve quickly. + * **`BadRequestError` (Client-Side Errors):** Bad request errors (often caused by malformed requests from the client) are typically *not* indicative of a model deployment failure. You might want to set a **very high `allowed_fails` threshold** (or effectively ignore them for cooldown purposes) for `BadRequestError` because they are usually client-side issues and not related to model health. +* **Mapping of Error Type to Allowed Fails Count:** The `allowed_fails_policy` object is a dictionary where: + * **Keys:** Are **error type names** (strings), similar to `retry_policy`. These names correspond to specific exception classes or error categories (e.g., `"BadRequestErrorAllowedFails"`, `"AuthenticationErrorAllowedFails"`, `"TimeoutErrorAllowedFails"`, `"RateLimitErrorAllowedFails"`, `"ContentPolicyViolationErrorAllowedFails"`, `"InternalServerErrorAllowedFails"`). *Check the documentation for the exact list of supported error type names for `allowed_fails_policy` in your LiteLLM Proxy version*. + * **Values:** Are **integers** representing the **allowed number of failures per minute** for that specific error type before cooldown is triggered. For example, `BadRequestErrorAllowedFails: 1000` might mean "allow up to 1000 BadRequestErrors per minute before cooldown" (effectively ignoring BadRequestErrors for cooldown purposes), while `InternalServerErrorAllowedFails: 5` might mean "allow only 5 InternalServerErrors per minute before triggering cooldown" (very sensitive to internal server errors). +* **Overrides Global** `allowed_fails`:** If you define an `allowed_fails_policy` for a specific error type, that error-specific setting will **override** the global `allowed_fails` value for that particular error type. The global `allowed_fails` setting will still apply to error types that are *not* explicitly listed in `allowed_fails_policy`. + +**Example YAML (Custom Allowed Fails Policy per Error Type):** + +```yaml +router_settings: + allowed_fails: 3 # Global allowed fails limit (applies to error types not in allowed_fails_policy) + allowed_fails_policy: # Custom allowed fails policy per error type + BadRequestErrorAllowedFails: 1000 # Allow up to 1000 BadRequestErrors per minute (effectively ignore for cooldown) + AuthenticationErrorAllowedFails: 5 # Allow only 5 AuthenticationErrors per minute + TimeoutErrorAllowedFails: 10 # Allow 12 TimeoutErrors per minute + RateLimitErrorAllowedFails: 10000 # Allow up to 10000 RateLimitErrors per minute + ContentPolicyViolationErrorAllowedFails: 15 # Allow 15 ContentPolicyViolationErrors per minute + InternalServerErrorAllowedFails: 20 # Allow 20 InternalServerError per minute + cooldown_time: 60 # Cooldown time of 60 seconds (1 minute) + # ... other router_settings ... +``` + +In this example, `allowed_fails_policy` defines different failure tolerances for various error types. `BadRequestError` and `RateLimitError` have very high allowed fails counts (effectively ignoring them for cooldown), while `AuthenticationError`, `TimeoutError`, `ContentPolicyViolationError`, and `InternalServerError` have lower, more restrictive allowed fails counts, making the cooldown mechanism more sensitive to these error types. The global `allowed_fails: 3` setting will apply to any other error types that are not explicitly listed in `allowed_fails_policy`. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Default Allowed Fails Policy (Example - might vary slightly in different versions):** + +If `allowed_fails_policy` is not explicitly configured, LiteLLM Proxy might use a default allowed fails policy that is similar to the example above, with values like: + +```yaml +allowed_fails_policy: + BadRequestErrorAllowedFails: 1000 + AuthenticationErrorAllowedFails: 10 + TimeoutErrorAllowedFails: 12 + RateLimitErrorAllowedFails: 10000 + ContentPolicyViolationErrorAllowedFails: 15 + InternalServerErrorAllowedFails: 20 +``` + +**Recommendation:** It's highly recommended to **explicitly define your `allowed_fails_policy` in `router_settings`** to have fine-grained control over the cooldown mechanism and tailor it to your specific error profiles and tolerance levels for different error types. Review the default policy (if available in your documentation) and then customize the allowed fails counts for each error type based on your understanding of what error types are more critical or indicative of model deployment health vs. transient or client-side issues. For client-side errors like `BadRequestError`, you can typically set a very high `AllowedFails` count (effectively ignoring them for cooldown). For provider-side errors like `InternalServerError`, you might want to set a lower `AllowedFails` count to trigger cooldown more readily. + +--- + +#### `fallbacks` (`router_settings`) + +**YAML Key:** `fallbacks` + +**Type:** Array of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no model-specific fallbacks configured by default). + +**Description:** The `fallbacks` parameter, within `router_settings`, is an **array of objects** that allows you to define **model-specific fallback models** for **general error conditions**. This is a more granular fallback mechanism compared to the global `default_fallbacks` (in `litellm_settings`). + +* **Model-Specific Failover:** `fallbacks` lets you specify a **fallback strategy for each individual model deployment or model alias** defined in your `model_list`. If a request to a particular model fails (due to any error that triggers fallback logic, *except* for `ContentPolicyViolationError` and `ContextWindowExceededError`, which have their own dedicated fallback settings), the proxy will attempt to use the fallback models you define specifically for that source model. +* **Custom Failover Paths:** You can define different fallback models for different source models, allowing you to create **custom failover paths** tailored to the characteristics of each model and your application's requirements. For example, you might want to fallback from a more powerful but potentially less reliable model to a more stable, but slightly less performant, model. +* **List of Fallback Models per Source Model:** Each object in the `fallbacks` list is a **mapping** that associates a *source model name* (or model group name) with a *list of fallback model names* to try if the source model fails. +* **Ordered Fallback Attempts:** For each source model, you can specify a list of fallback models to try in order. The proxy will attempt the fallbacks sequentially until one succeeds or the list is exhausted. +* **Model Names from** `model_list`:** The model names used as source models and fallback models must be valid `model_name` aliases defined in your `model_list`. + +**Example YAML (Defining Model-Specific Fallbacks):** + +```yaml +router_settings: + fallbacks: # Defining model-specific fallbacks for general errors + - gpt-4: ["claude-2", "gpt-3.5-turbo-large"] # If gpt-4 fails, try Claude-2, then GPT-3.5 Turbo Large + - claude-2: ["gpt-3.5-turbo"] # If Claude-2 fails, try GPT-3.5 Turbo + - gpt-3.5-turbo-small: ["gpt-3.5-turbo"] # If gpt-3.5-turbo-small fails, try gpt-3.5-turbo + # ... other router_settings ... +``` + +In this example, if a request to `gpt-4` fails (for any error that triggers fallback), the proxy will first retry the request with `claude-2`. If that also fails, it will then try `gpt-3.5-turbo-large`. Separate fallback mappings are defined for `claude-2` (fallback to `gpt-3.5-turbo`) and `gpt-3.5-turbo-small` (fallback to `gpt-3.5-turbo`). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `fallbacks` (in `router_settings`) provides **model-specific fallback strategies** for general errors. For handling `ContentPolicyViolationError` and `ContextWindowExceededError` specifically, use the dedicated `litellm_settings.content_policy_fallbacks` and `litellm_settings.context_window_fallbacks` parameters respectively. If no model-specific fallback is defined for a given source model, the proxy will fall back to using the global `default_fallbacks` list (if configured in `litellm_settings`). + +--- + +#### `content_policy_fallbacks` (`router_settings`) + +*(This parameter is also defined under `litellm_settings`. In `router_settings`, it likely serves the same purpose - to define model-specific fallbacks for ContentPolicyViolationError. It's documented here in `router_settings` because the source documentation lists it in the `router_settings` section. However, conceptually, it's very similar to `litellm_settings.content_policy_fallbacks`. )* + +**YAML Key:** `content_policy_fallbacks` (in `router_settings`) + +**Type:** Array of Objects (List of mappings) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no content policy fallbacks configured by default). + +**Description:** The `content_policy_fallbacks` parameter, within `router_settings` (and also available in `litellm_settings`), is an **array of objects** used to define **model-specific fallback models** that should be used *specifically* when a **`ContentPolicyViolationError`** occurs for a particular source model. This is a model-specific version of the general `litellm_settings.content_policy_fallbacks` setting. + +* **Model-Specific Content Policy Failover:** Similar to `router_settings.fallbacks`, `router_settings.content_policy_fallbacks` lets you define a fallback strategy for each individual model deployment or model alias, but **specifically for content policy violation errors**. +* **Overriding Global Content Policy Fallbacks:** If you define `content_policy_fallbacks` in `router_settings` for a given model, these **model-specific content policy fallbacks will take precedence over** the global `litellm_settings.content_policy_fallbacks` setting for that model. If no model-specific `content_policy_fallbacks` are defined in `router_settings` for a model, the proxy will then fall back to using the global `litellm_settings.content_policy_fallbacks` list (if configured). +* **Custom Failover for Content Policy Violations:** Use `router_settings.content_policy_fallbacks` to create **custom failover paths** for content policy errors, tailored to the characteristics of each model and your content moderation policies. You might want to fallback to a model that is known to be less sensitive to content filters, or to a model from a different provider with different content policies. +* **List of Fallback Models per Source Model:** The format is the same as `litellm_settings.content_policy_fallbacks` and `router_settings.fallbacks`. Each object in the `content_policy_fallbacks` list is a **mapping** that associates a *source model name* (or model group name) with a *list of fallback model names* to try if a `ContentPolicyViolationError` occurs for that source model. +* **Ordered Fallback Attempts:** For each source model, you can specify a list of fallback models to try in order. The proxy will attempt the fallbacks sequentially until one succeeds or the list is exhausted. +* **Model Names from** `model_list`:** The model names used as source models and fallback models must be valid `model_name` aliases defined in your `model_list`. + +**Example YAML (Model-Specific Content Policy Fallbacks in `router_settings`):** + +```yaml +router_settings: + content_policy_fallbacks: # Model-specific content policy fallbacks (router level) + - gpt-3.5-turbo-small: ["claude-instant-1"] # If gpt-3.5-turbo-small fails content policy, try Claude Instant-1 + - gpt-4: ["claude-2-opus"] # If gpt-4 fails content policy, try Claude-2 Opus + # ... other router_settings ... +# Global content policy fallbacks (litellm_settings) - these will be used if no router-level fallbacks are defined for a model +litellm_settings: + content_policy_fallbacks: + - gpt-3.5-turbo: ["claude-instant-1", "gpt-3.5-turbo-large"] # Global content policy fallbacks (used if no router-level fallbacks for gpt-3.5-turbo) + # ... other litellm_settings ... +``` + +In this example, `router_settings.content_policy_fallbacks` defines model-specific content policy fallbacks for `gpt-3.5-turbo-small` (fallback to `claude-instant-1`) and `gpt-4` (fallback to `claude-2-opus`). If a `ContentPolicyViolationError` occurs for either of these models, the router will first try the model-specific fallbacks defined here. If no model-specific fallback is defined for a given model, the proxy will then look at the global `litellm_settings.content_policy_fallbacks` list for a general content policy fallback strategy. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `router_settings.content_policy_fallbacks` takes **precedence** over `litellm_settings.content_policy_fallbacks` for model-specific content policy failovers. Use `router_settings.content_policy_fallbacks` when you need to define different content policy fallback strategies for different models. If you want to define a *global* content policy fallback strategy that applies to all models that don't have model-specific fallbacks, use `litellm_settings.content_policy_fallbacks`. + +--- + +#### `default_fallbacks` (`router_settings`) + +*(This parameter is also defined under `litellm_settings`. In `router_settings`, it likely serves the same purpose - to define model-specific fallbacks for general errors. It's documented here in `router_settings` because the source documentation lists it in the `router_settings` section. However, conceptually, it's very similar to `litellm_settings.default_fallbacks`. )* + +**YAML Key:** `default_fallbacks` (in `router_settings`) + +**Type:** Array of Strings (List of model names) + +**Environment Variable:** N/A + +**Default Value:** `[]` (Empty list, no model-specific default fallbacks configured by default). + +**Description:** The `default_fallbacks` parameter, within `router_settings` (and also available in `litellm_settings`), is a **list of model names** that allows you to define **model-specific default fallback models** for **general error conditions**. This is a model-specific version of the global `litellm_settings.default_fallbacks` setting. + +* **Model-Specific General Error Failover:** Similar to `router_settings.fallbacks` and `router_settings.content_policy_fallbacks`, `router_settings.default_fallbacks` lets you define a fallback strategy for each individual model deployment or model alias, but specifically for **general errors** (any error type that triggers fallback, *except* for `ContentPolicyViolationError` and `ContextWindowExceededError`, which have their own dedicated fallback settings). +* **Overriding Global Default Fallbacks:** If you define `default_fallbacks` in `router_settings` for a given model, these **model-specific default fallbacks will take precedence over** the global `litellm_settings.default_fallbacks` setting for that model. If no model-specific `default_fallbacks` are defined in `router_settings` for a model, the proxy will then fall back to using the global `litellm_settings.default_fallbacks` list (if configured). +* **Custom Failover Paths per Model:** Use `router_settings.default_fallbacks` to create **custom failover paths** for general errors, tailored to the characteristics of each model and your desired failover behavior. You might want to define different sets of fallback models for different primary models, based on their capabilities or intended use cases. +* **List of Fallback Models per Source Model:** The format is the same as `litellm_settings.default_fallbacks`, `router_settings.fallbacks`, and `router_settings.content_policy_fallbacks`. The value is a **list of strings**, where each string is a `model_name` alias of a fallback model to try. +* **Ordered Fallback Attempts:** For each source model, you can specify a list of fallback models to try in order. The proxy will attempt the fallbacks sequentially until one succeeds or the list is exhausted. +* **Model Names from** `model_list`:** The model names used as source models and fallback models must be valid `model_name` aliases defined in your `model_list`. + +**Example YAML (Model-Specific Default Fallbacks in `router_settings`):** + +```yaml +router_settings: + default_fallbacks: # Model-specific default fallbacks for general errors (router level) + - gpt-4: ["claude-2", "gpt-3.5-turbo-large"] # If gpt-4 fails for general errors, try Claude-2, then GPT-3.5 Turbo Large + - claude-2: ["gpt-3.5-turbo"] # If Claude-2 fails for general errors, try GPT-3.5 Turbo + - gpt-3.5-turbo-small: ["gpt-3.5-turbo"] # If gpt-3.5-turbo-small fails for general errors, try gpt-3.5-turbo + # ... other router_settings ... +# Global default fallbacks (litellm_settings) - these will be used if no router-level fallbacks are defined for a model +litellm_settings: + default_fallbacks: ["claude-instant-1"] # Global default fallbacks for any model without router-level fallbacks + # ... other litellm_settings ... +``` + +In this example, `router_settings.default_fallbacks` defines model-specific default fallbacks for `gpt-4`, `claude-2`, and `gpt-3.5-turbo-small`. For example, if `gpt-4` fails due to a general error, the proxy will first try `claude-2`, then `gpt-3.5-turbo-large`. If no model-specific fallback is defined for a given model, the proxy will then look at the global `litellm_settings.default_fallbacks` list for a general fallback strategy (in this example, it will try `"claude-instant-1"` as a last resort). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `router_settings.default_fallbacks` takes **precedence** over `litellm_settings.default_fallbacks` for model-specific general error failovers. Use `router_settings.default_fallbacks` when you want to define different general error fallback strategies for different models. If you want to define a *global* default fallback list that applies to all models that don't have model-specific general error fallbacks, use `litellm_settings.default_fallbacks`. + +--- + +#### `max_fallbacks` + +**YAML Key:** `max_fallbacks` + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `5` (Maximum 5 fallback attempts per request if not set). + +**Description:** The `max_fallbacks` parameter, within `router_settings`, sets the **maximum number of fallback models** that the proxy router will attempt to try **in sequence for a single original request** before giving up and returning an error to the client. + +* **Limit on Fallback Chain Length:** `max_fallbacks` prevents the proxy from getting into an **endless or excessively long chain of fallbacks** if all fallback models also fail. This acts as a safeguard to limit the overall latency and resource consumption of fallback attempts for a single request. +* **Fallback Attempt Limit:** The proxy will try **at most `max_fallbacks`** number of fallback models (in addition to the original model). If all these attempts fail, the proxy will stop trying fallbacks and return an error to the client, indicating that the request could not be fulfilled even after trying all available fallbacks. +* **Preventing Infinite Loops:** `max_fallbacks` is important to prevent scenarios where a request might keep getting routed from one fallback model to another in an infinite loop if all models in the fallback chain are also failing. It ensures that the proxy will eventually give up and report an error if no model in the chain can successfully process the request within a reasonable number of attempts. +* **Default Fallback Limit:** The default `max_fallbacks` value is `5`. This means that, by default, the proxy will attempt up to 5 fallback models (in addition to the original model) before failing a request. This default limit is often sufficient for most use cases. + +**Example YAML (Setting Max Fallback Attempts to 3):** + +```yaml +router_settings: + max_fallbacks: 3 # Limiting max fallback attempts to 3 per request + fallbacks: # Defining fallback models (example) + - gpt-4: ["claude-2", "gpt-3.5-turbo-large"] + # ... other fallback configurations ... + # ... other router_settings ... +``` + +In this example, even if the `fallbacks` configuration defines a long chain of fallback models, the `max_fallbacks: 3` setting will ensure that the proxy will attempt at most 3 fallback models for any original request that fails. After 3 failed fallback attempts (plus the initial attempt on the original model), the proxy will stop trying fallbacks and return an error to the client. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `max_fallbacks` value of 5 is generally a good balance between providing sufficient failover redundancy and preventing excessively long fallback chains. You might consider **adjusting `max_fallbacks`** if: + +* **You Have Very Long Fallback Chains:** If you have configured very long lists of fallback models in your `fallbacks`, `content_policy_fallbacks`, and `default_fallbacks` settings, and you want the proxy to try more fallback options before giving up, you can *increase* `max_fallbacks` to a higher value (e.g., 10 or more). However, be mindful that longer fallback chains can increase overall request latency and resource consumption. +* **You Want Faster Error Reporting:** If you want the proxy to fail faster and report errors to the client more quickly in case of persistent model issues, you can *decrease* `max_fallbacks` to a lower value (e.g., 1 or 2) or even `0` to disable fallbacks entirely (not generally recommended for production). + +Monitor your proxy's performance and error logs to determine an appropriate `max_fallbacks` value for your specific environment and reliability goals. + +--- + +#### `num_retries` (`router_settings`) + +*(This parameter is also defined under `general_settings`. In `router_settings`, it likely serves the same purpose - to set a default number of retry attempts. It's documented here in `router_settings` because the source documentation lists it in the `router_settings` section. However, conceptually, it's very similar to `general_settings.num_retries`. )* + +**YAML Key:** `num_retries` (in `router_settings`) + +**Type:** Integer + +**Environment Variable:** N/A + +**Default Value:** `3` (Proxy router will attempt 3 retries by default). + +**Description:** The `num_retries` parameter, within `router_settings` (and also available in `general_settings`), sets the **default number of retry attempts** that the proxy router will make for a failed request **in general**. This is a router-level default retry count that applies to all requests routed by the router, unless overridden by more specific retry policies (e.g., `retry_policy`). + +* **Router-Level Retry Attempts:** `router_settings.num_retries` defines a **default retry count that is specific to the proxy router**. If you have configured `num_retries` in both `general_settings` and `router_settings`, the `router_settings.num_retries` value will typically **take precedence** for requests handled by the router. The `general_settings.num_retries` might then act as a fallback for requests that are not routed through the router or for other proxy operations outside of the routing logic. *(Check documentation for your specific version to confirm the exact precedence rules if both are set.)* +* **Default Retry Attempts for Router Operations:** `router_settings.num_retries` is used as a **default retry count** for requests handled by the proxy router. This includes: + * Initial attempts to call a primary model deployment. + * Subsequent attempts to call fallback models (if fallbacks are configured). + * Potentially other router-level operations that might involve API calls or network interactions. +* **Overridden by** `retry_policy`:** The `num_retries` value set in `router_settings` (or `general_settings`) can be **overridden by the more granular `retry_policy` setting** in `router_settings`. If you define a `retry_policy` with specific retry counts for different error types, those error-specific retry counts from `retry_policy` will take precedence over the default `num_retries` for the corresponding error types. `num_retries` then acts as a more general fallback retry count if no error-specific retry policy is defined. + +**Example YAML (Router-Level Default Retry Count):** + +```yaml +router_settings: + num_retries: 4 # Setting default retry attempts to 4 for all router operations + retry_policy: # Custom retry policy (overrides default num_retries for specific errors) + TimeoutErrorRetries: 5 # Timeout errors get 5 retries (overriding default num_retries for TimeoutError) + # ... other error-specific retry policies ... + # ... other router_settings ... +``` + +In this example, `router_settings.num_retries: 4` sets a default retry count of 4 for all router operations. However, the `retry_policy` also defines specific retry counts for different error types. For `TimeoutErrorRetries`, the retry count is set to 5, which will **override** the router's default `num_retries` of 4 for TimeoutError errors. For other error types not listed in `retry_policy`, the router's default `num_retries` of 4 will apply. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** Similar to `general_settings.num_retries`, the default `num_retries` value of 3 (at the router level) is often a good starting point for most deployments. You might consider **adjusting `router_settings.num_retries`** if you want to change the **default retry behavior for all requests handled by the router**. However, for more granular and error-specific retry control, it is generally recommended to use the `router_settings.retry_policy` parameter, which allows you to define different retry counts for different error types, overriding the default `num_retries` value for specific error categories as needed. If you are not using `router_settings.retry_policy`, the `num_retries` value in `router_settings` (or `general_settings`) will act as a global default retry count for all retryable errors. + +--- + +#### `model_group_retry_policy` + +**YAML Key:** `model_group_retry_policy` + +**Type:** Object (Mapping of model group name to retry policy object) + +**Environment Variable:** N/A + +**Default Value:** `None` (No model group-specific retry policies are defined by default). + +**Description:** The `model_group_retry_policy` parameter, within `router_settings`, is an **advanced mapping** (object) that allows you to define **different retry policies for *different model groups***. This provides the most granular level of control over retry behavior, enabling you to tailor retry strategies to the specific characteristics and reliability of different model groups. + +* **Model Group-Specific Retry Policies:** `model_group_retry_policy` lets you configure **unique retry policies for each model group** that you have defined or are using in your proxy setup. This is useful when you have different model groups with varying levels of reliability, performance, or cost considerations. +* **Overriding Default Retry Policy:** If you define a retry policy for a specific model group in `model_group_retry_policy`, this **model-group-specific retry policy will override** the default `router_settings.retry_policy` (and `general_settings.num_retries`) for requests that are routed to models within that model group. For requests to models in groups that do *not* have a specific retry policy defined in `model_group_retry_policy`, the proxy will fall back to using the `router_settings.retry_policy` (or `general_settings.num_retries` if no router-level policy is set). +* **Mapping of Model Group Name to Retry Policy:** The `model_group_retry_policy` object is a dictionary where: + * **Keys:** Are **model group names** (strings). These names should correspond to model group names that you are using in your routing configurations or that are implicitly defined by your `model_name` aliases in `model_list`. + * **Values:** Are **retry policy objects**. Each retry policy object is a mapping (dictionary) that defines the retry counts for different error types, using the same structure as the `retry_policy` parameter in `router_settings`. (i.e., keys are error type names like `"TimeoutErrorRetries"`, values are retry counts as integers). + +**Example YAML (Model Group-Specific Retry Policies):** + +```yaml +router_settings: + retry_policy: # Default retry policy (applies to models without model-group-specific policy) + TimeoutErrorRetries: 3 + RateLimitErrorRetries: 3 + model_group_retry_policy: # Advanced: Model group-specific retry policies + gpt-4-group: # Retry policy for model group named "gpt-4-group" + TimeoutErrorRetries: 5 # GPT-4 group gets 5 retries for timeout errors (more aggressive retries) + RateLimitErrorRetries: 2 # GPT-4 group gets 2 retries for rate limit errors (less aggressive) + claude-group: # Retry policy for model group named "claude-group" + InternalServerErrorRetries: 0 # Claude group - no retries for internal server errors + RateLimitErrorRetries: 4 # Claude group - more retries for rate limit errors + # ... other router_settings ... +``` + +In this example, `model_group_retry_policy` defines custom retry policies for two model groups: `"gpt-4-group"` and `"claude-group"`. Requests routed to models within `"gpt-4-group"` will use the retry policy defined under `gpt-4-group` (e.g., 5 retries for `TimeoutErrorRetries`, 2 for `RateLimitErrorRetries`). Requests routed to models within `"claude-group"` will use the retry policy defined under `claude-group` (e.g., 0 retries for `InternalServerErrorRetries`, 4 for `RateLimitErrorRetries`). Requests to models that are *not* part of either `"gpt-4-group"` or `"claude-group"` will fall back to using the default `router_settings.retry_policy` (e.g., 3 retries for `TimeoutErrorRetries`, 3 for `RateLimitErrorRetries`, and so on). + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Note:** `model_group_retry_policy` is an **advanced configuration option** that is typically only needed in complex routing scenarios where you want to implement highly customized and differentiated retry strategies for different groups of models. For most deployments, the global `router_settings.retry_policy` (or even the default `num_retries`) is sufficient. Only use `model_group_retry_policy` if you have specific reasons to define different retry policies for different sets of models and understand the implications of this fine-grained retry control. The model group names used as keys in `model_group_retry_policy` should correspond to model group names that are meaningful within your proxy routing logic or custom routing strategies (if you are using custom routing). + +--- + +### Timeouts & Debugging + +This subsection of `router_settings` deals with configuring timeouts for request handling within the router and setting debug levels for routing-related logs. + +#### `timeout` (`router_settings`) + +**YAML Key:** `timeout` + +**Type:** Float (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** `600.0` (600 seconds, or 10 minutes, if not set). + +**Description:** The `timeout` parameter, within `router_settings`, sets the **default timeout duration in seconds** for **overall request handling** by the proxy router. This timeout is different from `litellm_settings.request_timeout`, which is for individual LLM API calls. `router_settings.timeout` sets a **broader timeout** that encompasses the entire request processing flow within the router, including: + +* **Initial API Call Attempt:** The time spent making the initial request to the primary model. +* **Fallback Attempts (if configured):** The time spent attempting fallback models (including multiple fallback attempts if a chain of fallbacks is defined). +* **Retries (if configured):** The time spent on retry attempts for any failed API calls. +* **Overall Request Processing Time:** The total time from when the proxy receives the request to when it sends a response back to the client (or returns an error after exhausting retries and fallbacks). + +* **Overall Request Deadline:** `timeout` acts as a **deadline for the entire request processing flow**. If the proxy cannot successfully complete a request (including all retries and fallbacks, if applicable) within the specified `timeout` duration, it will **abort the entire request** and return a `Timeout` error to the client. +* **Preventing Long-Running Requests:** `timeout` is essential to prevent requests from hanging indefinitely or taking an excessively long time to complete, especially in scenarios with complex routing, fallbacks, or retries. +* **Resource Management and Responsiveness:** Timeouts help manage resources and ensure that the proxy remains responsive, even when dealing with slow or unreliable backend LLM providers. +* **Default Timeout (10 Minutes):** The default `timeout` value is `600.0 seconds` (10 minutes). This is a relatively long timeout, designed to accommodate potentially complex routing and fallback scenarios. + +**Example YAML (Setting Router Timeout to 5 Minutes):** + +```yaml +router_settings: + timeout: 300.0 # Setting overall router timeout to 300 seconds (5 minutes) + # ... other router_settings ... +``` + +In this example, the overall request handling timeout is set to 300 seconds (5 minutes). If a request takes longer than 5 minutes to process (including all retries and fallbacks), the proxy will abort it and return a timeout error. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** The default `timeout` of 600 seconds (10 minutes) is often reasonable for many LLM applications that might involve complex processing or longer generation times. You might consider **adjusting `timeout`** if: + +* **You Need Faster Timeouts for Time-Sensitive Applications:** If your application requires very low latency and you want requests to fail faster if they take too long, you can *decrease* `timeout` to a smaller value (e.g., 30-60 seconds or less). +* **You Have Complex Routing or Fallback Scenarios:** If you have configured very complex routing rules, long fallback chains, or aggressive retry policies, and you expect the overall request processing time to potentially be longer, you might *increase* `timeout` to allow more time for the proxy to attempt all routing and failover strategies before timing out. +* **Upstream LLM Timeouts:** Be mindful of the timeouts configured at the backend LLM provider level (`litellm_settings.request_timeout`). The `router_settings.timeout` should generally be *longer* than `litellm_settings.request_timeout` to allow time for the proxy router to manage retries and fallbacks *after* an individual LLM call might have timed out. + +Monitor your proxy logs and application behavior to determine the optimal `timeout` value for your specific workload and latency requirements. + +--- + +#### `stream_timeout` (`router_settings`) + +**YAML Key:** `stream_timeout` + +**Type:** Float (representing seconds) + +**Environment Variable:** N/A + +**Default Value:** If not set, it will **inherit the value of the `timeout` parameter** (which defaults to 600 seconds). + +**Description:** The `stream_timeout` parameter, within `router_settings`, sets a **default timeout duration in seconds specifically for streaming requests**. This timeout applies to requests made using streaming mode (e.g., `/chat/completions` requests with `stream: true`). + +* **Timeout for Streaming Responses:** `stream_timeout` is similar to `timeout`, but it applies **specifically to streaming API calls**. It sets a deadline for the *entire streaming response* from the LLM provider. If the proxy does not receive the complete streaming response within the `stream_timeout` duration, it will abort the stream and return a timeout error to the client. +* **Handling Long Streaming Responses:** Streaming responses can potentially take longer to complete than non-streaming responses, especially for long generations or complex conversations. `stream_timeout` allows you to set a separate, potentially longer, timeout for streaming requests to accommodate these longer-duration responses. +* **Default to** `timeout` **Value:** If `stream_timeout` is **not explicitly set**, it will **default to using the value of the general `timeout` parameter**. This means that if you have set a general `timeout` for all requests, that same timeout will also apply to streaming requests unless you override it with `stream_timeout`. +* **Independent Timeout for Streams:** Setting `stream_timeout` allows you to define a **different timeout duration specifically for streaming requests**, giving you more granular control. You might want to set a *longer* `stream_timeout` than `timeout` if you expect streaming responses to take significantly longer than non-streaming responses. + +**Example YAML (Setting a Longer Timeout for Streaming Requests):** + +```yaml +router_settings: + timeout: 300.0 # General request timeout set to 300 seconds (5 minutes) + stream_timeout: 1200.0 # Streaming request timeout set to 1200 seconds (20 minutes) - longer for streams + # ... other router_settings ... +``` + +In this example, the general request timeout is set to 300 seconds (5 minutes), while the `stream_timeout` is set to 1200 seconds (20 minutes). This means that non-streaming requests will timeout after 5 minutes, while streaming requests will be allowed to run for up to 20 minutes before timing out. + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** If you expect streaming responses to take significantly longer than non-streaming responses in your application, consider setting a `stream_timeout` value that is **longer than your general `timeout`**. If you are not using streaming or expect streaming response times to be similar to non-streaming responses, you can typically **omit `stream_timeout`**, and it will default to using the value of `timeout`. Adjust `stream_timeout` based on your expected streaming response durations and latency requirements for streaming interactions. + +--- + +#### `debug_level` + +**YAML Key:** `debug_level` + +**Type:** String (Must be either `"DEBUG"` or `"INFO"`) + +**Environment Variable:** N/A + +**Default Value:** `"INFO"` (Router logs at `INFO` level by default). + +**Description:** The `debug_level` parameter, within `router_settings`, controls the **logging verbosity level for the proxy router's internal logs**. These are logs specifically related to the router's operations, routing decisions, health checks, and internal processing logic, as opposed to the general LiteLLM library logs controlled by `litellm_settings.set_verbose`. + +* **Router Logging Verbosity:** `debug_level` determines how much detail the proxy router will output in its logs. You can choose between two levels: + * `"INFO"`: (Default). Sets the router logging level to **INFO**. This level typically logs informational messages about routing decisions, health check statuses, and key router events. `"INFO"` level logging is generally suitable for production environments, providing a reasonable level of operational visibility without generating excessive log volume. + * `"DEBUG"`: Sets the router logging level to **DEBUG**. This level enables **very verbose debug logging**. The router will output much more detailed logs, including granular information about routing decisions, load balancing choices, health check probes, retry attempts, internal state changes, and potentially request/response details at the routing layer. `"DEBUG"` logging is invaluable for **troubleshooting complex routing issues, diagnosing performance problems within the router, or understanding the detailed internal behavior of the proxy router**. However, it generates a **very large volume of logs** and should **not be enabled in production** unless absolutely necessary for debugging a specific issue. +* **Console/Stdout Logs:** Router logs, regardless of the `debug_level`, are typically output to the proxy server's console (stdout). If you have configured a logging integration (e.g., Langfuse, OpenTelemetry, etc.) via `litellm_settings.callbacks`, router logs might also be captured by those integrations, depending on the integration's capabilities and configuration. + +**Example YAML (Setting Router Debug Level to DEBUG):** + +```yaml +router_settings: + debug_level: "DEBUG" # Setting router debug level to DEBUG (verbose logging for troubleshooting) + # ... other router_settings ... +``` + +**Example Environment Variable:** N/A (YAML-only parameter) + +**Recommendation:** + +* **For Production: Use** `"INFO"` **(Default).** Leave `debug_level` at its default `"INFO"` value for production environments. This provides a good balance of operational visibility and reasonable log volume. +* **For Debugging: Use** `"DEBUG"` **(Temporarily).** When you need to diagnose complex routing issues, performance bottlenecks, or understand the detailed internal behavior of the proxy router, temporarily set `debug_level: "DEBUG"` for a short period to capture more granular logs. **Remember to revert back to `"INFO"` after debugging**, as `"DEBUG"` logging generates a very high volume of logs and can impact performance and potentially expose sensitive information in logs if not handled carefully. + +--- + +#### `set_verbose` (`router_settings`) + +**YAML Key:** `set_verbose` + +**Type:** Boolean + +**Environment Variable:** `LITELLM_VERBOSE=1` + +**Default Value:** `false` (Verbose logging is disabled by default). + +**Description:** The documentation states this parameter is **DEPRECATED** within `router_settings`. It duplicates the functionality of `litellm_settings.set_verbose`. You should use `litellm_settings.set_verbose` to control verbose logging for the LiteLLM library. Setting this in `router_settings` likely has no effect. + +--- + +## Understanding Router Feature and Load Balancing Examples + +This section shows various examples using different combinations of routing strategies, `model_list`, and `router_settings`. + +**1. Simple Shuffle (Round Robin)** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-05-15" + +router_settings: + routing_strategy: simple-shuffle # This is the default, so you could omit this line +``` + +This configuration uses the default `"simple-shuffle"` strategy (round-robin). Requests to `model: "gpt-3.5-turbo-openai"` and `model: "gpt-3.5-turbo-azure"` will be distributed evenly. This example shows two models from different providers, but it also works with multiple deployments of the *same* model from a *single* provider (e.g., multiple Azure deployments). + +**2. Weighted Shuffle** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + weight: 7 # 70% of the traffic + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-05-15" + weight: 3 # 30% of the traffic + +router_settings: + routing_strategy: simple-shuffle # Still uses shuffle, but with weights +``` + +Here, requests sent to "gpt-3.5-turbo-openai" will receive approximately 70% of the traffic and the "gpt-3.5-turbo-azure" deployment will get 30%. The weights determine the *proportion* of requests each model receives, not the absolute number. + +**3. Least Busy** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-05-15" + +router_settings: + routing_strategy: least-busy +``` + +The proxy will route requests to the deployment (either OpenAI or Azure in this case) that is currently handling the *fewest* active requests. + +**4. Latency-Based Routing** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-05-15" + +router_settings: + routing_strategy: latency-based-routing + routing_strategy_args: + ttl: 300 # Cache latency data for 5 minutes (300 seconds) + lowest_latency_buffer: 0.1 # Consider models within 10% of the lowest latency +``` + +Requests will be routed to the deployment with the lowest *recent* latency. The `ttl` argument determines how long latency data is considered valid (cached). The `lowest_latency_buffer` (optional) specifies a percentage. Any deployment whose latency is within that percentage of the *lowest* observed latency is considered "low latency" and eligible for routing. + +**5. Usage-Based Routing (v2)** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + tpm: 60000 # Tokens per minute (example) + rpm: 100 # Requests per minute (example) + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-07-01-preview" + tpm: 120000 # Tokens per minute (example) + rpm: 200 # Requests per minute (example) +router_settings: + routing_strategy: usage-based-routing-v2 + redis_host: your-redis-host # REQUIRED for usage-based-routing-v2 + redis_port: 6379 # REQUIRED for usage-based-routing-v2 + redis_password: your-redis-password # If your Redis requires a password +``` + +This strategy uses Redis to track usage (tokens per minute) across proxy instances and routes requests to the deployment with the *lowest* current TPM, while also respecting the `rpm` and `tpm` limits defined on each model. This requires a working Redis instance. + +**6. Fallbacks** + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + +router_settings: + fallbacks: + - gpt-4: ["gpt-3.5-turbo"] # If gpt-4 fails, try gpt-3.5-turbo +``` + +If a request to `model: "gpt-4"` fails, the proxy will automatically retry the request using the `gpt-3.5-turbo` model. You can define fallbacks per model, and you can chain multiple fallbacks. + +**7. Content Policy Fallbacks** + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: "YOUR_OPENAI_API_KEY" + - model_name: claude-2 + litellm_params: + model: anthropic/claude-2 + api_key: "YOUR_ANTHROPIC_API_KEY" + +router_settings: + content_policy_fallbacks: + - gpt-4: ["claude-2"] # If gpt-4 fails with ContentPolicyViolationError, try claude-2 +``` + +If a request to `model: "gpt-4"` results in a `ContentPolicyViolationError`, the proxy will retry the request with `claude-2`. This is separate from general fallbacks. + +**8. Context Window Fallbacks** + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + model_info: + max_tokens: 4096 # Context window of gpt-3.5-turbo + - model_name: gpt-3.5-turbo-16k + litellm_params: + model: openai/gpt-3.5-turbo-16k + api_key: "YOUR_OPENAI_API_KEY" + model_info: + max_tokens: 16384 # Context window of gpt-3.5-turbo-16k + +router_settings: + context_window_fallbacks: + - gpt-3.5-turbo: ["gpt-3.5-turbo-16k"] +``` + +If a request to `gpt-3.5-turbo` exceeds its 4096 token context window, the proxy will retry the request using `gpt-3.5-turbo-16k`. + +**9. Default Fallbacks** + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + - model_name: claude-2 + litellm_params: + model: anthropic/claude-2 + api_key: "YOUR_ANTHROPIC_API_KEY" + +router_settings: + default_fallbacks: ["gpt-3.5-turbo", "claude-2"] # Fallback for ALL models if no specific fallback defined +``` + +If a request to `gpt-4` (or any model without a specific fallback defined) fails, the proxy will first try `gpt-3.5-turbo`. If that fails, it will try `claude-2`. + +**10. Caching Groups** + +```yaml +model_list: + - model_name: gpt-3.5-turbo-openai + litellm_params: + model: openai/gpt-3.5-turbo + api_key: "YOUR_OPENAI_API_KEY" + - model_name: gpt-3.5-turbo-azure + litellm_params: + model: azure/your-azure-deployment-name + api_key: "YOUR_AZURE_API_KEY" + api_base: "YOUR_AZURE_API_BASE" + api_version: "2023-07-01-preview" + +router_settings: + caching_groups: + - ["gpt-3.5-turbo-openai", "gpt-3.5-turbo-azure"] # Both model names share the same cache +``` + +If a request is made to `gpt-3.5-turbo-openai` and a response is cached, a subsequent request to `gpt-3.5-turbo-azure` *with the same prompt and parameters* will use the cached response, and vice versa. + +These examples cover a variety of routing and fallback scenarios. Combining these settings allows you to build a highly resilient, efficient, and cost-effective LLM gateway using the LiteLLM Proxy Server. Remember to test your configurations thoroughly under various conditions (load, failures, etc.) to ensure they behave as expected.