diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md index 603c47fad1..4b2621fa8c 100644 --- a/docs/my-website/docs/proxy/tag_routing.md +++ b/docs/my-website/docs/proxy/tag_routing.md @@ -25,6 +25,13 @@ model_list: model: openai/gpt-4o api_key: os.environ/OPENAI_API_KEY tags: ["paid"] # πŸ‘ˆ Key Change + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + tags: ["default"] # OPTIONAL - All untagged requests will get routed to this + router_settings: enable_tag_filtering: True # πŸ‘ˆ Key Change @@ -136,6 +143,46 @@ Response } ``` +## Setting Default Tags + +Use this if you want all untagged requests to be routed to specific deployments + +1. Set default tag on your yaml +```yaml + model_list: + - model_name: fake-openai-endpoint + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + tags: ["default"] # πŸ‘ˆ Key Change - All untagged requests will get routed to this + model_info: + id: "default-model" # used for identifying model in response headers +``` + +2. Start proxy +```shell +$ litellm --config /path/to/config.yaml +``` + +3. Make request with no tags +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "fake-openai-endpoint", + "messages": [ + {"role": "user", "content": "Hello, Claude gm!"} + ] + }' +``` + +Expect to see the following response header when this works +```shell +x-litellm-model-id: default-model +``` + ## ✨ Team based tag routing (Enterprise) LiteLLM Proxy supports team-based tag routing, allowing you to associate specific tags with teams and route requests accordingly. Example **Team A can access gpt-4 deployment A, Team B can access gpt-4 deployment B** (LLM Access Control For Teams) @@ -170,6 +217,12 @@ Here's how to set up and use team-based tag routing using curl commands: tags: ["teamB"] # πŸ‘ˆ Key Change model_info: id: "team-b-model" # used for identifying model in response headers + - model_name: fake-openai-endpoint + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + tags: ["default"] # OPTIONAL - All untagged requests will get routed to this router_settings: enable_tag_filtering: True # πŸ‘ˆ Key Change diff --git a/docs/my-website/docs/proxy/team_logging.md b/docs/my-website/docs/proxy/team_logging.md index c593f23bf5..fb177da761 100644 --- a/docs/my-website/docs/proxy/team_logging.md +++ b/docs/my-website/docs/proxy/team_logging.md @@ -208,8 +208,8 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ -d '{ "metadata": { "logging": [{ - "callback_name": "langfuse", # 'otel', 'langfuse', 'lunary' - "callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default + "callback_name": "langfuse", # "otel", "langfuse", "lunary" + "callback_type": "success", # "success", "failure", "success_and_failure" "callback_vars": { "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment diff --git a/litellm/__init__.py b/litellm/__init__.py index 25cae83282..cf13edce40 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -118,7 +118,7 @@ in_memory_llm_clients_cache: dict = {} safe_memory_mode: bool = False enable_azure_ad_token_refresh: Optional[bool] = False ### DEFAULT AZURE API VERSION ### -AZURE_DEFAULT_API_VERSION = "2024-07-01-preview" # this is updated to the latest +AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest ### COHERE EMBEDDINGS DEFAULT TYPE ### COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document" ### GUARDRAILS ### @@ -483,7 +483,12 @@ openai_compatible_providers: List = [ "azure_ai", "github", ] - +openai_text_completion_compatible_providers: List = ( + [ # providers that support `/v1/completions` + "together_ai", + "fireworks_ai", + ] +) # well supported replicate llms replicate_models: List = [ @@ -863,7 +868,7 @@ from .llms.custom_llm import CustomLLM from .llms.huggingface_restapi import HuggingfaceConfig from .llms.anthropic.chat import AnthropicConfig from .llms.anthropic.completion import AnthropicTextConfig -from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig +from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig from .llms.predibase import PredibaseConfig from .llms.replicate import ReplicateConfig from .llms.cohere.completion import CohereConfig diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 1eb4d0eb94..bcec062de1 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -22,6 +22,9 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha from litellm.llms.anthropic.cost_calculation import ( cost_per_token as anthropic_cost_per_token, ) +from litellm.llms.databricks.cost_calculator import ( + cost_per_token as databricks_cost_per_token, +) from litellm.rerank_api.types import RerankResponse from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS @@ -159,7 +162,7 @@ def cost_per_token( _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) model_without_prefix = model - model_parts = model.split("/") + model_parts = model.split("/", 1) if len(model_parts) > 1: model_without_prefix = model_parts[1] else: @@ -212,6 +215,8 @@ def cost_per_token( ) elif custom_llm_provider == "anthropic": return anthropic_cost_per_token(model=model, usage=usage_block) + elif custom_llm_provider == "databricks": + return databricks_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py index e2c3d6f3b8..84e45e16b1 100644 --- a/litellm/integrations/opentelemetry.py +++ b/litellm/integrations/opentelemetry.py @@ -649,7 +649,9 @@ class OpenTelemetry(CustomLogger): return BatchSpanProcessor( OTLPSpanExporterHTTP( endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers - ) + ), + max_queue_size=100, + max_export_batch_size=100, ) elif self.OTEL_EXPORTER == "otlp_grpc": verbose_logger.debug( @@ -659,7 +661,9 @@ class OpenTelemetry(CustomLogger): return BatchSpanProcessor( OTLPSpanExporterGRPC( endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers - ) + ), + max_queue_size=100, + max_export_batch_size=100, ) else: verbose_logger.debug( diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index b1db82a775..43273224cb 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -2333,6 +2333,8 @@ def get_standard_logging_object_payload( completion_start_time_float = completion_start_time.timestamp() elif isinstance(completion_start_time, float): completion_start_time_float = completion_start_time + else: + completion_start_time_float = end_time_float # clean up litellm hidden params clean_hidden_params = StandardLoggingHiddenParams( model_id=None, diff --git a/litellm/llms/AzureOpenAI/azure.py b/litellm/llms/AzureOpenAI/azure.py index 70f13375d2..8da9ee063b 100644 --- a/litellm/llms/AzureOpenAI/azure.py +++ b/litellm/llms/AzureOpenAI/azure.py @@ -245,7 +245,10 @@ class AzureOpenAIConfig: - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. """ - if json_schema is not None: + if json_schema is not None and ( + (api_version_year <= "2024" and api_version_month < "08") + or "gpt-4o" not in model + ): # azure api version "2024-08-01-preview" onwards supports 'json_schema' only for gpt-4o _tool_choice = ChatCompletionToolChoiceObjectParam( type="function", function=ChatCompletionToolChoiceFunctionParam( diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py index 8021ccd59e..ed4d199f67 100644 --- a/litellm/llms/OpenAI/openai.py +++ b/litellm/llms/OpenAI/openai.py @@ -1263,6 +1263,7 @@ class OpenAIChatCompletion(BaseLLM): error_headers = getattr(e, "headers", None) if response is not None and hasattr(response, "text"): + error_headers = getattr(e, "headers", None) raise OpenAIError( status_code=500, message=f"{str(e)}\n\nOriginal Response: {response.text}", @@ -1800,12 +1801,11 @@ class OpenAITextCompletion(BaseLLM): headers: Optional[dict] = None, ): super().completion() - exception_mapping_worked = False try: if headers is None: headers = self.validate_environment(api_key=api_key) if model is None or messages is None: - raise OpenAIError(status_code=422, message=f"Missing model or messages") + raise OpenAIError(status_code=422, message="Missing model or messages") if ( len(messages) > 0 diff --git a/litellm/llms/azure_text.py b/litellm/llms/azure_text.py index 9a8d462e56..db8c516b26 100644 --- a/litellm/llms/azure_text.py +++ b/litellm/llms/azure_text.py @@ -162,11 +162,10 @@ class AzureTextCompletion(BaseLLM): client=None, ): super().completion() - exception_mapping_worked = False try: if model is None or messages is None: raise AzureOpenAIError( - status_code=422, message=f"Missing model or messages" + status_code=422, message="Missing model or messages" ) max_retries = optional_params.pop("max_retries", 2) @@ -293,7 +292,10 @@ class AzureTextCompletion(BaseLLM): "api-version", api_version ) - response = azure_client.completions.create(**data, timeout=timeout) # type: ignore + raw_response = azure_client.completions.with_raw_response.create( + **data, timeout=timeout + ) + response = raw_response.parse() stringified_response = response.model_dump() ## LOGGING logging_obj.post_call( @@ -380,13 +382,15 @@ class AzureTextCompletion(BaseLLM): "complete_input_dict": data, }, ) - response = await azure_client.completions.create(**data, timeout=timeout) + raw_response = await azure_client.completions.with_raw_response.create( + **data, timeout=timeout + ) + response = raw_response.parse() return openai_text_completion_config.convert_to_chat_model_response_object( response_object=response.model_dump(), model_response_object=model_response, ) except AzureOpenAIError as e: - exception_mapping_worked = True raise e except Exception as e: status_code = getattr(e, "status_code", 500) diff --git a/litellm/llms/bedrock/chat.py b/litellm/llms/bedrock/chat.py index 4192ae868c..ee09797ba2 100644 --- a/litellm/llms/bedrock/chat.py +++ b/litellm/llms/bedrock/chat.py @@ -736,7 +736,9 @@ class BedrockLLM(BaseAWSLLM): if (stream is not None and stream is True) and provider != "ai21": endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream" - proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream" + proxy_endpoint_url = ( + f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream" + ) else: endpoint_url = f"{endpoint_url}/model/{modelId}/invoke" proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke" @@ -1268,7 +1270,7 @@ class AmazonConverseConfig: if len(value) == 0: # converse raises error for empty strings continue value = [value] - optional_params["stop_sequences"] = value + optional_params["stopSequences"] = value if param == "temperature": optional_params["temperature"] = value if param == "top_p": diff --git a/litellm/llms/databricks.py b/litellm/llms/databricks/chat.py similarity index 98% rename from litellm/llms/databricks.py rename to litellm/llms/databricks/chat.py index 3cc1c24568..0421cd9e46 100644 --- a/litellm/llms/databricks.py +++ b/litellm/llms/databricks/chat.py @@ -29,8 +29,8 @@ from litellm.types.utils import ( ) from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class DatabricksError(Exception): @@ -328,6 +328,7 @@ class DatabricksChatCompletion(BaseLLM): api_base: str, custom_prompt_dict: dict, model_response: ModelResponse, + custom_llm_provider: str, print_verbose: Callable, encoding, api_key, @@ -371,6 +372,8 @@ class DatabricksChatCompletion(BaseLLM): ) response = ModelResponse(**response_json) + response.model = custom_llm_provider + "/" + response.model + if base_model is not None: response._hidden_params["model"] = base_model return response @@ -472,6 +475,7 @@ class DatabricksChatCompletion(BaseLLM): data=data, api_base=api_base, custom_prompt_dict=custom_prompt_dict, + custom_llm_provider=custom_llm_provider, model_response=model_response, print_verbose=print_verbose, encoding=encoding, @@ -528,6 +532,8 @@ class DatabricksChatCompletion(BaseLLM): response = ModelResponse(**response_json) + response.model = custom_llm_provider + "/" + response.model + if base_model is not None: response._hidden_params["model"] = base_model diff --git a/litellm/llms/databricks/cost_calculator.py b/litellm/llms/databricks/cost_calculator.py new file mode 100644 index 0000000000..3d40f2aa62 --- /dev/null +++ b/litellm/llms/databricks/cost_calculator.py @@ -0,0 +1,39 @@ +""" +Helper util for handling databricks-specific cost calculation +- e.g.: handling 'dbrx-instruct-*' +""" + +from typing import Tuple + +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + base_model = model + if model.startswith("databricks/dbrx-instruct") or model.startswith( + "dbrx-instruct" + ): + base_model = "databricks-dbrx-instruct" + + ## GET MODEL INFO + model_info = get_model_info(model=base_model, custom_llm_provider="databricks") + + ## CALCULATE INPUT COST + + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + + ## CALCULATE OUTPUT COST + completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"] + + return prompt_cost, completion_cost diff --git a/litellm/llms/sagemaker/sagemaker.py b/litellm/llms/sagemaker/sagemaker.py index cbf1a9f62b..a7b36134b5 100644 --- a/litellm/llms/sagemaker/sagemaker.py +++ b/litellm/llms/sagemaker/sagemaker.py @@ -273,7 +273,7 @@ class SagemakerLLM(BaseAWSLLM): model_id = optional_params.get("model_id", None) if use_messages_api is True: - from litellm.llms.databricks import DatabricksChatCompletion + from litellm.llms.databricks.chat import DatabricksChatCompletion openai_like_chat_completions = DatabricksChatCompletion() inference_params["stream"] = True if stream is True else False diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py index 69909765e8..c30fa900f0 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py @@ -80,7 +80,7 @@ class VertexAIPartnerModels(BaseLLM): import vertexai from google.cloud import aiplatform - from litellm.llms.databricks import DatabricksChatCompletion + from litellm.llms.databricks.chat import DatabricksChatCompletion from litellm.llms.OpenAI.openai import OpenAIChatCompletion from litellm.llms.text_completion_codestral import CodestralTextCompletion from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( diff --git a/litellm/main.py b/litellm/main.py index cb35556191..1d20cf4240 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -92,7 +92,7 @@ from .llms.cohere import chat as cohere_chat from .llms.cohere import completion as cohere_completion # type: ignore from .llms.cohere import embed as cohere_embed from .llms.custom_llm import CustomLLM, custom_chat_llm_router -from .llms.databricks import DatabricksChatCompletion +from .llms.databricks.chat import DatabricksChatCompletion from .llms.huggingface_restapi import Huggingface from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion @@ -1013,7 +1013,10 @@ def completion( api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE") api_version = ( - api_version or litellm.api_version or get_secret("AZURE_API_VERSION") + api_version + or litellm.api_version + or get_secret("AZURE_API_VERSION") + or litellm.AZURE_DEFAULT_API_VERSION ) api_key = ( @@ -1209,6 +1212,9 @@ def completion( custom_llm_provider == "text-completion-openai" or "ft:babbage-002" in model or "ft:davinci-002" in model # support for finetuned completion models + or custom_llm_provider + in litellm.openai_text_completion_compatible_providers + and kwargs.get("text_completion") is True ): openai.api_type = "openai" @@ -4099,8 +4105,8 @@ def text_completion( kwargs.pop("prompt", None) - if ( - _model is not None and custom_llm_provider == "openai" + if _model is not None and ( + custom_llm_provider == "openai" ): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls if _model not in litellm.open_ai_chat_completion_models: model = "text-completion-openai/" + _model diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 487e187a3c..912c968311 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -2512,16 +2512,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-latest": { "max_tokens": 8192, @@ -2533,16 +2533,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro": { "max_tokens": 8192, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 335e934475..bf86da1e12 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,16 +1,9 @@ model_list: - - model_name: "anthropic/claude-3-5-sonnet-20240620" + - model_name: "gpt-turbo" litellm_params: - model: anthropic/claude-3-5-sonnet-20240620 - # api_base: http://0.0.0.0:9000 - - model_name: gpt-3.5-turbo - litellm_params: - model: openai/* + model: azure/chatgpt-v-2 + api_key: os.environ/AZURE_API_KEY + api_base: os.environ/AZURE_API_BASE -litellm_settings: - success_callback: ["s3"] - s3_callback_params: - s3_bucket_name: litellm-logs # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 \ No newline at end of file +router_settings: + model_group_alias: {"gpt-4": "gpt-turbo"} \ No newline at end of file diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index c2b240ea65..3559a4792f 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -242,6 +242,9 @@ class LiteLLMRoutes(enum.Enum): "/v1/models", # token counter "/utils/token_counter", + # rerank + "/rerank", + "/v1/rerank", ] mapped_pass_through_routes: List = [ diff --git a/litellm/proxy/health_check.py b/litellm/proxy/health_check.py index ff5ed7bfb7..215d2d8d60 100644 --- a/litellm/proxy/health_check.py +++ b/litellm/proxy/health_check.py @@ -3,7 +3,7 @@ import asyncio import logging import random -from typing import Optional +from typing import List, Optional import litellm from litellm._logging import print_verbose @@ -36,6 +36,25 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True): ) +def filter_deployments_by_id( + model_list: List, +) -> List: + seen_ids = set() + filtered_deployments = [] + + for deployment in model_list: + _model_info = deployment.get("model_info") or {} + _id = _model_info.get("id") or None + if _id is None: + continue + + if _id not in seen_ids: + seen_ids.add(_id) + filtered_deployments.append(deployment) + + return filtered_deployments + + async def _perform_health_check(model_list: list, details: Optional[bool] = True): """ Perform a health check for each model in the list. @@ -105,6 +124,9 @@ async def perform_health_check( _new_model_list = [x for x in model_list if x["model_name"] == model] model_list = _new_model_list + model_list = filter_deployments_by_id( + model_list=model_list + ) # filter duplicate deployments (e.g. when model alias'es are used) healthy_endpoints, unhealthy_endpoints = await _perform_health_check( model_list, details ) diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index d41aae50f6..890c576c94 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -86,10 +86,11 @@ def convert_key_logging_metadata_to_callback( team_callback_settings_obj.success_callback = [] if team_callback_settings_obj.failure_callback is None: team_callback_settings_obj.failure_callback = [] + if data.callback_name not in team_callback_settings_obj.success_callback: team_callback_settings_obj.success_callback.append(data.callback_name) - if data.callback_name in team_callback_settings_obj.failure_callback: + if data.callback_name not in team_callback_settings_obj.failure_callback: team_callback_settings_obj.failure_callback.append(data.callback_name) for var, value in data.callback_vars.items(): diff --git a/litellm/proxy/management_helpers/utils.py b/litellm/proxy/management_helpers/utils.py index efbe667fb6..af8e852013 100644 --- a/litellm/proxy/management_helpers/utils.py +++ b/litellm/proxy/management_helpers/utils.py @@ -109,8 +109,8 @@ async def add_new_member( where={"user_id": user_info.user_id}, # type: ignore data={"teams": {"push": [team_id]}}, ) - - returned_user = LiteLLM_UserTable(**_returned_user.model_dump()) + if _returned_user is not None: + returned_user = LiteLLM_UserTable(**_returned_user.model_dump()) elif len(existing_user_row) > 1: raise HTTPException( status_code=400, diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index e385a23d7e..ad7fbd384e 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -1,19 +1,19 @@ model_list: - model_name: openai/* litellm_params: - model: gpt-3.5-turbo + model: openai/* api_key: os.environ/OPENAI_API_KEY - -litellm_settings: - success_callback: ["prometheus"] - failure_callback: ["prometheus"] - -guardrails: - - guardrail_name: "presidio-pre-guard" + model_info: + id: "good-openai" + - model_name: openai/* litellm_params: - guardrail: presidio # supported values: "aporia", "lakera", "presidio" - mode: "pre_call" # pre_call, during_call, post_call - output_parse_pii: True + model: openai/* + api_key: os.environ/non-exsitent-env-var + tags: ["bad-model"] + model_info: + id: "test-openai" + + litellm_settings: callbacks: ["otel"] @@ -22,8 +22,16 @@ callback_settings: otel: message_logging: False +router_settings: + enable_tag_filtering: True # πŸ‘ˆ Key Chang + + general_settings: master_key: sk-1234 alerting: ["slack"] spend_report_frequency: "1d" + +litellm_settings: + success_callback: ["prometheus"] + failure_callback: ["prometheus"] \ No newline at end of file diff --git a/litellm/router.py b/litellm/router.py index bcd0b6221d..5a01f4f395 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -3690,7 +3690,7 @@ class Router: exception=original_exception, ) - allowed_fails = _allowed_fails or self.allowed_fails + allowed_fails = _allowed_fails if _allowed_fails is not None else self.allowed_fails dt = get_utc_datetime() current_minute = dt.strftime("%H-%M") @@ -4556,6 +4556,27 @@ class Router: ids.append(id) return ids + def _get_all_deployments( + self, model_name: str, model_alias: Optional[str] = None + ) -> List[DeploymentTypedDict]: + """ + Return all deployments of a model name + + Used for accurate 'get_model_list'. + """ + + returned_models: List[DeploymentTypedDict] = [] + for model in self.model_list: + if model["model_name"] == model_name: + if model_alias is not None: + alias_model = copy.deepcopy(model) + alias_model["model_name"] = model_name + returned_models.append(alias_model) + else: + returned_models.append(model) + + return returned_models + def get_model_names(self) -> List[str]: """ Returns all possible model names for router. @@ -4567,15 +4588,18 @@ class Router: def get_model_list( self, model_name: Optional[str] = None ) -> Optional[List[DeploymentTypedDict]]: + """ + Includes router model_group_alias'es as well + """ if hasattr(self, "model_list"): returned_models: List[DeploymentTypedDict] = [] for model_alias, model_value in self.model_group_alias.items(): - model_alias_item = DeploymentTypedDict( - model_name=model_alias, - litellm_params=LiteLLMParamsTypedDict(model=model_value), + returned_models.extend( + self._get_all_deployments( + model_name=model_value, model_alias=model_alias + ) ) - returned_models.append(model_alias_item) if model_name is None: returned_models += self.model_list @@ -4583,8 +4607,7 @@ class Router: return returned_models for model in self.model_list: - if model["model_name"] == model_name: - returned_models.append(model) + returned_models.extend(self._get_all_deployments(model_name=model_name)) return returned_models return None diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py index ed350109c1..78bc5e4f9f 100644 --- a/litellm/router_strategy/tag_based_routing.py +++ b/litellm/router_strategy/tag_based_routing.py @@ -1,5 +1,9 @@ """ -Use this to route requests between free and paid tiers +Use this to route requests between Teams + +- If tags in request is a subset of tags in deployment, return deployment +- if deployments are set with default tags, return all default deployment +- If no default_deployments are set, return all deployments """ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union @@ -25,14 +29,14 @@ async def get_deployments_for_tag( if request_kwargs is None: verbose_logger.debug( - "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s", + "get_deployments_for_tag: request_kwargs is None returning healthy_deployments: %s", healthy_deployments, ) return healthy_deployments if healthy_deployments is None: verbose_logger.debug( - "get_deployments_for_tier: healthy_deployments is None returning healthy_deployments" + "get_deployments_for_tag: healthy_deployments is None returning healthy_deployments" ) return healthy_deployments @@ -43,7 +47,9 @@ async def get_deployments_for_tag( new_healthy_deployments = [] if request_tags: - verbose_logger.debug("parameter routing: router_keys: %s", request_tags) + verbose_logger.debug( + "get_deployments_for_tag routing: router_keys: %s", request_tags + ) # example this can be router_keys=["free", "custom"] # get all deployments that have a superset of these router keys for deployment in healthy_deployments: @@ -66,9 +72,26 @@ async def get_deployments_for_tag( request_tags, ) new_healthy_deployments.append(deployment) + elif "default" in deployment_tags: + verbose_logger.debug( + "adding default deployment with tags: %s, request tags: %s", + deployment_tags, + request_tags, + ) + new_healthy_deployments.append(deployment) return new_healthy_deployments + # for Untagged requests use default deployments if set + _default_deployments_with_tags = [] + for deployment in healthy_deployments: + if "default" in deployment.get("litellm_params", {}).get("tags", []): + _default_deployments_with_tags.append(deployment) + + if len(_default_deployments_with_tags) > 0: + return _default_deployments_with_tags + + # if no default deployment is found, return healthy_deployments verbose_logger.debug( "no tier found in metadata, returning healthy_deployments: %s", healthy_deployments, diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 619d2ab5d3..3adf3bbee9 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -626,6 +626,8 @@ async def test_model_function_invoke(model, sync_mode, api_key, api_base): response = await litellm.acompletion(**data) print(f"response: {response}") + except litellm.InternalServerError: + pass except litellm.RateLimitError as e: pass except Exception as e: @@ -889,18 +891,29 @@ def encode_image(image_path): return base64.b64encode(image_file.read()).decode("utf-8") -@pytest.mark.skip( - reason="we already test claude-3, this is just another way to pass images" -) -def test_completion_claude_3_base64(): +@pytest.mark.parametrize( + "model", + [ + "gpt-4o", + "azure/gpt-4o", + "anthropic/claude-3-opus-20240229", + ], +) # +def test_completion_base64(model): try: + import base64 + + import requests + litellm.set_verbose = True - litellm.num_retries = 3 - image_path = "../proxy/cached_logo.jpg" - # Getting the base64 string - base64_image = encode_image(image_path) + url = "https://dummyimage.com/100/100/fff&text=Test+image" + response = requests.get(url) + file_data = response.content + + encoded_file = base64.b64encode(file_data).decode("utf-8") + base64_image = f"data:image/png;base64,{encoded_file}" resp = litellm.completion( - model="anthropic/claude-3-opus-20240229", + model=model, messages=[ { "role": "user", @@ -908,9 +921,7 @@ def test_completion_claude_3_base64(): {"type": "text", "text": "Whats in this image?"}, { "type": "image_url", - "image_url": { - "url": "data:image/jpeg;base64," + base64_image - }, + "image_url": {"url": base64_image}, }, ], } @@ -919,7 +930,6 @@ def test_completion_claude_3_base64(): print(f"\nResponse: {resp}") prompt_tokens = resp.usage.prompt_tokens - raise Exception("it worked!") except Exception as e: if "500 Internal error encountered.'" in str(e): pass @@ -2174,15 +2184,16 @@ def test_completion_openai(): @pytest.mark.parametrize( - "model", + "model, api_version", [ - "gpt-4o-2024-08-06", - "azure/chatgpt-v-2", - "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + ("gpt-4o-2024-08-06", None), + ("azure/chatgpt-v-2", None), + ("bedrock/anthropic.claude-3-sonnet-20240229-v1:0", None), + ("azure/gpt-4o", "2024-08-01-preview"), ], ) @pytest.mark.flaky(retries=3, delay=1) -def test_completion_openai_pydantic(model): +def test_completion_openai_pydantic(model, api_version): try: litellm.set_verbose = True from pydantic import BaseModel @@ -2207,6 +2218,7 @@ def test_completion_openai_pydantic(model): messages=messages, metadata={"hi": "bye"}, response_format=EventsList, + api_version=api_version, ) break except litellm.JSONSchemaValidationError: @@ -3469,14 +3481,14 @@ def response_format_tests(response: litellm.ModelResponse): @pytest.mark.parametrize( "model", [ - # "bedrock/cohere.command-r-plus-v1:0", + "bedrock/mistral.mistral-large-2407-v1:0", + "bedrock/cohere.command-r-plus-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", - # "anthropic.claude-instant-v1", - # "bedrock/ai21.j2-mid", - # "mistral.mistral-7b-instruct-v0:2", + "anthropic.claude-instant-v1", + "mistral.mistral-7b-instruct-v0:2", # "bedrock/amazon.titan-tg1-large", - # "meta.llama3-8b-instruct-v1:0", - # "cohere.command-text-v14", + "meta.llama3-8b-instruct-v1:0", + "cohere.command-text-v14", ], ) @pytest.mark.parametrize("sync_mode", [True, False]) @@ -3491,6 +3503,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model): messages=[{"role": "user", "content": "Hey! how's it going?"}], temperature=0.2, max_tokens=200, + stop=["stop sequence"], ) assert isinstance(response, litellm.ModelResponse) @@ -3502,6 +3515,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model): messages=[{"role": "user", "content": "Hey! how's it going?"}], temperature=0.2, max_tokens=100, + stop=["stop sequence"], ) assert isinstance(response, litellm.ModelResponse) diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 55a5abbdd7..ed9eebedb1 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -1219,3 +1219,13 @@ def test_completion_cost_anthropic_prompt_caching(): cost_2 = completion_cost(model=model, completion_response=response_2) assert cost_1 > cost_2 + + +def test_completion_cost_databricks(): + model, messages = "databricks/databricks-dbrx-instruct", [ + {"role": "user", "content": "What is 2+2?"} + ] + + resp = litellm.completion(model=model, messages=messages) # works fine + + cost = completion_cost(completion_response=resp) diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index 0388e026b9..a570692f6d 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -864,7 +864,7 @@ def _pre_call_utils( data["messages"] = [{"role": "user", "content": "Hello world"}] if streaming is True: data["stream"] = True - mapped_target = client.chat.completions.with_raw_response + mapped_target = client.chat.completions.with_raw_response # type: ignore if sync_mode: original_function = litellm.completion else: @@ -873,7 +873,7 @@ def _pre_call_utils( data["prompt"] = "Hello world" if streaming is True: data["stream"] = True - mapped_target = client.completions.with_raw_response + mapped_target = client.completions.with_raw_response # type: ignore if sync_mode: original_function = litellm.text_completion else: diff --git a/litellm/tests/test_function_calling.py b/litellm/tests/test_function_calling.py index 79db9f1623..f30f713ead 100644 --- a/litellm/tests/test_function_calling.py +++ b/litellm/tests/test_function_calling.py @@ -52,6 +52,7 @@ def get_current_weather(location, unit="fahrenheit"): # "anthropic.claude-3-sonnet-20240229-v1:0", ], ) +@pytest.mark.flaky(retries=3, delay=1) def test_aaparallel_function_call(model): try: litellm.set_verbose = True diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py index fb1025ab26..102c126d11 100644 --- a/litellm/tests/test_proxy_server.py +++ b/litellm/tests/test_proxy_server.py @@ -1255,7 +1255,17 @@ async def test_add_callback_via_key(prisma_client): @pytest.mark.asyncio -async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client): +@pytest.mark.parametrize( + "callback_type, expected_success_callbacks, expected_failure_callbacks", + [ + ("success", ["langfuse"], []), + ("failure", [], ["langfuse"]), + ("success_and_failure", ["langfuse"], ["langfuse"]), + ], +) +async def test_add_callback_via_key_litellm_pre_call_utils( + prisma_client, callback_type, expected_success_callbacks, expected_failure_callbacks +): import json from fastapi import HTTPException, Request, Response @@ -1312,7 +1322,7 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client): "logging": [ { "callback_name": "langfuse", - "callback_type": "success", + "callback_type": callback_type, "callback_vars": { "langfuse_public_key": "my-mock-public-key", "langfuse_secret_key": "my-mock-secret-key", @@ -1359,14 +1369,21 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client): } new_data = await add_litellm_data_to_request(**data) + print("NEW DATA: {}".format(new_data)) - assert "success_callback" in new_data - assert new_data["success_callback"] == ["langfuse"] assert "langfuse_public_key" in new_data assert new_data["langfuse_public_key"] == "my-mock-public-key" assert "langfuse_secret_key" in new_data assert new_data["langfuse_secret_key"] == "my-mock-secret-key" + if expected_success_callbacks: + assert "success_callback" in new_data + assert new_data["success_callback"] == expected_success_callbacks + + if expected_failure_callbacks: + assert "failure_callback" in new_data + assert new_data["failure_callback"] == expected_failure_callbacks + @pytest.mark.asyncio async def test_gemini_pass_through_endpoint(): diff --git a/litellm/tests/test_router_tag_routing.py b/litellm/tests/test_router_tag_routing.py index 67f100d794..f71a9b762d 100644 --- a/litellm/tests/test_router_tag_routing.py +++ b/litellm/tests/test_router_tag_routing.py @@ -91,3 +91,72 @@ async def test_router_free_paid_tier(): print("response_extra_info: ", response_extra_info) assert response_extra_info["model_id"] == "very-expensive-model" + + +@pytest.mark.asyncio() +async def test_default_tagged_deployments(): + """ + - only use default deployment for untagged requests + - if a request has tag "default", use default deployment + """ + + router = litellm.Router( + model_list=[ + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4o", + "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + "tags": ["default"], + }, + "model_info": {"id": "default-model"}, + }, + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4o", + "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + }, + "model_info": {"id": "default-model-2"}, + }, + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4o-mini", + "api_base": "https://exampleopenaiendpoint-production.up.railway.app/", + "tags": ["teamA"], + }, + "model_info": {"id": "very-expensive-model"}, + }, + ], + enable_tag_filtering=True, + ) + + for _ in range(5): + # Untagged request, this should pick model with id == "default-model" + response = await router.acompletion( + model="gpt-4", + messages=[{"role": "user", "content": "Tell me a joke."}], + ) + + print("Response: ", response) + + response_extra_info = response._hidden_params + print("response_extra_info: ", response_extra_info) + + assert response_extra_info["model_id"] == "default-model" + + for _ in range(5): + # requests tagged with "default", this should pick model with id == "default-model" + response = await router.acompletion( + model="gpt-4", + messages=[{"role": "user", "content": "Tell me a joke."}], + metadata={"tags": ["default"]}, + ) + + print("Response: ", response) + + response_extra_info = response._hidden_params + print("response_extra_info: ", response_extra_info) + + assert response_extra_info["model_id"] == "default-model" diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py index 70325c44e2..e6a4a0499c 100644 --- a/litellm/tests/test_text_completion.py +++ b/litellm/tests/test_text_completion.py @@ -4239,3 +4239,14 @@ def test_completion_vllm(): mock_call.assert_called_once() assert "hello" in mock_call.call_args.kwargs["extra_body"] + + +def test_completion_fireworks_ai_multiple_choices(): + litellm.set_verbose = True + response = litellm.text_completion( + model="fireworks_ai/llama-v3p1-8b-instruct", + prompt=["halo", "hi", "halo", "hi"], + ) + print(response.choices) + + assert len(response.choices) == 4 diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 487e187a3c..912c968311 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -2512,16 +2512,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-latest": { "max_tokens": 8192, @@ -2533,16 +2533,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro": { "max_tokens": 8192, diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index 57113d3509..b1d6b3dc66 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -148,6 +148,7 @@ router_settings: redis_password: os.environ/REDIS_PASSWORD redis_port: os.environ/REDIS_PORT enable_pre_call_checks: true + model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"} general_settings: master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys diff --git a/pyproject.toml b/pyproject.toml index e07372d325..cf9a543095 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.44.22" +version = "1.44.23" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.44.22" +version = "1.44.23" version_files = [ "pyproject.toml:^version" ]