From 09ca581620f1d738c9f3736e213ec7b5fd60f12a Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Mon, 9 Sep 2024 21:56:12 -0700 Subject: [PATCH] LiteLLM Minor Fixes and Improvements (09/09/2024) (#5602) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(main.py): pass default azure api version as alternative in completion call Fixes api error caused due to api version Closes https://github.com/BerriAI/litellm/issues/5584 * Fixed gemini-1.5-flash pricing (#5590) * add /key/list endpoint * bump: version 1.44.21 → 1.44.22 * docs architecture * Fixed gemini-1.5-flash pricing --------- Co-authored-by: Ishaan Jaff * fix(bedrock/chat.py): fix converse api stop sequence param mapping Fixes https://github.com/BerriAI/litellm/issues/5592 * fix(databricks/cost_calculator.py): handle databricks model name changes Fixes https://github.com/BerriAI/litellm/issues/5597 * fix(azure.py): support azure api version 2024-08-01-preview Closes https://github.com/BerriAI/litellm/issues/5377 * fix(proxy/_types.py): allow dev keys to call cohere /rerank endpoint Fixes issue where only admin could call rerank endpoint * fix(azure.py): check if model is gpt-4o * fix(proxy/_types.py): support /v1/rerank on non-admin routes as well * fix(cost_calculator.py): fix split on `/` logic in cost calculator --------- Co-authored-by: F1bos <44951186+F1bos@users.noreply.github.com> Co-authored-by: Ishaan Jaff --- litellm/__init__.py | 4 +- litellm/cost_calculator.py | 7 ++- litellm/llms/AzureOpenAI/azure.py | 5 +- litellm/llms/bedrock/chat.py | 6 +- .../{databricks.py => databricks/chat.py} | 10 +++- litellm/llms/databricks/cost_calculator.py | 39 ++++++++++++ litellm/llms/sagemaker/sagemaker.py | 2 +- .../vertex_ai_partner_models/main.py | 2 +- litellm/main.py | 7 ++- ...odel_prices_and_context_window_backup.json | 20 +++---- litellm/proxy/_types.py | 3 + litellm/tests/test_completion.py | 60 +++++++++++-------- litellm/tests/test_completion_cost.py | 10 ++++ model_prices_and_context_window.json | 20 +++---- 14 files changed, 139 insertions(+), 56 deletions(-) rename litellm/llms/{databricks.py => databricks/chat.py} (98%) create mode 100644 litellm/llms/databricks/cost_calculator.py diff --git a/litellm/__init__.py b/litellm/__init__.py index 0b2ec8820b..cf13edce40 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -118,7 +118,7 @@ in_memory_llm_clients_cache: dict = {} safe_memory_mode: bool = False enable_azure_ad_token_refresh: Optional[bool] = False ### DEFAULT AZURE API VERSION ### -AZURE_DEFAULT_API_VERSION = "2024-07-01-preview" # this is updated to the latest +AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest ### COHERE EMBEDDINGS DEFAULT TYPE ### COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document" ### GUARDRAILS ### @@ -868,7 +868,7 @@ from .llms.custom_llm import CustomLLM from .llms.huggingface_restapi import HuggingfaceConfig from .llms.anthropic.chat import AnthropicConfig from .llms.anthropic.completion import AnthropicTextConfig -from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig +from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig from .llms.predibase import PredibaseConfig from .llms.replicate import ReplicateConfig from .llms.cohere.completion import CohereConfig diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 1eb4d0eb94..bcec062de1 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -22,6 +22,9 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha from litellm.llms.anthropic.cost_calculation import ( cost_per_token as anthropic_cost_per_token, ) +from litellm.llms.databricks.cost_calculator import ( + cost_per_token as databricks_cost_per_token, +) from litellm.rerank_api.types import RerankResponse from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS @@ -159,7 +162,7 @@ def cost_per_token( _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) model_without_prefix = model - model_parts = model.split("/") + model_parts = model.split("/", 1) if len(model_parts) > 1: model_without_prefix = model_parts[1] else: @@ -212,6 +215,8 @@ def cost_per_token( ) elif custom_llm_provider == "anthropic": return anthropic_cost_per_token(model=model, usage=usage_block) + elif custom_llm_provider == "databricks": + return databricks_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, diff --git a/litellm/llms/AzureOpenAI/azure.py b/litellm/llms/AzureOpenAI/azure.py index 70f13375d2..8da9ee063b 100644 --- a/litellm/llms/AzureOpenAI/azure.py +++ b/litellm/llms/AzureOpenAI/azure.py @@ -245,7 +245,10 @@ class AzureOpenAIConfig: - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. """ - if json_schema is not None: + if json_schema is not None and ( + (api_version_year <= "2024" and api_version_month < "08") + or "gpt-4o" not in model + ): # azure api version "2024-08-01-preview" onwards supports 'json_schema' only for gpt-4o _tool_choice = ChatCompletionToolChoiceObjectParam( type="function", function=ChatCompletionToolChoiceFunctionParam( diff --git a/litellm/llms/bedrock/chat.py b/litellm/llms/bedrock/chat.py index 4192ae868c..ee09797ba2 100644 --- a/litellm/llms/bedrock/chat.py +++ b/litellm/llms/bedrock/chat.py @@ -736,7 +736,9 @@ class BedrockLLM(BaseAWSLLM): if (stream is not None and stream is True) and provider != "ai21": endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream" - proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream" + proxy_endpoint_url = ( + f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream" + ) else: endpoint_url = f"{endpoint_url}/model/{modelId}/invoke" proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke" @@ -1268,7 +1270,7 @@ class AmazonConverseConfig: if len(value) == 0: # converse raises error for empty strings continue value = [value] - optional_params["stop_sequences"] = value + optional_params["stopSequences"] = value if param == "temperature": optional_params["temperature"] = value if param == "top_p": diff --git a/litellm/llms/databricks.py b/litellm/llms/databricks/chat.py similarity index 98% rename from litellm/llms/databricks.py rename to litellm/llms/databricks/chat.py index 3cc1c24568..0421cd9e46 100644 --- a/litellm/llms/databricks.py +++ b/litellm/llms/databricks/chat.py @@ -29,8 +29,8 @@ from litellm.types.utils import ( ) from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class DatabricksError(Exception): @@ -328,6 +328,7 @@ class DatabricksChatCompletion(BaseLLM): api_base: str, custom_prompt_dict: dict, model_response: ModelResponse, + custom_llm_provider: str, print_verbose: Callable, encoding, api_key, @@ -371,6 +372,8 @@ class DatabricksChatCompletion(BaseLLM): ) response = ModelResponse(**response_json) + response.model = custom_llm_provider + "/" + response.model + if base_model is not None: response._hidden_params["model"] = base_model return response @@ -472,6 +475,7 @@ class DatabricksChatCompletion(BaseLLM): data=data, api_base=api_base, custom_prompt_dict=custom_prompt_dict, + custom_llm_provider=custom_llm_provider, model_response=model_response, print_verbose=print_verbose, encoding=encoding, @@ -528,6 +532,8 @@ class DatabricksChatCompletion(BaseLLM): response = ModelResponse(**response_json) + response.model = custom_llm_provider + "/" + response.model + if base_model is not None: response._hidden_params["model"] = base_model diff --git a/litellm/llms/databricks/cost_calculator.py b/litellm/llms/databricks/cost_calculator.py new file mode 100644 index 0000000000..3d40f2aa62 --- /dev/null +++ b/litellm/llms/databricks/cost_calculator.py @@ -0,0 +1,39 @@ +""" +Helper util for handling databricks-specific cost calculation +- e.g.: handling 'dbrx-instruct-*' +""" + +from typing import Tuple + +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + base_model = model + if model.startswith("databricks/dbrx-instruct") or model.startswith( + "dbrx-instruct" + ): + base_model = "databricks-dbrx-instruct" + + ## GET MODEL INFO + model_info = get_model_info(model=base_model, custom_llm_provider="databricks") + + ## CALCULATE INPUT COST + + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + + ## CALCULATE OUTPUT COST + completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"] + + return prompt_cost, completion_cost diff --git a/litellm/llms/sagemaker/sagemaker.py b/litellm/llms/sagemaker/sagemaker.py index cbf1a9f62b..a7b36134b5 100644 --- a/litellm/llms/sagemaker/sagemaker.py +++ b/litellm/llms/sagemaker/sagemaker.py @@ -273,7 +273,7 @@ class SagemakerLLM(BaseAWSLLM): model_id = optional_params.get("model_id", None) if use_messages_api is True: - from litellm.llms.databricks import DatabricksChatCompletion + from litellm.llms.databricks.chat import DatabricksChatCompletion openai_like_chat_completions = DatabricksChatCompletion() inference_params["stream"] = True if stream is True else False diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py index 69909765e8..c30fa900f0 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py @@ -80,7 +80,7 @@ class VertexAIPartnerModels(BaseLLM): import vertexai from google.cloud import aiplatform - from litellm.llms.databricks import DatabricksChatCompletion + from litellm.llms.databricks.chat import DatabricksChatCompletion from litellm.llms.OpenAI.openai import OpenAIChatCompletion from litellm.llms.text_completion_codestral import CodestralTextCompletion from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( diff --git a/litellm/main.py b/litellm/main.py index 2095322328..1d20cf4240 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -92,7 +92,7 @@ from .llms.cohere import chat as cohere_chat from .llms.cohere import completion as cohere_completion # type: ignore from .llms.cohere import embed as cohere_embed from .llms.custom_llm import CustomLLM, custom_chat_llm_router -from .llms.databricks import DatabricksChatCompletion +from .llms.databricks.chat import DatabricksChatCompletion from .llms.huggingface_restapi import Huggingface from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion @@ -1013,7 +1013,10 @@ def completion( api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE") api_version = ( - api_version or litellm.api_version or get_secret("AZURE_API_VERSION") + api_version + or litellm.api_version + or get_secret("AZURE_API_VERSION") + or litellm.AZURE_DEFAULT_API_VERSION ) api_key = ( diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 487e187a3c..912c968311 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -2512,16 +2512,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-latest": { "max_tokens": 8192, @@ -2533,16 +2533,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro": { "max_tokens": 8192, diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index c2b240ea65..3559a4792f 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -242,6 +242,9 @@ class LiteLLMRoutes(enum.Enum): "/v1/models", # token counter "/utils/token_counter", + # rerank + "/rerank", + "/v1/rerank", ] mapped_pass_through_routes: List = [ diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 92310ae3cb..3adf3bbee9 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -891,18 +891,29 @@ def encode_image(image_path): return base64.b64encode(image_file.read()).decode("utf-8") -@pytest.mark.skip( - reason="we already test claude-3, this is just another way to pass images" -) -def test_completion_claude_3_base64(): +@pytest.mark.parametrize( + "model", + [ + "gpt-4o", + "azure/gpt-4o", + "anthropic/claude-3-opus-20240229", + ], +) # +def test_completion_base64(model): try: + import base64 + + import requests + litellm.set_verbose = True - litellm.num_retries = 3 - image_path = "../proxy/cached_logo.jpg" - # Getting the base64 string - base64_image = encode_image(image_path) + url = "https://dummyimage.com/100/100/fff&text=Test+image" + response = requests.get(url) + file_data = response.content + + encoded_file = base64.b64encode(file_data).decode("utf-8") + base64_image = f"data:image/png;base64,{encoded_file}" resp = litellm.completion( - model="anthropic/claude-3-opus-20240229", + model=model, messages=[ { "role": "user", @@ -910,9 +921,7 @@ def test_completion_claude_3_base64(): {"type": "text", "text": "Whats in this image?"}, { "type": "image_url", - "image_url": { - "url": "data:image/jpeg;base64," + base64_image - }, + "image_url": {"url": base64_image}, }, ], } @@ -921,7 +930,6 @@ def test_completion_claude_3_base64(): print(f"\nResponse: {resp}") prompt_tokens = resp.usage.prompt_tokens - raise Exception("it worked!") except Exception as e: if "500 Internal error encountered.'" in str(e): pass @@ -2176,15 +2184,16 @@ def test_completion_openai(): @pytest.mark.parametrize( - "model", + "model, api_version", [ - "gpt-4o-2024-08-06", - "azure/chatgpt-v-2", - "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + ("gpt-4o-2024-08-06", None), + ("azure/chatgpt-v-2", None), + ("bedrock/anthropic.claude-3-sonnet-20240229-v1:0", None), + ("azure/gpt-4o", "2024-08-01-preview"), ], ) @pytest.mark.flaky(retries=3, delay=1) -def test_completion_openai_pydantic(model): +def test_completion_openai_pydantic(model, api_version): try: litellm.set_verbose = True from pydantic import BaseModel @@ -2209,6 +2218,7 @@ def test_completion_openai_pydantic(model): messages=messages, metadata={"hi": "bye"}, response_format=EventsList, + api_version=api_version, ) break except litellm.JSONSchemaValidationError: @@ -3471,14 +3481,14 @@ def response_format_tests(response: litellm.ModelResponse): @pytest.mark.parametrize( "model", [ - # "bedrock/cohere.command-r-plus-v1:0", + "bedrock/mistral.mistral-large-2407-v1:0", + "bedrock/cohere.command-r-plus-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", - # "anthropic.claude-instant-v1", - # "bedrock/ai21.j2-mid", - # "mistral.mistral-7b-instruct-v0:2", + "anthropic.claude-instant-v1", + "mistral.mistral-7b-instruct-v0:2", # "bedrock/amazon.titan-tg1-large", - # "meta.llama3-8b-instruct-v1:0", - # "cohere.command-text-v14", + "meta.llama3-8b-instruct-v1:0", + "cohere.command-text-v14", ], ) @pytest.mark.parametrize("sync_mode", [True, False]) @@ -3493,6 +3503,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model): messages=[{"role": "user", "content": "Hey! how's it going?"}], temperature=0.2, max_tokens=200, + stop=["stop sequence"], ) assert isinstance(response, litellm.ModelResponse) @@ -3504,6 +3515,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model): messages=[{"role": "user", "content": "Hey! how's it going?"}], temperature=0.2, max_tokens=100, + stop=["stop sequence"], ) assert isinstance(response, litellm.ModelResponse) diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 55a5abbdd7..ed9eebedb1 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -1219,3 +1219,13 @@ def test_completion_cost_anthropic_prompt_caching(): cost_2 = completion_cost(model=model, completion_response=response_2) assert cost_1 > cost_2 + + +def test_completion_cost_databricks(): + model, messages = "databricks/databricks-dbrx-instruct", [ + {"role": "user", "content": "What is 2+2?"} + ] + + resp = litellm.completion(model=model, messages=messages) # works fine + + cost = completion_cost(completion_response=resp) diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 487e187a3c..912c968311 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -2512,16 +2512,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-latest": { "max_tokens": 8192, @@ -2533,16 +2533,16 @@ "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, "max_pdf_size_mb": 30, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.000000075, + "input_cost_per_token_above_128k_tokens": 0.00000015, + "output_cost_per_token": 0.0000003, + "output_cost_per_token_above_128k_tokens": 0.0000006, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro": { "max_tokens": 8192,