diff --git a/.circleci/config.yml b/.circleci/config.yml index e301bd96e..acd3a8058 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -230,6 +230,34 @@ jobs: # Store test results - store_test_results: path: test-results + llm_translation_testing: + docker: + - image: cimg/python:3.11 + working_directory: ~/project + + steps: + - checkout + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-asyncio==0.21.1" + pip install "respx==0.21.1" + # Run pytest and generate JUnit XML report + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/llm_translation -x -s -v --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + + # Store test results + - store_test_results: + path: test-results installing_litellm_on_python: docker: @@ -370,7 +398,7 @@ jobs: command: | pwd ls - python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests + python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation no_output_timeout: 120m # Store test results @@ -694,6 +722,12 @@ workflows: only: - main - /litellm_.*/ + - llm_translation_testing: + filters: + branches: + only: + - main + - /litellm_.*/ - installing_litellm_on_python: filters: branches: @@ -711,6 +745,7 @@ workflows: - local_testing - build_and_test - load_testing + - llm_translation_testing - litellm_router_testing - litellm_assistants_api_testing - ui_endpoint_testing diff --git a/litellm/llms/AI21/chat.py b/litellm/llms/AI21/chat.py index 4eabaaa87..7a60b1904 100644 --- a/litellm/llms/AI21/chat.py +++ b/litellm/llms/AI21/chat.py @@ -75,6 +75,7 @@ class AI21ChatConfig: "tools", "response_format", "max_tokens", + "max_completion_tokens", "temperature", "top_p", "stop", @@ -90,6 +91,8 @@ class AI21ChatConfig: ) -> dict: supported_openai_params = self.get_supported_openai_params(model=model) for param, value in non_default_params.items(): - if param in supported_openai_params: + if param == "max_completion_tokens": + optional_params["max_tokens"] = value + elif param in supported_openai_params: optional_params[param] = value return optional_params diff --git a/litellm/llms/AzureOpenAI/azure.py b/litellm/llms/AzureOpenAI/azure.py index 8da9ee063..b1a9e1549 100644 --- a/litellm/llms/AzureOpenAI/azure.py +++ b/litellm/llms/AzureOpenAI/azure.py @@ -156,6 +156,7 @@ class AzureOpenAIConfig: "stream", "stop", "max_tokens", + "max_completion_tokens", "tools", "tool_choice", "presence_penalty", @@ -268,6 +269,9 @@ class AzureOpenAIConfig: optional_params["json_mode"] = True else: optional_params["response_format"] = value + elif param == "max_completion_tokens": + # TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support + optional_params["max_tokens"] = value elif param in supported_openai_params: optional_params[param] = value diff --git a/litellm/llms/OpenAI/gpt_transformation.py b/litellm/llms/OpenAI/gpt_transformation.py index be14031bd..4ff4790c9 100644 --- a/litellm/llms/OpenAI/gpt_transformation.py +++ b/litellm/llms/OpenAI/gpt_transformation.py @@ -92,6 +92,7 @@ class OpenAIGPTConfig: "logprobs", "top_logprobs", "max_tokens", + "max_completion_tokens", "n", "presence_penalty", "seed", diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py index 8504d5fe2..b37af10bf 100644 --- a/litellm/llms/OpenAI/openai.py +++ b/litellm/llms/OpenAI/openai.py @@ -190,6 +190,7 @@ class DeepInfraConfig: "functions", "logit_bias", "max_tokens", + "max_completion_tokens", "n", "presence_penalty", "stop", @@ -229,7 +230,9 @@ class DeepInfraConfig: ), status_code=400, ) - if param in supported_openai_params: + elif param == "max_completion_tokens": + optional_params["max_tokens"] = value + elif param in supported_openai_params: if value is not None: optional_params[param] = value return optional_params @@ -347,7 +350,9 @@ class OpenAIConfig: - `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion. - - `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. + - `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. OpenAI has now deprecated in favor of max_completion_tokens, and is not compatible with o1 series models. + + - `max_completion_tokens` (integer or null): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. - `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message. @@ -364,6 +369,7 @@ class OpenAIConfig: function_call: Optional[Union[str, dict]] = None functions: Optional[list] = None logit_bias: Optional[dict] = None + max_completion_tokens: Optional[int] = None max_tokens: Optional[int] = None n: Optional[int] = None presence_penalty: Optional[int] = None @@ -378,6 +384,7 @@ class OpenAIConfig: function_call: Optional[Union[str, dict]] = None, functions: Optional[list] = None, logit_bias: Optional[dict] = None, + max_completion_tokens: Optional[int] = None, max_tokens: Optional[int] = None, n: Optional[int] = None, presence_penalty: Optional[int] = None, diff --git a/litellm/llms/anthropic/chat.py b/litellm/llms/anthropic/chat.py index 0520506bb..cf4f23905 100644 --- a/litellm/llms/anthropic/chat.py +++ b/litellm/llms/anthropic/chat.py @@ -158,6 +158,7 @@ class AnthropicConfig: "temperature", "top_p", "max_tokens", + "max_completion_tokens", "tools", "tool_choice", "extra_headers", @@ -173,6 +174,8 @@ class AnthropicConfig: for param, value in non_default_params.items(): if param == "max_tokens": optional_params["max_tokens"] = value + if param == "max_completion_tokens": + optional_params["max_tokens"] = value if param == "tools": optional_params["tools"] = value if param == "tool_choice": diff --git a/litellm/llms/azure_text.py b/litellm/llms/azure_text.py index db8c516b2..6defd58ff 100644 --- a/litellm/llms/azure_text.py +++ b/litellm/llms/azure_text.py @@ -94,16 +94,16 @@ class AzureOpenAIConfig(OpenAIConfig): top_p: Optional[int] = None, ) -> None: super().__init__( - frequency_penalty, - function_call, - functions, - logit_bias, - max_tokens, - n, - presence_penalty, - stop, - temperature, - top_p, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + stop=stop, + temperature=temperature, + top_p=top_p, ) diff --git a/litellm/llms/bedrock/chat.py b/litellm/llms/bedrock/chat.py index 8d6d98ba6..35f0c794a 100644 --- a/litellm/llms/bedrock/chat.py +++ b/litellm/llms/bedrock/chat.py @@ -161,6 +161,7 @@ class AmazonCohereChatConfig: def get_supported_openai_params(self) -> List[str]: return [ "max_tokens", + "max_completion_tokens", "stream", "stop", "temperature", @@ -177,7 +178,7 @@ class AmazonCohereChatConfig: self, non_default_params: dict, optional_params: dict ) -> dict: for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens"] = value if param == "stream": optional_params["stream"] = value @@ -1156,6 +1157,7 @@ class AmazonConverseConfig: def get_supported_openai_params(self, model: str) -> List[str]: supported_params = [ "max_tokens", + "max_completion_tokens", "stream", "stream_options", "stop", @@ -1263,7 +1265,7 @@ class AmazonConverseConfig: ), status_code=400, ) - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["maxTokens"] = value if param == "stream": optional_params["stream"] = value diff --git a/litellm/llms/bedrock/common_utils.py b/litellm/llms/bedrock/common_utils.py index 25379474e..86fced96a 100644 --- a/litellm/llms/bedrock/common_utils.py +++ b/litellm/llms/bedrock/common_utils.py @@ -5,7 +5,7 @@ Common utilities used across bedrock chat/embedding/image generation import os import types from enum import Enum -from typing import List, Optional, Union, Tuple +from typing import List, Optional, Tuple, Union import httpx @@ -158,6 +158,7 @@ class AmazonAnthropicClaude3Config: def get_supported_openai_params(self): return [ "max_tokens", + "max_completion_tokens", "tools", "tool_choice", "stream", @@ -169,7 +170,7 @@ class AmazonAnthropicClaude3Config: def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens"] = value if param == "tools": optional_params["tools"] = value @@ -240,11 +241,18 @@ class AmazonAnthropicConfig: def get_supported_openai_params( self, ): - return ["max_tokens", "temperature", "stop", "top_p", "stream"] + return [ + "max_tokens", + "max_completion_tokens", + "temperature", + "stop", + "top_p", + "stream", + ] def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens_to_sample"] = value if param == "temperature": optional_params["temperature"] = value diff --git a/litellm/llms/cerebras/chat.py b/litellm/llms/cerebras/chat.py index 13b8f0ee9..0b885a599 100644 --- a/litellm/llms/cerebras/chat.py +++ b/litellm/llms/cerebras/chat.py @@ -70,6 +70,7 @@ class CerebrasConfig: return [ "max_tokens", + "max_completion_tokens", "response_format", "seed", "stop", @@ -86,6 +87,8 @@ class CerebrasConfig: ) -> dict: supported_openai_params = self.get_supported_openai_params(model=model) for param, value in non_default_params.items(): - if param in supported_openai_params: + if param == "max_completion_tokens": + optional_params["max_tokens"] = value + elif param in supported_openai_params: optional_params[param] = value return optional_params diff --git a/litellm/llms/databricks/chat.py b/litellm/llms/databricks/chat.py index 0421cd9e4..739abb91f 100644 --- a/litellm/llms/databricks/chat.py +++ b/litellm/llms/databricks/chat.py @@ -106,11 +106,19 @@ class DatabricksConfig: ] def get_supported_openai_params(self): - return ["stream", "stop", "temperature", "top_p", "max_tokens", "n"] + return [ + "stream", + "stop", + "temperature", + "top_p", + "max_tokens", + "max_completion_tokens", + "n", + ] def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens"] = value if param == "n": optional_params["n"] = value diff --git a/litellm/llms/fireworks_ai.py b/litellm/llms/fireworks_ai.py index e9caf887a..b6511689e 100644 --- a/litellm/llms/fireworks_ai.py +++ b/litellm/llms/fireworks_ai.py @@ -73,6 +73,7 @@ class FireworksAIConfig: "stream", "tools", "tool_choice", + "max_completion_tokens", "max_tokens", "temperature", "top_p", @@ -102,6 +103,8 @@ class FireworksAIConfig: else: # pass through the value of tool choice optional_params["tool_choice"] = value + elif param == "max_completion_tokens": + optional_params["max_tokens"] = value elif param in supported_openai_params: if value is not None: optional_params[param] = value diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index 69715167b..fe44ea0d3 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -139,6 +139,7 @@ class HuggingfaceConfig: "stream", "temperature", "max_tokens", + "max_completion_tokens", "top_p", "stop", "n", @@ -167,7 +168,7 @@ class HuggingfaceConfig: optional_params["stream"] = value if param == "stop": optional_params["stop"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": # HF TGI raises the following exception when max_new_tokens==0 # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive if value == 0: diff --git a/litellm/llms/nvidia_nim.py b/litellm/llms/nvidia_nim.py index 6d2e4316b..99c88345e 100644 --- a/litellm/llms/nvidia_nim.py +++ b/litellm/llms/nvidia_nim.py @@ -78,6 +78,7 @@ class NvidiaNimConfig: "temperature", "top_p", "max_tokens", + "max_completion_tokens", ] elif model == "nvidia/nemotron-4-340b-reward": return [ @@ -92,6 +93,7 @@ class NvidiaNimConfig: "frequency_penalty", "presence_penalty", "max_tokens", + "max_completion_tokens", "stop", ] else: @@ -124,6 +126,7 @@ class NvidiaNimConfig: "frequency_penalty", "presence_penalty", "max_tokens", + "max_completion_tokens", "stop", "seed", ] @@ -133,6 +136,8 @@ class NvidiaNimConfig: ) -> dict: supported_openai_params = self.get_supported_openai_params(model=model) for param, value in non_default_params.items(): - if param in supported_openai_params: + if param == "max_completion_tokens": + optional_params["max_tokens"] = value + elif param in supported_openai_params: optional_params[param] = value return optional_params diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 2fc44f9cd..0191ea082 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -140,6 +140,7 @@ class OllamaChatConfig: ): return [ "max_tokens", + "max_completion_tokens", "stream", "top_p", "temperature", @@ -156,7 +157,7 @@ class OllamaChatConfig: self, model: str, non_default_params: dict, optional_params: dict ): for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["num_predict"] = value if param == "stream": optional_params["stream"] = value diff --git a/litellm/llms/predibase.py b/litellm/llms/predibase.py index 81e28934d..1ec5937c6 100644 --- a/litellm/llms/predibase.py +++ b/litellm/llms/predibase.py @@ -154,6 +154,7 @@ class PredibaseConfig: return [ "stream", "temperature", + "max_completion_tokens", "max_tokens", "top_p", "stop", @@ -181,7 +182,7 @@ class PredibaseConfig: optional_params["stream"] = value if param == "stop": optional_params["stop"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": # HF TGI raises the following exception when max_new_tokens==0 # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive if value == 0: diff --git a/litellm/llms/text_completion_codestral.py b/litellm/llms/text_completion_codestral.py index 9bcd64631..feb3c6dac 100644 --- a/litellm/llms/text_completion_codestral.py +++ b/litellm/llms/text_completion_codestral.py @@ -141,6 +141,7 @@ class MistralTextCompletionConfig: "temperature", "top_p", "max_tokens", + "max_completion_tokens", "stream", "seed", "stop", @@ -154,7 +155,7 @@ class MistralTextCompletionConfig: optional_params["temperature"] = value if param == "top_p": optional_params["top_p"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens"] = value if param == "stream" and value == True: optional_params["stream"] = value diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py index 712f51a98..858336bf0 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py @@ -158,6 +158,7 @@ class VertexAIConfig: "temperature", "top_p", "max_tokens", + "max_completion_tokens", "stream", "tools", "tool_choice", @@ -184,7 +185,7 @@ class VertexAIConfig: optional_params["stop_sequences"] = [value] elif isinstance(value, list): optional_params["stop_sequences"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_output_tokens"] = value if param == "response_format" and value["type"] == "json_object": optional_params["response_mime_type"] = "application/json" @@ -319,6 +320,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty "temperature", "top_p", "max_tokens", + "max_completion_tokens", "stream", "tools", "tool_choice", @@ -413,7 +415,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty optional_params["stop_sequences"] = [value] elif isinstance(value, list): optional_params["stop_sequences"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_output_tokens"] = value if param == "response_format": # type: ignore if value["type"] == "json_object": # type: ignore @@ -554,6 +556,7 @@ class VertexGeminiConfig: "temperature", "top_p", "max_tokens", + "max_completion_tokens", "stream", "tools", "functions", @@ -653,7 +656,7 @@ class VertexGeminiConfig: optional_params["stop_sequences"] = [value] elif isinstance(value, list): optional_params["stop_sequences"] = value - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_output_tokens"] = value if param == "response_format" and isinstance(value, dict): # type: ignore if value["type"] == "json_object": diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py index 13d8edeb9..ecb11e1c9 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py @@ -114,6 +114,7 @@ class VertexAIAnthropicConfig: def get_supported_openai_params(self): return [ "max_tokens", + "max_completion_tokens", "tools", "tool_choice", "stream", @@ -125,7 +126,7 @@ class VertexAIAnthropicConfig: def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): - if param == "max_tokens": + if param == "max_tokens" or param == "max_completion_tokens": optional_params["max_tokens"] = value if param == "tools": optional_params["tools"] = value diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/ai21/transformation.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/ai21/transformation.py index 83b8dfcfb..2d9d6076e 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/ai21/transformation.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/ai21/transformation.py @@ -46,6 +46,10 @@ class VertexAIAi21Config: def map_openai_params( self, non_default_params: dict, optional_params: dict, model: str ): + if "max_completion_tokens" in non_default_params: + non_default_params["max_tokens"] = non_default_params.pop( + "max_completion_tokens" + ) return litellm.OpenAIConfig().map_openai_params( non_default_params=non_default_params, optional_params=optional_params, diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/llama3/transformation.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/llama3/transformation.py index 97223f1fd..683e0ff8e 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/llama3/transformation.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/llama3/transformation.py @@ -52,6 +52,10 @@ class VertexAILlama3Config: def map_openai_params( self, non_default_params: dict, optional_params: dict, model: str ): + if "max_completion_tokens" in non_default_params: + non_default_params["max_tokens"] = non_default_params.pop( + "max_completion_tokens" + ) return litellm.OpenAIConfig().map_openai_params( non_default_params=non_default_params, optional_params=optional_params, diff --git a/litellm/llms/volcengine.py b/litellm/llms/volcengine.py index eb289d1c4..9b288c868 100644 --- a/litellm/llms/volcengine.py +++ b/litellm/llms/volcengine.py @@ -60,6 +60,7 @@ class VolcEngineConfig: "logit_bias", "logprobs", "top_logprobs", + "max_completion_tokens", "max_tokens", "n", "presence_penalty", @@ -82,6 +83,8 @@ class VolcEngineConfig: ) -> dict: supported_openai_params = self.get_supported_openai_params(model) for param, value in non_default_params.items(): - if param in supported_openai_params: + if param == "max_completion_tokens": + optional_params["max_tokens"] = value + elif param in supported_openai_params: optional_params[param] = value return optional_params diff --git a/litellm/main.py b/litellm/main.py index a50c908c6..80136e997 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -264,6 +264,7 @@ async def acompletion( stream_options: Optional[dict] = None, stop=None, max_tokens: Optional[int] = None, + max_completion_tokens: Optional[int] = None, presence_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None, logit_bias: Optional[dict] = None, @@ -303,6 +304,7 @@ async def acompletion( stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). + max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. @@ -341,6 +343,7 @@ async def acompletion( "stream_options": stream_options, "stop": stop, "max_tokens": max_tokens, + "max_completion_tokens": max_completion_tokens, "presence_penalty": presence_penalty, "frequency_penalty": frequency_penalty, "logit_bias": logit_bias, @@ -633,6 +636,7 @@ def completion( stream: Optional[bool] = None, stream_options: Optional[dict] = None, stop=None, + max_completion_tokens: Optional[int] = None, max_tokens: Optional[int] = None, presence_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None, @@ -675,6 +679,7 @@ def completion( stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). + max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. @@ -759,6 +764,7 @@ def completion( "stream", "stream_options", "stop", + "max_completion_tokens", "max_tokens", "presence_penalty", "frequency_penalty", @@ -917,6 +923,7 @@ def completion( stream_options=stream_options, stop=stop, max_tokens=max_tokens, + max_completion_tokens=max_completion_tokens, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, logit_bias=logit_bias, diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 1fb2997c0..d501d8572 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -6,7 +6,7 @@ "input_cost_per_token": 0.0000, "output_cost_per_token": 0.000, "litellm_provider": "one of https://docs.litellm.ai/docs/providers", - "mode": "one of chat, embedding, completion, image_generation, audio_transcription", + "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech", "supports_function_calling": true, "supports_parallel_function_calling": true, "supports_vision": true diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 6dd1bad5f..d6e424de5 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1317,11 +1317,12 @@ import openai def test_completion_gpt4_turbo(): + litellm.set_verbose = True try: response = completion( model="gpt-4-1106-preview", messages=messages, - max_tokens=10, + max_completion_tokens=10, ) print(response) except openai.RateLimitError: diff --git a/litellm/utils.py b/litellm/utils.py index d3e757ae8..af0bbc98f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2765,6 +2765,7 @@ def get_optional_params( stream_options=None, stop=None, max_tokens=None, + max_completion_tokens=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, @@ -2842,6 +2843,7 @@ def get_optional_params( "stream_options": None, "stop": None, "max_tokens": None, + "max_completion_tokens": None, "presence_penalty": None, "frequency_penalty": None, "logit_bias": None, diff --git a/tests/llm_translation/conftest.py b/tests/llm_translation/conftest.py new file mode 100644 index 000000000..eca0bc431 --- /dev/null +++ b/tests/llm_translation/conftest.py @@ -0,0 +1,54 @@ +# conftest.py + +import importlib +import os +import sys + +import pytest + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +import litellm + + +@pytest.fixture(scope="function", autouse=True) +def setup_and_teardown(): + """ + This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained. + """ + curr_dir = os.getcwd() # Get the current working directory + sys.path.insert( + 0, os.path.abspath("../..") + ) # Adds the project directory to the system path + + import litellm + from litellm import Router + + importlib.reload(litellm) + import asyncio + + loop = asyncio.get_event_loop_policy().new_event_loop() + asyncio.set_event_loop(loop) + print(litellm) + # from litellm import Router, completion, aembedding, acompletion, embedding + yield + + # Teardown code (executes after the yield point) + loop.close() # Close the loop created earlier + asyncio.set_event_loop(None) # Remove the reference to the loop + + +def pytest_collection_modifyitems(config, items): + # Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests + custom_logger_tests = [ + item for item in items if "custom_logger" in item.parent.name + ] + other_tests = [item for item in items if "custom_logger" not in item.parent.name] + + # Sort tests based on their names + custom_logger_tests.sort(key=lambda x: x.name) + other_tests.sort(key=lambda x: x.name) + + # Reorder the items list + items[:] = custom_logger_tests + other_tests diff --git a/litellm/tests/test_fireworks_ai.py b/tests/llm_translation/test_fireworks_ai_translation.py similarity index 100% rename from litellm/tests/test_fireworks_ai.py rename to tests/llm_translation/test_fireworks_ai_translation.py diff --git a/tests/llm_translation/test_max_completion_tokens.py b/tests/llm_translation/test_max_completion_tokens.py new file mode 100644 index 000000000..d44a2db37 --- /dev/null +++ b/tests/llm_translation/test_max_completion_tokens.py @@ -0,0 +1,342 @@ +import json +import os +import sys + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path +from datetime import datetime +from unittest.mock import AsyncMock +from dotenv import load_dotenv + +load_dotenv() +import httpx +import pytest +from respx import MockRouter + +import litellm +from litellm import Choices, Message, ModelResponse + +# Adds the parent directory to the system path + + +def return_mocked_response(model: str): + if model == "bedrock/mistral.mistral-large-2407-v1:0": + return { + "metrics": {"latencyMs": 316}, + "output": { + "message": { + "content": [{"text": "Hello! How are you doing today? How can"}], + "role": "assistant", + } + }, + "stopReason": "max_tokens", + "usage": {"inputTokens": 5, "outputTokens": 10, "totalTokens": 15}, + } + + +@pytest.mark.parametrize( + "model", + [ + "bedrock/mistral.mistral-large-2407-v1:0", + ], +) +@pytest.mark.respx +@pytest.mark.asyncio() +async def test_bedrock_max_completion_tokens(model: str, respx_mock: MockRouter): + """ + Tests that: + - max_completion_tokens is passed as max_tokens to bedrock models + """ + litellm.set_verbose = True + + mock_response = return_mocked_response(model) + _model = model.split("/")[1] + print("\n\nmock_response: ", mock_response) + url = f"https://bedrock-runtime.us-west-2.amazonaws.com/model/{_model}/converse" + mock_request = respx_mock.post(url).mock( + return_value=httpx.Response(200, json=mock_response) + ) + + response = await litellm.acompletion( + model=model, + max_completion_tokens=10, + messages=[{"role": "user", "content": "Hello!"}], + ) + + assert mock_request.called + request_body = json.loads(mock_request.calls[0].request.content) + + print("request_body: ", request_body) + + assert request_body == { + "messages": [{"role": "user", "content": [{"text": "Hello!"}]}], + "additionalModelRequestFields": {}, + "system": [], + "inferenceConfig": {"maxTokens": 10}, + } + print(f"response: {response}") + assert isinstance(response, ModelResponse) + + +@pytest.mark.parametrize( + "model", + ["anthropic/claude-3-sonnet-20240229", "anthropic/claude-3-opus-20240229,"], +) +@pytest.mark.respx +@pytest.mark.asyncio() +async def test_anthropic_api_max_completion_tokens(model: str, respx_mock: MockRouter): + """ + Tests that: + - max_completion_tokens is passed as max_tokens to anthropic models + """ + litellm.set_verbose = True + + mock_response = { + "content": [{"text": "Hi! My name is Claude.", "type": "text"}], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-5-sonnet-20240620", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": None, + "type": "message", + "usage": {"input_tokens": 2095, "output_tokens": 503}, + } + + print("\n\nmock_response: ", mock_response) + url = f"https://api.anthropic.com/v1/messages" + mock_request = respx_mock.post(url).mock( + return_value=httpx.Response(200, json=mock_response) + ) + + response = await litellm.acompletion( + model=model, + max_completion_tokens=10, + messages=[{"role": "user", "content": "Hello!"}], + ) + + assert mock_request.called + request_body = json.loads(mock_request.calls[0].request.content) + + print("request_body: ", request_body) + + assert request_body == { + "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}], + "max_tokens": 10, + "model": model.split("/")[-1], + } + print(f"response: {response}") + assert isinstance(response, ModelResponse) + + +def test_all_model_configs(): + from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.ai21.transformation import ( + VertexAIAi21Config, + ) + from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.llama3.transformation import ( + VertexAILlama3Config, + ) + + assert ( + "max_completion_tokens" in VertexAILlama3Config().get_supported_openai_params() + ) + assert VertexAILlama3Config().map_openai_params( + {"max_completion_tokens": 10}, {}, "llama3" + ) == {"max_tokens": 10} + + assert "max_completion_tokens" in VertexAIAi21Config().get_supported_openai_params() + assert VertexAIAi21Config().map_openai_params( + {"max_completion_tokens": 10}, {}, "llama3" + ) == {"max_tokens": 10} + + from litellm.llms.fireworks_ai import FireworksAIConfig + + assert "max_completion_tokens" in FireworksAIConfig().get_supported_openai_params() + assert FireworksAIConfig().map_openai_params( + {"max_completion_tokens": 10}, {}, "llama3" + ) == {"max_tokens": 10} + + from litellm.llms.huggingface_restapi import HuggingfaceConfig + + assert "max_completion_tokens" in HuggingfaceConfig().get_supported_openai_params() + assert HuggingfaceConfig().map_openai_params({"max_completion_tokens": 10}, {}) == { + "max_new_tokens": 10 + } + + from litellm.llms.nvidia_nim import NvidiaNimConfig + + assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params( + model="llama3" + ) + assert NvidiaNimConfig().map_openai_params( + model="llama3", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.ollama_chat import OllamaChatConfig + + assert "max_completion_tokens" in OllamaChatConfig().get_supported_openai_params() + assert OllamaChatConfig().map_openai_params( + model="llama3", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"num_predict": 10} + + from litellm.llms.predibase import PredibaseConfig + + assert "max_completion_tokens" in PredibaseConfig().get_supported_openai_params() + assert PredibaseConfig().map_openai_params( + {"max_completion_tokens": 10}, + {}, + ) == {"max_new_tokens": 10} + + from litellm.llms.text_completion_codestral import MistralTextCompletionConfig + + assert ( + "max_completion_tokens" + in MistralTextCompletionConfig().get_supported_openai_params() + ) + assert MistralTextCompletionConfig().map_openai_params( + {"max_completion_tokens": 10}, + {}, + ) == {"max_tokens": 10} + + from litellm.llms.volcengine import VolcEngineConfig + + assert "max_completion_tokens" in VolcEngineConfig().get_supported_openai_params( + model="llama3" + ) + assert VolcEngineConfig().map_openai_params( + model="llama3", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.AI21.chat import AI21ChatConfig + + assert "max_completion_tokens" in AI21ChatConfig().get_supported_openai_params( + "jamba-1.5-mini@001" + ) + assert AI21ChatConfig().map_openai_params( + model="jamba-1.5-mini@001", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.AzureOpenAI.azure import AzureOpenAIConfig + + assert "max_completion_tokens" in AzureOpenAIConfig().get_supported_openai_params() + assert AzureOpenAIConfig().map_openai_params( + model="gpt-3.5-turbo", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + api_version="2022-12-01", + drop_params=False, + ) == {"max_tokens": 10} + + from litellm.llms.bedrock.chat import AmazonConverseConfig + + assert ( + "max_completion_tokens" + in AmazonConverseConfig().get_supported_openai_params( + model="anthropic.claude-3-sonnet-20240229-v1:0" + ) + ) + assert AmazonConverseConfig().map_openai_params( + model="anthropic.claude-3-sonnet-20240229-v1:0", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + drop_params=False, + ) == {"maxTokens": 10} + + from litellm.llms.text_completion_codestral import MistralTextCompletionConfig + + assert ( + "max_completion_tokens" + in MistralTextCompletionConfig().get_supported_openai_params() + ) + assert MistralTextCompletionConfig().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.bedrock.common_utils import ( + AmazonAnthropicClaude3Config, + AmazonAnthropicConfig, + ) + + assert ( + "max_completion_tokens" + in AmazonAnthropicClaude3Config().get_supported_openai_params() + ) + + assert AmazonAnthropicClaude3Config().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + assert ( + "max_completion_tokens" in AmazonAnthropicConfig().get_supported_openai_params() + ) + + assert AmazonAnthropicConfig().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens_to_sample": 10} + + from litellm.llms.databricks.chat import DatabricksConfig + + assert "max_completion_tokens" in DatabricksConfig().get_supported_openai_params() + + assert DatabricksConfig().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_anthropic import ( + VertexAIAnthropicConfig, + ) + + assert ( + "max_completion_tokens" + in VertexAIAnthropicConfig().get_supported_openai_params() + ) + + assert VertexAIAnthropicConfig().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_tokens": 10} + + from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( + VertexAIConfig, + GoogleAIStudioGeminiConfig, + VertexGeminiConfig, + ) + + assert "max_completion_tokens" in VertexAIConfig().get_supported_openai_params() + + assert VertexAIConfig().map_openai_params( + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_output_tokens": 10} + + assert ( + "max_completion_tokens" + in GoogleAIStudioGeminiConfig().get_supported_openai_params() + ) + + assert GoogleAIStudioGeminiConfig().map_openai_params( + model="gemini-1.0-pro", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + ) == {"max_output_tokens": 10} + + assert "max_completion_tokens" in VertexGeminiConfig().get_supported_openai_params() + + assert VertexGeminiConfig().map_openai_params( + model="gemini-1.0-pro", + non_default_params={"max_completion_tokens": 10}, + optional_params={}, + drop_params=False, + ) == {"max_output_tokens": 10} diff --git a/litellm/tests/test_openai_o1.py b/tests/llm_translation/test_openai_o1.py similarity index 50% rename from litellm/tests/test_openai_o1.py rename to tests/llm_translation/test_openai_o1.py index 39dadc96e..70fe346b1 100644 --- a/litellm/tests/test_openai_o1.py +++ b/tests/llm_translation/test_openai_o1.py @@ -1,7 +1,14 @@ import json +import os +import sys from datetime import datetime from unittest.mock import AsyncMock +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path + + import httpx import pytest from respx import MockRouter @@ -50,3 +57,45 @@ async def test_o1_handle_system_role(respx_mock: MockRouter): print(f"response: {response}") assert isinstance(response, ModelResponse) + + +@pytest.mark.asyncio +@pytest.mark.respx +@pytest.mark.parametrize("model", ["gpt-4", "gpt-4-0314", "gpt-4-32k", "o1-preview"]) +async def test_o1_max_completion_tokens(respx_mock: MockRouter, model: str): + """ + Tests that: + - max_completion_tokens is passed directly to OpenAI chat completion models + """ + litellm.set_verbose = True + + mock_response = ModelResponse( + id="cmpl-mock", + choices=[Choices(message=Message(content="Mocked response", role="assistant"))], + created=int(datetime.now().timestamp()), + model=model, + ) + + mock_request = respx_mock.post("https://api.openai.com/v1/chat/completions").mock( + return_value=httpx.Response(200, json=mock_response.dict()) + ) + + response = await litellm.acompletion( + model=model, + max_completion_tokens=10, + messages=[{"role": "user", "content": "Hello!"}], + ) + + assert mock_request.called + request_body = json.loads(mock_request.calls[0].request.content) + + print("request_body: ", request_body) + + assert request_body == { + "model": model, + "max_completion_tokens": 10, + "messages": [{"role": "user", "content": "Hello!"}], + } + + print(f"response: {response}") + assert isinstance(response, ModelResponse) diff --git a/litellm/tests/test_optional_params.py b/tests/llm_translation/test_optional_params.py similarity index 100% rename from litellm/tests/test_optional_params.py rename to tests/llm_translation/test_optional_params.py