[Feat] Add max_completion_tokens param (#5691)

* add max_completion_tokens

* add max_completion_tokens

* add max_completion_tokens support for OpenAI models

* add max_completion_tokens param

* add max_completion_tokens for bedrock converse models

* add test for converse maxTokens

* fix openai o1 param mapping test

* move test optional params

* add max_completion_tokens for anthropic api

* fix conftest

* add max_completion tokens for vertex ai partner models

* add max_completion_tokens for fireworks ai

* add max_completion_tokens for hf rest api

* add test for param mapping

* add param mapping for vertex, gemini + testing

* predibase is the most unstable and unusable llm api in prod, can't handle our ci/cd

* add max_completion_tokens to openai supported params

* fix fireworks ai param mapping
This commit is contained in:
Ishaan Jaff 2024-09-14 14:57:01 -07:00 committed by GitHub
parent 415a3ede9e
commit 85acdb9193
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 591 additions and 35 deletions

View file

@ -230,6 +230,34 @@ jobs:
# Store test results # Store test results
- store_test_results: - store_test_results:
path: test-results path: test-results
llm_translation_testing:
docker:
- image: cimg/python:3.11
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/llm_translation -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
# Store test results
- store_test_results:
path: test-results
installing_litellm_on_python: installing_litellm_on_python:
docker: docker:
@ -370,7 +398,7 @@ jobs:
command: | command: |
pwd pwd
ls ls
python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation
no_output_timeout: 120m no_output_timeout: 120m
# Store test results # Store test results
@ -694,6 +722,12 @@ workflows:
only: only:
- main - main
- /litellm_.*/ - /litellm_.*/
- llm_translation_testing:
filters:
branches:
only:
- main
- /litellm_.*/
- installing_litellm_on_python: - installing_litellm_on_python:
filters: filters:
branches: branches:
@ -711,6 +745,7 @@ workflows:
- local_testing - local_testing
- build_and_test - build_and_test
- load_testing - load_testing
- llm_translation_testing
- litellm_router_testing - litellm_router_testing
- litellm_assistants_api_testing - litellm_assistants_api_testing
- ui_endpoint_testing - ui_endpoint_testing

View file

@ -75,6 +75,7 @@ class AI21ChatConfig:
"tools", "tools",
"response_format", "response_format",
"max_tokens", "max_tokens",
"max_completion_tokens",
"temperature", "temperature",
"top_p", "top_p",
"stop", "stop",
@ -90,6 +91,8 @@ class AI21ChatConfig:
) -> dict: ) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model) supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param in supported_openai_params: if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value
return optional_params return optional_params

View file

@ -156,6 +156,7 @@ class AzureOpenAIConfig:
"stream", "stream",
"stop", "stop",
"max_tokens", "max_tokens",
"max_completion_tokens",
"tools", "tools",
"tool_choice", "tool_choice",
"presence_penalty", "presence_penalty",
@ -268,6 +269,9 @@ class AzureOpenAIConfig:
optional_params["json_mode"] = True optional_params["json_mode"] = True
else: else:
optional_params["response_format"] = value optional_params["response_format"] = value
elif param == "max_completion_tokens":
# TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
optional_params["max_tokens"] = value
elif param in supported_openai_params: elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value

View file

@ -92,6 +92,7 @@ class OpenAIGPTConfig:
"logprobs", "logprobs",
"top_logprobs", "top_logprobs",
"max_tokens", "max_tokens",
"max_completion_tokens",
"n", "n",
"presence_penalty", "presence_penalty",
"seed", "seed",

View file

@ -190,6 +190,7 @@ class DeepInfraConfig:
"functions", "functions",
"logit_bias", "logit_bias",
"max_tokens", "max_tokens",
"max_completion_tokens",
"n", "n",
"presence_penalty", "presence_penalty",
"stop", "stop",
@ -229,7 +230,9 @@ class DeepInfraConfig:
), ),
status_code=400, status_code=400,
) )
if param in supported_openai_params: elif param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
if value is not None: if value is not None:
optional_params[param] = value optional_params[param] = value
return optional_params return optional_params
@ -347,7 +350,9 @@ class OpenAIConfig:
- `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion. - `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion.
- `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. - `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. OpenAI has now deprecated in favor of max_completion_tokens, and is not compatible with o1 series models.
- `max_completion_tokens` (integer or null): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
- `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message. - `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message.
@ -364,6 +369,7 @@ class OpenAIConfig:
function_call: Optional[Union[str, dict]] = None function_call: Optional[Union[str, dict]] = None
functions: Optional[list] = None functions: Optional[list] = None
logit_bias: Optional[dict] = None logit_bias: Optional[dict] = None
max_completion_tokens: Optional[int] = None
max_tokens: Optional[int] = None max_tokens: Optional[int] = None
n: Optional[int] = None n: Optional[int] = None
presence_penalty: Optional[int] = None presence_penalty: Optional[int] = None
@ -378,6 +384,7 @@ class OpenAIConfig:
function_call: Optional[Union[str, dict]] = None, function_call: Optional[Union[str, dict]] = None,
functions: Optional[list] = None, functions: Optional[list] = None,
logit_bias: Optional[dict] = None, logit_bias: Optional[dict] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
n: Optional[int] = None, n: Optional[int] = None,
presence_penalty: Optional[int] = None, presence_penalty: Optional[int] = None,

View file

@ -158,6 +158,7 @@ class AnthropicConfig:
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
"tools", "tools",
"tool_choice", "tool_choice",
"extra_headers", "extra_headers",
@ -173,6 +174,8 @@ class AnthropicConfig:
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "tools": if param == "tools":
optional_params["tools"] = value optional_params["tools"] = value
if param == "tool_choice": if param == "tool_choice":

View file

@ -94,16 +94,16 @@ class AzureOpenAIConfig(OpenAIConfig):
top_p: Optional[int] = None, top_p: Optional[int] = None,
) -> None: ) -> None:
super().__init__( super().__init__(
frequency_penalty, frequency_penalty=frequency_penalty,
function_call, function_call=function_call,
functions, functions=functions,
logit_bias, logit_bias=logit_bias,
max_tokens, max_tokens=max_tokens,
n, n=n,
presence_penalty, presence_penalty=presence_penalty,
stop, stop=stop,
temperature, temperature=temperature,
top_p, top_p=top_p,
) )

View file

@ -161,6 +161,7 @@ class AmazonCohereChatConfig:
def get_supported_openai_params(self) -> List[str]: def get_supported_openai_params(self) -> List[str]:
return [ return [
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"stop", "stop",
"temperature", "temperature",
@ -177,7 +178,7 @@ class AmazonCohereChatConfig:
self, non_default_params: dict, optional_params: dict self, non_default_params: dict, optional_params: dict
) -> dict: ) -> dict:
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "stream": if param == "stream":
optional_params["stream"] = value optional_params["stream"] = value
@ -1156,6 +1157,7 @@ class AmazonConverseConfig:
def get_supported_openai_params(self, model: str) -> List[str]: def get_supported_openai_params(self, model: str) -> List[str]:
supported_params = [ supported_params = [
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"stream_options", "stream_options",
"stop", "stop",
@ -1263,7 +1265,7 @@ class AmazonConverseConfig:
), ),
status_code=400, status_code=400,
) )
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["maxTokens"] = value optional_params["maxTokens"] = value
if param == "stream": if param == "stream":
optional_params["stream"] = value optional_params["stream"] = value

View file

@ -5,7 +5,7 @@ Common utilities used across bedrock chat/embedding/image generation
import os import os
import types import types
from enum import Enum from enum import Enum
from typing import List, Optional, Union, Tuple from typing import List, Optional, Tuple, Union
import httpx import httpx
@ -158,6 +158,7 @@ class AmazonAnthropicClaude3Config:
def get_supported_openai_params(self): def get_supported_openai_params(self):
return [ return [
"max_tokens", "max_tokens",
"max_completion_tokens",
"tools", "tools",
"tool_choice", "tool_choice",
"stream", "stream",
@ -169,7 +170,7 @@ class AmazonAnthropicClaude3Config:
def map_openai_params(self, non_default_params: dict, optional_params: dict): def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "tools": if param == "tools":
optional_params["tools"] = value optional_params["tools"] = value
@ -240,11 +241,18 @@ class AmazonAnthropicConfig:
def get_supported_openai_params( def get_supported_openai_params(
self, self,
): ):
return ["max_tokens", "temperature", "stop", "top_p", "stream"] return [
"max_tokens",
"max_completion_tokens",
"temperature",
"stop",
"top_p",
"stream",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict): def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens_to_sample"] = value optional_params["max_tokens_to_sample"] = value
if param == "temperature": if param == "temperature":
optional_params["temperature"] = value optional_params["temperature"] = value

View file

@ -70,6 +70,7 @@ class CerebrasConfig:
return [ return [
"max_tokens", "max_tokens",
"max_completion_tokens",
"response_format", "response_format",
"seed", "seed",
"stop", "stop",
@ -86,6 +87,8 @@ class CerebrasConfig:
) -> dict: ) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model) supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param in supported_openai_params: if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value
return optional_params return optional_params

View file

@ -106,11 +106,19 @@ class DatabricksConfig:
] ]
def get_supported_openai_params(self): def get_supported_openai_params(self):
return ["stream", "stop", "temperature", "top_p", "max_tokens", "n"] return [
"stream",
"stop",
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"n",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict): def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "n": if param == "n":
optional_params["n"] = value optional_params["n"] = value

View file

@ -73,6 +73,7 @@ class FireworksAIConfig:
"stream", "stream",
"tools", "tools",
"tool_choice", "tool_choice",
"max_completion_tokens",
"max_tokens", "max_tokens",
"temperature", "temperature",
"top_p", "top_p",
@ -102,6 +103,8 @@ class FireworksAIConfig:
else: else:
# pass through the value of tool choice # pass through the value of tool choice
optional_params["tool_choice"] = value optional_params["tool_choice"] = value
elif param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params: elif param in supported_openai_params:
if value is not None: if value is not None:
optional_params[param] = value optional_params[param] = value

View file

@ -139,6 +139,7 @@ class HuggingfaceConfig:
"stream", "stream",
"temperature", "temperature",
"max_tokens", "max_tokens",
"max_completion_tokens",
"top_p", "top_p",
"stop", "stop",
"n", "n",
@ -167,7 +168,7 @@ class HuggingfaceConfig:
optional_params["stream"] = value optional_params["stream"] = value
if param == "stop": if param == "stop":
optional_params["stop"] = value optional_params["stop"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
# HF TGI raises the following exception when max_new_tokens==0 # HF TGI raises the following exception when max_new_tokens==0
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
if value == 0: if value == 0:

View file

@ -78,6 +78,7 @@ class NvidiaNimConfig:
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
] ]
elif model == "nvidia/nemotron-4-340b-reward": elif model == "nvidia/nemotron-4-340b-reward":
return [ return [
@ -92,6 +93,7 @@ class NvidiaNimConfig:
"frequency_penalty", "frequency_penalty",
"presence_penalty", "presence_penalty",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stop", "stop",
] ]
else: else:
@ -124,6 +126,7 @@ class NvidiaNimConfig:
"frequency_penalty", "frequency_penalty",
"presence_penalty", "presence_penalty",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stop", "stop",
"seed", "seed",
] ]
@ -133,6 +136,8 @@ class NvidiaNimConfig:
) -> dict: ) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model) supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param in supported_openai_params: if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value
return optional_params return optional_params

View file

@ -140,6 +140,7 @@ class OllamaChatConfig:
): ):
return [ return [
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"top_p", "top_p",
"temperature", "temperature",
@ -156,7 +157,7 @@ class OllamaChatConfig:
self, model: str, non_default_params: dict, optional_params: dict self, model: str, non_default_params: dict, optional_params: dict
): ):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["num_predict"] = value optional_params["num_predict"] = value
if param == "stream": if param == "stream":
optional_params["stream"] = value optional_params["stream"] = value

View file

@ -154,6 +154,7 @@ class PredibaseConfig:
return [ return [
"stream", "stream",
"temperature", "temperature",
"max_completion_tokens",
"max_tokens", "max_tokens",
"top_p", "top_p",
"stop", "stop",
@ -181,7 +182,7 @@ class PredibaseConfig:
optional_params["stream"] = value optional_params["stream"] = value
if param == "stop": if param == "stop":
optional_params["stop"] = value optional_params["stop"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
# HF TGI raises the following exception when max_new_tokens==0 # HF TGI raises the following exception when max_new_tokens==0
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive # Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
if value == 0: if value == 0:

View file

@ -141,6 +141,7 @@ class MistralTextCompletionConfig:
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"seed", "seed",
"stop", "stop",
@ -154,7 +155,7 @@ class MistralTextCompletionConfig:
optional_params["temperature"] = value optional_params["temperature"] = value
if param == "top_p": if param == "top_p":
optional_params["top_p"] = value optional_params["top_p"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "stream" and value == True: if param == "stream" and value == True:
optional_params["stream"] = value optional_params["stream"] = value

View file

@ -158,6 +158,7 @@ class VertexAIConfig:
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"tools", "tools",
"tool_choice", "tool_choice",
@ -184,7 +185,7 @@ class VertexAIConfig:
optional_params["stop_sequences"] = [value] optional_params["stop_sequences"] = [value]
elif isinstance(value, list): elif isinstance(value, list):
optional_params["stop_sequences"] = value optional_params["stop_sequences"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value optional_params["max_output_tokens"] = value
if param == "response_format" and value["type"] == "json_object": if param == "response_format" and value["type"] == "json_object":
optional_params["response_mime_type"] = "application/json" optional_params["response_mime_type"] = "application/json"
@ -319,6 +320,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"tools", "tools",
"tool_choice", "tool_choice",
@ -413,7 +415,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
optional_params["stop_sequences"] = [value] optional_params["stop_sequences"] = [value]
elif isinstance(value, list): elif isinstance(value, list):
optional_params["stop_sequences"] = value optional_params["stop_sequences"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value optional_params["max_output_tokens"] = value
if param == "response_format": # type: ignore if param == "response_format": # type: ignore
if value["type"] == "json_object": # type: ignore if value["type"] == "json_object": # type: ignore
@ -554,6 +556,7 @@ class VertexGeminiConfig:
"temperature", "temperature",
"top_p", "top_p",
"max_tokens", "max_tokens",
"max_completion_tokens",
"stream", "stream",
"tools", "tools",
"functions", "functions",
@ -653,7 +656,7 @@ class VertexGeminiConfig:
optional_params["stop_sequences"] = [value] optional_params["stop_sequences"] = [value]
elif isinstance(value, list): elif isinstance(value, list):
optional_params["stop_sequences"] = value optional_params["stop_sequences"] = value
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value optional_params["max_output_tokens"] = value
if param == "response_format" and isinstance(value, dict): # type: ignore if param == "response_format" and isinstance(value, dict): # type: ignore
if value["type"] == "json_object": if value["type"] == "json_object":

View file

@ -114,6 +114,7 @@ class VertexAIAnthropicConfig:
def get_supported_openai_params(self): def get_supported_openai_params(self):
return [ return [
"max_tokens", "max_tokens",
"max_completion_tokens",
"tools", "tools",
"tool_choice", "tool_choice",
"stream", "stream",
@ -125,7 +126,7 @@ class VertexAIAnthropicConfig:
def map_openai_params(self, non_default_params: dict, optional_params: dict): def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value optional_params["max_tokens"] = value
if param == "tools": if param == "tools":
optional_params["tools"] = value optional_params["tools"] = value

View file

@ -46,6 +46,10 @@ class VertexAIAi21Config:
def map_openai_params( def map_openai_params(
self, non_default_params: dict, optional_params: dict, model: str self, non_default_params: dict, optional_params: dict, model: str
): ):
if "max_completion_tokens" in non_default_params:
non_default_params["max_tokens"] = non_default_params.pop(
"max_completion_tokens"
)
return litellm.OpenAIConfig().map_openai_params( return litellm.OpenAIConfig().map_openai_params(
non_default_params=non_default_params, non_default_params=non_default_params,
optional_params=optional_params, optional_params=optional_params,

View file

@ -52,6 +52,10 @@ class VertexAILlama3Config:
def map_openai_params( def map_openai_params(
self, non_default_params: dict, optional_params: dict, model: str self, non_default_params: dict, optional_params: dict, model: str
): ):
if "max_completion_tokens" in non_default_params:
non_default_params["max_tokens"] = non_default_params.pop(
"max_completion_tokens"
)
return litellm.OpenAIConfig().map_openai_params( return litellm.OpenAIConfig().map_openai_params(
non_default_params=non_default_params, non_default_params=non_default_params,
optional_params=optional_params, optional_params=optional_params,

View file

@ -60,6 +60,7 @@ class VolcEngineConfig:
"logit_bias", "logit_bias",
"logprobs", "logprobs",
"top_logprobs", "top_logprobs",
"max_completion_tokens",
"max_tokens", "max_tokens",
"n", "n",
"presence_penalty", "presence_penalty",
@ -82,6 +83,8 @@ class VolcEngineConfig:
) -> dict: ) -> dict:
supported_openai_params = self.get_supported_openai_params(model) supported_openai_params = self.get_supported_openai_params(model)
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param in supported_openai_params: if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value
return optional_params return optional_params

View file

@ -264,6 +264,7 @@ async def acompletion(
stream_options: Optional[dict] = None, stream_options: Optional[dict] = None,
stop=None, stop=None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
max_completion_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None, presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None,
logit_bias: Optional[dict] = None, logit_bias: Optional[dict] = None,
@ -303,6 +304,7 @@ async def acompletion(
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True. stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -341,6 +343,7 @@ async def acompletion(
"stream_options": stream_options, "stream_options": stream_options,
"stop": stop, "stop": stop,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"max_completion_tokens": max_completion_tokens,
"presence_penalty": presence_penalty, "presence_penalty": presence_penalty,
"frequency_penalty": frequency_penalty, "frequency_penalty": frequency_penalty,
"logit_bias": logit_bias, "logit_bias": logit_bias,
@ -633,6 +636,7 @@ def completion(
stream: Optional[bool] = None, stream: Optional[bool] = None,
stream_options: Optional[dict] = None, stream_options: Optional[dict] = None,
stop=None, stop=None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None, presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None,
@ -675,6 +679,7 @@ def completion(
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true. stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far. frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion. logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -759,6 +764,7 @@ def completion(
"stream", "stream",
"stream_options", "stream_options",
"stop", "stop",
"max_completion_tokens",
"max_tokens", "max_tokens",
"presence_penalty", "presence_penalty",
"frequency_penalty", "frequency_penalty",
@ -917,6 +923,7 @@ def completion(
stream_options=stream_options, stream_options=stream_options,
stop=stop, stop=stop,
max_tokens=max_tokens, max_tokens=max_tokens,
max_completion_tokens=max_completion_tokens,
presence_penalty=presence_penalty, presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty, frequency_penalty=frequency_penalty,
logit_bias=logit_bias, logit_bias=logit_bias,

View file

@ -6,7 +6,7 @@
"input_cost_per_token": 0.0000, "input_cost_per_token": 0.0000,
"output_cost_per_token": 0.000, "output_cost_per_token": 0.000,
"litellm_provider": "one of https://docs.litellm.ai/docs/providers", "litellm_provider": "one of https://docs.litellm.ai/docs/providers",
"mode": "one of chat, embedding, completion, image_generation, audio_transcription", "mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
"supports_function_calling": true, "supports_function_calling": true,
"supports_parallel_function_calling": true, "supports_parallel_function_calling": true,
"supports_vision": true "supports_vision": true

View file

@ -1317,11 +1317,12 @@ import openai
def test_completion_gpt4_turbo(): def test_completion_gpt4_turbo():
litellm.set_verbose = True
try: try:
response = completion( response = completion(
model="gpt-4-1106-preview", model="gpt-4-1106-preview",
messages=messages, messages=messages,
max_tokens=10, max_completion_tokens=10,
) )
print(response) print(response)
except openai.RateLimitError: except openai.RateLimitError:

View file

@ -2765,6 +2765,7 @@ def get_optional_params(
stream_options=None, stream_options=None,
stop=None, stop=None,
max_tokens=None, max_tokens=None,
max_completion_tokens=None,
presence_penalty=None, presence_penalty=None,
frequency_penalty=None, frequency_penalty=None,
logit_bias=None, logit_bias=None,
@ -2842,6 +2843,7 @@ def get_optional_params(
"stream_options": None, "stream_options": None,
"stop": None, "stop": None,
"max_tokens": None, "max_tokens": None,
"max_completion_tokens": None,
"presence_penalty": None, "presence_penalty": None,
"frequency_penalty": None, "frequency_penalty": None,
"logit_bias": None, "logit_bias": None,

View file

@ -0,0 +1,54 @@
# conftest.py
import importlib
import os
import sys
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
@pytest.fixture(scope="function", autouse=True)
def setup_and_teardown():
"""
This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
"""
curr_dir = os.getcwd() # Get the current working directory
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the project directory to the system path
import litellm
from litellm import Router
importlib.reload(litellm)
import asyncio
loop = asyncio.get_event_loop_policy().new_event_loop()
asyncio.set_event_loop(loop)
print(litellm)
# from litellm import Router, completion, aembedding, acompletion, embedding
yield
# Teardown code (executes after the yield point)
loop.close() # Close the loop created earlier
asyncio.set_event_loop(None) # Remove the reference to the loop
def pytest_collection_modifyitems(config, items):
# Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
custom_logger_tests = [
item for item in items if "custom_logger" in item.parent.name
]
other_tests = [item for item in items if "custom_logger" not in item.parent.name]
# Sort tests based on their names
custom_logger_tests.sort(key=lambda x: x.name)
other_tests.sort(key=lambda x: x.name)
# Reorder the items list
items[:] = custom_logger_tests + other_tests

View file

@ -0,0 +1,342 @@
import json
import os
import sys
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from datetime import datetime
from unittest.mock import AsyncMock
from dotenv import load_dotenv
load_dotenv()
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
# Adds the parent directory to the system path
def return_mocked_response(model: str):
if model == "bedrock/mistral.mistral-large-2407-v1:0":
return {
"metrics": {"latencyMs": 316},
"output": {
"message": {
"content": [{"text": "Hello! How are you doing today? How can"}],
"role": "assistant",
}
},
"stopReason": "max_tokens",
"usage": {"inputTokens": 5, "outputTokens": 10, "totalTokens": 15},
}
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-large-2407-v1:0",
],
)
@pytest.mark.respx
@pytest.mark.asyncio()
async def test_bedrock_max_completion_tokens(model: str, respx_mock: MockRouter):
"""
Tests that:
- max_completion_tokens is passed as max_tokens to bedrock models
"""
litellm.set_verbose = True
mock_response = return_mocked_response(model)
_model = model.split("/")[1]
print("\n\nmock_response: ", mock_response)
url = f"https://bedrock-runtime.us-west-2.amazonaws.com/model/{_model}/converse"
mock_request = respx_mock.post(url).mock(
return_value=httpx.Response(200, json=mock_response)
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"messages": [{"role": "user", "content": [{"text": "Hello!"}]}],
"additionalModelRequestFields": {},
"system": [],
"inferenceConfig": {"maxTokens": 10},
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)
@pytest.mark.parametrize(
"model",
["anthropic/claude-3-sonnet-20240229", "anthropic/claude-3-opus-20240229,"],
)
@pytest.mark.respx
@pytest.mark.asyncio()
async def test_anthropic_api_max_completion_tokens(model: str, respx_mock: MockRouter):
"""
Tests that:
- max_completion_tokens is passed as max_tokens to anthropic models
"""
litellm.set_verbose = True
mock_response = {
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
"model": "claude-3-5-sonnet-20240620",
"role": "assistant",
"stop_reason": "end_turn",
"stop_sequence": None,
"type": "message",
"usage": {"input_tokens": 2095, "output_tokens": 503},
}
print("\n\nmock_response: ", mock_response)
url = f"https://api.anthropic.com/v1/messages"
mock_request = respx_mock.post(url).mock(
return_value=httpx.Response(200, json=mock_response)
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"messages": [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}],
"max_tokens": 10,
"model": model.split("/")[-1],
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)
def test_all_model_configs():
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.ai21.transformation import (
VertexAIAi21Config,
)
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.llama3.transformation import (
VertexAILlama3Config,
)
assert (
"max_completion_tokens" in VertexAILlama3Config().get_supported_openai_params()
)
assert VertexAILlama3Config().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
assert "max_completion_tokens" in VertexAIAi21Config().get_supported_openai_params()
assert VertexAIAi21Config().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
from litellm.llms.fireworks_ai import FireworksAIConfig
assert "max_completion_tokens" in FireworksAIConfig().get_supported_openai_params()
assert FireworksAIConfig().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
from litellm.llms.huggingface_restapi import HuggingfaceConfig
assert "max_completion_tokens" in HuggingfaceConfig().get_supported_openai_params()
assert HuggingfaceConfig().map_openai_params({"max_completion_tokens": 10}, {}) == {
"max_new_tokens": 10
}
from litellm.llms.nvidia_nim import NvidiaNimConfig
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
model="llama3"
)
assert NvidiaNimConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.ollama_chat import OllamaChatConfig
assert "max_completion_tokens" in OllamaChatConfig().get_supported_openai_params()
assert OllamaChatConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"num_predict": 10}
from litellm.llms.predibase import PredibaseConfig
assert "max_completion_tokens" in PredibaseConfig().get_supported_openai_params()
assert PredibaseConfig().map_openai_params(
{"max_completion_tokens": 10},
{},
) == {"max_new_tokens": 10}
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
assert (
"max_completion_tokens"
in MistralTextCompletionConfig().get_supported_openai_params()
)
assert MistralTextCompletionConfig().map_openai_params(
{"max_completion_tokens": 10},
{},
) == {"max_tokens": 10}
from litellm.llms.volcengine import VolcEngineConfig
assert "max_completion_tokens" in VolcEngineConfig().get_supported_openai_params(
model="llama3"
)
assert VolcEngineConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.AI21.chat import AI21ChatConfig
assert "max_completion_tokens" in AI21ChatConfig().get_supported_openai_params(
"jamba-1.5-mini@001"
)
assert AI21ChatConfig().map_openai_params(
model="jamba-1.5-mini@001",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.AzureOpenAI.azure import AzureOpenAIConfig
assert "max_completion_tokens" in AzureOpenAIConfig().get_supported_openai_params()
assert AzureOpenAIConfig().map_openai_params(
model="gpt-3.5-turbo",
non_default_params={"max_completion_tokens": 10},
optional_params={},
api_version="2022-12-01",
drop_params=False,
) == {"max_tokens": 10}
from litellm.llms.bedrock.chat import AmazonConverseConfig
assert (
"max_completion_tokens"
in AmazonConverseConfig().get_supported_openai_params(
model="anthropic.claude-3-sonnet-20240229-v1:0"
)
)
assert AmazonConverseConfig().map_openai_params(
model="anthropic.claude-3-sonnet-20240229-v1:0",
non_default_params={"max_completion_tokens": 10},
optional_params={},
drop_params=False,
) == {"maxTokens": 10}
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
assert (
"max_completion_tokens"
in MistralTextCompletionConfig().get_supported_openai_params()
)
assert MistralTextCompletionConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.bedrock.common_utils import (
AmazonAnthropicClaude3Config,
AmazonAnthropicConfig,
)
assert (
"max_completion_tokens"
in AmazonAnthropicClaude3Config().get_supported_openai_params()
)
assert AmazonAnthropicClaude3Config().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
assert (
"max_completion_tokens" in AmazonAnthropicConfig().get_supported_openai_params()
)
assert AmazonAnthropicConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens_to_sample": 10}
from litellm.llms.databricks.chat import DatabricksConfig
assert "max_completion_tokens" in DatabricksConfig().get_supported_openai_params()
assert DatabricksConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_anthropic import (
VertexAIAnthropicConfig,
)
assert (
"max_completion_tokens"
in VertexAIAnthropicConfig().get_supported_openai_params()
)
assert VertexAIAnthropicConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexAIConfig,
GoogleAIStudioGeminiConfig,
VertexGeminiConfig,
)
assert "max_completion_tokens" in VertexAIConfig().get_supported_openai_params()
assert VertexAIConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_output_tokens": 10}
assert (
"max_completion_tokens"
in GoogleAIStudioGeminiConfig().get_supported_openai_params()
)
assert GoogleAIStudioGeminiConfig().map_openai_params(
model="gemini-1.0-pro",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_output_tokens": 10}
assert "max_completion_tokens" in VertexGeminiConfig().get_supported_openai_params()
assert VertexGeminiConfig().map_openai_params(
model="gemini-1.0-pro",
non_default_params={"max_completion_tokens": 10},
optional_params={},
drop_params=False,
) == {"max_output_tokens": 10}

View file

@ -1,7 +1,14 @@
import json import json
import os
import sys
from datetime import datetime from datetime import datetime
from unittest.mock import AsyncMock from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx import httpx
import pytest import pytest
from respx import MockRouter from respx import MockRouter
@ -50,3 +57,45 @@ async def test_o1_handle_system_role(respx_mock: MockRouter):
print(f"response: {response}") print(f"response: {response}")
assert isinstance(response, ModelResponse) assert isinstance(response, ModelResponse)
@pytest.mark.asyncio
@pytest.mark.respx
@pytest.mark.parametrize("model", ["gpt-4", "gpt-4-0314", "gpt-4-32k", "o1-preview"])
async def test_o1_max_completion_tokens(respx_mock: MockRouter, model: str):
"""
Tests that:
- max_completion_tokens is passed directly to OpenAI chat completion models
"""
litellm.set_verbose = True
mock_response = ModelResponse(
id="cmpl-mock",
choices=[Choices(message=Message(content="Mocked response", role="assistant"))],
created=int(datetime.now().timestamp()),
model=model,
)
mock_request = respx_mock.post("https://api.openai.com/v1/chat/completions").mock(
return_value=httpx.Response(200, json=mock_response.dict())
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"model": model,
"max_completion_tokens": 10,
"messages": [{"role": "user", "content": "Hello!"}],
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)