forked from phoenix/litellm-mirror
[Feat] Add max_completion_tokens
param (#5691)
* add max_completion_tokens * add max_completion_tokens * add max_completion_tokens support for OpenAI models * add max_completion_tokens param * add max_completion_tokens for bedrock converse models * add test for converse maxTokens * fix openai o1 param mapping test * move test optional params * add max_completion_tokens for anthropic api * fix conftest * add max_completion tokens for vertex ai partner models * add max_completion_tokens for fireworks ai * add max_completion_tokens for hf rest api * add test for param mapping * add param mapping for vertex, gemini + testing * predibase is the most unstable and unusable llm api in prod, can't handle our ci/cd * add max_completion_tokens to openai supported params * fix fireworks ai param mapping
This commit is contained in:
parent
415a3ede9e
commit
85acdb9193
31 changed files with 591 additions and 35 deletions
|
@ -230,6 +230,34 @@ jobs:
|
||||||
# Store test results
|
# Store test results
|
||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: test-results
|
path: test-results
|
||||||
|
llm_translation_testing:
|
||||||
|
docker:
|
||||||
|
- image: cimg/python:3.11
|
||||||
|
working_directory: ~/project
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run:
|
||||||
|
name: Install Dependencies
|
||||||
|
command: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install -r requirements.txt
|
||||||
|
pip install "pytest==7.3.1"
|
||||||
|
pip install "pytest-retry==1.6.3"
|
||||||
|
pip install "pytest-asyncio==0.21.1"
|
||||||
|
pip install "respx==0.21.1"
|
||||||
|
# Run pytest and generate JUnit XML report
|
||||||
|
- run:
|
||||||
|
name: Run tests
|
||||||
|
command: |
|
||||||
|
pwd
|
||||||
|
ls
|
||||||
|
python -m pytest -vv tests/llm_translation -x -s -v --junitxml=test-results/junit.xml --durations=5
|
||||||
|
no_output_timeout: 120m
|
||||||
|
|
||||||
|
# Store test results
|
||||||
|
- store_test_results:
|
||||||
|
path: test-results
|
||||||
|
|
||||||
installing_litellm_on_python:
|
installing_litellm_on_python:
|
||||||
docker:
|
docker:
|
||||||
|
@ -370,7 +398,7 @@ jobs:
|
||||||
command: |
|
command: |
|
||||||
pwd
|
pwd
|
||||||
ls
|
ls
|
||||||
python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests
|
python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation
|
||||||
no_output_timeout: 120m
|
no_output_timeout: 120m
|
||||||
|
|
||||||
# Store test results
|
# Store test results
|
||||||
|
@ -694,6 +722,12 @@ workflows:
|
||||||
only:
|
only:
|
||||||
- main
|
- main
|
||||||
- /litellm_.*/
|
- /litellm_.*/
|
||||||
|
- llm_translation_testing:
|
||||||
|
filters:
|
||||||
|
branches:
|
||||||
|
only:
|
||||||
|
- main
|
||||||
|
- /litellm_.*/
|
||||||
- installing_litellm_on_python:
|
- installing_litellm_on_python:
|
||||||
filters:
|
filters:
|
||||||
branches:
|
branches:
|
||||||
|
@ -711,6 +745,7 @@ workflows:
|
||||||
- local_testing
|
- local_testing
|
||||||
- build_and_test
|
- build_and_test
|
||||||
- load_testing
|
- load_testing
|
||||||
|
- llm_translation_testing
|
||||||
- litellm_router_testing
|
- litellm_router_testing
|
||||||
- litellm_assistants_api_testing
|
- litellm_assistants_api_testing
|
||||||
- ui_endpoint_testing
|
- ui_endpoint_testing
|
||||||
|
|
|
@ -75,6 +75,7 @@ class AI21ChatConfig:
|
||||||
"tools",
|
"tools",
|
||||||
"response_format",
|
"response_format",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -90,6 +91,8 @@ class AI21ChatConfig:
|
||||||
) -> dict:
|
) -> dict:
|
||||||
supported_openai_params = self.get_supported_openai_params(model=model)
|
supported_openai_params = self.get_supported_openai_params(model=model)
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param in supported_openai_params:
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
elif param in supported_openai_params:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
|
@ -156,6 +156,7 @@ class AzureOpenAIConfig:
|
||||||
"stream",
|
"stream",
|
||||||
"stop",
|
"stop",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
|
@ -268,6 +269,9 @@ class AzureOpenAIConfig:
|
||||||
optional_params["json_mode"] = True
|
optional_params["json_mode"] = True
|
||||||
else:
|
else:
|
||||||
optional_params["response_format"] = value
|
optional_params["response_format"] = value
|
||||||
|
elif param == "max_completion_tokens":
|
||||||
|
# TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
elif param in supported_openai_params:
|
elif param in supported_openai_params:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
|
|
||||||
|
|
|
@ -92,6 +92,7 @@ class OpenAIGPTConfig:
|
||||||
"logprobs",
|
"logprobs",
|
||||||
"top_logprobs",
|
"top_logprobs",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"n",
|
"n",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"seed",
|
"seed",
|
||||||
|
|
|
@ -190,6 +190,7 @@ class DeepInfraConfig:
|
||||||
"functions",
|
"functions",
|
||||||
"logit_bias",
|
"logit_bias",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"n",
|
"n",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -229,7 +230,9 @@ class DeepInfraConfig:
|
||||||
),
|
),
|
||||||
status_code=400,
|
status_code=400,
|
||||||
)
|
)
|
||||||
if param in supported_openai_params:
|
elif param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
elif param in supported_openai_params:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
@ -347,7 +350,9 @@ class OpenAIConfig:
|
||||||
|
|
||||||
- `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion.
|
- `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion.
|
||||||
|
|
||||||
- `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion.
|
- `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. OpenAI has now deprecated in favor of max_completion_tokens, and is not compatible with o1 series models.
|
||||||
|
|
||||||
|
- `max_completion_tokens` (integer or null): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
||||||
|
|
||||||
- `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message.
|
- `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message.
|
||||||
|
|
||||||
|
@ -364,6 +369,7 @@ class OpenAIConfig:
|
||||||
function_call: Optional[Union[str, dict]] = None
|
function_call: Optional[Union[str, dict]] = None
|
||||||
functions: Optional[list] = None
|
functions: Optional[list] = None
|
||||||
logit_bias: Optional[dict] = None
|
logit_bias: Optional[dict] = None
|
||||||
|
max_completion_tokens: Optional[int] = None
|
||||||
max_tokens: Optional[int] = None
|
max_tokens: Optional[int] = None
|
||||||
n: Optional[int] = None
|
n: Optional[int] = None
|
||||||
presence_penalty: Optional[int] = None
|
presence_penalty: Optional[int] = None
|
||||||
|
@ -378,6 +384,7 @@ class OpenAIConfig:
|
||||||
function_call: Optional[Union[str, dict]] = None,
|
function_call: Optional[Union[str, dict]] = None,
|
||||||
functions: Optional[list] = None,
|
functions: Optional[list] = None,
|
||||||
logit_bias: Optional[dict] = None,
|
logit_bias: Optional[dict] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
n: Optional[int] = None,
|
n: Optional[int] = None,
|
||||||
presence_penalty: Optional[int] = None,
|
presence_penalty: Optional[int] = None,
|
||||||
|
|
|
@ -158,6 +158,7 @@ class AnthropicConfig:
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
"extra_headers",
|
"extra_headers",
|
||||||
|
@ -173,6 +174,8 @@ class AnthropicConfig:
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
if param == "tools":
|
if param == "tools":
|
||||||
optional_params["tools"] = value
|
optional_params["tools"] = value
|
||||||
if param == "tool_choice":
|
if param == "tool_choice":
|
||||||
|
|
|
@ -94,16 +94,16 @@ class AzureOpenAIConfig(OpenAIConfig):
|
||||||
top_p: Optional[int] = None,
|
top_p: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(
|
super().__init__(
|
||||||
frequency_penalty,
|
frequency_penalty=frequency_penalty,
|
||||||
function_call,
|
function_call=function_call,
|
||||||
functions,
|
functions=functions,
|
||||||
logit_bias,
|
logit_bias=logit_bias,
|
||||||
max_tokens,
|
max_tokens=max_tokens,
|
||||||
n,
|
n=n,
|
||||||
presence_penalty,
|
presence_penalty=presence_penalty,
|
||||||
stop,
|
stop=stop,
|
||||||
temperature,
|
temperature=temperature,
|
||||||
top_p,
|
top_p=top_p,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -161,6 +161,7 @@ class AmazonCohereChatConfig:
|
||||||
def get_supported_openai_params(self) -> List[str]:
|
def get_supported_openai_params(self) -> List[str]:
|
||||||
return [
|
return [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"stop",
|
"stop",
|
||||||
"temperature",
|
"temperature",
|
||||||
|
@ -177,7 +178,7 @@ class AmazonCohereChatConfig:
|
||||||
self, non_default_params: dict, optional_params: dict
|
self, non_default_params: dict, optional_params: dict
|
||||||
) -> dict:
|
) -> dict:
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
if param == "stream":
|
if param == "stream":
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
|
@ -1156,6 +1157,7 @@ class AmazonConverseConfig:
|
||||||
def get_supported_openai_params(self, model: str) -> List[str]:
|
def get_supported_openai_params(self, model: str) -> List[str]:
|
||||||
supported_params = [
|
supported_params = [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"stream_options",
|
"stream_options",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -1263,7 +1265,7 @@ class AmazonConverseConfig:
|
||||||
),
|
),
|
||||||
status_code=400,
|
status_code=400,
|
||||||
)
|
)
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["maxTokens"] = value
|
optional_params["maxTokens"] = value
|
||||||
if param == "stream":
|
if param == "stream":
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
|
|
|
@ -5,7 +5,7 @@ Common utilities used across bedrock chat/embedding/image generation
|
||||||
import os
|
import os
|
||||||
import types
|
import types
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import List, Optional, Union, Tuple
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
@ -158,6 +158,7 @@ class AmazonAnthropicClaude3Config:
|
||||||
def get_supported_openai_params(self):
|
def get_supported_openai_params(self):
|
||||||
return [
|
return [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
"stream",
|
"stream",
|
||||||
|
@ -169,7 +170,7 @@ class AmazonAnthropicClaude3Config:
|
||||||
|
|
||||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
if param == "tools":
|
if param == "tools":
|
||||||
optional_params["tools"] = value
|
optional_params["tools"] = value
|
||||||
|
@ -240,11 +241,18 @@ class AmazonAnthropicConfig:
|
||||||
def get_supported_openai_params(
|
def get_supported_openai_params(
|
||||||
self,
|
self,
|
||||||
):
|
):
|
||||||
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
return [
|
||||||
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
|
"temperature",
|
||||||
|
"stop",
|
||||||
|
"top_p",
|
||||||
|
"stream",
|
||||||
|
]
|
||||||
|
|
||||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens_to_sample"] = value
|
optional_params["max_tokens_to_sample"] = value
|
||||||
if param == "temperature":
|
if param == "temperature":
|
||||||
optional_params["temperature"] = value
|
optional_params["temperature"] = value
|
||||||
|
|
|
@ -70,6 +70,7 @@ class CerebrasConfig:
|
||||||
|
|
||||||
return [
|
return [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"response_format",
|
"response_format",
|
||||||
"seed",
|
"seed",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -86,6 +87,8 @@ class CerebrasConfig:
|
||||||
) -> dict:
|
) -> dict:
|
||||||
supported_openai_params = self.get_supported_openai_params(model=model)
|
supported_openai_params = self.get_supported_openai_params(model=model)
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param in supported_openai_params:
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
elif param in supported_openai_params:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
|
@ -106,11 +106,19 @@ class DatabricksConfig:
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_supported_openai_params(self):
|
def get_supported_openai_params(self):
|
||||||
return ["stream", "stop", "temperature", "top_p", "max_tokens", "n"]
|
return [
|
||||||
|
"stream",
|
||||||
|
"stop",
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
|
"n",
|
||||||
|
]
|
||||||
|
|
||||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
if param == "n":
|
if param == "n":
|
||||||
optional_params["n"] = value
|
optional_params["n"] = value
|
||||||
|
|
|
@ -73,6 +73,7 @@ class FireworksAIConfig:
|
||||||
"stream",
|
"stream",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
|
"max_completion_tokens",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
|
@ -102,6 +103,8 @@ class FireworksAIConfig:
|
||||||
else:
|
else:
|
||||||
# pass through the value of tool choice
|
# pass through the value of tool choice
|
||||||
optional_params["tool_choice"] = value
|
optional_params["tool_choice"] = value
|
||||||
|
elif param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
elif param in supported_openai_params:
|
elif param in supported_openai_params:
|
||||||
if value is not None:
|
if value is not None:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
|
|
|
@ -139,6 +139,7 @@ class HuggingfaceConfig:
|
||||||
"stream",
|
"stream",
|
||||||
"temperature",
|
"temperature",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"top_p",
|
"top_p",
|
||||||
"stop",
|
"stop",
|
||||||
"n",
|
"n",
|
||||||
|
@ -167,7 +168,7 @@ class HuggingfaceConfig:
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
if param == "stop":
|
if param == "stop":
|
||||||
optional_params["stop"] = value
|
optional_params["stop"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
# HF TGI raises the following exception when max_new_tokens==0
|
# HF TGI raises the following exception when max_new_tokens==0
|
||||||
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
|
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
|
||||||
if value == 0:
|
if value == 0:
|
||||||
|
|
|
@ -78,6 +78,7 @@ class NvidiaNimConfig:
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
]
|
]
|
||||||
elif model == "nvidia/nemotron-4-340b-reward":
|
elif model == "nvidia/nemotron-4-340b-reward":
|
||||||
return [
|
return [
|
||||||
|
@ -92,6 +93,7 @@ class NvidiaNimConfig:
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stop",
|
"stop",
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
|
@ -124,6 +126,7 @@ class NvidiaNimConfig:
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stop",
|
"stop",
|
||||||
"seed",
|
"seed",
|
||||||
]
|
]
|
||||||
|
@ -133,6 +136,8 @@ class NvidiaNimConfig:
|
||||||
) -> dict:
|
) -> dict:
|
||||||
supported_openai_params = self.get_supported_openai_params(model=model)
|
supported_openai_params = self.get_supported_openai_params(model=model)
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param in supported_openai_params:
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
elif param in supported_openai_params:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
|
@ -140,6 +140,7 @@ class OllamaChatConfig:
|
||||||
):
|
):
|
||||||
return [
|
return [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"top_p",
|
"top_p",
|
||||||
"temperature",
|
"temperature",
|
||||||
|
@ -156,7 +157,7 @@ class OllamaChatConfig:
|
||||||
self, model: str, non_default_params: dict, optional_params: dict
|
self, model: str, non_default_params: dict, optional_params: dict
|
||||||
):
|
):
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["num_predict"] = value
|
optional_params["num_predict"] = value
|
||||||
if param == "stream":
|
if param == "stream":
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
|
|
|
@ -154,6 +154,7 @@ class PredibaseConfig:
|
||||||
return [
|
return [
|
||||||
"stream",
|
"stream",
|
||||||
"temperature",
|
"temperature",
|
||||||
|
"max_completion_tokens",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"top_p",
|
"top_p",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -181,7 +182,7 @@ class PredibaseConfig:
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
if param == "stop":
|
if param == "stop":
|
||||||
optional_params["stop"] = value
|
optional_params["stop"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
# HF TGI raises the following exception when max_new_tokens==0
|
# HF TGI raises the following exception when max_new_tokens==0
|
||||||
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
|
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
|
||||||
if value == 0:
|
if value == 0:
|
||||||
|
|
|
@ -141,6 +141,7 @@ class MistralTextCompletionConfig:
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"seed",
|
"seed",
|
||||||
"stop",
|
"stop",
|
||||||
|
@ -154,7 +155,7 @@ class MistralTextCompletionConfig:
|
||||||
optional_params["temperature"] = value
|
optional_params["temperature"] = value
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
optional_params["top_p"] = value
|
optional_params["top_p"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
if param == "stream" and value == True:
|
if param == "stream" and value == True:
|
||||||
optional_params["stream"] = value
|
optional_params["stream"] = value
|
||||||
|
|
|
@ -158,6 +158,7 @@ class VertexAIConfig:
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
|
@ -184,7 +185,7 @@ class VertexAIConfig:
|
||||||
optional_params["stop_sequences"] = [value]
|
optional_params["stop_sequences"] = [value]
|
||||||
elif isinstance(value, list):
|
elif isinstance(value, list):
|
||||||
optional_params["stop_sequences"] = value
|
optional_params["stop_sequences"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_output_tokens"] = value
|
optional_params["max_output_tokens"] = value
|
||||||
if param == "response_format" and value["type"] == "json_object":
|
if param == "response_format" and value["type"] == "json_object":
|
||||||
optional_params["response_mime_type"] = "application/json"
|
optional_params["response_mime_type"] = "application/json"
|
||||||
|
@ -319,6 +320,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
|
@ -413,7 +415,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
|
||||||
optional_params["stop_sequences"] = [value]
|
optional_params["stop_sequences"] = [value]
|
||||||
elif isinstance(value, list):
|
elif isinstance(value, list):
|
||||||
optional_params["stop_sequences"] = value
|
optional_params["stop_sequences"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_output_tokens"] = value
|
optional_params["max_output_tokens"] = value
|
||||||
if param == "response_format": # type: ignore
|
if param == "response_format": # type: ignore
|
||||||
if value["type"] == "json_object": # type: ignore
|
if value["type"] == "json_object": # type: ignore
|
||||||
|
@ -554,6 +556,7 @@ class VertexGeminiConfig:
|
||||||
"temperature",
|
"temperature",
|
||||||
"top_p",
|
"top_p",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"stream",
|
"stream",
|
||||||
"tools",
|
"tools",
|
||||||
"functions",
|
"functions",
|
||||||
|
@ -653,7 +656,7 @@ class VertexGeminiConfig:
|
||||||
optional_params["stop_sequences"] = [value]
|
optional_params["stop_sequences"] = [value]
|
||||||
elif isinstance(value, list):
|
elif isinstance(value, list):
|
||||||
optional_params["stop_sequences"] = value
|
optional_params["stop_sequences"] = value
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_output_tokens"] = value
|
optional_params["max_output_tokens"] = value
|
||||||
if param == "response_format" and isinstance(value, dict): # type: ignore
|
if param == "response_format" and isinstance(value, dict): # type: ignore
|
||||||
if value["type"] == "json_object":
|
if value["type"] == "json_object":
|
||||||
|
|
|
@ -114,6 +114,7 @@ class VertexAIAnthropicConfig:
|
||||||
def get_supported_openai_params(self):
|
def get_supported_openai_params(self):
|
||||||
return [
|
return [
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
|
"max_completion_tokens",
|
||||||
"tools",
|
"tools",
|
||||||
"tool_choice",
|
"tool_choice",
|
||||||
"stream",
|
"stream",
|
||||||
|
@ -125,7 +126,7 @@ class VertexAIAnthropicConfig:
|
||||||
|
|
||||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param == "max_tokens":
|
if param == "max_tokens" or param == "max_completion_tokens":
|
||||||
optional_params["max_tokens"] = value
|
optional_params["max_tokens"] = value
|
||||||
if param == "tools":
|
if param == "tools":
|
||||||
optional_params["tools"] = value
|
optional_params["tools"] = value
|
||||||
|
|
|
@ -46,6 +46,10 @@ class VertexAIAi21Config:
|
||||||
def map_openai_params(
|
def map_openai_params(
|
||||||
self, non_default_params: dict, optional_params: dict, model: str
|
self, non_default_params: dict, optional_params: dict, model: str
|
||||||
):
|
):
|
||||||
|
if "max_completion_tokens" in non_default_params:
|
||||||
|
non_default_params["max_tokens"] = non_default_params.pop(
|
||||||
|
"max_completion_tokens"
|
||||||
|
)
|
||||||
return litellm.OpenAIConfig().map_openai_params(
|
return litellm.OpenAIConfig().map_openai_params(
|
||||||
non_default_params=non_default_params,
|
non_default_params=non_default_params,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
|
|
|
@ -52,6 +52,10 @@ class VertexAILlama3Config:
|
||||||
def map_openai_params(
|
def map_openai_params(
|
||||||
self, non_default_params: dict, optional_params: dict, model: str
|
self, non_default_params: dict, optional_params: dict, model: str
|
||||||
):
|
):
|
||||||
|
if "max_completion_tokens" in non_default_params:
|
||||||
|
non_default_params["max_tokens"] = non_default_params.pop(
|
||||||
|
"max_completion_tokens"
|
||||||
|
)
|
||||||
return litellm.OpenAIConfig().map_openai_params(
|
return litellm.OpenAIConfig().map_openai_params(
|
||||||
non_default_params=non_default_params,
|
non_default_params=non_default_params,
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
|
|
|
@ -60,6 +60,7 @@ class VolcEngineConfig:
|
||||||
"logit_bias",
|
"logit_bias",
|
||||||
"logprobs",
|
"logprobs",
|
||||||
"top_logprobs",
|
"top_logprobs",
|
||||||
|
"max_completion_tokens",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"n",
|
"n",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
|
@ -82,6 +83,8 @@ class VolcEngineConfig:
|
||||||
) -> dict:
|
) -> dict:
|
||||||
supported_openai_params = self.get_supported_openai_params(model)
|
supported_openai_params = self.get_supported_openai_params(model)
|
||||||
for param, value in non_default_params.items():
|
for param, value in non_default_params.items():
|
||||||
if param in supported_openai_params:
|
if param == "max_completion_tokens":
|
||||||
|
optional_params["max_tokens"] = value
|
||||||
|
elif param in supported_openai_params:
|
||||||
optional_params[param] = value
|
optional_params[param] = value
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
|
@ -264,6 +264,7 @@ async def acompletion(
|
||||||
stream_options: Optional[dict] = None,
|
stream_options: Optional[dict] = None,
|
||||||
stop=None,
|
stop=None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
presence_penalty: Optional[float] = None,
|
presence_penalty: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
logit_bias: Optional[dict] = None,
|
logit_bias: Optional[dict] = None,
|
||||||
|
@ -303,6 +304,7 @@ async def acompletion(
|
||||||
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
|
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
|
||||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||||
|
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
||||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||||
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
||||||
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
||||||
|
@ -341,6 +343,7 @@ async def acompletion(
|
||||||
"stream_options": stream_options,
|
"stream_options": stream_options,
|
||||||
"stop": stop,
|
"stop": stop,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
|
"max_completion_tokens": max_completion_tokens,
|
||||||
"presence_penalty": presence_penalty,
|
"presence_penalty": presence_penalty,
|
||||||
"frequency_penalty": frequency_penalty,
|
"frequency_penalty": frequency_penalty,
|
||||||
"logit_bias": logit_bias,
|
"logit_bias": logit_bias,
|
||||||
|
@ -633,6 +636,7 @@ def completion(
|
||||||
stream: Optional[bool] = None,
|
stream: Optional[bool] = None,
|
||||||
stream_options: Optional[dict] = None,
|
stream_options: Optional[dict] = None,
|
||||||
stop=None,
|
stop=None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
max_tokens: Optional[int] = None,
|
max_tokens: Optional[int] = None,
|
||||||
presence_penalty: Optional[float] = None,
|
presence_penalty: Optional[float] = None,
|
||||||
frequency_penalty: Optional[float] = None,
|
frequency_penalty: Optional[float] = None,
|
||||||
|
@ -675,6 +679,7 @@ def completion(
|
||||||
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
|
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
|
||||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||||
|
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
|
||||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||||
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
|
||||||
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
|
||||||
|
@ -759,6 +764,7 @@ def completion(
|
||||||
"stream",
|
"stream",
|
||||||
"stream_options",
|
"stream_options",
|
||||||
"stop",
|
"stop",
|
||||||
|
"max_completion_tokens",
|
||||||
"max_tokens",
|
"max_tokens",
|
||||||
"presence_penalty",
|
"presence_penalty",
|
||||||
"frequency_penalty",
|
"frequency_penalty",
|
||||||
|
@ -917,6 +923,7 @@ def completion(
|
||||||
stream_options=stream_options,
|
stream_options=stream_options,
|
||||||
stop=stop,
|
stop=stop,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
presence_penalty=presence_penalty,
|
presence_penalty=presence_penalty,
|
||||||
frequency_penalty=frequency_penalty,
|
frequency_penalty=frequency_penalty,
|
||||||
logit_bias=logit_bias,
|
logit_bias=logit_bias,
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
"input_cost_per_token": 0.0000,
|
"input_cost_per_token": 0.0000,
|
||||||
"output_cost_per_token": 0.000,
|
"output_cost_per_token": 0.000,
|
||||||
"litellm_provider": "one of https://docs.litellm.ai/docs/providers",
|
"litellm_provider": "one of https://docs.litellm.ai/docs/providers",
|
||||||
"mode": "one of chat, embedding, completion, image_generation, audio_transcription",
|
"mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
"supports_parallel_function_calling": true,
|
"supports_parallel_function_calling": true,
|
||||||
"supports_vision": true
|
"supports_vision": true
|
||||||
|
|
|
@ -1317,11 +1317,12 @@ import openai
|
||||||
|
|
||||||
|
|
||||||
def test_completion_gpt4_turbo():
|
def test_completion_gpt4_turbo():
|
||||||
|
litellm.set_verbose = True
|
||||||
try:
|
try:
|
||||||
response = completion(
|
response = completion(
|
||||||
model="gpt-4-1106-preview",
|
model="gpt-4-1106-preview",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_tokens=10,
|
max_completion_tokens=10,
|
||||||
)
|
)
|
||||||
print(response)
|
print(response)
|
||||||
except openai.RateLimitError:
|
except openai.RateLimitError:
|
||||||
|
|
|
@ -2765,6 +2765,7 @@ def get_optional_params(
|
||||||
stream_options=None,
|
stream_options=None,
|
||||||
stop=None,
|
stop=None,
|
||||||
max_tokens=None,
|
max_tokens=None,
|
||||||
|
max_completion_tokens=None,
|
||||||
presence_penalty=None,
|
presence_penalty=None,
|
||||||
frequency_penalty=None,
|
frequency_penalty=None,
|
||||||
logit_bias=None,
|
logit_bias=None,
|
||||||
|
@ -2842,6 +2843,7 @@ def get_optional_params(
|
||||||
"stream_options": None,
|
"stream_options": None,
|
||||||
"stop": None,
|
"stop": None,
|
||||||
"max_tokens": None,
|
"max_tokens": None,
|
||||||
|
"max_completion_tokens": None,
|
||||||
"presence_penalty": None,
|
"presence_penalty": None,
|
||||||
"frequency_penalty": None,
|
"frequency_penalty": None,
|
||||||
"logit_bias": None,
|
"logit_bias": None,
|
||||||
|
|
54
tests/llm_translation/conftest.py
Normal file
54
tests/llm_translation/conftest.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
# conftest.py
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def setup_and_teardown():
|
||||||
|
"""
|
||||||
|
This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
|
||||||
|
"""
|
||||||
|
curr_dir = os.getcwd() # Get the current working directory
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the project directory to the system path
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm import Router
|
||||||
|
|
||||||
|
importlib.reload(litellm)
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop_policy().new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
print(litellm)
|
||||||
|
# from litellm import Router, completion, aembedding, acompletion, embedding
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Teardown code (executes after the yield point)
|
||||||
|
loop.close() # Close the loop created earlier
|
||||||
|
asyncio.set_event_loop(None) # Remove the reference to the loop
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collection_modifyitems(config, items):
|
||||||
|
# Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
|
||||||
|
custom_logger_tests = [
|
||||||
|
item for item in items if "custom_logger" in item.parent.name
|
||||||
|
]
|
||||||
|
other_tests = [item for item in items if "custom_logger" not in item.parent.name]
|
||||||
|
|
||||||
|
# Sort tests based on their names
|
||||||
|
custom_logger_tests.sort(key=lambda x: x.name)
|
||||||
|
other_tests.sort(key=lambda x: x.name)
|
||||||
|
|
||||||
|
# Reorder the items list
|
||||||
|
items[:] = custom_logger_tests + other_tests
|
342
tests/llm_translation/test_max_completion_tokens.py
Normal file
342
tests/llm_translation/test_max_completion_tokens.py
Normal file
|
@ -0,0 +1,342 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import AsyncMock
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
from respx import MockRouter
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm import Choices, Message, ModelResponse
|
||||||
|
|
||||||
|
# Adds the parent directory to the system path
|
||||||
|
|
||||||
|
|
||||||
|
def return_mocked_response(model: str):
|
||||||
|
if model == "bedrock/mistral.mistral-large-2407-v1:0":
|
||||||
|
return {
|
||||||
|
"metrics": {"latencyMs": 316},
|
||||||
|
"output": {
|
||||||
|
"message": {
|
||||||
|
"content": [{"text": "Hello! How are you doing today? How can"}],
|
||||||
|
"role": "assistant",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"stopReason": "max_tokens",
|
||||||
|
"usage": {"inputTokens": 5, "outputTokens": 10, "totalTokens": 15},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
[
|
||||||
|
"bedrock/mistral.mistral-large-2407-v1:0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.respx
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_bedrock_max_completion_tokens(model: str, respx_mock: MockRouter):
|
||||||
|
"""
|
||||||
|
Tests that:
|
||||||
|
- max_completion_tokens is passed as max_tokens to bedrock models
|
||||||
|
"""
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
mock_response = return_mocked_response(model)
|
||||||
|
_model = model.split("/")[1]
|
||||||
|
print("\n\nmock_response: ", mock_response)
|
||||||
|
url = f"https://bedrock-runtime.us-west-2.amazonaws.com/model/{_model}/converse"
|
||||||
|
mock_request = respx_mock.post(url).mock(
|
||||||
|
return_value=httpx.Response(200, json=mock_response)
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await litellm.acompletion(
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
messages=[{"role": "user", "content": "Hello!"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert mock_request.called
|
||||||
|
request_body = json.loads(mock_request.calls[0].request.content)
|
||||||
|
|
||||||
|
print("request_body: ", request_body)
|
||||||
|
|
||||||
|
assert request_body == {
|
||||||
|
"messages": [{"role": "user", "content": [{"text": "Hello!"}]}],
|
||||||
|
"additionalModelRequestFields": {},
|
||||||
|
"system": [],
|
||||||
|
"inferenceConfig": {"maxTokens": 10},
|
||||||
|
}
|
||||||
|
print(f"response: {response}")
|
||||||
|
assert isinstance(response, ModelResponse)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
["anthropic/claude-3-sonnet-20240229", "anthropic/claude-3-opus-20240229,"],
|
||||||
|
)
|
||||||
|
@pytest.mark.respx
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_anthropic_api_max_completion_tokens(model: str, respx_mock: MockRouter):
|
||||||
|
"""
|
||||||
|
Tests that:
|
||||||
|
- max_completion_tokens is passed as max_tokens to anthropic models
|
||||||
|
"""
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
mock_response = {
|
||||||
|
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
|
||||||
|
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
|
||||||
|
"model": "claude-3-5-sonnet-20240620",
|
||||||
|
"role": "assistant",
|
||||||
|
"stop_reason": "end_turn",
|
||||||
|
"stop_sequence": None,
|
||||||
|
"type": "message",
|
||||||
|
"usage": {"input_tokens": 2095, "output_tokens": 503},
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n\nmock_response: ", mock_response)
|
||||||
|
url = f"https://api.anthropic.com/v1/messages"
|
||||||
|
mock_request = respx_mock.post(url).mock(
|
||||||
|
return_value=httpx.Response(200, json=mock_response)
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await litellm.acompletion(
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
messages=[{"role": "user", "content": "Hello!"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert mock_request.called
|
||||||
|
request_body = json.loads(mock_request.calls[0].request.content)
|
||||||
|
|
||||||
|
print("request_body: ", request_body)
|
||||||
|
|
||||||
|
assert request_body == {
|
||||||
|
"messages": [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}],
|
||||||
|
"max_tokens": 10,
|
||||||
|
"model": model.split("/")[-1],
|
||||||
|
}
|
||||||
|
print(f"response: {response}")
|
||||||
|
assert isinstance(response, ModelResponse)
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_model_configs():
|
||||||
|
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.ai21.transformation import (
|
||||||
|
VertexAIAi21Config,
|
||||||
|
)
|
||||||
|
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.llama3.transformation import (
|
||||||
|
VertexAILlama3Config,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens" in VertexAILlama3Config().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
assert VertexAILlama3Config().map_openai_params(
|
||||||
|
{"max_completion_tokens": 10}, {}, "llama3"
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in VertexAIAi21Config().get_supported_openai_params()
|
||||||
|
assert VertexAIAi21Config().map_openai_params(
|
||||||
|
{"max_completion_tokens": 10}, {}, "llama3"
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.fireworks_ai import FireworksAIConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in FireworksAIConfig().get_supported_openai_params()
|
||||||
|
assert FireworksAIConfig().map_openai_params(
|
||||||
|
{"max_completion_tokens": 10}, {}, "llama3"
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.huggingface_restapi import HuggingfaceConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in HuggingfaceConfig().get_supported_openai_params()
|
||||||
|
assert HuggingfaceConfig().map_openai_params({"max_completion_tokens": 10}, {}) == {
|
||||||
|
"max_new_tokens": 10
|
||||||
|
}
|
||||||
|
|
||||||
|
from litellm.llms.nvidia_nim import NvidiaNimConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
|
||||||
|
model="llama3"
|
||||||
|
)
|
||||||
|
assert NvidiaNimConfig().map_openai_params(
|
||||||
|
model="llama3",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.ollama_chat import OllamaChatConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in OllamaChatConfig().get_supported_openai_params()
|
||||||
|
assert OllamaChatConfig().map_openai_params(
|
||||||
|
model="llama3",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"num_predict": 10}
|
||||||
|
|
||||||
|
from litellm.llms.predibase import PredibaseConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in PredibaseConfig().get_supported_openai_params()
|
||||||
|
assert PredibaseConfig().map_openai_params(
|
||||||
|
{"max_completion_tokens": 10},
|
||||||
|
{},
|
||||||
|
) == {"max_new_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in MistralTextCompletionConfig().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
assert MistralTextCompletionConfig().map_openai_params(
|
||||||
|
{"max_completion_tokens": 10},
|
||||||
|
{},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.volcengine import VolcEngineConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in VolcEngineConfig().get_supported_openai_params(
|
||||||
|
model="llama3"
|
||||||
|
)
|
||||||
|
assert VolcEngineConfig().map_openai_params(
|
||||||
|
model="llama3",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.AI21.chat import AI21ChatConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in AI21ChatConfig().get_supported_openai_params(
|
||||||
|
"jamba-1.5-mini@001"
|
||||||
|
)
|
||||||
|
assert AI21ChatConfig().map_openai_params(
|
||||||
|
model="jamba-1.5-mini@001",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.AzureOpenAI.azure import AzureOpenAIConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in AzureOpenAIConfig().get_supported_openai_params()
|
||||||
|
assert AzureOpenAIConfig().map_openai_params(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
api_version="2022-12-01",
|
||||||
|
drop_params=False,
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.bedrock.chat import AmazonConverseConfig
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in AmazonConverseConfig().get_supported_openai_params(
|
||||||
|
model="anthropic.claude-3-sonnet-20240229-v1:0"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert AmazonConverseConfig().map_openai_params(
|
||||||
|
model="anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
drop_params=False,
|
||||||
|
) == {"maxTokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in MistralTextCompletionConfig().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
assert MistralTextCompletionConfig().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.bedrock.common_utils import (
|
||||||
|
AmazonAnthropicClaude3Config,
|
||||||
|
AmazonAnthropicConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in AmazonAnthropicClaude3Config().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert AmazonAnthropicClaude3Config().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens" in AmazonAnthropicConfig().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert AmazonAnthropicConfig().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens_to_sample": 10}
|
||||||
|
|
||||||
|
from litellm.llms.databricks.chat import DatabricksConfig
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in DatabricksConfig().get_supported_openai_params()
|
||||||
|
|
||||||
|
assert DatabricksConfig().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_anthropic import (
|
||||||
|
VertexAIAnthropicConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in VertexAIAnthropicConfig().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert VertexAIAnthropicConfig().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_tokens": 10}
|
||||||
|
|
||||||
|
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||||
|
VertexAIConfig,
|
||||||
|
GoogleAIStudioGeminiConfig,
|
||||||
|
VertexGeminiConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in VertexAIConfig().get_supported_openai_params()
|
||||||
|
|
||||||
|
assert VertexAIConfig().map_openai_params(
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_output_tokens": 10}
|
||||||
|
|
||||||
|
assert (
|
||||||
|
"max_completion_tokens"
|
||||||
|
in GoogleAIStudioGeminiConfig().get_supported_openai_params()
|
||||||
|
)
|
||||||
|
|
||||||
|
assert GoogleAIStudioGeminiConfig().map_openai_params(
|
||||||
|
model="gemini-1.0-pro",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
) == {"max_output_tokens": 10}
|
||||||
|
|
||||||
|
assert "max_completion_tokens" in VertexGeminiConfig().get_supported_openai_params()
|
||||||
|
|
||||||
|
assert VertexGeminiConfig().map_openai_params(
|
||||||
|
model="gemini-1.0-pro",
|
||||||
|
non_default_params={"max_completion_tokens": 10},
|
||||||
|
optional_params={},
|
||||||
|
drop_params=False,
|
||||||
|
) == {"max_output_tokens": 10}
|
|
@ -1,7 +1,14 @@
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from unittest.mock import AsyncMock
|
from unittest.mock import AsyncMock
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import pytest
|
import pytest
|
||||||
from respx import MockRouter
|
from respx import MockRouter
|
||||||
|
@ -50,3 +57,45 @@ async def test_o1_handle_system_role(respx_mock: MockRouter):
|
||||||
|
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
assert isinstance(response, ModelResponse)
|
assert isinstance(response, ModelResponse)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.respx
|
||||||
|
@pytest.mark.parametrize("model", ["gpt-4", "gpt-4-0314", "gpt-4-32k", "o1-preview"])
|
||||||
|
async def test_o1_max_completion_tokens(respx_mock: MockRouter, model: str):
|
||||||
|
"""
|
||||||
|
Tests that:
|
||||||
|
- max_completion_tokens is passed directly to OpenAI chat completion models
|
||||||
|
"""
|
||||||
|
litellm.set_verbose = True
|
||||||
|
|
||||||
|
mock_response = ModelResponse(
|
||||||
|
id="cmpl-mock",
|
||||||
|
choices=[Choices(message=Message(content="Mocked response", role="assistant"))],
|
||||||
|
created=int(datetime.now().timestamp()),
|
||||||
|
model=model,
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_request = respx_mock.post("https://api.openai.com/v1/chat/completions").mock(
|
||||||
|
return_value=httpx.Response(200, json=mock_response.dict())
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await litellm.acompletion(
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=10,
|
||||||
|
messages=[{"role": "user", "content": "Hello!"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert mock_request.called
|
||||||
|
request_body = json.loads(mock_request.calls[0].request.content)
|
||||||
|
|
||||||
|
print("request_body: ", request_body)
|
||||||
|
|
||||||
|
assert request_body == {
|
||||||
|
"model": model,
|
||||||
|
"max_completion_tokens": 10,
|
||||||
|
"messages": [{"role": "user", "content": "Hello!"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"response: {response}")
|
||||||
|
assert isinstance(response, ModelResponse)
|
Loading…
Add table
Add a link
Reference in a new issue