[Feat] Add max_completion_tokens param (#5691)

* add max_completion_tokens

* add max_completion_tokens

* add max_completion_tokens support for OpenAI models

* add max_completion_tokens param

* add max_completion_tokens for bedrock converse models

* add test for converse maxTokens

* fix openai o1 param mapping test

* move test optional params

* add max_completion_tokens for anthropic api

* fix conftest

* add max_completion tokens for vertex ai partner models

* add max_completion_tokens for fireworks ai

* add max_completion_tokens for hf rest api

* add test for param mapping

* add param mapping for vertex, gemini + testing

* predibase is the most unstable and unusable llm api in prod, can't handle our ci/cd

* add max_completion_tokens to openai supported params

* fix fireworks ai param mapping
This commit is contained in:
Ishaan Jaff 2024-09-14 14:57:01 -07:00 committed by GitHub
parent 415a3ede9e
commit 85acdb9193
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 591 additions and 35 deletions

View file

@ -230,6 +230,34 @@ jobs:
# Store test results
- store_test_results:
path: test-results
llm_translation_testing:
docker:
- image: cimg/python:3.11
working_directory: ~/project
steps:
- checkout
- run:
name: Install Dependencies
command: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
pip install "respx==0.21.1"
# Run pytest and generate JUnit XML report
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/llm_translation -x -s -v --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
# Store test results
- store_test_results:
path: test-results
installing_litellm_on_python:
docker:
@ -370,7 +398,7 @@ jobs:
command: |
pwd
ls
python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests
python -m pytest -s -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation
no_output_timeout: 120m
# Store test results
@ -694,6 +722,12 @@ workflows:
only:
- main
- /litellm_.*/
- llm_translation_testing:
filters:
branches:
only:
- main
- /litellm_.*/
- installing_litellm_on_python:
filters:
branches:
@ -711,6 +745,7 @@ workflows:
- local_testing
- build_and_test
- load_testing
- llm_translation_testing
- litellm_router_testing
- litellm_assistants_api_testing
- ui_endpoint_testing

View file

@ -75,6 +75,7 @@ class AI21ChatConfig:
"tools",
"response_format",
"max_tokens",
"max_completion_tokens",
"temperature",
"top_p",
"stop",
@ -90,6 +91,8 @@ class AI21ChatConfig:
) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items():
if param in supported_openai_params:
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value
return optional_params

View file

@ -156,6 +156,7 @@ class AzureOpenAIConfig:
"stream",
"stop",
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"presence_penalty",
@ -268,6 +269,9 @@ class AzureOpenAIConfig:
optional_params["json_mode"] = True
else:
optional_params["response_format"] = value
elif param == "max_completion_tokens":
# TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value

View file

@ -92,6 +92,7 @@ class OpenAIGPTConfig:
"logprobs",
"top_logprobs",
"max_tokens",
"max_completion_tokens",
"n",
"presence_penalty",
"seed",

View file

@ -190,6 +190,7 @@ class DeepInfraConfig:
"functions",
"logit_bias",
"max_tokens",
"max_completion_tokens",
"n",
"presence_penalty",
"stop",
@ -229,7 +230,9 @@ class DeepInfraConfig:
),
status_code=400,
)
if param in supported_openai_params:
elif param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
if value is not None:
optional_params[param] = value
return optional_params
@ -347,7 +350,9 @@ class OpenAIConfig:
- `logit_bias` (map): This optional parameter modifies the likelihood of specified tokens appearing in the completion.
- `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion.
- `max_tokens` (integer or null): This optional parameter helps to set the maximum number of tokens to generate in the chat completion. OpenAI has now deprecated in favor of max_completion_tokens, and is not compatible with o1 series models.
- `max_completion_tokens` (integer or null): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
- `n` (integer or null): This optional parameter helps to set how many chat completion choices to generate for each input message.
@ -364,6 +369,7 @@ class OpenAIConfig:
function_call: Optional[Union[str, dict]] = None
functions: Optional[list] = None
logit_bias: Optional[dict] = None
max_completion_tokens: Optional[int] = None
max_tokens: Optional[int] = None
n: Optional[int] = None
presence_penalty: Optional[int] = None
@ -378,6 +384,7 @@ class OpenAIConfig:
function_call: Optional[Union[str, dict]] = None,
functions: Optional[list] = None,
logit_bias: Optional[dict] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[int] = None,

View file

@ -158,6 +158,7 @@ class AnthropicConfig:
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"extra_headers",
@ -173,6 +174,8 @@ class AnthropicConfig:
for param, value in non_default_params.items():
if param == "max_tokens":
optional_params["max_tokens"] = value
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "tools":
optional_params["tools"] = value
if param == "tool_choice":

View file

@ -94,16 +94,16 @@ class AzureOpenAIConfig(OpenAIConfig):
top_p: Optional[int] = None,
) -> None:
super().__init__(
frequency_penalty,
function_call,
functions,
logit_bias,
max_tokens,
n,
presence_penalty,
stop,
temperature,
top_p,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
stop=stop,
temperature=temperature,
top_p=top_p,
)

View file

@ -161,6 +161,7 @@ class AmazonCohereChatConfig:
def get_supported_openai_params(self) -> List[str]:
return [
"max_tokens",
"max_completion_tokens",
"stream",
"stop",
"temperature",
@ -177,7 +178,7 @@ class AmazonCohereChatConfig:
self, non_default_params: dict, optional_params: dict
) -> dict:
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "stream":
optional_params["stream"] = value
@ -1156,6 +1157,7 @@ class AmazonConverseConfig:
def get_supported_openai_params(self, model: str) -> List[str]:
supported_params = [
"max_tokens",
"max_completion_tokens",
"stream",
"stream_options",
"stop",
@ -1263,7 +1265,7 @@ class AmazonConverseConfig:
),
status_code=400,
)
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["maxTokens"] = value
if param == "stream":
optional_params["stream"] = value

View file

@ -5,7 +5,7 @@ Common utilities used across bedrock chat/embedding/image generation
import os
import types
from enum import Enum
from typing import List, Optional, Union, Tuple
from typing import List, Optional, Tuple, Union
import httpx
@ -158,6 +158,7 @@ class AmazonAnthropicClaude3Config:
def get_supported_openai_params(self):
return [
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"stream",
@ -169,7 +170,7 @@ class AmazonAnthropicClaude3Config:
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "tools":
optional_params["tools"] = value
@ -240,11 +241,18 @@ class AmazonAnthropicConfig:
def get_supported_openai_params(
self,
):
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
return [
"max_tokens",
"max_completion_tokens",
"temperature",
"stop",
"top_p",
"stream",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens_to_sample"] = value
if param == "temperature":
optional_params["temperature"] = value

View file

@ -70,6 +70,7 @@ class CerebrasConfig:
return [
"max_tokens",
"max_completion_tokens",
"response_format",
"seed",
"stop",
@ -86,6 +87,8 @@ class CerebrasConfig:
) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items():
if param in supported_openai_params:
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value
return optional_params

View file

@ -106,11 +106,19 @@ class DatabricksConfig:
]
def get_supported_openai_params(self):
return ["stream", "stop", "temperature", "top_p", "max_tokens", "n"]
return [
"stream",
"stop",
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"n",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "n":
optional_params["n"] = value

View file

@ -73,6 +73,7 @@ class FireworksAIConfig:
"stream",
"tools",
"tool_choice",
"max_completion_tokens",
"max_tokens",
"temperature",
"top_p",
@ -102,6 +103,8 @@ class FireworksAIConfig:
else:
# pass through the value of tool choice
optional_params["tool_choice"] = value
elif param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
if value is not None:
optional_params[param] = value

View file

@ -139,6 +139,7 @@ class HuggingfaceConfig:
"stream",
"temperature",
"max_tokens",
"max_completion_tokens",
"top_p",
"stop",
"n",
@ -167,7 +168,7 @@ class HuggingfaceConfig:
optional_params["stream"] = value
if param == "stop":
optional_params["stop"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
# HF TGI raises the following exception when max_new_tokens==0
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
if value == 0:

View file

@ -78,6 +78,7 @@ class NvidiaNimConfig:
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
]
elif model == "nvidia/nemotron-4-340b-reward":
return [
@ -92,6 +93,7 @@ class NvidiaNimConfig:
"frequency_penalty",
"presence_penalty",
"max_tokens",
"max_completion_tokens",
"stop",
]
else:
@ -124,6 +126,7 @@ class NvidiaNimConfig:
"frequency_penalty",
"presence_penalty",
"max_tokens",
"max_completion_tokens",
"stop",
"seed",
]
@ -133,6 +136,8 @@ class NvidiaNimConfig:
) -> dict:
supported_openai_params = self.get_supported_openai_params(model=model)
for param, value in non_default_params.items():
if param in supported_openai_params:
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value
return optional_params

View file

@ -140,6 +140,7 @@ class OllamaChatConfig:
):
return [
"max_tokens",
"max_completion_tokens",
"stream",
"top_p",
"temperature",
@ -156,7 +157,7 @@ class OllamaChatConfig:
self, model: str, non_default_params: dict, optional_params: dict
):
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["num_predict"] = value
if param == "stream":
optional_params["stream"] = value

View file

@ -154,6 +154,7 @@ class PredibaseConfig:
return [
"stream",
"temperature",
"max_completion_tokens",
"max_tokens",
"top_p",
"stop",
@ -181,7 +182,7 @@ class PredibaseConfig:
optional_params["stream"] = value
if param == "stop":
optional_params["stop"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
# HF TGI raises the following exception when max_new_tokens==0
# Failed: Error occurred: HuggingfaceException - Input validation error: `max_new_tokens` must be strictly positive
if value == 0:

View file

@ -141,6 +141,7 @@ class MistralTextCompletionConfig:
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"stream",
"seed",
"stop",
@ -154,7 +155,7 @@ class MistralTextCompletionConfig:
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "stream" and value == True:
optional_params["stream"] = value

View file

@ -158,6 +158,7 @@ class VertexAIConfig:
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"stream",
"tools",
"tool_choice",
@ -184,7 +185,7 @@ class VertexAIConfig:
optional_params["stop_sequences"] = [value]
elif isinstance(value, list):
optional_params["stop_sequences"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value
if param == "response_format" and value["type"] == "json_object":
optional_params["response_mime_type"] = "application/json"
@ -319,6 +320,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"stream",
"tools",
"tool_choice",
@ -413,7 +415,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
optional_params["stop_sequences"] = [value]
elif isinstance(value, list):
optional_params["stop_sequences"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value
if param == "response_format": # type: ignore
if value["type"] == "json_object": # type: ignore
@ -554,6 +556,7 @@ class VertexGeminiConfig:
"temperature",
"top_p",
"max_tokens",
"max_completion_tokens",
"stream",
"tools",
"functions",
@ -653,7 +656,7 @@ class VertexGeminiConfig:
optional_params["stop_sequences"] = [value]
elif isinstance(value, list):
optional_params["stop_sequences"] = value
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_output_tokens"] = value
if param == "response_format" and isinstance(value, dict): # type: ignore
if value["type"] == "json_object":

View file

@ -114,6 +114,7 @@ class VertexAIAnthropicConfig:
def get_supported_openai_params(self):
return [
"max_tokens",
"max_completion_tokens",
"tools",
"tool_choice",
"stream",
@ -125,7 +126,7 @@ class VertexAIAnthropicConfig:
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
if param == "max_tokens" or param == "max_completion_tokens":
optional_params["max_tokens"] = value
if param == "tools":
optional_params["tools"] = value

View file

@ -46,6 +46,10 @@ class VertexAIAi21Config:
def map_openai_params(
self, non_default_params: dict, optional_params: dict, model: str
):
if "max_completion_tokens" in non_default_params:
non_default_params["max_tokens"] = non_default_params.pop(
"max_completion_tokens"
)
return litellm.OpenAIConfig().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,

View file

@ -52,6 +52,10 @@ class VertexAILlama3Config:
def map_openai_params(
self, non_default_params: dict, optional_params: dict, model: str
):
if "max_completion_tokens" in non_default_params:
non_default_params["max_tokens"] = non_default_params.pop(
"max_completion_tokens"
)
return litellm.OpenAIConfig().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,

View file

@ -60,6 +60,7 @@ class VolcEngineConfig:
"logit_bias",
"logprobs",
"top_logprobs",
"max_completion_tokens",
"max_tokens",
"n",
"presence_penalty",
@ -82,6 +83,8 @@ class VolcEngineConfig:
) -> dict:
supported_openai_params = self.get_supported_openai_params(model)
for param, value in non_default_params.items():
if param in supported_openai_params:
if param == "max_completion_tokens":
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value
return optional_params

View file

@ -264,6 +264,7 @@ async def acompletion(
stream_options: Optional[dict] = None,
stop=None,
max_tokens: Optional[int] = None,
max_completion_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[dict] = None,
@ -303,6 +304,7 @@ async def acompletion(
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -341,6 +343,7 @@ async def acompletion(
"stream_options": stream_options,
"stop": stop,
"max_tokens": max_tokens,
"max_completion_tokens": max_completion_tokens,
"presence_penalty": presence_penalty,
"frequency_penalty": frequency_penalty,
"logit_bias": logit_bias,
@ -633,6 +636,7 @@ def completion(
stream: Optional[bool] = None,
stream_options: Optional[dict] = None,
stop=None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None,
frequency_penalty: Optional[float] = None,
@ -675,6 +679,7 @@ def completion(
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -759,6 +764,7 @@ def completion(
"stream",
"stream_options",
"stop",
"max_completion_tokens",
"max_tokens",
"presence_penalty",
"frequency_penalty",
@ -917,6 +923,7 @@ def completion(
stream_options=stream_options,
stop=stop,
max_tokens=max_tokens,
max_completion_tokens=max_completion_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,

View file

@ -6,7 +6,7 @@
"input_cost_per_token": 0.0000,
"output_cost_per_token": 0.000,
"litellm_provider": "one of https://docs.litellm.ai/docs/providers",
"mode": "one of chat, embedding, completion, image_generation, audio_transcription",
"mode": "one of chat, embedding, completion, image_generation, audio_transcription, audio_speech",
"supports_function_calling": true,
"supports_parallel_function_calling": true,
"supports_vision": true

View file

@ -1317,11 +1317,12 @@ import openai
def test_completion_gpt4_turbo():
litellm.set_verbose = True
try:
response = completion(
model="gpt-4-1106-preview",
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
print(response)
except openai.RateLimitError:

View file

@ -2765,6 +2765,7 @@ def get_optional_params(
stream_options=None,
stop=None,
max_tokens=None,
max_completion_tokens=None,
presence_penalty=None,
frequency_penalty=None,
logit_bias=None,
@ -2842,6 +2843,7 @@ def get_optional_params(
"stream_options": None,
"stop": None,
"max_tokens": None,
"max_completion_tokens": None,
"presence_penalty": None,
"frequency_penalty": None,
"logit_bias": None,

View file

@ -0,0 +1,54 @@
# conftest.py
import importlib
import os
import sys
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
@pytest.fixture(scope="function", autouse=True)
def setup_and_teardown():
"""
This fixture reloads litellm before every function. To speed up testing by removing callbacks being chained.
"""
curr_dir = os.getcwd() # Get the current working directory
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the project directory to the system path
import litellm
from litellm import Router
importlib.reload(litellm)
import asyncio
loop = asyncio.get_event_loop_policy().new_event_loop()
asyncio.set_event_loop(loop)
print(litellm)
# from litellm import Router, completion, aembedding, acompletion, embedding
yield
# Teardown code (executes after the yield point)
loop.close() # Close the loop created earlier
asyncio.set_event_loop(None) # Remove the reference to the loop
def pytest_collection_modifyitems(config, items):
# Separate tests in 'test_amazing_proxy_custom_logger.py' and other tests
custom_logger_tests = [
item for item in items if "custom_logger" in item.parent.name
]
other_tests = [item for item in items if "custom_logger" not in item.parent.name]
# Sort tests based on their names
custom_logger_tests.sort(key=lambda x: x.name)
other_tests.sort(key=lambda x: x.name)
# Reorder the items list
items[:] = custom_logger_tests + other_tests

View file

@ -0,0 +1,342 @@
import json
import os
import sys
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from datetime import datetime
from unittest.mock import AsyncMock
from dotenv import load_dotenv
load_dotenv()
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
# Adds the parent directory to the system path
def return_mocked_response(model: str):
if model == "bedrock/mistral.mistral-large-2407-v1:0":
return {
"metrics": {"latencyMs": 316},
"output": {
"message": {
"content": [{"text": "Hello! How are you doing today? How can"}],
"role": "assistant",
}
},
"stopReason": "max_tokens",
"usage": {"inputTokens": 5, "outputTokens": 10, "totalTokens": 15},
}
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-large-2407-v1:0",
],
)
@pytest.mark.respx
@pytest.mark.asyncio()
async def test_bedrock_max_completion_tokens(model: str, respx_mock: MockRouter):
"""
Tests that:
- max_completion_tokens is passed as max_tokens to bedrock models
"""
litellm.set_verbose = True
mock_response = return_mocked_response(model)
_model = model.split("/")[1]
print("\n\nmock_response: ", mock_response)
url = f"https://bedrock-runtime.us-west-2.amazonaws.com/model/{_model}/converse"
mock_request = respx_mock.post(url).mock(
return_value=httpx.Response(200, json=mock_response)
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"messages": [{"role": "user", "content": [{"text": "Hello!"}]}],
"additionalModelRequestFields": {},
"system": [],
"inferenceConfig": {"maxTokens": 10},
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)
@pytest.mark.parametrize(
"model",
["anthropic/claude-3-sonnet-20240229", "anthropic/claude-3-opus-20240229,"],
)
@pytest.mark.respx
@pytest.mark.asyncio()
async def test_anthropic_api_max_completion_tokens(model: str, respx_mock: MockRouter):
"""
Tests that:
- max_completion_tokens is passed as max_tokens to anthropic models
"""
litellm.set_verbose = True
mock_response = {
"content": [{"text": "Hi! My name is Claude.", "type": "text"}],
"id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
"model": "claude-3-5-sonnet-20240620",
"role": "assistant",
"stop_reason": "end_turn",
"stop_sequence": None,
"type": "message",
"usage": {"input_tokens": 2095, "output_tokens": 503},
}
print("\n\nmock_response: ", mock_response)
url = f"https://api.anthropic.com/v1/messages"
mock_request = respx_mock.post(url).mock(
return_value=httpx.Response(200, json=mock_response)
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"messages": [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}],
"max_tokens": 10,
"model": model.split("/")[-1],
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)
def test_all_model_configs():
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.ai21.transformation import (
VertexAIAi21Config,
)
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.llama3.transformation import (
VertexAILlama3Config,
)
assert (
"max_completion_tokens" in VertexAILlama3Config().get_supported_openai_params()
)
assert VertexAILlama3Config().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
assert "max_completion_tokens" in VertexAIAi21Config().get_supported_openai_params()
assert VertexAIAi21Config().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
from litellm.llms.fireworks_ai import FireworksAIConfig
assert "max_completion_tokens" in FireworksAIConfig().get_supported_openai_params()
assert FireworksAIConfig().map_openai_params(
{"max_completion_tokens": 10}, {}, "llama3"
) == {"max_tokens": 10}
from litellm.llms.huggingface_restapi import HuggingfaceConfig
assert "max_completion_tokens" in HuggingfaceConfig().get_supported_openai_params()
assert HuggingfaceConfig().map_openai_params({"max_completion_tokens": 10}, {}) == {
"max_new_tokens": 10
}
from litellm.llms.nvidia_nim import NvidiaNimConfig
assert "max_completion_tokens" in NvidiaNimConfig().get_supported_openai_params(
model="llama3"
)
assert NvidiaNimConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.ollama_chat import OllamaChatConfig
assert "max_completion_tokens" in OllamaChatConfig().get_supported_openai_params()
assert OllamaChatConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"num_predict": 10}
from litellm.llms.predibase import PredibaseConfig
assert "max_completion_tokens" in PredibaseConfig().get_supported_openai_params()
assert PredibaseConfig().map_openai_params(
{"max_completion_tokens": 10},
{},
) == {"max_new_tokens": 10}
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
assert (
"max_completion_tokens"
in MistralTextCompletionConfig().get_supported_openai_params()
)
assert MistralTextCompletionConfig().map_openai_params(
{"max_completion_tokens": 10},
{},
) == {"max_tokens": 10}
from litellm.llms.volcengine import VolcEngineConfig
assert "max_completion_tokens" in VolcEngineConfig().get_supported_openai_params(
model="llama3"
)
assert VolcEngineConfig().map_openai_params(
model="llama3",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.AI21.chat import AI21ChatConfig
assert "max_completion_tokens" in AI21ChatConfig().get_supported_openai_params(
"jamba-1.5-mini@001"
)
assert AI21ChatConfig().map_openai_params(
model="jamba-1.5-mini@001",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.AzureOpenAI.azure import AzureOpenAIConfig
assert "max_completion_tokens" in AzureOpenAIConfig().get_supported_openai_params()
assert AzureOpenAIConfig().map_openai_params(
model="gpt-3.5-turbo",
non_default_params={"max_completion_tokens": 10},
optional_params={},
api_version="2022-12-01",
drop_params=False,
) == {"max_tokens": 10}
from litellm.llms.bedrock.chat import AmazonConverseConfig
assert (
"max_completion_tokens"
in AmazonConverseConfig().get_supported_openai_params(
model="anthropic.claude-3-sonnet-20240229-v1:0"
)
)
assert AmazonConverseConfig().map_openai_params(
model="anthropic.claude-3-sonnet-20240229-v1:0",
non_default_params={"max_completion_tokens": 10},
optional_params={},
drop_params=False,
) == {"maxTokens": 10}
from litellm.llms.text_completion_codestral import MistralTextCompletionConfig
assert (
"max_completion_tokens"
in MistralTextCompletionConfig().get_supported_openai_params()
)
assert MistralTextCompletionConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.bedrock.common_utils import (
AmazonAnthropicClaude3Config,
AmazonAnthropicConfig,
)
assert (
"max_completion_tokens"
in AmazonAnthropicClaude3Config().get_supported_openai_params()
)
assert AmazonAnthropicClaude3Config().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
assert (
"max_completion_tokens" in AmazonAnthropicConfig().get_supported_openai_params()
)
assert AmazonAnthropicConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens_to_sample": 10}
from litellm.llms.databricks.chat import DatabricksConfig
assert "max_completion_tokens" in DatabricksConfig().get_supported_openai_params()
assert DatabricksConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_ai_anthropic import (
VertexAIAnthropicConfig,
)
assert (
"max_completion_tokens"
in VertexAIAnthropicConfig().get_supported_openai_params()
)
assert VertexAIAnthropicConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_tokens": 10}
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexAIConfig,
GoogleAIStudioGeminiConfig,
VertexGeminiConfig,
)
assert "max_completion_tokens" in VertexAIConfig().get_supported_openai_params()
assert VertexAIConfig().map_openai_params(
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_output_tokens": 10}
assert (
"max_completion_tokens"
in GoogleAIStudioGeminiConfig().get_supported_openai_params()
)
assert GoogleAIStudioGeminiConfig().map_openai_params(
model="gemini-1.0-pro",
non_default_params={"max_completion_tokens": 10},
optional_params={},
) == {"max_output_tokens": 10}
assert "max_completion_tokens" in VertexGeminiConfig().get_supported_openai_params()
assert VertexGeminiConfig().map_openai_params(
model="gemini-1.0-pro",
non_default_params={"max_completion_tokens": 10},
optional_params={},
drop_params=False,
) == {"max_output_tokens": 10}

View file

@ -1,7 +1,14 @@
import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx
import pytest
from respx import MockRouter
@ -50,3 +57,45 @@ async def test_o1_handle_system_role(respx_mock: MockRouter):
print(f"response: {response}")
assert isinstance(response, ModelResponse)
@pytest.mark.asyncio
@pytest.mark.respx
@pytest.mark.parametrize("model", ["gpt-4", "gpt-4-0314", "gpt-4-32k", "o1-preview"])
async def test_o1_max_completion_tokens(respx_mock: MockRouter, model: str):
"""
Tests that:
- max_completion_tokens is passed directly to OpenAI chat completion models
"""
litellm.set_verbose = True
mock_response = ModelResponse(
id="cmpl-mock",
choices=[Choices(message=Message(content="Mocked response", role="assistant"))],
created=int(datetime.now().timestamp()),
model=model,
)
mock_request = respx_mock.post("https://api.openai.com/v1/chat/completions").mock(
return_value=httpx.Response(200, json=mock_response.dict())
)
response = await litellm.acompletion(
model=model,
max_completion_tokens=10,
messages=[{"role": "user", "content": "Hello!"}],
)
assert mock_request.called
request_body = json.loads(mock_request.calls[0].request.content)
print("request_body: ", request_body)
assert request_body == {
"model": model,
"max_completion_tokens": 10,
"messages": [{"role": "user", "content": "Hello!"}],
}
print(f"response: {response}")
assert isinstance(response, ModelResponse)