mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
* fix(health.md): add rerank model health check information * build(model_prices_and_context_window.json): add gemini 2.0 for google ai studio - pricing + commercial rate limits * build(model_prices_and_context_window.json): add gemini-2.0 supports audio output = true * docs(team_model_add.md): clarify allowing teams to add models is an enterprise feature * fix(o1_transformation.py): add support for 'n', 'response_format' and 'stop' params for o1 and 'stream_options' param for o1-mini * build(model_prices_and_context_window.json): add 'supports_system_message' to supporting openai models needed as o1-preview, and o1-mini models don't support 'system message * fix(o1_transformation.py): translate system message based on if o1 model supports it * fix(o1_transformation.py): return 'stream' param support if o1-mini/o1-preview o1 currently doesn't support streaming, but the other model versions do Fixes https://github.com/BerriAI/litellm/issues/7292 * fix(o1_transformation.py): return tool calling/response_format in supported params if model map says so Fixes https://github.com/BerriAI/litellm/issues/7292 * fix: fix linting errors * fix: update '_transform_messages' * fix(o1_transformation.py): fix provider passed for supported param checks * test(base_llm_unit_tests.py): skip test if api takes >5s to respond * fix(utils.py): return false in 'supports_factory' if can't find value * fix(o1_transformation.py): always return stream + stream_options as supported params + handle stream options being passed in for azure o1 * feat(openai.py): support stream faking natively in openai handler Allows o1 calls to be faked for just the "o1" model, allows native streaming for o1-mini, o1-preview Fixes https://github.com/BerriAI/litellm/issues/7292 * fix(openai.py): use inference param instead of original optional param
179 lines
5.2 KiB
Python
179 lines
5.2 KiB
Python
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
|
|
sys.path.insert(
|
|
0, os.path.abspath("../..")
|
|
) # Adds the parent directory to the system path
|
|
|
|
|
|
import httpx
|
|
import pytest
|
|
from respx import MockRouter
|
|
|
|
import litellm
|
|
from litellm import Choices, Message, ModelResponse
|
|
|
|
|
|
@pytest.mark.parametrize("model", ["o1-preview", "o1-mini", "o1"])
|
|
@pytest.mark.asyncio
|
|
async def test_o1_handle_system_role(model):
|
|
"""
|
|
Tests that:
|
|
- max_tokens is translated to 'max_completion_tokens'
|
|
- role 'system' is translated to 'user'
|
|
"""
|
|
from openai import AsyncOpenAI
|
|
from litellm.utils import supports_system_messages
|
|
|
|
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
|
litellm.model_cost = litellm.get_model_cost_map(url="")
|
|
|
|
litellm.set_verbose = True
|
|
|
|
client = AsyncOpenAI(api_key="fake-api-key")
|
|
|
|
with patch.object(
|
|
client.chat.completions.with_raw_response, "create"
|
|
) as mock_client:
|
|
try:
|
|
await litellm.acompletion(
|
|
model=model,
|
|
max_tokens=10,
|
|
messages=[{"role": "system", "content": "Be a good bot!"}],
|
|
client=client,
|
|
)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
mock_client.assert_called_once()
|
|
request_body = mock_client.call_args.kwargs
|
|
|
|
print("request_body: ", request_body)
|
|
|
|
assert request_body["model"] == model
|
|
assert request_body["max_completion_tokens"] == 10
|
|
if supports_system_messages(model, "openai"):
|
|
assert request_body["messages"] == [
|
|
{"role": "system", "content": "Be a good bot!"}
|
|
]
|
|
else:
|
|
assert request_body["messages"] == [
|
|
{"role": "user", "content": "Be a good bot!"}
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"model, expected_tool_calling_support",
|
|
[("o1-preview", False), ("o1-mini", False), ("o1", True)],
|
|
)
|
|
@pytest.mark.asyncio
|
|
async def test_o1_handle_tool_calling_optional_params(
|
|
model, expected_tool_calling_support
|
|
):
|
|
"""
|
|
Tests that:
|
|
- max_tokens is translated to 'max_completion_tokens'
|
|
- role 'system' is translated to 'user'
|
|
"""
|
|
from openai import AsyncOpenAI
|
|
from litellm.utils import ProviderConfigManager
|
|
from litellm.types.utils import LlmProviders
|
|
|
|
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
|
litellm.model_cost = litellm.get_model_cost_map(url="")
|
|
|
|
config = ProviderConfigManager.get_provider_chat_config(
|
|
model=model, provider=LlmProviders.OPENAI
|
|
)
|
|
|
|
supported_params = config.get_supported_openai_params(model=model)
|
|
|
|
assert expected_tool_calling_support == ("tools" in supported_params)
|
|
|
|
|
|
# @pytest.mark.parametrize(
|
|
# "model",
|
|
# ["o1"], # "o1-preview", "o1-mini",
|
|
# )
|
|
# @pytest.mark.asyncio
|
|
# async def test_o1_handle_streaming_e2e(model):
|
|
# """
|
|
# Tests that:
|
|
# - max_tokens is translated to 'max_completion_tokens'
|
|
# - role 'system' is translated to 'user'
|
|
# """
|
|
# from openai import AsyncOpenAI
|
|
# from litellm.utils import ProviderConfigManager
|
|
# from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
|
|
# from litellm.types.utils import LlmProviders
|
|
|
|
# resp = litellm.completion(
|
|
# model=model,
|
|
# messages=[{"role": "user", "content": "Hello!"}],
|
|
# stream=True,
|
|
# )
|
|
# assert isinstance(resp, CustomStreamWrapper)
|
|
# for chunk in resp:
|
|
# print("chunk: ", chunk)
|
|
|
|
# assert True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model", ["gpt-4", "gpt-4-0314", "gpt-4-32k", "o1-preview"])
|
|
async def test_o1_max_completion_tokens(model: str):
|
|
"""
|
|
Tests that:
|
|
- max_completion_tokens is passed directly to OpenAI chat completion models
|
|
"""
|
|
from openai import AsyncOpenAI
|
|
|
|
litellm.set_verbose = True
|
|
|
|
client = AsyncOpenAI(api_key="fake-api-key")
|
|
|
|
with patch.object(
|
|
client.chat.completions.with_raw_response, "create"
|
|
) as mock_client:
|
|
try:
|
|
await litellm.acompletion(
|
|
model=model,
|
|
max_completion_tokens=10,
|
|
messages=[{"role": "user", "content": "Hello!"}],
|
|
client=client,
|
|
)
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
mock_client.assert_called_once()
|
|
request_body = mock_client.call_args.kwargs
|
|
|
|
print("request_body: ", request_body)
|
|
|
|
assert request_body["model"] == model
|
|
assert request_body["max_completion_tokens"] == 10
|
|
assert request_body["messages"] == [{"role": "user", "content": "Hello!"}]
|
|
|
|
|
|
def test_litellm_responses():
|
|
"""
|
|
ensures that type of completion_tokens_details is correctly handled / returned
|
|
"""
|
|
from litellm import ModelResponse
|
|
from litellm.types.utils import CompletionTokensDetails
|
|
|
|
response = ModelResponse(
|
|
usage={
|
|
"completion_tokens": 436,
|
|
"prompt_tokens": 14,
|
|
"total_tokens": 450,
|
|
"completion_tokens_details": {"reasoning_tokens": 0},
|
|
}
|
|
)
|
|
|
|
print("response: ", response)
|
|
|
|
assert isinstance(response.usage.completion_tokens_details, CompletionTokensDetails)
|