fix(health.md): add rerank model health check information (#7295)

* fix(health.md): add rerank model health check information

* build(model_prices_and_context_window.json): add gemini 2.0 for google ai studio - pricing + commercial rate limits

* build(model_prices_and_context_window.json): add gemini-2.0 supports audio output = true

* docs(team_model_add.md): clarify allowing teams to add models is an enterprise feature

* fix(o1_transformation.py): add support for 'n', 'response_format' and 'stop' params for o1 and 'stream_options' param for o1-mini

* build(model_prices_and_context_window.json): add 'supports_system_message' to supporting openai models

needed as o1-preview, and o1-mini models don't support 'system message

* fix(o1_transformation.py): translate system message based on if o1 model supports it

* fix(o1_transformation.py): return 'stream' param support if o1-mini/o1-preview

o1 currently doesn't support streaming, but the other model versions do

Fixes https://github.com/BerriAI/litellm/issues/7292

* fix(o1_transformation.py): return tool calling/response_format in supported params if model map says so

Fixes https://github.com/BerriAI/litellm/issues/7292

* fix: fix linting errors

* fix: update '_transform_messages'

* fix(o1_transformation.py): fix provider passed for supported param checks

* test(base_llm_unit_tests.py): skip test if api takes >5s to respond

* fix(utils.py): return false in 'supports_factory' if can't find value

* fix(o1_transformation.py): always return stream + stream_options as supported params + handle stream options being passed in for azure o1

* feat(openai.py): support stream faking natively in openai handler

Allows o1 calls to be faked for just the "o1" model, allows native streaming for o1-mini, o1-preview

 Fixes https://github.com/BerriAI/litellm/issues/7292

* fix(openai.py): use inference param instead of original optional param
This commit is contained in:
Krish Dholakia 2024-12-18 19:18:10 -08:00 committed by GitHub
parent e95820367f
commit 1a4910f6c0
34 changed files with 800 additions and 515 deletions

View file

@ -17,14 +17,19 @@ import litellm
from litellm import Choices, Message, ModelResponse
@pytest.mark.parametrize("model", ["o1-preview", "o1-mini", "o1"])
@pytest.mark.asyncio
async def test_o1_handle_system_role():
async def test_o1_handle_system_role(model):
"""
Tests that:
- max_tokens is translated to 'max_completion_tokens'
- role 'system' is translated to 'user'
"""
from openai import AsyncOpenAI
from litellm.utils import supports_system_messages
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
litellm.set_verbose = True
@ -35,9 +40,9 @@ async def test_o1_handle_system_role():
) as mock_client:
try:
await litellm.acompletion(
model="o1-preview",
model=model,
max_tokens=10,
messages=[{"role": "system", "content": "Hello!"}],
messages=[{"role": "system", "content": "Be a good bot!"}],
client=client,
)
except Exception as e:
@ -48,9 +53,73 @@ async def test_o1_handle_system_role():
print("request_body: ", request_body)
assert request_body["model"] == "o1-preview"
assert request_body["model"] == model
assert request_body["max_completion_tokens"] == 10
assert request_body["messages"] == [{"role": "user", "content": "Hello!"}]
if supports_system_messages(model, "openai"):
assert request_body["messages"] == [
{"role": "system", "content": "Be a good bot!"}
]
else:
assert request_body["messages"] == [
{"role": "user", "content": "Be a good bot!"}
]
@pytest.mark.parametrize(
"model, expected_tool_calling_support",
[("o1-preview", False), ("o1-mini", False), ("o1", True)],
)
@pytest.mark.asyncio
async def test_o1_handle_tool_calling_optional_params(
model, expected_tool_calling_support
):
"""
Tests that:
- max_tokens is translated to 'max_completion_tokens'
- role 'system' is translated to 'user'
"""
from openai import AsyncOpenAI
from litellm.utils import ProviderConfigManager
from litellm.types.utils import LlmProviders
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
config = ProviderConfigManager.get_provider_chat_config(
model=model, provider=LlmProviders.OPENAI
)
supported_params = config.get_supported_openai_params(model=model)
assert expected_tool_calling_support == ("tools" in supported_params)
# @pytest.mark.parametrize(
# "model",
# ["o1"], # "o1-preview", "o1-mini",
# )
# @pytest.mark.asyncio
# async def test_o1_handle_streaming_e2e(model):
# """
# Tests that:
# - max_tokens is translated to 'max_completion_tokens'
# - role 'system' is translated to 'user'
# """
# from openai import AsyncOpenAI
# from litellm.utils import ProviderConfigManager
# from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
# from litellm.types.utils import LlmProviders
# resp = litellm.completion(
# model=model,
# messages=[{"role": "user", "content": "Hello!"}],
# stream=True,
# )
# assert isinstance(resp, CustomStreamWrapper)
# for chunk in resp:
# print("chunk: ", chunk)
# assert True
@pytest.mark.asyncio