forked from phoenix/litellm-mirror
* Fix Vertex AI function calling invoke: use JSON format instead of protobuf text format. (#6702) * test: test tool_call conversion when arguments is empty dict Fixes https://github.com/BerriAI/litellm/issues/6833 * fix(openai_like/handler.py): return more descriptive error message Fixes https://github.com/BerriAI/litellm/issues/6812 * test: skip overloaded model * docs(anthropic.md): update anthropic docs to show how to route to any new model * feat(groq/): fake stream when 'response_format' param is passed Groq doesn't support streaming when response_format is set * feat(groq/): add response_format support for groq Closes https://github.com/BerriAI/litellm/issues/6845 * fix(o1_handler.py): remove fake streaming for o1 Closes https://github.com/BerriAI/litellm/issues/6801 * build(model_prices_and_context_window.json): add groq llama3.2b model pricing Closes https://github.com/BerriAI/litellm/issues/6807 * fix(utils.py): fix handling ollama response format param Fixes https://github.com/BerriAI/litellm/issues/6848#issuecomment-2491215485 * docs(sidebars.js): refactor chat endpoint placement * fix: fix linting errors * test: fix test * test: fix test * fix(openai_like/handler): handle max retries * fix(streaming_handler.py): fix streaming check for openai-compatible providers * test: update test * test: correctly handle model is overloaded error * test: update test * test: fix test * test: mark flaky test --------- Co-authored-by: Guowang Li <Guowang@users.noreply.github.com>
206 lines
5.8 KiB
Python
206 lines
5.8 KiB
Python
#### What this tests ####
|
|
# This tests litellm router with batch completion
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import time
|
|
import traceback
|
|
|
|
import openai
|
|
import pytest
|
|
|
|
sys.path.insert(
|
|
0, os.path.abspath("../..")
|
|
) # Adds the parent directory to the system path
|
|
import os
|
|
from collections import defaultdict
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
import httpx
|
|
from dotenv import load_dotenv
|
|
|
|
import litellm
|
|
from litellm import Router
|
|
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
|
|
|
|
load_dotenv()
|
|
|
|
|
|
@pytest.mark.parametrize("mode", ["all_responses", "fastest_response"])
|
|
@pytest.mark.asyncio
|
|
async def test_batch_completion_multiple_models(mode):
|
|
litellm.set_verbose = True
|
|
|
|
router = litellm.Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "groq-llama",
|
|
"litellm_params": {
|
|
"model": "groq/llama3-8b-8192",
|
|
},
|
|
},
|
|
]
|
|
)
|
|
|
|
if mode == "all_responses":
|
|
response = await router.abatch_completion(
|
|
models=["gpt-3.5-turbo", "groq-llama"],
|
|
messages=[
|
|
{"role": "user", "content": "is litellm becoming a better product ?"}
|
|
],
|
|
max_tokens=15,
|
|
)
|
|
|
|
print(response)
|
|
assert len(response) == 2
|
|
|
|
models_in_responses = []
|
|
print(f"response: {response}")
|
|
for individual_response in response:
|
|
print(f"individual_response: {individual_response}")
|
|
_model = individual_response["model"]
|
|
models_in_responses.append(_model)
|
|
|
|
# assert both models are different
|
|
assert models_in_responses[0] != models_in_responses[1]
|
|
elif mode == "fastest_response":
|
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
|
|
response = await router.abatch_completion_fastest_response(
|
|
model="gpt-3.5-turbo, groq-llama",
|
|
messages=[
|
|
{"role": "user", "content": "is litellm becoming a better product ?"}
|
|
],
|
|
max_tokens=15,
|
|
)
|
|
|
|
ChatCompletion.model_validate(response.model_dump(), strict=True)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_batch_completion_fastest_response_unit_test():
|
|
"""
|
|
Unit test to confirm fastest response will always return the response which arrives earliest.
|
|
|
|
2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned
|
|
"""
|
|
litellm.set_verbose = True
|
|
|
|
router = litellm.Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-4",
|
|
"litellm_params": {
|
|
"model": "gpt-4",
|
|
},
|
|
"model_info": {"id": "1"},
|
|
},
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
"mock_response": "This is a fake response",
|
|
},
|
|
"model_info": {"id": "2"},
|
|
},
|
|
]
|
|
)
|
|
|
|
response = await router.abatch_completion_fastest_response(
|
|
model="gpt-4, gpt-3.5-turbo",
|
|
messages=[
|
|
{"role": "user", "content": "is litellm becoming a better product ?"}
|
|
],
|
|
max_tokens=500,
|
|
)
|
|
|
|
assert response._hidden_params["model_id"] == "2"
|
|
assert response.choices[0].message.content == "This is a fake response"
|
|
print(f"response: {response}")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_batch_completion_fastest_response_streaming():
|
|
litellm.set_verbose = True
|
|
|
|
router = litellm.Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "groq-llama",
|
|
"litellm_params": {
|
|
"model": "groq/llama3-8b-8192",
|
|
},
|
|
},
|
|
]
|
|
)
|
|
|
|
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
|
|
|
|
response = await router.abatch_completion_fastest_response(
|
|
model="gpt-3.5-turbo, groq-llama",
|
|
messages=[
|
|
{"role": "user", "content": "is litellm becoming a better product ?"}
|
|
],
|
|
max_tokens=15,
|
|
stream=True,
|
|
)
|
|
|
|
async for chunk in response:
|
|
ChatCompletionChunk.model_validate(chunk.model_dump(), strict=True)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_batch_completion_multiple_models_multiple_messages():
|
|
litellm.set_verbose = True
|
|
|
|
router = litellm.Router(
|
|
model_list=[
|
|
{
|
|
"model_name": "gpt-3.5-turbo",
|
|
"litellm_params": {
|
|
"model": "gpt-3.5-turbo",
|
|
},
|
|
},
|
|
{
|
|
"model_name": "groq-llama",
|
|
"litellm_params": {
|
|
"model": "groq/llama3-8b-8192",
|
|
},
|
|
},
|
|
]
|
|
)
|
|
|
|
response = await router.abatch_completion(
|
|
models=["gpt-3.5-turbo", "groq-llama"],
|
|
messages=[
|
|
[{"role": "user", "content": "is litellm becoming a better product ?"}],
|
|
[{"role": "user", "content": "who is this"}],
|
|
],
|
|
max_tokens=15,
|
|
)
|
|
|
|
print("response from batches =", response)
|
|
assert len(response) == 2
|
|
assert len(response[0]) == 2
|
|
assert isinstance(response[0][0], litellm.ModelResponse)
|
|
|
|
# models_in_responses = []
|
|
# for individual_response in response:
|
|
# _model = individual_response["model"]
|
|
# models_in_responses.append(_model)
|
|
|
|
# # assert both models are different
|
|
# assert models_in_responses[0] != models_in_responses[1]
|