litellm/tests/local_testing/test_router_batch_completion.py
Krish Dholakia 7e5085dc7b
Litellm dev 11 21 2024 (#6837)
* Fix Vertex AI function calling invoke: use JSON format instead of protobuf text format. (#6702)

* test: test tool_call conversion when arguments is empty dict

Fixes https://github.com/BerriAI/litellm/issues/6833

* fix(openai_like/handler.py): return more descriptive error message

Fixes https://github.com/BerriAI/litellm/issues/6812

* test: skip overloaded model

* docs(anthropic.md): update anthropic docs to show how to route to any new model

* feat(groq/): fake stream when 'response_format' param is passed

Groq doesn't support streaming when response_format is set

* feat(groq/): add response_format support for groq

Closes https://github.com/BerriAI/litellm/issues/6845

* fix(o1_handler.py): remove fake streaming for o1

Closes https://github.com/BerriAI/litellm/issues/6801

* build(model_prices_and_context_window.json): add groq llama3.2b model pricing

Closes https://github.com/BerriAI/litellm/issues/6807

* fix(utils.py): fix handling ollama response format param

Fixes https://github.com/BerriAI/litellm/issues/6848#issuecomment-2491215485

* docs(sidebars.js): refactor chat endpoint placement

* fix: fix linting errors

* test: fix test

* test: fix test

* fix(openai_like/handler): handle max retries

* fix(streaming_handler.py): fix streaming check for openai-compatible providers

* test: update test

* test: correctly handle model is overloaded error

* test: update test

* test: fix test

* test: mark flaky test

---------

Co-authored-by: Guowang Li <Guowang@users.noreply.github.com>
2024-11-22 01:53:52 +05:30

206 lines
5.8 KiB
Python

#### What this tests ####
# This tests litellm router with batch completion
import asyncio
import os
import sys
import time
import traceback
import openai
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import os
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import httpx
from dotenv import load_dotenv
import litellm
from litellm import Router
from litellm.router import Deployment, LiteLLM_Params, ModelInfo
load_dotenv()
@pytest.mark.parametrize("mode", ["all_responses", "fastest_response"])
@pytest.mark.asyncio
async def test_batch_completion_multiple_models(mode):
litellm.set_verbose = True
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
{
"model_name": "groq-llama",
"litellm_params": {
"model": "groq/llama3-8b-8192",
},
},
]
)
if mode == "all_responses":
response = await router.abatch_completion(
models=["gpt-3.5-turbo", "groq-llama"],
messages=[
{"role": "user", "content": "is litellm becoming a better product ?"}
],
max_tokens=15,
)
print(response)
assert len(response) == 2
models_in_responses = []
print(f"response: {response}")
for individual_response in response:
print(f"individual_response: {individual_response}")
_model = individual_response["model"]
models_in_responses.append(_model)
# assert both models are different
assert models_in_responses[0] != models_in_responses[1]
elif mode == "fastest_response":
from openai.types.chat.chat_completion import ChatCompletion
response = await router.abatch_completion_fastest_response(
model="gpt-3.5-turbo, groq-llama",
messages=[
{"role": "user", "content": "is litellm becoming a better product ?"}
],
max_tokens=15,
)
ChatCompletion.model_validate(response.model_dump(), strict=True)
@pytest.mark.asyncio
async def test_batch_completion_fastest_response_unit_test():
"""
Unit test to confirm fastest response will always return the response which arrives earliest.
2 models -> 1 is cached, the other is a real llm api call => assert cached response always returned
"""
litellm.set_verbose = True
router = litellm.Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
},
"model_info": {"id": "1"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"mock_response": "This is a fake response",
},
"model_info": {"id": "2"},
},
]
)
response = await router.abatch_completion_fastest_response(
model="gpt-4, gpt-3.5-turbo",
messages=[
{"role": "user", "content": "is litellm becoming a better product ?"}
],
max_tokens=500,
)
assert response._hidden_params["model_id"] == "2"
assert response.choices[0].message.content == "This is a fake response"
print(f"response: {response}")
@pytest.mark.asyncio
async def test_batch_completion_fastest_response_streaming():
litellm.set_verbose = True
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
{
"model_name": "groq-llama",
"litellm_params": {
"model": "groq/llama3-8b-8192",
},
},
]
)
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
response = await router.abatch_completion_fastest_response(
model="gpt-3.5-turbo, groq-llama",
messages=[
{"role": "user", "content": "is litellm becoming a better product ?"}
],
max_tokens=15,
stream=True,
)
async for chunk in response:
ChatCompletionChunk.model_validate(chunk.model_dump(), strict=True)
@pytest.mark.asyncio
async def test_batch_completion_multiple_models_multiple_messages():
litellm.set_verbose = True
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
{
"model_name": "groq-llama",
"litellm_params": {
"model": "groq/llama3-8b-8192",
},
},
]
)
response = await router.abatch_completion(
models=["gpt-3.5-turbo", "groq-llama"],
messages=[
[{"role": "user", "content": "is litellm becoming a better product ?"}],
[{"role": "user", "content": "who is this"}],
],
max_tokens=15,
)
print("response from batches =", response)
assert len(response) == 2
assert len(response[0]) == 2
assert isinstance(response[0][0], litellm.ModelResponse)
# models_in_responses = []
# for individual_response in response:
# _model = individual_response["model"]
# models_in_responses.append(_model)
# # assert both models are different
# assert models_in_responses[0] != models_in_responses[1]