mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-24 18:24:20 +00:00
[Feat] Add Responses API - Routing Affinity logic for sessions (#10193)
* test for test_responses_api_routing_with_previous_response_id * test_responses_api_routing_with_previous_response_id * add ResponsesApiDeploymentCheck * ResponsesApiDeploymentCheck * ResponsesApiDeploymentCheck * fix ResponsesApiDeploymentCheck * test_responses_api_routing_with_previous_response_id * ResponsesApiDeploymentCheck * test_responses_api_deployment_check.py * docs routing affinity * simplify ResponsesApiDeploymentCheck * test response id * fix code quality check
This commit is contained in:
parent
9314c633ed
commit
043055c7b3
9 changed files with 862 additions and 29 deletions
|
@ -520,3 +520,121 @@ for event in response:
|
|||
| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
|
||||
| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) |
|
||||
|
||||
## Load Balancing with Routing Affinity
|
||||
|
||||
When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response.
|
||||
|
||||
|
||||
#### Example Usage
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="python-sdk" label="Python SDK">
|
||||
|
||||
```python showLineNumbers title="Python SDK with Routing Affinity"
|
||||
import litellm
|
||||
|
||||
# Set up router with multiple deployments of the same model
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-gpt4-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-4-turbo",
|
||||
"api_key": "your-api-key-1",
|
||||
"api_version": "2024-06-01",
|
||||
"api_base": "https://endpoint1.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-gpt4-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/gpt-4-turbo",
|
||||
"api_key": "your-api-key-2",
|
||||
"api_version": "2024-06-01",
|
||||
"api_base": "https://endpoint2.openai.azure.com",
|
||||
},
|
||||
},
|
||||
],
|
||||
optional_pre_call_checks=["responses_api_deployment_check"],
|
||||
)
|
||||
|
||||
# Initial request
|
||||
response = await router.aresponses(
|
||||
model="azure-gpt4-turbo",
|
||||
input="Hello, who are you?",
|
||||
truncation="auto",
|
||||
)
|
||||
|
||||
# Store the response ID
|
||||
response_id = response.id
|
||||
|
||||
# Follow-up request - will be automatically routed to the same deployment
|
||||
follow_up = await router.aresponses(
|
||||
model="azure-gpt4-turbo",
|
||||
input="Tell me more about yourself",
|
||||
truncation="auto",
|
||||
previous_response_id=response_id # This ensures routing to the same deployment
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="proxy-server" label="Proxy Server">
|
||||
|
||||
#### 1. Setup routing affinity on proxy config.yaml
|
||||
|
||||
To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml.
|
||||
|
||||
```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity"
|
||||
model_list:
|
||||
- model_name: azure-gpt4-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-4-turbo
|
||||
api_key: your-api-key-1
|
||||
api_version: 2024-06-01
|
||||
api_base: https://endpoint1.openai.azure.com
|
||||
- model_name: azure-gpt4-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-4-turbo
|
||||
api_key: your-api-key-2
|
||||
api_version: 2024-06-01
|
||||
api_base: https://endpoint2.openai.azure.com
|
||||
|
||||
router_settings:
|
||||
optional_pre_call_checks: ["responses_api_deployment_check"]
|
||||
```
|
||||
|
||||
#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy
|
||||
|
||||
```python showLineNumbers title="OpenAI Client with Proxy Server"
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:4000",
|
||||
api_key="your-api-key"
|
||||
)
|
||||
|
||||
# Initial request
|
||||
response = client.responses.create(
|
||||
model="azure-gpt4-turbo",
|
||||
input="Hello, who are you?"
|
||||
)
|
||||
|
||||
response_id = response.id
|
||||
|
||||
# Follow-up request - will be automatically routed to the same deployment
|
||||
follow_up = client.responses.create(
|
||||
model="azure-gpt4-turbo",
|
||||
input="Tell me more about yourself",
|
||||
previous_response_id=response_id # This ensures routing to the same deployment
|
||||
)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. When a user makes an initial request to the Responses API, LiteLLM caches which model deployment that returned the specific response. (Stored in Redis if you connected LiteLLM to Redis)
|
||||
2. When a subsequent request includes `previous_response_id`, LiteLLM automatically routes it to the same deployment
|
||||
3. If the original deployment is unavailable, or if the `previous_response_id` isn't found in the cache, LiteLLM falls back to normal routing
|
||||
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
model_list:
|
||||
- model_name: openai/*
|
||||
- model_name: azure-computer-use-preview
|
||||
litellm_params:
|
||||
model: openai/*
|
||||
- model_name: anthropic/*
|
||||
model: azure/computer-use-preview
|
||||
api_key: mock-api-key
|
||||
api_version: mock-api-version
|
||||
api_base: https://mock-endpoint.openai.azure.com
|
||||
- model_name: azure-computer-use-preview
|
||||
litellm_params:
|
||||
model: anthropic/*
|
||||
- model_name: gemini/*
|
||||
litellm_params:
|
||||
model: gemini/*
|
||||
litellm_settings:
|
||||
drop_params: true
|
||||
|
||||
model: azure/computer-use-preview-2
|
||||
api_key: mock-api-key-2
|
||||
api_version: mock-api-version-2
|
||||
api_base: https://mock-endpoint-2.openai.azure.com
|
||||
|
||||
router_settings:
|
||||
optional_pre_call_checks: ["responses_api_deployment_check"]
|
||||
|
|
|
@ -116,6 +116,13 @@ async def aresponses(
|
|||
response = await init_response
|
||||
else:
|
||||
response = init_response
|
||||
|
||||
# Update the responses_api_response_id with the model_id
|
||||
if isinstance(response, ResponsesAPIResponse):
|
||||
response = ResponsesAPIRequestUtils._update_responses_api_response_id_with_model_id(
|
||||
responses_api_response=response,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise litellm.exception_type(
|
||||
|
@ -248,6 +255,13 @@ def responses(
|
|||
),
|
||||
)
|
||||
|
||||
# Update the responses_api_response_id with the model_id
|
||||
if isinstance(response, ResponsesAPIResponse):
|
||||
response = ResponsesAPIRequestUtils._update_responses_api_response_id_with_model_id(
|
||||
responses_api_response=response,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
raise litellm.exception_type(
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
from typing import Any, Dict, Union, cast, get_type_hints
|
||||
import base64
|
||||
from typing import Any, Dict, Optional, Tuple, Union, cast, get_type_hints
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
|
||||
from litellm.types.llms.openai import (
|
||||
ResponseAPIUsage,
|
||||
ResponsesAPIOptionalRequestParams,
|
||||
ResponsesAPIResponse,
|
||||
)
|
||||
from litellm.types.utils import Usage
|
||||
from litellm.types.utils import SpecialEnums, Usage
|
||||
|
||||
|
||||
class ResponsesAPIRequestUtils:
|
||||
|
@ -77,6 +80,66 @@ class ResponsesAPIRequestUtils:
|
|||
}
|
||||
return cast(ResponsesAPIOptionalRequestParams, filtered_params)
|
||||
|
||||
@staticmethod
|
||||
def _update_responses_api_response_id_with_model_id(
|
||||
responses_api_response: ResponsesAPIResponse,
|
||||
kwargs: Dict[str, Any],
|
||||
) -> ResponsesAPIResponse:
|
||||
"""Update the responses_api_response_id with the model_id"""
|
||||
litellm_metadata: Dict[str, Any] = kwargs.get("litellm_metadata", {}) or {}
|
||||
model_info: Dict[str, Any] = litellm_metadata.get("model_info", {}) or {}
|
||||
model_id = model_info.get("id")
|
||||
updated_id = ResponsesAPIRequestUtils._build_responses_api_response_id(
|
||||
model_id=model_id,
|
||||
response_id=responses_api_response.id,
|
||||
)
|
||||
responses_api_response.id = updated_id
|
||||
return responses_api_response
|
||||
|
||||
@staticmethod
|
||||
def _build_responses_api_response_id(
|
||||
model_id: Optional[str],
|
||||
response_id: str,
|
||||
) -> str:
|
||||
"""Build the responses_api_response_id"""
|
||||
if model_id is None:
|
||||
return response_id
|
||||
assembled_id: str = str(
|
||||
SpecialEnums.LITELLM_MANAGED_RESPONSE_COMPLETE_STR.value
|
||||
).format(model_id, response_id)
|
||||
base64_encoded_id: str = base64.b64encode(assembled_id.encode("utf-8")).decode(
|
||||
"utf-8"
|
||||
)
|
||||
return f"resp_{base64_encoded_id}"
|
||||
|
||||
@staticmethod
|
||||
def _decode_responses_api_response_id(
|
||||
response_id: str,
|
||||
) -> Tuple[Optional[str], str]:
|
||||
"""
|
||||
Decode the responses_api_response_id
|
||||
|
||||
Returns:
|
||||
Tuple of model_id, response_id (from upstream provider)
|
||||
"""
|
||||
try:
|
||||
# Remove prefix and decode
|
||||
cleaned_id = response_id.replace("resp_", "")
|
||||
decoded_id = base64.b64decode(cleaned_id.encode("utf-8")).decode("utf-8")
|
||||
|
||||
# Parse components using known prefixes
|
||||
if ";" not in decoded_id:
|
||||
return None, response_id
|
||||
|
||||
model_part, response_part = decoded_id.split(";", 1)
|
||||
model_id = model_part.replace("litellm:model_id:", "")
|
||||
decoded_response_id = response_part.replace("response_id:", "")
|
||||
|
||||
return model_id, decoded_response_id
|
||||
except Exception as e:
|
||||
verbose_logger.debug(f"Error decoding response_id '{response_id}': {e}")
|
||||
return None, response_id
|
||||
|
||||
|
||||
class ResponseAPILoggingUtils:
|
||||
@staticmethod
|
||||
|
|
|
@ -98,6 +98,9 @@ from litellm.router_utils.handle_error import (
|
|||
from litellm.router_utils.pre_call_checks.prompt_caching_deployment_check import (
|
||||
PromptCachingDeploymentCheck,
|
||||
)
|
||||
from litellm.router_utils.pre_call_checks.responses_api_deployment_check import (
|
||||
ResponsesApiDeploymentCheck,
|
||||
)
|
||||
from litellm.router_utils.router_callbacks.track_deployment_metrics import (
|
||||
increment_deployment_failures_for_current_minute,
|
||||
increment_deployment_successes_for_current_minute,
|
||||
|
@ -339,9 +342,9 @@ class Router:
|
|||
) # names of models under litellm_params. ex. azure/chatgpt-v-2
|
||||
self.deployment_latency_map = {}
|
||||
### CACHING ###
|
||||
cache_type: Literal[
|
||||
"local", "redis", "redis-semantic", "s3", "disk"
|
||||
] = "local" # default to an in-memory cache
|
||||
cache_type: Literal["local", "redis", "redis-semantic", "s3", "disk"] = (
|
||||
"local" # default to an in-memory cache
|
||||
)
|
||||
redis_cache = None
|
||||
cache_config: Dict[str, Any] = {}
|
||||
|
||||
|
@ -562,9 +565,9 @@ class Router:
|
|||
)
|
||||
)
|
||||
|
||||
self.model_group_retry_policy: Optional[
|
||||
Dict[str, RetryPolicy]
|
||||
] = model_group_retry_policy
|
||||
self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = (
|
||||
model_group_retry_policy
|
||||
)
|
||||
|
||||
self.allowed_fails_policy: Optional[AllowedFailsPolicy] = None
|
||||
if allowed_fails_policy is not None:
|
||||
|
@ -765,6 +768,8 @@ class Router:
|
|||
provider_budget_config=self.provider_budget_config,
|
||||
model_list=self.model_list,
|
||||
)
|
||||
elif pre_call_check == "responses_api_deployment_check":
|
||||
_callback = ResponsesApiDeploymentCheck()
|
||||
if _callback is not None:
|
||||
litellm.logging_callback_manager.add_litellm_callback(_callback)
|
||||
|
||||
|
@ -3247,11 +3252,11 @@ class Router:
|
|||
|
||||
if isinstance(e, litellm.ContextWindowExceededError):
|
||||
if context_window_fallbacks is not None:
|
||||
fallback_model_group: Optional[
|
||||
List[str]
|
||||
] = self._get_fallback_model_group_from_fallbacks(
|
||||
fallbacks=context_window_fallbacks,
|
||||
model_group=model_group,
|
||||
fallback_model_group: Optional[List[str]] = (
|
||||
self._get_fallback_model_group_from_fallbacks(
|
||||
fallbacks=context_window_fallbacks,
|
||||
model_group=model_group,
|
||||
)
|
||||
)
|
||||
if fallback_model_group is None:
|
||||
raise original_exception
|
||||
|
@ -3283,11 +3288,11 @@ class Router:
|
|||
e.message += "\n{}".format(error_message)
|
||||
elif isinstance(e, litellm.ContentPolicyViolationError):
|
||||
if content_policy_fallbacks is not None:
|
||||
fallback_model_group: Optional[
|
||||
List[str]
|
||||
] = self._get_fallback_model_group_from_fallbacks(
|
||||
fallbacks=content_policy_fallbacks,
|
||||
model_group=model_group,
|
||||
fallback_model_group: Optional[List[str]] = (
|
||||
self._get_fallback_model_group_from_fallbacks(
|
||||
fallbacks=content_policy_fallbacks,
|
||||
model_group=model_group,
|
||||
)
|
||||
)
|
||||
if fallback_model_group is None:
|
||||
raise original_exception
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
"""
|
||||
For Responses API, we need routing affinity when a user sends a previous_response_id.
|
||||
|
||||
eg. If proxy admins are load balancing between N gpt-4.1-turbo deployments, and a user sends a previous_response_id,
|
||||
we want to route to the same gpt-4.1-turbo deployment.
|
||||
|
||||
This is different from the normal behavior of the router, which does not have routing affinity for previous_response_id.
|
||||
|
||||
|
||||
If previous_response_id is provided, route to the deployment that returned the previous response
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
from litellm.integrations.custom_logger import CustomLogger, Span
|
||||
from litellm.responses.utils import ResponsesAPIRequestUtils
|
||||
from litellm.types.llms.openai import AllMessageValues
|
||||
|
||||
|
||||
class ResponsesApiDeploymentCheck(CustomLogger):
|
||||
async def async_filter_deployments(
|
||||
self,
|
||||
model: str,
|
||||
healthy_deployments: List,
|
||||
messages: Optional[List[AllMessageValues]],
|
||||
request_kwargs: Optional[dict] = None,
|
||||
parent_otel_span: Optional[Span] = None,
|
||||
) -> List[dict]:
|
||||
request_kwargs = request_kwargs or {}
|
||||
previous_response_id = request_kwargs.get("previous_response_id", None)
|
||||
if previous_response_id is None:
|
||||
return healthy_deployments
|
||||
|
||||
model_id, response_id = (
|
||||
ResponsesAPIRequestUtils._decode_responses_api_response_id(
|
||||
response_id=previous_response_id,
|
||||
)
|
||||
)
|
||||
if model_id is None:
|
||||
return healthy_deployments
|
||||
|
||||
for deployment in healthy_deployments:
|
||||
if deployment["model_info"]["id"] == model_id:
|
||||
return [deployment]
|
||||
|
||||
return healthy_deployments
|
|
@ -709,7 +709,11 @@ class GenericBudgetWindowDetails(BaseModel):
|
|||
ttl_seconds: int
|
||||
|
||||
|
||||
OptionalPreCallChecks = List[Literal["prompt_caching", "router_budget_limiting"]]
|
||||
OptionalPreCallChecks = List[
|
||||
Literal[
|
||||
"prompt_caching", "router_budget_limiting", "responses_api_deployment_check"
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
class LiteLLM_RouterFileObject(TypedDict, total=False):
|
||||
|
|
|
@ -2254,6 +2254,8 @@ class SpecialEnums(Enum):
|
|||
LITELM_MANAGED_FILE_ID_PREFIX = "litellm_proxy"
|
||||
LITELLM_MANAGED_FILE_COMPLETE_STR = "litellm_proxy:{};unified_id,{}"
|
||||
|
||||
LITELLM_MANAGED_RESPONSE_COMPLETE_STR = "litellm:model_id:{};response_id:{}"
|
||||
|
||||
|
||||
LLMResponseTypes = Union[
|
||||
ModelResponse, EmbeddingResponse, ImageResponse, OpenAIFileObject
|
||||
|
|
|
@ -0,0 +1,578 @@
|
|||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
import json
|
||||
|
||||
import litellm
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
from litellm.types.llms.openai import (
|
||||
IncompleteDetails,
|
||||
ResponseAPIUsage,
|
||||
ResponseCompletedEvent,
|
||||
ResponsesAPIResponse,
|
||||
ResponseTextConfig,
|
||||
)
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_responses_api_routing_with_previous_response_id():
|
||||
"""
|
||||
Test that when using a previous_response_id, the request is sent to the same model_id
|
||||
"""
|
||||
# Create a mock response that simulates Azure responses API
|
||||
mock_response_id = "resp_mock-resp-456"
|
||||
|
||||
mock_response_data = {
|
||||
"id": mock_response_id,
|
||||
"object": "response",
|
||||
"created_at": 1741476542,
|
||||
"status": "completed",
|
||||
"model": "azure/computer-use-preview",
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"id": "msg_123",
|
||||
"status": "completed",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"text": "I'm doing well, thank you for asking!",
|
||||
"annotations": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": True,
|
||||
"usage": {
|
||||
"input_tokens": 10,
|
||||
"output_tokens": 20,
|
||||
"total_tokens": 30,
|
||||
"output_tokens_details": {"reasoning_tokens": 0},
|
||||
},
|
||||
"text": {"format": {"type": "text"}},
|
||||
"error": None,
|
||||
"incomplete_details": None,
|
||||
"instructions": None,
|
||||
"metadata": {},
|
||||
"temperature": 1.0,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_p": 1.0,
|
||||
"max_output_tokens": None,
|
||||
"previous_response_id": None,
|
||||
"reasoning": {"effort": None, "summary": None},
|
||||
"truncation": "disabled",
|
||||
"user": None,
|
||||
}
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, json_data, status_code):
|
||||
self._json_data = json_data
|
||||
self.status_code = status_code
|
||||
self.text = json.dumps(json_data)
|
||||
|
||||
def json(self):
|
||||
return self._json_data
|
||||
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview-2",
|
||||
"api_key": "mock-api-key-2",
|
||||
"api_version": "mock-api-version-2",
|
||||
"api_base": "https://mock-endpoint-2.openai.azure.com",
|
||||
},
|
||||
},
|
||||
],
|
||||
optional_pre_call_checks=["responses_api_deployment_check"],
|
||||
)
|
||||
MODEL = "azure-computer-use-preview"
|
||||
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_post:
|
||||
# Configure the mock to return our response
|
||||
mock_post.return_value = MockResponse(mock_response_data, 200)
|
||||
|
||||
# Make the initial request
|
||||
# litellm._turn_on_debug()
|
||||
response = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Hello, how are you?",
|
||||
truncation="auto",
|
||||
)
|
||||
print("RESPONSE", response)
|
||||
|
||||
# Store the model_id from the response
|
||||
expected_model_id = response._hidden_params["model_id"]
|
||||
response_id = response.id
|
||||
|
||||
print("Response ID=", response_id, "came from model_id=", expected_model_id)
|
||||
|
||||
# Make 10 other requests with previous_response_id, assert that they are sent to the same model_id
|
||||
for i in range(10):
|
||||
# Reset the mock for the next call
|
||||
mock_post.reset_mock()
|
||||
|
||||
# Set up the mock to return our response again
|
||||
mock_post.return_value = MockResponse(mock_response_data, 200)
|
||||
|
||||
response = await router.aresponses(
|
||||
model=MODEL,
|
||||
input=f"Follow-up question {i+1}",
|
||||
truncation="auto",
|
||||
previous_response_id=response_id,
|
||||
)
|
||||
|
||||
# Assert the model_id is preserved
|
||||
assert response._hidden_params["model_id"] == expected_model_id
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_routing_without_previous_response_id():
|
||||
"""
|
||||
Test that normal routing (load balancing) works when no previous_response_id is provided
|
||||
"""
|
||||
mock_response_data = {
|
||||
"id": "mock-resp-123",
|
||||
"object": "response",
|
||||
"created_at": 1741476542,
|
||||
"status": "completed",
|
||||
"model": "azure/computer-use-preview",
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"id": "msg_123",
|
||||
"status": "completed",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "output_text", "text": "Hello there!", "annotations": []}
|
||||
],
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": True,
|
||||
"usage": {
|
||||
"input_tokens": 5,
|
||||
"output_tokens": 10,
|
||||
"total_tokens": 15,
|
||||
"output_tokens_details": {"reasoning_tokens": 0},
|
||||
},
|
||||
"text": {"format": {"type": "text"}},
|
||||
"error": None,
|
||||
"incomplete_details": None,
|
||||
"instructions": None,
|
||||
"metadata": {},
|
||||
"temperature": 1.0,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_p": 1.0,
|
||||
"max_output_tokens": None,
|
||||
"previous_response_id": None,
|
||||
"reasoning": {"effort": None, "summary": None},
|
||||
"truncation": "disabled",
|
||||
"user": None,
|
||||
}
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, json_data, status_code):
|
||||
self._json_data = json_data
|
||||
self.status_code = status_code
|
||||
self.text = json.dumps(json_data)
|
||||
|
||||
def json(self):
|
||||
return self._json_data
|
||||
|
||||
# Create a router with two identical deployments to test load balancing
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-1",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-1.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-2",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-2.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-3",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-3.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-4",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-4.openai.azure.com",
|
||||
},
|
||||
},
|
||||
],
|
||||
optional_pre_call_checks=["responses_api_deployment_check"],
|
||||
)
|
||||
|
||||
MODEL = "azure-computer-use-preview"
|
||||
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_post:
|
||||
# Configure the mock to return our response
|
||||
mock_post.return_value = MockResponse(mock_response_data, 200)
|
||||
|
||||
# Make multiple requests and verify we're hitting different deployments
|
||||
used_model_ids = set()
|
||||
|
||||
for i in range(20):
|
||||
response = await router.aresponses(
|
||||
model=MODEL,
|
||||
input=f"Question {i}",
|
||||
truncation="auto",
|
||||
)
|
||||
|
||||
used_model_ids.add(response._hidden_params["model_id"])
|
||||
|
||||
# We should have used more than one model_id if load balancing is working
|
||||
assert (
|
||||
len(used_model_ids) > 1
|
||||
), "Load balancing isn't working, only one deployment was used"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_previous_response_id_not_in_cache():
|
||||
"""
|
||||
Test behavior when a previous_response_id is provided but not found in cache
|
||||
"""
|
||||
mock_response_data = {
|
||||
"id": "mock-resp-789",
|
||||
"object": "response",
|
||||
"created_at": 1741476542,
|
||||
"status": "completed",
|
||||
"model": "azure/computer-use-preview",
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"id": "msg_123",
|
||||
"status": "completed",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"text": "Nice to meet you!",
|
||||
"annotations": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": True,
|
||||
"usage": {
|
||||
"input_tokens": 5,
|
||||
"output_tokens": 10,
|
||||
"total_tokens": 15,
|
||||
"output_tokens_details": {"reasoning_tokens": 0},
|
||||
},
|
||||
"text": {"format": {"type": "text"}},
|
||||
"error": None,
|
||||
"incomplete_details": None,
|
||||
"instructions": None,
|
||||
"metadata": {},
|
||||
"temperature": 1.0,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_p": 1.0,
|
||||
"max_output_tokens": None,
|
||||
"previous_response_id": None,
|
||||
"reasoning": {"effort": None, "summary": None},
|
||||
"truncation": "disabled",
|
||||
"user": None,
|
||||
}
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, json_data, status_code):
|
||||
self._json_data = json_data
|
||||
self.status_code = status_code
|
||||
self.text = json.dumps(json_data)
|
||||
|
||||
def json(self):
|
||||
return self._json_data
|
||||
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-1",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-1.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview",
|
||||
"api_key": "mock-api-key-2",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-2.openai.azure.com",
|
||||
},
|
||||
},
|
||||
],
|
||||
optional_pre_call_checks=["responses_api_deployment_check"],
|
||||
)
|
||||
|
||||
MODEL = "azure-computer-use-preview"
|
||||
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_post:
|
||||
# Configure the mock to return our response
|
||||
mock_post.return_value = MockResponse(mock_response_data, 200)
|
||||
|
||||
# Make a request with a non-existent previous_response_id
|
||||
response = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Hello, this is a test",
|
||||
truncation="auto",
|
||||
previous_response_id="non-existent-response-id",
|
||||
)
|
||||
|
||||
# Should still get a valid response
|
||||
assert response is not None
|
||||
assert response.id is not None
|
||||
|
||||
# Since the previous_response_id wasn't found, routing should work normally
|
||||
# We can't assert exactly which deployment was chosen, but we can verify the basics
|
||||
assert response._hidden_params["model_id"] is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_response_ids_routing():
|
||||
"""
|
||||
Test that different response IDs correctly route to their respective original deployments
|
||||
"""
|
||||
# Create two different mock responses for our two different deployments
|
||||
mock_response_data_1 = {
|
||||
"id": "mock-resp-deployment-1",
|
||||
"object": "response",
|
||||
"created_at": 1741476542,
|
||||
"status": "completed",
|
||||
"model": "azure/computer-use-preview",
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"id": "msg_123",
|
||||
"status": "completed",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"text": "Response from deployment 1",
|
||||
"annotations": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": True,
|
||||
"usage": {
|
||||
"input_tokens": 5,
|
||||
"output_tokens": 10,
|
||||
"total_tokens": 15,
|
||||
"output_tokens_details": {"reasoning_tokens": 0},
|
||||
},
|
||||
"text": {"format": {"type": "text"}},
|
||||
"error": None,
|
||||
"incomplete_details": None,
|
||||
"instructions": None,
|
||||
"metadata": {},
|
||||
"temperature": 1.0,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_p": 1.0,
|
||||
"max_output_tokens": None,
|
||||
"previous_response_id": None,
|
||||
"reasoning": {"effort": None, "summary": None},
|
||||
"truncation": "disabled",
|
||||
"user": None,
|
||||
}
|
||||
|
||||
mock_response_data_2 = {
|
||||
"id": "mock-resp-deployment-2",
|
||||
"object": "response",
|
||||
"created_at": 1741476542,
|
||||
"status": "completed",
|
||||
"model": "azure/computer-use-preview",
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"id": "msg_456",
|
||||
"status": "completed",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"text": "Response from deployment 2",
|
||||
"annotations": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": True,
|
||||
"usage": {
|
||||
"input_tokens": 5,
|
||||
"output_tokens": 10,
|
||||
"total_tokens": 15,
|
||||
"output_tokens_details": {"reasoning_tokens": 0},
|
||||
},
|
||||
"text": {"format": {"type": "text"}},
|
||||
"error": None,
|
||||
"incomplete_details": None,
|
||||
"instructions": None,
|
||||
"metadata": {},
|
||||
"temperature": 1.0,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_p": 1.0,
|
||||
"max_output_tokens": None,
|
||||
"previous_response_id": None,
|
||||
"reasoning": {"effort": None, "summary": None},
|
||||
"truncation": "disabled",
|
||||
"user": None,
|
||||
}
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, json_data, status_code):
|
||||
self._json_data = json_data
|
||||
self.status_code = status_code
|
||||
self.text = json.dumps(json_data)
|
||||
|
||||
def json(self):
|
||||
return self._json_data
|
||||
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview-1",
|
||||
"api_key": "mock-api-key-1",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-1.openai.azure.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "azure-computer-use-preview",
|
||||
"litellm_params": {
|
||||
"model": "azure/computer-use-preview-2",
|
||||
"api_key": "mock-api-key-2",
|
||||
"api_version": "mock-api-version",
|
||||
"api_base": "https://mock-endpoint-2.openai.azure.com",
|
||||
},
|
||||
},
|
||||
],
|
||||
optional_pre_call_checks=["responses_api_deployment_check"],
|
||||
)
|
||||
|
||||
MODEL = "azure-computer-use-preview"
|
||||
|
||||
with patch(
|
||||
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_post:
|
||||
# For the first request, return response from deployment 1
|
||||
mock_post.return_value = MockResponse(mock_response_data_1, 200)
|
||||
|
||||
# Make the first request to deployment 1
|
||||
response1 = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Request to deployment 1",
|
||||
truncation="auto",
|
||||
)
|
||||
|
||||
# Store details from first response
|
||||
model_id_1 = response1._hidden_params["model_id"]
|
||||
response_id_1 = response1.id
|
||||
|
||||
# For the second request, return response from deployment 2
|
||||
mock_post.return_value = MockResponse(mock_response_data_2, 200)
|
||||
|
||||
# Make the second request to deployment 2
|
||||
response2 = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Request to deployment 2",
|
||||
truncation="auto",
|
||||
)
|
||||
|
||||
# Store details from second response
|
||||
model_id_2 = response2._hidden_params["model_id"]
|
||||
response_id_2 = response2.id
|
||||
|
||||
# Wait for cache updates
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Now make follow-up requests using the previous response IDs
|
||||
|
||||
# First, reset mock
|
||||
mock_post.reset_mock()
|
||||
mock_post.return_value = MockResponse(mock_response_data_1, 200)
|
||||
|
||||
# Follow-up to response 1 should go to model_id_1
|
||||
follow_up_1 = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Follow up to deployment 1",
|
||||
truncation="auto",
|
||||
previous_response_id=response_id_1,
|
||||
)
|
||||
|
||||
# Verify it went to the correct deployment
|
||||
assert follow_up_1._hidden_params["model_id"] == model_id_1
|
||||
|
||||
# Reset mock again
|
||||
mock_post.reset_mock()
|
||||
mock_post.return_value = MockResponse(mock_response_data_2, 200)
|
||||
|
||||
# Follow-up to response 2 should go to model_id_2
|
||||
follow_up_2 = await router.aresponses(
|
||||
model=MODEL,
|
||||
input="Follow up to deployment 2",
|
||||
truncation="auto",
|
||||
previous_response_id=response_id_2,
|
||||
)
|
||||
|
||||
# Verify it went to the correct deployment
|
||||
assert follow_up_2._hidden_params["model_id"] == model_id_2
|
Loading…
Add table
Add a link
Reference in a new issue