forked from phoenix/litellm-mirror
Merge pull request #4807 from BerriAI/litellm_return-response_headers
[Feat] Return response headers on `litellm.completion` , `litellm.embedding`
This commit is contained in:
commit
00d431ea42
5 changed files with 397 additions and 10 deletions
|
@ -238,6 +238,104 @@ response = completion(
|
|||
|
||||
## Advanced
|
||||
|
||||
### Getting OpenAI API Response Headers
|
||||
|
||||
Set `litellm.return_response_headers = True` to get raw response headers from OpenAI
|
||||
|
||||
You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="litellm.completion" label="litellm.completion">
|
||||
|
||||
```python
|
||||
litellm.return_response_headers = True
|
||||
|
||||
# /chat/completion
|
||||
response = completion(
|
||||
model="gpt-4o-mini",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi",
|
||||
}
|
||||
],
|
||||
)
|
||||
print(f"response: {response}")
|
||||
print("_response_headers=", response._response_headers)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="litellm.completion - streaming" label="litellm.completion + stream">
|
||||
|
||||
```python
|
||||
litellm.return_response_headers = True
|
||||
|
||||
# /chat/completion
|
||||
response = completion(
|
||||
model="gpt-4o-mini",
|
||||
stream=True,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi",
|
||||
}
|
||||
],
|
||||
)
|
||||
print(f"response: {response}")
|
||||
print("response_headers=", response._response_headers)
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="litellm.embedding" label="litellm.embedding">
|
||||
|
||||
```python
|
||||
litellm.return_response_headers = True
|
||||
|
||||
# embedding
|
||||
embedding_response = litellm.embedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="hello",
|
||||
)
|
||||
|
||||
embedding_response_headers = embedding_response._response_headers
|
||||
print("embedding_response_headers=", embedding_response_headers)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
Expected Response Headers from OpenAI
|
||||
|
||||
```json
|
||||
{
|
||||
"date": "Sat, 20 Jul 2024 22:05:23 GMT",
|
||||
"content-type": "application/json",
|
||||
"transfer-encoding": "chunked",
|
||||
"connection": "keep-alive",
|
||||
"access-control-allow-origin": "*",
|
||||
"openai-model": "text-embedding-ada-002",
|
||||
"openai-organization": "*****",
|
||||
"openai-processing-ms": "20",
|
||||
"openai-version": "2020-10-01",
|
||||
"strict-transport-security": "max-age=15552000; includeSubDomains; preload",
|
||||
"x-ratelimit-limit-requests": "5000",
|
||||
"x-ratelimit-limit-tokens": "5000000",
|
||||
"x-ratelimit-remaining-requests": "4999",
|
||||
"x-ratelimit-remaining-tokens": "4999999",
|
||||
"x-ratelimit-reset-requests": "12ms",
|
||||
"x-ratelimit-reset-tokens": "0s",
|
||||
"x-request-id": "req_cc37487bfd336358231a17034bcfb4d9",
|
||||
"cf-cache-status": "DYNAMIC",
|
||||
"set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
|
||||
"x-content-type-options": "nosniff",
|
||||
"server": "cloudflare",
|
||||
"cf-ray": "8a66409b4f8acee9-SJC",
|
||||
"content-encoding": "br",
|
||||
"alt-svc": "h3=\":443\"; ma=86400"
|
||||
}
|
||||
```
|
||||
|
||||
### Parallel Function calling
|
||||
See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
|
||||
```python
|
||||
|
|
|
@ -784,6 +784,34 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def make_sync_openai_chat_completion_request(
|
||||
self,
|
||||
openai_client: OpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
|
||||
- call chat.completions.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = openai_client.chat.completions.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = openai_client.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def completion(
|
||||
self,
|
||||
model_response: ModelResponse,
|
||||
|
@ -916,7 +944,15 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
},
|
||||
)
|
||||
|
||||
response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||
headers, response = (
|
||||
self.make_sync_openai_chat_completion_request(
|
||||
openai_client=openai_client,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
)
|
||||
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
input=messages,
|
||||
|
@ -927,6 +963,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
return convert_to_model_response_object(
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
_response_headers=headers,
|
||||
)
|
||||
except openai.UnprocessableEntityError as e:
|
||||
## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
|
||||
|
@ -1043,6 +1080,25 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
},
|
||||
)
|
||||
|
||||
headers, response = await self.make_openai_chat_completion_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
input=data["messages"],
|
||||
api_key=api_key,
|
||||
original_response=stringified_response,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
return convert_to_model_response_object(
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
hidden_params={"headers": headers},
|
||||
_response_headers=headers,
|
||||
)
|
||||
except Exception as e:
|
||||
raise e
|
||||
headers, response = await self.make_openai_chat_completion_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
|
@ -1122,13 +1178,20 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
response = openai_client.chat.completions.create(**data, timeout=timeout)
|
||||
headers, response = self.make_sync_openai_chat_completion_request(
|
||||
openai_client=openai_client,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
custom_llm_provider="openai",
|
||||
logging_obj=logging_obj,
|
||||
stream_options=data.get("stream_options", None),
|
||||
_response_headers=headers,
|
||||
)
|
||||
return streamwrapper
|
||||
|
||||
|
@ -1174,6 +1237,28 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
custom_llm_provider="openai",
|
||||
logging_obj=logging_obj,
|
||||
stream_options=data.get("stream_options", None),
|
||||
_response_headers=headers,
|
||||
)
|
||||
return streamwrapper
|
||||
except (
|
||||
Exception
|
||||
) as e: # need to exception handle here. async exceptions don't get caught in sync functions.
|
||||
if response is not None and hasattr(response, "text"):
|
||||
raise OpenAIError(
|
||||
status_code=500,
|
||||
message=f"{str(e)}\n\nOriginal Response: {response.text}",
|
||||
|
||||
headers, response = await self.make_openai_chat_completion_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
|
@ -1252,6 +1337,32 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def make_sync_openai_embedding_request(
|
||||
self,
|
||||
openai_client: OpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call embeddings.create.with_raw_response when litellm.return_response_headers is True
|
||||
- call embeddings.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = openai_client.embeddings.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
async def aembedding(
|
||||
self,
|
||||
input: list,
|
||||
|
@ -1286,7 +1397,12 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
original_response=stringified_response,
|
||||
)
|
||||
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
return convert_to_model_response_object(
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
response_type="embedding",
|
||||
_response_headers=headers,
|
||||
) # type: ignore
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -1347,17 +1463,26 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
client=client,
|
||||
)
|
||||
|
||||
## COMPLETION CALL
|
||||
response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
## embedding CALL
|
||||
headers: Optional[Dict] = None
|
||||
headers, sync_embedding_response = self.make_sync_openai_embedding_request(
|
||||
openai_client=openai_client, data=data, timeout=timeout
|
||||
) # type: ignore
|
||||
|
||||
## LOGGING
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
logging_obj.post_call(
|
||||
input=input,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=response,
|
||||
original_response=sync_embedding_response,
|
||||
)
|
||||
|
||||
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
|
||||
return convert_to_model_response_object(
|
||||
response_object=sync_embedding_response.model_dump(),
|
||||
model_response_object=model_response,
|
||||
_response_headers=headers,
|
||||
response_type="embedding",
|
||||
) # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
raise e
|
||||
|
@ -1520,6 +1645,33 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def make_sync_openai_audio_transcriptions_request(
|
||||
self,
|
||||
openai_client: OpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
|
||||
- call openai_aclient.audio.transcriptions.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = (
|
||||
openai_client.audio.transcriptions.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
) # type: ignore
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def audio_transcriptions(
|
||||
self,
|
||||
model: str,
|
||||
|
@ -1555,8 +1707,10 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
)
|
||||
response = openai_client.audio.transcriptions.create(
|
||||
**data, timeout=timeout # type: ignore
|
||||
response = self.make_sync_openai_audio_transcriptions_request(
|
||||
openai_client=openai_client,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if isinstance(response, BaseModel):
|
||||
|
|
|
@ -1340,6 +1340,115 @@ def test_completion_azure_gpt4_vision():
|
|||
# test_completion_azure_gpt4_vision()
|
||||
|
||||
|
||||
def test_completion_openai_response_headers():
|
||||
"""
|
||||
Tests if LiteLLM reurns response hea
|
||||
"""
|
||||
litellm.return_response_headers = True
|
||||
|
||||
# /chat/completion
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi",
|
||||
}
|
||||
]
|
||||
|
||||
response = completion(
|
||||
model="gpt-4o-mini",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
print(f"response: {response}")
|
||||
|
||||
print("response_headers=", response._response_headers)
|
||||
assert response._response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in response._response_headers
|
||||
|
||||
# /chat/completion - with streaming
|
||||
|
||||
streaming_response = litellm.completion(
|
||||
model="gpt-4o-mini",
|
||||
messages=messages,
|
||||
stream=True,
|
||||
)
|
||||
response_headers = streaming_response._response_headers
|
||||
print("streaming response_headers=", response_headers)
|
||||
assert response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in response_headers
|
||||
|
||||
for chunk in streaming_response:
|
||||
print("chunk=", chunk)
|
||||
|
||||
# embedding
|
||||
embedding_response = litellm.embedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="hello",
|
||||
)
|
||||
|
||||
embedding_response_headers = embedding_response._response_headers
|
||||
print("embedding_response_headers=", embedding_response_headers)
|
||||
assert embedding_response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in embedding_response_headers
|
||||
|
||||
litellm.return_response_headers = False
|
||||
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
async def test_async_completion_openai_response_headers():
|
||||
"""
|
||||
Tests if LiteLLM reurns response hea
|
||||
"""
|
||||
litellm.return_response_headers = True
|
||||
|
||||
# /chat/completion
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi",
|
||||
}
|
||||
]
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="gpt-4o-mini",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
print(f"response: {response}")
|
||||
|
||||
print("response_headers=", response._response_headers)
|
||||
assert response._response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in response._response_headers
|
||||
|
||||
# /chat/completion with streaming
|
||||
|
||||
streaming_response = await litellm.acompletion(
|
||||
model="gpt-4o-mini",
|
||||
messages=messages,
|
||||
stream=True,
|
||||
)
|
||||
response_headers = streaming_response._response_headers
|
||||
print("streaming response_headers=", response_headers)
|
||||
assert response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in response_headers
|
||||
|
||||
async for chunk in streaming_response:
|
||||
print("chunk=", chunk)
|
||||
|
||||
# embedding
|
||||
embedding_response = await litellm.aembedding(
|
||||
model="text-embedding-ada-002",
|
||||
input="hello",
|
||||
)
|
||||
|
||||
embedding_response_headers = embedding_response._response_headers
|
||||
print("embedding_response_headers=", embedding_response_headers)
|
||||
assert embedding_response_headers is not None
|
||||
assert "x-ratelimit-remaining-tokens" in embedding_response_headers
|
||||
|
||||
litellm.return_response_headers = False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["gpt-3.5-turbo", "gpt-4", "gpt-4o"])
|
||||
def test_completion_openai_params(model):
|
||||
litellm.drop_params = True
|
||||
|
|
|
@ -536,6 +536,8 @@ class ModelResponse(OpenAIObject):
|
|||
|
||||
_hidden_params: dict = {}
|
||||
|
||||
_response_headers: Optional[dict] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
id=None,
|
||||
|
@ -549,6 +551,7 @@ class ModelResponse(OpenAIObject):
|
|||
stream_options=None,
|
||||
response_ms=None,
|
||||
hidden_params=None,
|
||||
_response_headers=None,
|
||||
**params,
|
||||
) -> None:
|
||||
if stream is not None and stream is True:
|
||||
|
@ -598,6 +601,9 @@ class ModelResponse(OpenAIObject):
|
|||
if hidden_params:
|
||||
self._hidden_params = hidden_params
|
||||
|
||||
if _response_headers:
|
||||
self._response_headers = _response_headers
|
||||
|
||||
init_values = {
|
||||
"id": id,
|
||||
"choices": choices,
|
||||
|
@ -667,6 +673,7 @@ class EmbeddingResponse(OpenAIObject):
|
|||
"""Usage statistics for the embedding request."""
|
||||
|
||||
_hidden_params: dict = {}
|
||||
_response_headers: Optional[Dict] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -675,6 +682,8 @@ class EmbeddingResponse(OpenAIObject):
|
|||
stream=False,
|
||||
response_ms=None,
|
||||
data=None,
|
||||
hidden_params=None,
|
||||
_response_headers=None,
|
||||
**params,
|
||||
):
|
||||
object = "list"
|
||||
|
@ -692,6 +701,9 @@ class EmbeddingResponse(OpenAIObject):
|
|||
else:
|
||||
usage = Usage()
|
||||
|
||||
if _response_headers:
|
||||
self._response_headers = _response_headers
|
||||
|
||||
model = model
|
||||
super().__init__(model=model, object=object, data=data, usage=usage)
|
||||
|
||||
|
@ -974,6 +986,7 @@ class TranscriptionResponse(OpenAIObject):
|
|||
text: Optional[str] = None
|
||||
|
||||
_hidden_params: dict = {}
|
||||
_response_headers: Optional[dict] = None
|
||||
|
||||
def __init__(self, text=None):
|
||||
super().__init__(text=text)
|
||||
|
|
|
@ -5666,6 +5666,7 @@ def convert_to_model_response_object(
|
|||
start_time=None,
|
||||
end_time=None,
|
||||
hidden_params: Optional[dict] = None,
|
||||
_response_headers: Optional[dict] = None,
|
||||
):
|
||||
received_args = locals()
|
||||
### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
|
||||
|
@ -5764,6 +5765,9 @@ def convert_to_model_response_object(
|
|||
if hidden_params is not None:
|
||||
model_response_object._hidden_params = hidden_params
|
||||
|
||||
if _response_headers is not None:
|
||||
model_response_object._response_headers = _response_headers
|
||||
|
||||
return model_response_object
|
||||
elif response_type == "embedding" and (
|
||||
model_response_object is None
|
||||
|
@ -5796,6 +5800,9 @@ def convert_to_model_response_object(
|
|||
if hidden_params is not None:
|
||||
model_response_object._hidden_params = hidden_params
|
||||
|
||||
if _response_headers is not None:
|
||||
model_response_object._response_headers = _response_headers
|
||||
|
||||
return model_response_object
|
||||
elif response_type == "image_generation" and (
|
||||
model_response_object is None
|
||||
|
@ -5837,6 +5844,10 @@ def convert_to_model_response_object(
|
|||
|
||||
if hidden_params is not None:
|
||||
model_response_object._hidden_params = hidden_params
|
||||
|
||||
if _response_headers is not None:
|
||||
model_response_object._response_headers = _response_headers
|
||||
|
||||
return model_response_object
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
|
@ -8262,6 +8273,7 @@ class CustomStreamWrapper:
|
|||
logging_obj=None,
|
||||
stream_options=None,
|
||||
make_call: Optional[Callable] = None,
|
||||
_response_headers: Optional[dict] = None,
|
||||
):
|
||||
self.model = model
|
||||
self.make_call = make_call
|
||||
|
@ -8293,6 +8305,7 @@ class CustomStreamWrapper:
|
|||
self._hidden_params = {
|
||||
"model_id": (_model_info.get("id", None))
|
||||
} # returned as x-litellm-model-id response header in proxy
|
||||
self._response_headers = _response_headers
|
||||
self.response_id = None
|
||||
self.logging_loop = None
|
||||
self.rules = Rules()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue