diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index d86263dd5..657e9c736 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -238,6 +238,104 @@ response = completion( ## Advanced +### Getting OpenAI API Response Headers + +Set `litellm.return_response_headers = True` to get raw response headers from OpenAI + +You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions + + + + +```python +litellm.return_response_headers = True + +# /chat/completion +response = completion( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": "hi", + } + ], +) +print(f"response: {response}") +print("_response_headers=", response._response_headers) +``` + + + + +```python +litellm.return_response_headers = True + +# /chat/completion +response = completion( + model="gpt-4o-mini", + stream=True, + messages=[ + { + "role": "user", + "content": "hi", + } + ], +) +print(f"response: {response}") +print("response_headers=", response._response_headers) +for chunk in response: + print(chunk) +``` + + + + +```python +litellm.return_response_headers = True + +# embedding +embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input="hello", +) + +embedding_response_headers = embedding_response._response_headers +print("embedding_response_headers=", embedding_response_headers) +``` + + + +Expected Response Headers from OpenAI + +```json +{ + "date": "Sat, 20 Jul 2024 22:05:23 GMT", + "content-type": "application/json", + "transfer-encoding": "chunked", + "connection": "keep-alive", + "access-control-allow-origin": "*", + "openai-model": "text-embedding-ada-002", + "openai-organization": "*****", + "openai-processing-ms": "20", + "openai-version": "2020-10-01", + "strict-transport-security": "max-age=15552000; includeSubDomains; preload", + "x-ratelimit-limit-requests": "5000", + "x-ratelimit-limit-tokens": "5000000", + "x-ratelimit-remaining-requests": "4999", + "x-ratelimit-remaining-tokens": "4999999", + "x-ratelimit-reset-requests": "12ms", + "x-ratelimit-reset-tokens": "0s", + "x-request-id": "req_cc37487bfd336358231a17034bcfb4d9", + "cf-cache-status": "DYNAMIC", + "set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "x-content-type-options": "nosniff", + "server": "cloudflare", + "cf-ray": "8a66409b4f8acee9-SJC", + "content-encoding": "br", + "alt-svc": "h3=\":443\"; ma=86400" +} +``` + ### Parallel Function calling See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call) ```python diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 2606c8f96..06c48cbc4 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -784,6 +784,34 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + def make_sync_openai_chat_completion_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call chat.completions.create.with_raw_response when litellm.return_response_headers is True + - call chat.completions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = openai_client.chat.completions.with_raw_response.create( + **data, timeout=timeout + ) + + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.chat.completions.create( + **data, timeout=timeout + ) + return None, response + except Exception as e: + raise e + def completion( self, model_response: ModelResponse, @@ -916,7 +944,15 @@ class OpenAIChatCompletion(BaseLLM): }, ) - response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore + headers, response = ( + self.make_sync_openai_chat_completion_request( + openai_client=openai_client, + data=data, + timeout=timeout, + ) + ) + + logging_obj.model_call_details["response_headers"] = headers stringified_response = response.model_dump() logging_obj.post_call( input=messages, @@ -927,6 +963,7 @@ class OpenAIChatCompletion(BaseLLM): return convert_to_model_response_object( response_object=stringified_response, model_response_object=model_response, + _response_headers=headers, ) except openai.UnprocessableEntityError as e: ## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800 @@ -1043,6 +1080,25 @@ class OpenAIChatCompletion(BaseLLM): }, ) + headers, response = await self.make_openai_chat_completion_request( + openai_aclient=openai_aclient, data=data, timeout=timeout + ) + stringified_response = response.model_dump() + logging_obj.post_call( + input=data["messages"], + api_key=api_key, + original_response=stringified_response, + additional_args={"complete_input_dict": data}, + ) + logging_obj.model_call_details["response_headers"] = headers + return convert_to_model_response_object( + response_object=stringified_response, + model_response_object=model_response, + hidden_params={"headers": headers}, + _response_headers=headers, + ) + except Exception as e: + raise e headers, response = await self.make_openai_chat_completion_request( openai_aclient=openai_aclient, data=data, timeout=timeout ) @@ -1122,13 +1178,20 @@ class OpenAIChatCompletion(BaseLLM): "complete_input_dict": data, }, ) - response = openai_client.chat.completions.create(**data, timeout=timeout) + headers, response = self.make_sync_openai_chat_completion_request( + openai_client=openai_client, + data=data, + timeout=timeout, + ) + + logging_obj.model_call_details["response_headers"] = headers streamwrapper = CustomStreamWrapper( completion_stream=response, model=model, custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), + _response_headers=headers, ) return streamwrapper @@ -1170,8 +1233,30 @@ class OpenAIChatCompletion(BaseLLM): }, ) + headers, response = await self.make_openai_chat_completion_request( + openai_aclient=openai_aclient, data=data, timeout=timeout + ) + logging_obj.model_call_details["response_headers"] = headers + streamwrapper = CustomStreamWrapper( + completion_stream=response, + model=model, + custom_llm_provider="openai", + logging_obj=logging_obj, + stream_options=data.get("stream_options", None), + _response_headers=headers, + ) + return streamwrapper + except ( + Exception + ) as e: # need to exception handle here. async exceptions don't get caught in sync functions. + if response is not None and hasattr(response, "text"): + raise OpenAIError( + status_code=500, + message=f"{str(e)}\n\nOriginal Response: {response.text}", + headers, response = await self.make_openai_chat_completion_request( openai_aclient=openai_aclient, data=data, timeout=timeout + ) logging_obj.model_call_details["response_headers"] = headers streamwrapper = CustomStreamWrapper( @@ -1252,6 +1337,32 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + def make_sync_openai_embedding_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call embeddings.create.with_raw_response when litellm.return_response_headers is True + - call embeddings.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = openai_client.embeddings.with_raw_response.create( + **data, timeout=timeout + ) # type: ignore + + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + async def aembedding( self, input: list, @@ -1286,7 +1397,12 @@ class OpenAIChatCompletion(BaseLLM): additional_args={"complete_input_dict": data}, original_response=stringified_response, ) - return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore + return convert_to_model_response_object( + response_object=stringified_response, + model_response_object=model_response, + response_type="embedding", + _response_headers=headers, + ) # type: ignore except Exception as e: ## LOGGING logging_obj.post_call( @@ -1347,17 +1463,26 @@ class OpenAIChatCompletion(BaseLLM): client=client, ) - ## COMPLETION CALL - response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore + ## embedding CALL + headers: Optional[Dict] = None + headers, sync_embedding_response = self.make_sync_openai_embedding_request( + openai_client=openai_client, data=data, timeout=timeout + ) # type: ignore + ## LOGGING + logging_obj.model_call_details["response_headers"] = headers logging_obj.post_call( input=input, api_key=api_key, additional_args={"complete_input_dict": data}, - original_response=response, + original_response=sync_embedding_response, ) - - return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore + return convert_to_model_response_object( + response_object=sync_embedding_response.model_dump(), + model_response_object=model_response, + _response_headers=headers, + response_type="embedding", + ) # type: ignore except OpenAIError as e: exception_mapping_worked = True raise e @@ -1520,6 +1645,33 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + def make_sync_openai_audio_transcriptions_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True + - call openai_aclient.audio.transcriptions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = ( + openai_client.audio.transcriptions.with_raw_response.create( + **data, timeout=timeout + ) + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + def audio_transcriptions( self, model: str, @@ -1555,8 +1707,10 @@ class OpenAIChatCompletion(BaseLLM): timeout=timeout, max_retries=max_retries, ) - response = openai_client.audio.transcriptions.create( - **data, timeout=timeout # type: ignore + response = self.make_sync_openai_audio_transcriptions_request( + openai_client=openai_client, + data=data, + timeout=timeout, ) if isinstance(response, BaseModel): diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 7eda96cb9..770498962 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1340,6 +1340,115 @@ def test_completion_azure_gpt4_vision(): # test_completion_azure_gpt4_vision() +def test_completion_openai_response_headers(): + """ + Tests if LiteLLM reurns response hea + """ + litellm.return_response_headers = True + + # /chat/completion + messages = [ + { + "role": "user", + "content": "hi", + } + ] + + response = completion( + model="gpt-4o-mini", + messages=messages, + ) + + print(f"response: {response}") + + print("response_headers=", response._response_headers) + assert response._response_headers is not None + assert "x-ratelimit-remaining-tokens" in response._response_headers + + # /chat/completion - with streaming + + streaming_response = litellm.completion( + model="gpt-4o-mini", + messages=messages, + stream=True, + ) + response_headers = streaming_response._response_headers + print("streaming response_headers=", response_headers) + assert response_headers is not None + assert "x-ratelimit-remaining-tokens" in response_headers + + for chunk in streaming_response: + print("chunk=", chunk) + + # embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input="hello", + ) + + embedding_response_headers = embedding_response._response_headers + print("embedding_response_headers=", embedding_response_headers) + assert embedding_response_headers is not None + assert "x-ratelimit-remaining-tokens" in embedding_response_headers + + litellm.return_response_headers = False + + +@pytest.mark.asyncio() +async def test_async_completion_openai_response_headers(): + """ + Tests if LiteLLM reurns response hea + """ + litellm.return_response_headers = True + + # /chat/completion + messages = [ + { + "role": "user", + "content": "hi", + } + ] + + response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + ) + + print(f"response: {response}") + + print("response_headers=", response._response_headers) + assert response._response_headers is not None + assert "x-ratelimit-remaining-tokens" in response._response_headers + + # /chat/completion with streaming + + streaming_response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + stream=True, + ) + response_headers = streaming_response._response_headers + print("streaming response_headers=", response_headers) + assert response_headers is not None + assert "x-ratelimit-remaining-tokens" in response_headers + + async for chunk in streaming_response: + print("chunk=", chunk) + + # embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input="hello", + ) + + embedding_response_headers = embedding_response._response_headers + print("embedding_response_headers=", embedding_response_headers) + assert embedding_response_headers is not None + assert "x-ratelimit-remaining-tokens" in embedding_response_headers + + litellm.return_response_headers = False + + @pytest.mark.parametrize("model", ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]) def test_completion_openai_params(model): litellm.drop_params = True diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 4747a9a87..6581fea5f 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -536,6 +536,8 @@ class ModelResponse(OpenAIObject): _hidden_params: dict = {} + _response_headers: Optional[dict] = None + def __init__( self, id=None, @@ -549,6 +551,7 @@ class ModelResponse(OpenAIObject): stream_options=None, response_ms=None, hidden_params=None, + _response_headers=None, **params, ) -> None: if stream is not None and stream is True: @@ -598,6 +601,9 @@ class ModelResponse(OpenAIObject): if hidden_params: self._hidden_params = hidden_params + if _response_headers: + self._response_headers = _response_headers + init_values = { "id": id, "choices": choices, @@ -667,6 +673,7 @@ class EmbeddingResponse(OpenAIObject): """Usage statistics for the embedding request.""" _hidden_params: dict = {} + _response_headers: Optional[Dict] = None def __init__( self, @@ -675,6 +682,8 @@ class EmbeddingResponse(OpenAIObject): stream=False, response_ms=None, data=None, + hidden_params=None, + _response_headers=None, **params, ): object = "list" @@ -692,6 +701,9 @@ class EmbeddingResponse(OpenAIObject): else: usage = Usage() + if _response_headers: + self._response_headers = _response_headers + model = model super().__init__(model=model, object=object, data=data, usage=usage) @@ -974,6 +986,7 @@ class TranscriptionResponse(OpenAIObject): text: Optional[str] = None _hidden_params: dict = {} + _response_headers: Optional[dict] = None def __init__(self, text=None): super().__init__(text=text) diff --git a/litellm/utils.py b/litellm/utils.py index dd15aeb45..ef4daec5e 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5666,6 +5666,7 @@ def convert_to_model_response_object( start_time=None, end_time=None, hidden_params: Optional[dict] = None, + _response_headers: Optional[dict] = None, ): received_args = locals() ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary @@ -5764,6 +5765,9 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + if _response_headers is not None: + model_response_object._response_headers = _response_headers + return model_response_object elif response_type == "embedding" and ( model_response_object is None @@ -5796,6 +5800,9 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + if _response_headers is not None: + model_response_object._response_headers = _response_headers + return model_response_object elif response_type == "image_generation" and ( model_response_object is None @@ -5837,6 +5844,10 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + + if _response_headers is not None: + model_response_object._response_headers = _response_headers + return model_response_object except Exception as e: raise Exception( @@ -8262,6 +8273,7 @@ class CustomStreamWrapper: logging_obj=None, stream_options=None, make_call: Optional[Callable] = None, + _response_headers: Optional[dict] = None, ): self.model = model self.make_call = make_call @@ -8293,6 +8305,7 @@ class CustomStreamWrapper: self._hidden_params = { "model_id": (_model_info.get("id", None)) } # returned as x-litellm-model-id response header in proxy + self._response_headers = _response_headers self.response_id = None self.logging_loop = None self.rules = Rules()