From 64dbe07593f5acf066ee8bfab2a2de2743749c6f Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 14:07:41 -0700 Subject: [PATCH 01/11] openai return response headers --- litellm/llms/openai.py | 112 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 0a40ab3fe..dbe178b30 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -784,6 +784,34 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + def make_sync_openai_chat_completion_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call chat.completions.create.with_raw_response when litellm.return_response_headers is True + - call chat.completions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = openai_client.chat.completions.with_raw_response.create( + **data, timeout=timeout + ) + + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.chat.completions.create( + **data, timeout=timeout + ) + return None, response + except Exception as e: + raise e + def completion( self, model_response: ModelResponse, @@ -913,7 +941,15 @@ class OpenAIChatCompletion(BaseLLM): }, ) - response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore + headers, response = ( + self.make_sync_openai_chat_completion_request( + openai_client=openai_client, + data=data, + timeout=timeout, + ) + ) + + logging_obj.model_call_details["response_headers"] = headers stringified_response = response.model_dump() logging_obj.post_call( input=messages, @@ -1059,7 +1095,13 @@ class OpenAIChatCompletion(BaseLLM): "complete_input_dict": data, }, ) - response = openai_client.chat.completions.create(**data, timeout=timeout) + headers, response = self.make_sync_openai_chat_completion_request( + openai_client=openai_client, + data=data, + timeout=timeout, + ) + + logging_obj.model_call_details["response_headers"] = headers streamwrapper = CustomStreamWrapper( completion_stream=response, model=model, @@ -1159,6 +1201,31 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + async def make_sync_openai_embedding_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call embeddings.create.with_raw_response when litellm.return_response_headers is True + - call embeddings.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = openai_client.embeddings.with_raw_response.create( + **data, timeout=timeout + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + async def aembedding( self, input: list, @@ -1255,15 +1322,19 @@ class OpenAIChatCompletion(BaseLLM): ) ## COMPLETION CALL - response = openai_client.embeddings.create(**data, timeout=timeout) # type: ignore + headers: Optional[Dict] = None + headers, response = self.make_sync_openai_embedding_request( + openai_client=openai_client, data=data, timeout=timeout + ) # type: ignore + ## LOGGING + logging_obj.model_call_details["response_headers"] = headers logging_obj.post_call( input=input, api_key=api_key, additional_args={"complete_input_dict": data}, original_response=response, ) - return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore except OpenAIError as e: exception_mapping_worked = True @@ -1427,6 +1498,33 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e + async def make_sync_openai_audio_transcriptions_request( + self, + openai_client: OpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True + - call openai_aclient.audio.transcriptions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = ( + openai_client.audio.transcriptions.with_raw_response.create( + **data, timeout=timeout + ) + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + def audio_transcriptions( self, model: str, @@ -1462,8 +1560,10 @@ class OpenAIChatCompletion(BaseLLM): timeout=timeout, max_retries=max_retries, ) - response = openai_client.audio.transcriptions.create( - **data, timeout=timeout # type: ignore + response = self.make_sync_openai_audio_transcriptions_request( + openai_client=openai_client, + data=data, + timeout=timeout, ) if isinstance(response, BaseModel): From ca8012090c9f69a25bf6588a29b614810254fd9b Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 14:58:14 -0700 Subject: [PATCH 02/11] return response_headers in response --- litellm/types/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 4747a9a87..55e5335f3 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -536,6 +536,8 @@ class ModelResponse(OpenAIObject): _hidden_params: dict = {} + response_headers: Optional[dict] = None + def __init__( self, id=None, @@ -549,6 +551,7 @@ class ModelResponse(OpenAIObject): stream_options=None, response_ms=None, hidden_params=None, + response_headers=None, **params, ) -> None: if stream is not None and stream is True: @@ -598,6 +601,9 @@ class ModelResponse(OpenAIObject): if hidden_params: self._hidden_params = hidden_params + if response_headers: + self.response_headers = response_headers + init_values = { "id": id, "choices": choices, @@ -668,6 +674,8 @@ class EmbeddingResponse(OpenAIObject): _hidden_params: dict = {} + response_headers: Optional[dict] = None + def __init__( self, model=None, @@ -675,6 +683,8 @@ class EmbeddingResponse(OpenAIObject): stream=False, response_ms=None, data=None, + hidden_params=None, + response_headers=None, **params, ): object = "list" @@ -974,6 +984,7 @@ class TranscriptionResponse(OpenAIObject): text: Optional[str] = None _hidden_params: dict = {} + response_headers: Optional[dict] = None def __init__(self, text=None): super().__init__(text=text) From 46cf4f69ae3c4d680bf6fd4064a026c333f5f3fd Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 14:59:08 -0700 Subject: [PATCH 03/11] return response headers in response --- litellm/utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/litellm/utils.py b/litellm/utils.py index dd15aeb45..bb173c133 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5666,6 +5666,7 @@ def convert_to_model_response_object( start_time=None, end_time=None, hidden_params: Optional[dict] = None, + response_headers: Optional[dict] = None, ): received_args = locals() ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary @@ -5764,6 +5765,9 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + if response_headers is not None: + model_response_object.response_headers = response_headers + return model_response_object elif response_type == "embedding" and ( model_response_object is None @@ -5796,6 +5800,9 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + if response_headers is not None: + model_response_object.response_headers = response_headers + return model_response_object elif response_type == "image_generation" and ( model_response_object is None @@ -5837,6 +5844,10 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params + + if response_headers is not None: + model_response_object.response_headers = response_headers + return model_response_object except Exception as e: raise Exception( @@ -8262,6 +8273,7 @@ class CustomStreamWrapper: logging_obj=None, stream_options=None, make_call: Optional[Callable] = None, + response_headers: Optional[dict] = None, ): self.model = model self.make_call = make_call @@ -8293,6 +8305,7 @@ class CustomStreamWrapper: self._hidden_params = { "model_id": (_model_info.get("id", None)) } # returned as x-litellm-model-id response header in proxy + self.response_headers = response_headers self.response_id = None self.logging_loop = None self.rules = Rules() From 3427838ce5f553ad630463542313249d63c93eb3 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 15:04:27 -0700 Subject: [PATCH 04/11] openai - return response headers --- litellm/llms/openai.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index dbe178b30..1bbf3ca7c 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -960,6 +960,7 @@ class OpenAIChatCompletion(BaseLLM): return convert_to_model_response_object( response_object=stringified_response, model_response_object=model_response, + response_headers=headers, ) except Exception as e: if print_verbose is not None: @@ -1108,6 +1109,7 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), + response_headers=headers, ) return streamwrapper @@ -1201,7 +1203,7 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e - async def make_sync_openai_embedding_request( + def make_sync_openai_embedding_request( self, openai_client: OpenAI, data: dict, @@ -1217,6 +1219,7 @@ class OpenAIChatCompletion(BaseLLM): raw_response = openai_client.embeddings.with_raw_response.create( **data, timeout=timeout ) # type: ignore + headers = dict(raw_response.headers) response = raw_response.parse() return headers, response @@ -1321,9 +1324,9 @@ class OpenAIChatCompletion(BaseLLM): client=client, ) - ## COMPLETION CALL + ## embedding CALL headers: Optional[Dict] = None - headers, response = self.make_sync_openai_embedding_request( + headers, sync_embedding_response = self.make_sync_openai_embedding_request( openai_client=openai_client, data=data, timeout=timeout ) # type: ignore @@ -1333,9 +1336,14 @@ class OpenAIChatCompletion(BaseLLM): input=input, api_key=api_key, additional_args={"complete_input_dict": data}, - original_response=response, + original_response=sync_embedding_response, ) - return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore + return convert_to_model_response_object( + response_object=sync_embedding_response.model_dump(), + model_response_object=model_response, + response_headers=headers, + response_type="embedding", + ) # type: ignore except OpenAIError as e: exception_mapping_worked = True raise e From 6039e0b2a70bc6216343aa83fdc4e1fb72e1ff0c Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 15:08:54 -0700 Subject: [PATCH 05/11] test - response_headers --- litellm/tests/test_completion.py | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 34eebb712..779203259 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1310,6 +1310,58 @@ def test_completion_azure_gpt4_vision(): # test_completion_azure_gpt4_vision() +def test_completion_openai_response_headers(): + """ + Tests if LiteLLM reurns response hea + """ + litellm.return_response_headers = True + + # /chat/completion + messages = [ + { + "role": "user", + "content": "hi", + } + ] + + response = completion( + model="gpt-4o-mini", + messages=messages, + ) + + print(f"response: {response}") + + print("response_headers=", response.response_headers) + assert response.response_headers is not None + assert "x-ratelimit-remaining-tokens" in response.response_headers + + # /chat/completion with streaming + + streaming_response = litellm.completion( + model="gpt-4o-mini", + messages=messages, + stream=True, + ) + response_headers = streaming_response.response_headers + print("streaming response_headers=", response_headers) + assert response_headers is not None + assert "x-ratelimit-remaining-tokens" in response_headers + + for chunk in streaming_response: + print("chunk=", chunk) + + # embedding + embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input="hello", + ) + + embedding_response_headers = embedding_response.response_headers + print("embedding_response_headers=", embedding_response_headers) + assert embedding_response_headers is not None + assert "x-ratelimit-remaining-tokens" in embedding_response_headers + + @pytest.mark.parametrize("model", ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]) def test_completion_openai_params(model): litellm.drop_params = True From 2e9f1e8de20b44e0e5f30eb57b9a814119d3d476 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 15:19:15 -0700 Subject: [PATCH 06/11] docs - response headers --- docs/my-website/docs/providers/openai.md | 98 ++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index d86263dd5..a6276e332 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -238,6 +238,104 @@ response = completion( ## Advanced +### Getting OpenAI API Response Headers + +Set `litellm.return_response_headers = True` to get raw response headers from OpenAI + +You can expect to always get the `response_headers` field from `litellm.completion()`, `litellm.embedding()` functions + + + + +```python +litellm.return_response_headers = True + +# /chat/completion +response = completion( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": "hi", + } + ], +) +print(f"response: {response}") +print("response_headers=", response.response_headers) +``` + + + + +```python +litellm.return_response_headers = True + +# /chat/completion +response = completion( + model="gpt-4o-mini", + stream=True, + messages=[ + { + "role": "user", + "content": "hi", + } + ], +) +print(f"response: {response}") +print("response_headers=", response.response_headers) +for chunk in response: + print(chunk) +``` + + + + +```python +litellm.return_response_headers = True + +# embedding +embedding_response = litellm.embedding( + model="text-embedding-ada-002", + input="hello", +) + +embedding_response_headers = embedding_response.response_headers +print("embedding_response_headers=", embedding_response_headers) +``` + + + +Expected Response Headers from OpenAI + +```json +{ + "date": "Sat, 20 Jul 2024 22:05:23 GMT", + "content-type": "application/json", + "transfer-encoding": "chunked", + "connection": "keep-alive", + "access-control-allow-origin": "*", + "openai-model": "text-embedding-ada-002", + "openai-organization": "*****", + "openai-processing-ms": "20", + "openai-version": "2020-10-01", + "strict-transport-security": "max-age=15552000; includeSubDomains; preload", + "x-ratelimit-limit-requests": "5000", + "x-ratelimit-limit-tokens": "5000000", + "x-ratelimit-remaining-requests": "4999", + "x-ratelimit-remaining-tokens": "4999999", + "x-ratelimit-reset-requests": "12ms", + "x-ratelimit-reset-tokens": "0s", + "x-request-id": "req_cc37487bfd336358231a17034bcfb4d9", + "cf-cache-status": "DYNAMIC", + "set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None", + "x-content-type-options": "nosniff", + "server": "cloudflare", + "cf-ray": "8a66409b4f8acee9-SJC", + "content-encoding": "br", + "alt-svc": "h3=\":443\"; ma=86400" +} +``` + ### Parallel Function calling See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call) ```python From 5e52f50a82d1018c232dd570e37ebdc29ba04617 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 15:26:44 -0700 Subject: [PATCH 07/11] return response headers --- litellm/llms/openai.py | 9 ++++- litellm/tests/test_completion.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 1bbf3ca7c..6c91bd15e 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1059,6 +1059,7 @@ class OpenAIChatCompletion(BaseLLM): response_object=stringified_response, model_response_object=model_response, hidden_params={"headers": headers}, + response_headers=headers, ) except Exception as e: raise e @@ -1159,6 +1160,7 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), + response_headers=headers, ) return streamwrapper except ( @@ -1263,7 +1265,12 @@ class OpenAIChatCompletion(BaseLLM): additional_args={"complete_input_dict": data}, original_response=stringified_response, ) - return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore + return convert_to_model_response_object( + response_object=stringified_response, + model_response_object=model_response, + response_type="embedding", + response_headers=headers, + ) # type: ignore except Exception as e: ## LOGGING logging_obj.post_call( diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 779203259..baad4f3bf 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1361,6 +1361,63 @@ def test_completion_openai_response_headers(): assert embedding_response_headers is not None assert "x-ratelimit-remaining-tokens" in embedding_response_headers + litellm.return_response_headers = False + + +@pytest.mark.asyncio() +async def test_async_completion_openai_response_headers(): + """ + Tests if LiteLLM reurns response hea + """ + litellm.return_response_headers = True + + # /chat/completion + messages = [ + { + "role": "user", + "content": "hi", + } + ] + + response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + ) + + print(f"response: {response}") + + print("response_headers=", response.response_headers) + assert response.response_headers is not None + assert "x-ratelimit-remaining-tokens" in response.response_headers + + # /chat/completion with streaming + + streaming_response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + stream=True, + ) + response_headers = streaming_response.response_headers + print("streaming response_headers=", response_headers) + assert response_headers is not None + assert "x-ratelimit-remaining-tokens" in response_headers + + async for chunk in streaming_response: + print("chunk=", chunk) + + # embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input="hello", + ) + + embedding_response_headers = embedding_response.response_headers + print("embedding_response_headers=", embedding_response_headers) + assert embedding_response_headers is not None + assert "x-ratelimit-remaining-tokens" in embedding_response_headers + + litellm.return_response_headers = False + @pytest.mark.parametrize("model", ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]) def test_completion_openai_params(model): From 5e4d2912444ec8e67f991f54edf298a5dd09ebb2 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 17:31:16 -0700 Subject: [PATCH 08/11] rename to _response_headers --- litellm/llms/openai.py | 12 ++++++------ litellm/tests/test_completion.py | 20 ++++++++++---------- litellm/types/utils.py | 18 ++++++++++-------- litellm/utils.py | 18 +++++++++--------- 4 files changed, 35 insertions(+), 33 deletions(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 6c91bd15e..ec5c55855 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -960,7 +960,7 @@ class OpenAIChatCompletion(BaseLLM): return convert_to_model_response_object( response_object=stringified_response, model_response_object=model_response, - response_headers=headers, + _response_headers=headers, ) except Exception as e: if print_verbose is not None: @@ -1059,7 +1059,7 @@ class OpenAIChatCompletion(BaseLLM): response_object=stringified_response, model_response_object=model_response, hidden_params={"headers": headers}, - response_headers=headers, + _response_headers=headers, ) except Exception as e: raise e @@ -1110,7 +1110,7 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), - response_headers=headers, + _response_headers=headers, ) return streamwrapper @@ -1160,7 +1160,7 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), - response_headers=headers, + _response_headers=headers, ) return streamwrapper except ( @@ -1269,7 +1269,7 @@ class OpenAIChatCompletion(BaseLLM): response_object=stringified_response, model_response_object=model_response, response_type="embedding", - response_headers=headers, + _response_headers=headers, ) # type: ignore except Exception as e: ## LOGGING @@ -1348,7 +1348,7 @@ class OpenAIChatCompletion(BaseLLM): return convert_to_model_response_object( response_object=sync_embedding_response.model_dump(), model_response_object=model_response, - response_headers=headers, + _response_headers=headers, response_type="embedding", ) # type: ignore except OpenAIError as e: diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index baad4f3bf..8af8e51b2 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1331,9 +1331,9 @@ def test_completion_openai_response_headers(): print(f"response: {response}") - print("response_headers=", response.response_headers) - assert response.response_headers is not None - assert "x-ratelimit-remaining-tokens" in response.response_headers + print("response_headers=", response._response_headers) + assert response._response_headers is not None + assert "x-ratelimit-remaining-tokens" in response._response_headers # /chat/completion with streaming @@ -1342,7 +1342,7 @@ def test_completion_openai_response_headers(): messages=messages, stream=True, ) - response_headers = streaming_response.response_headers + response_headers = streaming_response._response_headers print("streaming response_headers=", response_headers) assert response_headers is not None assert "x-ratelimit-remaining-tokens" in response_headers @@ -1356,7 +1356,7 @@ def test_completion_openai_response_headers(): input="hello", ) - embedding_response_headers = embedding_response.response_headers + embedding_response_headers = embedding_response._response_headers print("embedding_response_headers=", embedding_response_headers) assert embedding_response_headers is not None assert "x-ratelimit-remaining-tokens" in embedding_response_headers @@ -1386,9 +1386,9 @@ async def test_async_completion_openai_response_headers(): print(f"response: {response}") - print("response_headers=", response.response_headers) - assert response.response_headers is not None - assert "x-ratelimit-remaining-tokens" in response.response_headers + print("response_headers=", response._response_headers) + assert response._response_headers is not None + assert "x-ratelimit-remaining-tokens" in response._response_headers # /chat/completion with streaming @@ -1397,7 +1397,7 @@ async def test_async_completion_openai_response_headers(): messages=messages, stream=True, ) - response_headers = streaming_response.response_headers + response_headers = streaming_response._response_headers print("streaming response_headers=", response_headers) assert response_headers is not None assert "x-ratelimit-remaining-tokens" in response_headers @@ -1411,7 +1411,7 @@ async def test_async_completion_openai_response_headers(): input="hello", ) - embedding_response_headers = embedding_response.response_headers + embedding_response_headers = embedding_response._response_headers print("embedding_response_headers=", embedding_response_headers) assert embedding_response_headers is not None assert "x-ratelimit-remaining-tokens" in embedding_response_headers diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 55e5335f3..6581fea5f 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -536,7 +536,7 @@ class ModelResponse(OpenAIObject): _hidden_params: dict = {} - response_headers: Optional[dict] = None + _response_headers: Optional[dict] = None def __init__( self, @@ -551,7 +551,7 @@ class ModelResponse(OpenAIObject): stream_options=None, response_ms=None, hidden_params=None, - response_headers=None, + _response_headers=None, **params, ) -> None: if stream is not None and stream is True: @@ -601,8 +601,8 @@ class ModelResponse(OpenAIObject): if hidden_params: self._hidden_params = hidden_params - if response_headers: - self.response_headers = response_headers + if _response_headers: + self._response_headers = _response_headers init_values = { "id": id, @@ -673,8 +673,7 @@ class EmbeddingResponse(OpenAIObject): """Usage statistics for the embedding request.""" _hidden_params: dict = {} - - response_headers: Optional[dict] = None + _response_headers: Optional[Dict] = None def __init__( self, @@ -684,7 +683,7 @@ class EmbeddingResponse(OpenAIObject): response_ms=None, data=None, hidden_params=None, - response_headers=None, + _response_headers=None, **params, ): object = "list" @@ -702,6 +701,9 @@ class EmbeddingResponse(OpenAIObject): else: usage = Usage() + if _response_headers: + self._response_headers = _response_headers + model = model super().__init__(model=model, object=object, data=data, usage=usage) @@ -984,7 +986,7 @@ class TranscriptionResponse(OpenAIObject): text: Optional[str] = None _hidden_params: dict = {} - response_headers: Optional[dict] = None + _response_headers: Optional[dict] = None def __init__(self, text=None): super().__init__(text=text) diff --git a/litellm/utils.py b/litellm/utils.py index bb173c133..ef4daec5e 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5666,7 +5666,7 @@ def convert_to_model_response_object( start_time=None, end_time=None, hidden_params: Optional[dict] = None, - response_headers: Optional[dict] = None, + _response_headers: Optional[dict] = None, ): received_args = locals() ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary @@ -5765,8 +5765,8 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params - if response_headers is not None: - model_response_object.response_headers = response_headers + if _response_headers is not None: + model_response_object._response_headers = _response_headers return model_response_object elif response_type == "embedding" and ( @@ -5800,8 +5800,8 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params - if response_headers is not None: - model_response_object.response_headers = response_headers + if _response_headers is not None: + model_response_object._response_headers = _response_headers return model_response_object elif response_type == "image_generation" and ( @@ -5845,8 +5845,8 @@ def convert_to_model_response_object( if hidden_params is not None: model_response_object._hidden_params = hidden_params - if response_headers is not None: - model_response_object.response_headers = response_headers + if _response_headers is not None: + model_response_object._response_headers = _response_headers return model_response_object except Exception as e: @@ -8273,7 +8273,7 @@ class CustomStreamWrapper: logging_obj=None, stream_options=None, make_call: Optional[Callable] = None, - response_headers: Optional[dict] = None, + _response_headers: Optional[dict] = None, ): self.model = model self.make_call = make_call @@ -8305,7 +8305,7 @@ class CustomStreamWrapper: self._hidden_params = { "model_id": (_model_info.get("id", None)) } # returned as x-litellm-model-id response header in proxy - self.response_headers = response_headers + self._response_headers = _response_headers self.response_id = None self.logging_loop = None self.rules = Rules() From 4e301658caf3c4b02f485c37a141d00b35f29a59 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 17:32:34 -0700 Subject: [PATCH 09/11] docs _response_headers --- docs/my-website/docs/providers/openai.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index a6276e332..657e9c736 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -242,7 +242,7 @@ response = completion( Set `litellm.return_response_headers = True` to get raw response headers from OpenAI -You can expect to always get the `response_headers` field from `litellm.completion()`, `litellm.embedding()` functions +You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions @@ -261,7 +261,7 @@ response = completion( ], ) print(f"response: {response}") -print("response_headers=", response.response_headers) +print("_response_headers=", response._response_headers) ``` @@ -282,7 +282,7 @@ response = completion( ], ) print(f"response: {response}") -print("response_headers=", response.response_headers) +print("response_headers=", response._response_headers) for chunk in response: print(chunk) ``` @@ -299,7 +299,7 @@ embedding_response = litellm.embedding( input="hello", ) -embedding_response_headers = embedding_response.response_headers +embedding_response_headers = embedding_response._response_headers print("embedding_response_headers=", embedding_response_headers) ``` From 2513b64ed402d5609c279f225405ff51e01d13a8 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 17:44:12 -0700 Subject: [PATCH 10/11] ci/cd run tests again --- litellm/tests/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 8af8e51b2..f025414c3 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1335,7 +1335,7 @@ def test_completion_openai_response_headers(): assert response._response_headers is not None assert "x-ratelimit-remaining-tokens" in response._response_headers - # /chat/completion with streaming + # /chat/completion - with streaming streaming_response = litellm.completion( model="gpt-4o-mini", From 82764d2cec938c469d33f607c97a349ce40b7196 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 20 Jul 2024 18:17:21 -0700 Subject: [PATCH 11/11] fix make_sync_openai_audio_transcriptions_request --- litellm/llms/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index ec5c55855..7e32bceed 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1513,7 +1513,7 @@ class OpenAIChatCompletion(BaseLLM): except Exception as e: raise e - async def make_sync_openai_audio_transcriptions_request( + def make_sync_openai_audio_transcriptions_request( self, openai_client: OpenAI, data: dict,