diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 1bbf3ca7c..6c91bd15e 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1059,6 +1059,7 @@ class OpenAIChatCompletion(BaseLLM): response_object=stringified_response, model_response_object=model_response, hidden_params={"headers": headers}, + response_headers=headers, ) except Exception as e: raise e @@ -1159,6 +1160,7 @@ class OpenAIChatCompletion(BaseLLM): custom_llm_provider="openai", logging_obj=logging_obj, stream_options=data.get("stream_options", None), + response_headers=headers, ) return streamwrapper except ( @@ -1263,7 +1265,12 @@ class OpenAIChatCompletion(BaseLLM): additional_args={"complete_input_dict": data}, original_response=stringified_response, ) - return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore + return convert_to_model_response_object( + response_object=stringified_response, + model_response_object=model_response, + response_type="embedding", + response_headers=headers, + ) # type: ignore except Exception as e: ## LOGGING logging_obj.post_call( diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 779203259..baad4f3bf 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -1361,6 +1361,63 @@ def test_completion_openai_response_headers(): assert embedding_response_headers is not None assert "x-ratelimit-remaining-tokens" in embedding_response_headers + litellm.return_response_headers = False + + +@pytest.mark.asyncio() +async def test_async_completion_openai_response_headers(): + """ + Tests if LiteLLM reurns response hea + """ + litellm.return_response_headers = True + + # /chat/completion + messages = [ + { + "role": "user", + "content": "hi", + } + ] + + response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + ) + + print(f"response: {response}") + + print("response_headers=", response.response_headers) + assert response.response_headers is not None + assert "x-ratelimit-remaining-tokens" in response.response_headers + + # /chat/completion with streaming + + streaming_response = await litellm.acompletion( + model="gpt-4o-mini", + messages=messages, + stream=True, + ) + response_headers = streaming_response.response_headers + print("streaming response_headers=", response_headers) + assert response_headers is not None + assert "x-ratelimit-remaining-tokens" in response_headers + + async for chunk in streaming_response: + print("chunk=", chunk) + + # embedding + embedding_response = await litellm.aembedding( + model="text-embedding-ada-002", + input="hello", + ) + + embedding_response_headers = embedding_response.response_headers + print("embedding_response_headers=", embedding_response_headers) + assert embedding_response_headers is not None + assert "x-ratelimit-remaining-tokens" in embedding_response_headers + + litellm.return_response_headers = False + @pytest.mark.parametrize("model", ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]) def test_completion_openai_params(model):