diff --git a/litellm/main.py b/litellm/main.py index 2da5795fa2..8326140fab 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3947,6 +3947,7 @@ async def atext_completion( ), model=model, custom_llm_provider=custom_llm_provider, + stream_options=kwargs.get('stream_options'), ) else: ## OpenAI / Azure Text Completion Returns here diff --git a/tests/llm_translation/test_text_completion.py b/tests/llm_translation/test_text_completion.py index 50c96e6eb0..4a664eb370 100644 --- a/tests/llm_translation/test_text_completion.py +++ b/tests/llm_translation/test_text_completion.py @@ -139,3 +139,38 @@ def test_convert_chat_to_text_completion_multiple_choices(): completion_tokens_details=None, prompt_tokens_details=None, ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("sync_mode", [True, False]) +async def test_text_completion_include_usage(sync_mode): + """Test text completion with include_usage""" + last_chunk = None + if sync_mode: + response = await litellm.atext_completion( + model="gpt-3.5-turbo", + prompt="Hello, world!", + stream=True, + stream_options={"include_usage": True}, + ) + + async for chunk in response: + print(chunk) + last_chunk = chunk + else: + response = litellm.text_completion( + model="gpt-3.5-turbo", + prompt="Hello, world!", + stream=True, + stream_options={"include_usage": True}, + ) + + for chunk in response: + print(chunk) + last_chunk = chunk + + assert last_chunk is not None + assert last_chunk.usage is not None + assert last_chunk.usage.prompt_tokens > 0 + assert last_chunk.usage.completion_tokens > 0 + assert last_chunk.usage.total_tokens > 0 diff --git a/tests/test_openai_endpoints.py b/tests/test_openai_endpoints.py index ce0deb51a0..0faae9d333 100644 --- a/tests/test_openai_endpoints.py +++ b/tests/test_openai_endpoints.py @@ -378,6 +378,36 @@ async def test_chat_completion_streaming(): print(f"response_str: {response_str}") +@pytest.mark.asyncio +async def test_completion_streaming_usage_metrics(): + """ + [PROD Test] Ensures usage metrics are returned correctly when `include_usage` is set to `True` + """ + client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + + response = await client.completions.create( + model="gpt-instruct", + prompt="hey", + stream=True, + stream_options={"include_usage": True}, + max_tokens=4, + temperature=0.00000001, + ) + + last_chunk = None + async for chunk in response: + print("chunk", chunk) + last_chunk = chunk + + assert last_chunk is not None, "No chunks were received" + assert last_chunk.usage is not None, "Usage information was not received" + assert last_chunk.usage.prompt_tokens > 0, "Prompt tokens should be greater than 0" + assert ( + last_chunk.usage.completion_tokens > 0 + ), "Completion tokens should be greater than 0" + assert last_chunk.usage.total_tokens > 0, "Total tokens should be greater than 0" + + @pytest.mark.asyncio async def test_chat_completion_anthropic_structured_output(): """