diff --git a/litellm/main.py b/litellm/main.py index 278399d5f..1ddefc756 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -2472,22 +2472,22 @@ async def atext_completion(*args, **kwargs): or custom_llm_provider == "ollama" or custom_llm_provider == "vertex_ai" ): # currently implemented aiohttp calls for just azure and openai, soon all. - if kwargs.get("stream", False): - response = text_completion(*args, **kwargs) - else: - # Await normally - response = await loop.run_in_executor(None, func_with_context) - if asyncio.iscoroutine(response): - response = await response + # Await normally + response = await loop.run_in_executor(None, func_with_context) + if asyncio.iscoroutine(response): + response = await response else: # Call the synchronous function using run_in_executor response = await loop.run_in_executor(None, func_with_context) - if kwargs.get("stream", False): # return an async generator - return _async_streaming( - response=response, + if kwargs.get("stream", False) == True: # return an async generator + return TextCompletionStreamWrapper( + completion_stream=_async_streaming( + response=response, + model=model, + custom_llm_provider=custom_llm_provider, + args=args, + ), model=model, - custom_llm_provider=custom_llm_provider, - args=args, ) else: return response @@ -2691,11 +2691,11 @@ def text_completion( **kwargs, **optional_params, ) + if kwargs.get("acompletion", False) == True: + return response if stream == True or kwargs.get("stream", False) == True: response = TextCompletionStreamWrapper(completion_stream=response, model=model) return response - if kwargs.get("acompletion", False) == True: - return response transformed_logprobs = None # only supported for TGI models try: diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py index 485e86e7f..547abb533 100644 --- a/litellm/tests/test_async_fn.py +++ b/litellm/tests/test_async_fn.py @@ -215,3 +215,30 @@ def test_get_response_non_openai_streaming(): # test_get_response_non_openai_streaming() + + +async def test_get_response(): + try: + response = await litellm.atext_completion( + model="gpt-3.5-turbo", + prompt="good morning", + stream=True, + max_tokens=10, + ) + print(f"response: {response}") + + num_finish_reason = 0 + async for chunk in response: + print(chunk) + if chunk["choices"][0].get("finish_reason") is not None: + num_finish_reason += 1 + print("finish_reason", chunk["choices"][0].get("finish_reason")) + + assert ( + num_finish_reason == 1 + ), f"expected only one finish reason. Got {num_finish_reason}" + except Exception as e: + pytest.fail(f"GOT exception for gpt-3.5 instruct In streaming{e}") + + +# asyncio.run(test_get_response())