diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index befb4b092..da2e634f6 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -158,9 +158,8 @@ class LiteLLMOpenAIMixin(
         params["model"] = self.get_litellm_model_name(params["model"])
 
         logger.debug(f"params to litellm (openai compat): {params}")
-        # unfortunately, we need to use synchronous litellm.completion here because litellm
-        # caches various httpx.client objects in a non-eventloop aware manner
-        response = litellm.completion(**params)
+        # see https://docs.litellm.ai/docs/completion/stream#async-completion
+        response = await litellm.acompletion(**params)
         if stream:
             return self._stream_chat_completion(response)
         else:
@@ -170,7 +169,7 @@ class LiteLLMOpenAIMixin(
         self, response: litellm.ModelResponse
     ) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
         async def _stream_generator():
-            for chunk in response:
+            async for chunk in response:
                 yield chunk
 
         async for chunk in convert_openai_chat_completion_stream(
diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json
index 1ae018397..203fc51a5 100644
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@@ -78,7 +78,7 @@
         },
         {
           "role": "user",
-          "content": "What's the weather like in San Francisco?"
+          "content": "What's the weather like in San Francisco, CA?"
         }
       ],
       "tools": [