From c6403706b41da188fd3e6904e60bb0a6888c60c7 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Fri, 12 Sep 2025 16:41:48 -0400
Subject: [PATCH] use task and async generator instead of a thread

---
 llama_stack/testing/inference_recorder.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
index be1961ecc..e09513e4d 100644
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@@ -378,17 +378,17 @@ def patch_inference_clients():
 
     def patched_models_list(self, *args, **kwargs):
         import asyncio
-        import concurrent.futures
 
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            future = executor.submit(
-                lambda: asyncio.run(
-                    _patched_inference_method(
-                        _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
-                    )
-                )
-            )
-            return future.result()
+        task = asyncio.create_task(
+            _patched_inference_method(_original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs)
+        )
+
+        async def _iter():
+            result = await task
+            async for item in result:
+                yield item
+
+        return _iter()
 
     # Apply OpenAI patches
     AsyncChatCompletions.create = patched_chat_completions_create