From c6403706b41da188fd3e6904e60bb0a6888c60c7 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Fri, 12 Sep 2025 16:41:48 -0400 Subject: [PATCH] use task and async generator instead of a thread --- llama_stack/testing/inference_recorder.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index be1961ecc..e09513e4d 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -378,17 +378,17 @@ def patch_inference_clients(): def patched_models_list(self, *args, **kwargs): import asyncio - import concurrent.futures - with concurrent.futures.ThreadPoolExecutor() as executor: - future = executor.submit( - lambda: asyncio.run( - _patched_inference_method( - _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs - ) - ) - ) - return future.result() + task = asyncio.create_task( + _patched_inference_method(_original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs) + ) + + async def _iter(): + result = await task + async for item in result: + yield item + + return _iter() # Apply OpenAI patches AsyncChatCompletions.create = patched_chat_completions_create