From 6aedfc22016f3bfc388988324401d358500d9199 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 25 Mar 2025 13:04:24 -0400 Subject: [PATCH] test fixes Signed-off-by: Ihar Hrachyshka --- llama_stack/providers/remote/inference/vllm/vllm.py | 8 ++++---- llama_stack/providers/utils/inference/openai_compat.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index ecf41e50d..26e429592 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -245,7 +245,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): response_format: Optional[ResponseFormat] = None, stream: Optional[bool] = False, logprobs: Optional[LogProbConfig] = None, - ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk]: + ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]: assert self.model_store is not None if sampling_params is None: sampling_params = SamplingParams() @@ -275,7 +275,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): stream: Optional[bool] = False, logprobs: Optional[LogProbConfig] = None, tool_config: Optional[ToolConfig] = None, - ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk]: + ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]: assert self.model_store is not None if sampling_params is None: sampling_params = SamplingParams() @@ -319,7 +319,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): async def _stream_chat_completion( self, request: ChatCompletionRequest, client: AsyncOpenAI - ) -> AsyncGenerator[ChatCompletionResponseStreamChunk]: + ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]: params = await self._get_params(request) stream = await client.chat.completions.create(**params) @@ -336,7 +336,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): r = await self.client.completions.create(**params) return process_completion_response(r) - async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator[CompletionResponseStreamChunk]: + async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator[CompletionResponseStreamChunk, None]: assert self.client is not None params = await self._get_params(request) diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 9aedfade7..340ce8923 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -337,7 +337,7 @@ async def process_completion_stream_response( async def process_chat_completion_stream_response( stream: AsyncGenerator[OpenAICompatCompletionResponse, None], request: ChatCompletionRequest, -) -> AsyncGenerator[ChatCompletionResponseStreamChunk]: +) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]: yield ChatCompletionResponseStreamChunk( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.start,