diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index 691737c15..77c95cc7e 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -201,7 +201,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): response = OpenAICompatCompletionResponse( choices=[choice], ) - return process_chat_completion_response(response, self.formatter) + return process_chat_completion_response(response, self.formatter, request) async def _stream_chat_completion( self, request: ChatCompletionRequest, results_generator: AsyncGenerator @@ -227,7 +227,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): ) stream = _generate_and_convert_to_openai_compat() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def embeddings(self, model_id: str, contents: List[InterleavedContent]) -> EmbeddingsResponse: diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 03a0a40c3..54a674d7e 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -134,7 +134,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference): ) response = OpenAICompatCompletionResponse(choices=[choice]) - return process_chat_completion_response(response, self.formatter) + return process_chat_completion_response(response, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: params = await self._get_params_for_chat_completion(request) @@ -152,7 +152,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference): yield OpenAICompatCompletionResponse(choices=[choice]) stream = _generate_and_convert_to_openai_compat() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> Dict: diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index bd12c56c8..47f208129 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -155,14 +155,14 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference): r = await self.client.completions.create(**params) - return process_chat_completion_response(r, self.formatter) + return process_chat_completion_response(r, self.formatter, request) async def _stream_chat_completion(self, request: CompletionRequest) -> AsyncGenerator: params = await self._get_params(request) stream = await self.client.completions.create(**params) - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict: diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 37070b4ce..fe4865080 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -112,7 +112,7 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference): ) -> ChatCompletionResponse: params = self._get_params(request) r = client.completions.create(**params) - return process_chat_completion_response(r, self.formatter) + return process_chat_completion_response(r, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator: params = self._get_params(request) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index ecd195854..05a5d2d7a 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -304,7 +304,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): response = OpenAICompatCompletionResponse( choices=[choice], ) - return process_chat_completion_response(response, self.formatter) + return process_chat_completion_response(response, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: params = await self._get_params(request) @@ -330,7 +330,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): ) stream = _generate_and_convert_to_openai_compat() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def embeddings( diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index a62b0c97f..c7b20b9a1 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -99,7 +99,7 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference): ) -> ChatCompletionResponse: params = self._get_params(request) r = client.completions.create(**params) - return process_chat_completion_response(r, self.formatter) + return process_chat_completion_response(r, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator: params = self._get_params(request) @@ -110,7 +110,7 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference): yield chunk stream = _to_async_generator() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk def _get_params(self, request: ChatCompletionRequest) -> dict: diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index dd697cd62..2f0ef816c 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -159,7 +159,7 @@ class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference): yield chunk stream = _to_async_generator() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def embeddings( diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 2281319b3..97a6621fb 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -236,7 +236,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): response = OpenAICompatCompletionResponse( choices=[choice], ) - return process_chat_completion_response(response, self.formatter) + return process_chat_completion_response(response, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: params = await self._get_params(request) @@ -252,7 +252,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): ) stream = _generate_and_convert_to_openai_compat() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def _get_params(self, request: ChatCompletionRequest) -> dict: diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index cf24daf60..a165b01d9 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -220,7 +220,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi r = self._get_client().chat.completions.create(**params) else: r = self._get_client().completions.create(**params) - return process_chat_completion_response(r, self.formatter) + return process_chat_completion_response(r, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator: params = await self._get_params(request) @@ -235,7 +235,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi yield chunk stream = _to_async_generator() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict: diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 482e6fa97..8db09dfd7 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -142,7 +142,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): ) -> ChatCompletionResponse: params = await self._get_params(request) r = client.chat.completions.create(**params) - return process_chat_completion_response(r, self.formatter) + return process_chat_completion_response(r, self.formatter, request) async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator: params = await self._get_params(request) @@ -155,7 +155,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): yield chunk stream = _to_async_generator() - async for chunk in process_chat_completion_stream_response(stream, self.formatter): + async for chunk in process_chat_completion_stream_response(stream, self.formatter, request): yield chunk async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse: