Make utils non-public

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-08-07 02:58:21 +00:00 · 2025-02-10 21:54:36 -05:00 · 2025-02-10 21:54:36 -05:00 · 34366f0b01
commit 34366f0b01
parent b2a86532a2
2 changed files with 9 additions and 7 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -69,7 +69,7 @@ def build_model_aliases():
    ]


-def convert_to_vllm_tool_calls_in_response(
+def _convert_to_vllm_tool_calls_in_response(
    tool_calls,
 ) -> List[ToolCall]:
    if not tool_calls:
@ -89,7 +89,7 @@ def convert_to_vllm_tool_calls_in_response(
    ]


-def convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict]:
+def _convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict]:
    if tools is None:
        return tools

@ -128,7 +128,7 @@ def convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict]:
    return None


-def convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
+def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
    return {
        "stop": StopReason.end_of_turn,
        "length": StopReason.out_of_tokens,
@ -214,8 +214,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        result = ChatCompletionResponse(
            completion_message=CompletionMessage(
                content=choice.message.content or "",
-                stop_reason=convert_to_vllm_finish_reason(choice.finish_reason),
-                tool_calls=convert_to_vllm_tool_calls_in_response(choice.message.tool_calls),
+                stop_reason=_convert_to_vllm_finish_reason(choice.finish_reason),
+                tool_calls=_convert_to_vllm_tool_calls_in_response(choice.message.tool_calls),
            ),
            logprobs=None,
        )
@ -269,7 +269,9 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        if "max_tokens" not in options:
            options["max_tokens"] = self.config.max_tokens

-        input_dict = {"tools": convert_to_vllm_tools_in_request(request.tools)}
+        input_dict = {}
+        if isinstance(request, ChatCompletionRequest) and request.tools is not None:
+            input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}

        if isinstance(request, ChatCompletionRequest):
            input_dict["messages"] = [await convert_message_to_openai_dict(m, download=True) for m in request.messages]
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -174,7 +174,7 @@ def process_chat_completion_response(
 ) -> ChatCompletionResponse:
    choice = response.choices[0]

-    # TODO: This does not work well with tool calls (at least for vLLM remote)
+    # TODO: This does not work well with tool calls for vLLM remote provider
    raw_message = formatter.decode_assistant_message_from_content(
        text_from_choice(choice), get_stop_reason(choice.finish_reason)
    )