Support sys_prompt behavior in inference (#937)

# What does this PR do? The current default system prompt for llama3.2 tends to overindex on tool calling and doesn't work well when the prompt does not require tool calling. This PR adds an option to override the default system prompt, and organizes tool-related configs into a new config object. - [ ] Addresses issue (#issue) ## Test Plan python -m unittest llama_stack.providers.tests.inference.test_prompt_adapter ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/937). * #938 * __->__ #937
2025-02-03 23:35:16 -08:00 · 2025-02-03 23:35:16 -08:00 · c9ab72fa82
commit c9ab72fa82
parent 62cd3c391e
25 changed files with 308 additions and 48 deletions
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -24,6 +24,7 @@ from llama_stack.apis.inference import (
    ResponseFormat,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -102,6 +103,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -109,11 +111,10 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        if stream:
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -24,6 +24,7 @@ from llama_stack.apis.inference import (
    ResponseFormat,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -128,6 +129,7 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -140,6 +142,7 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        if stream:
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -89,16 +89,16 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        request = ChatCompletionRequest(
            model=model,
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -25,6 +25,7 @@ from llama_stack.apis.inference import (
    ResponseFormatType,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -204,6 +205,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -211,11 +213,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        if stream:
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -99,6 +99,7 @@ class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderD
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
        model_id = self.get_provider_model_id(model_id)
        if model_id == "llama-3.2-3b-preview":
@ -115,10 +116,9 @@ class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderD
                sampling_params=sampling_params,
                response_format=response_format,
                tools=tools,
-                tool_choice=tool_choice,
-                tool_prompt_format=tool_prompt_format,
                stream=stream,
                logprobs=logprobs,
+                tool_config=tool_config,
            )
        )

--- a/llama_stack/providers/remote/inference/groq/groq_utils.py
+++ b/llama_stack/providers/remote/inference/groq/groq_utils.py
@ -79,7 +79,7 @@ def convert_chat_completion_request(
        # so we exclude it for now
        warnings.warn("repetition_penalty is not supported")

-    if request.tool_prompt_format != ToolPromptFormat.json:
+    if request.tool_config.tool_prompt_format != ToolPromptFormat.json:
        warnings.warn("tool_prompt_format is not used by Groq. Ignoring.")

    sampling_options = get_sampling_strategy_options(request.sampling_params)
@ -93,7 +93,7 @@ def convert_chat_completion_request(
        temperature=sampling_options.get("temperature", 1.0),
        top_p=sampling_options.get("top_p", 1.0),
        tools=[_convert_groq_tool_definition(tool) for tool in request.tools or []],
-        tool_choice=request.tool_choice.value if request.tool_choice else None,
+        tool_choice=(request.tool_config.tool_choice.value if request.tool_config.tool_choice else None),
    )


--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -171,6 +171,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
        if tool_prompt_format:
            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring")
@ -184,10 +185,9 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
                sampling_params=sampling_params,
                response_format=response_format,
                tools=tools,
-                tool_choice=tool_choice,
-                tool_prompt_format=tool_prompt_format,
                stream=stream,
                logprobs=logprobs,
+                tool_config=tool_config,
            ),
            n=1,
        )
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@ -282,9 +282,9 @@ async def convert_chat_completion_request(

    if request.tools:
        payload.update(tools=[_convert_tooldef_to_openai_tool(tool) for tool in request.tools])
-        if request.tool_choice:
+        if request.tool_config.tool_choice:
            payload.update(
-                tool_choice=request.tool_choice.value
+                tool_choice=request.tool_config.tool_choice.value
            )  # we cannot include tool_choice w/o tools, server will complain

    if request.logprobs:
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -29,6 +29,7 @@ from llama_stack.apis.inference import (
    ResponseFormat,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -224,6 +225,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -231,11 +233,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
            response_format=response_format,
+            tool_config=tool_config,
        )
        if stream:
            return self._stream_chat_completion(request)
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -83,10 +83,9 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -125,10 +125,9 @@ class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )
        request_sambanova = await self.convert_chat_completion_request(request)

--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -26,6 +26,7 @@ from llama_stack.apis.inference import (
    ResponseFormatType,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -205,6 +206,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -212,11 +214,10 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        if stream:
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -24,6 +24,7 @@ from llama_stack.apis.inference import (
    ResponseFormatType,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -194,6 +195,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -201,11 +203,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
+            tool_config=tool_config,
        )

        if stream:
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -27,6 +27,7 @@ from llama_stack.apis.inference import (
    ResponseFormatType,
    SamplingParams,
    ToolChoice,
+    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
 )
@ -119,6 +120,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        tool_prompt_format: Optional[ToolPromptFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
@ -126,11 +128,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
            response_format=response_format,
+            tool_config=tool_config,
        )
        if stream:
            return self._stream_chat_completion(request, self.client)