Update Strategy in SamplingParams to be a union

2026-01-02 10:24:31 +00:00 · 2025-01-14 15:56:02 -08:00 · 2025-01-14 15:56:02 -08:00 · dea575c994
commit dea575c994
parent 300e6e2702
28 changed files with 600 additions and 377 deletions
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -34,6 +34,7 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    get_sampling_strategy_options,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
    process_chat_completion_response,
@ -166,16 +167,13 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
    ) -> Dict:
        bedrock_model = request.model

-        inference_config = {}
-        param_mapping = {
-            "max_tokens": "max_gen_len",
-            "temperature": "temperature",
-            "top_p": "top_p",
-        }
+        sampling_params = request.sampling_params
+        options = get_sampling_strategy_options(sampling_params)

-        for k, v in param_mapping.items():
-            if getattr(request.sampling_params, k):
-                inference_config[v] = getattr(request.sampling_params, k)
+        if sampling_params.max_tokens:
+            options["max_gen_len"] = sampling_params.max_tokens
+        if sampling_params.repetition_penalty > 0:
+            options["repetition_penalty"] = sampling_params.repetition_penalty

        prompt = await chat_completion_request_to_prompt(
            request, self.get_llama_model(request.model), self.formatter
@ -185,7 +183,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
            "body": json.dumps(
                {
                    "prompt": prompt,
-                    **inference_config,
+                    **options,
                }
            ),
        }
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -9,6 +9,7 @@ from typing import AsyncGenerator, List, Optional, Union
 from cerebras.cloud.sdk import AsyncCerebras
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import TopKSamplingStrategy
 from llama_models.llama3.api.tokenizer import Tokenizer

 from llama_stack.apis.common.content_types import InterleavedContent
@ -172,7 +173,9 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
    async def _get_params(
        self, request: Union[ChatCompletionRequest, CompletionRequest]
    ) -> dict:
-        if request.sampling_params and request.sampling_params.top_k:
+        if request.sampling_params and isinstance(
+            request.sampling_params.strategy, TopKSamplingStrategy
+        ):
            raise ValueError("`top_k` not supported by Cerebras")

        prompt = ""
--- a/llama_stack/providers/remote/inference/groq/groq_utils.py
+++ b/llama_stack/providers/remote/inference/groq/groq_utils.py
@ -48,6 +48,9 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.providers.utils.inference.openai_compat import (
+    get_sampling_strategy_options,
+)


 def convert_chat_completion_request(
@ -77,6 +80,7 @@ def convert_chat_completion_request(
    if request.tool_prompt_format != ToolPromptFormat.json:
        warnings.warn("tool_prompt_format is not used by Groq. Ignoring.")

+    sampling_options = get_sampling_strategy_options(request.sampling_params)
    return CompletionCreateParams(
        model=request.model,
        messages=[_convert_message(message) for message in request.messages],
@ -84,8 +88,8 @@ def convert_chat_completion_request(
        frequency_penalty=None,
        stream=request.stream,
        max_tokens=request.sampling_params.max_tokens or None,
-        temperature=request.sampling_params.temperature,
-        top_p=request.sampling_params.top_p,
+        temperature=sampling_options.get("temperature", 1.0),
+        top_p=sampling_options.get("top_p", 1.0),
        tools=[_convert_groq_tool_definition(tool) for tool in request.tools or []],
        tool_choice=request.tool_choice.value if request.tool_choice else None,
    )
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@ -263,19 +263,18 @@ def convert_chat_completion_request(
        if request.sampling_params.max_tokens:
            payload.update(max_tokens=request.sampling_params.max_tokens)

-        if request.sampling_params.strategy == "top_p":
+        strategy = request.sampling_params.strategy
+        if isinstance(strategy, TopPSamplingStrategy):
            nvext.update(top_k=-1)
-            payload.update(top_p=request.sampling_params.top_p)
-        elif request.sampling_params.strategy == "top_k":
-            if (
-                request.sampling_params.top_k != -1
-                and request.sampling_params.top_k < 1
-            ):
+            payload.update(top_p=strategy.top_p)
+            payload.update(temperature=strategy.temperature)
+        elif isinstance(strategy, TopKSamplingStrategy):
+            if strategy.top_k != -1 and strategy.top_k < 1:
                warnings.warn("top_k must be -1 or >= 1")
-            nvext.update(top_k=request.sampling_params.top_k)
-        elif request.sampling_params.strategy == "greedy":
+            nvext.update(top_k=strategy.top_k)
+        elif strategy.strategy == "greedy":
            nvext.update(top_k=-1)
-            payload.update(temperature=request.sampling_params.temperature)
+            payload.update(temperature=strategy.temperature)

    return payload