Merge 43fb18928b into 188a56af5c

2025-10-03 19:57:35 +00:00 · 2025-10-03 12:12:16 -07:00 · 2025-10-03 12:12:16 -07:00 · dba0a47a3f
commit dba0a47a3f
parent 188a56af5c 43fb18928b
8 changed files with 3 additions and 7 deletions
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -4218,7 +4218,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -3068,7 +3068,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -2713,7 +2713,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1927,7 +1927,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -15437,7 +15437,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -11477,7 +11477,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -96,7 +96,7 @@ class SamplingParams(BaseModel):

    strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)

-    max_tokens: int | None = 0
+    max_tokens: int | None = None
    repetition_penalty: float | None = 1.0
    stop: list[str] | None = None

--- a/tests/integration/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
                "model": text_model_id,
                "sampling_params": {
                    "temperature": 0.0,
+                    "max_tokens": 512,
                },
            },
        },
@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
                "model": text_model_id,
                "sampling_params": {
                    "temperature": 0.0,
+                    "max_tokens": 512,
                },
            },
        },