Fix BadRequestError due to unvalid max_tokens

This patch ensures if max tokens is not defined it is set to None. This avoid some providers to fail, as they don't have protection for it being set to 0 Issue: #3666
2025-10-03 19:57:35 +00:00 · 2025-10-03 17:37:04 +02:00 · 2025-10-03 17:37:04 +02:00 · 43fb18928b
commit 43fb18928b
parent 4dfbe46954
8 changed files with 3 additions and 7 deletions
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -4218,7 +4218,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -3068,7 +3068,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -2713,7 +2713,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1927,7 +1927,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -14753,7 +14753,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -10909,7 +10909,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -96,7 +96,7 @@ class SamplingParams(BaseModel):

    strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)

-    max_tokens: int | None = 0
+    max_tokens: int | None = None
    repetition_penalty: float | None = 1.0
    stop: list[str] | None = None

--- a/tests/integration/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
                "model": text_model_id,
                "sampling_params": {
                    "temperature": 0.0,
+                    "max_tokens": 512,
                },
            },
        },
@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
                "model": text_model_id,
                "sampling_params": {
                    "temperature": 0.0,
+                    "max_tokens": 512,
                },
            },
        },