From f7243a1e2cf204ecf5d1b3a2d24b6e29b97759a1 Mon Sep 17 00:00:00 2001 From: Luis Tomas Bolivar Date: Fri, 3 Oct 2025 17:37:04 +0200 Subject: [PATCH] Fix BadRequestError due to unvalid max_tokens This patch ensures if max tokens is not defined it is set to None. This avoid some providers to fail, as they don't have protection for it being set to 0 Issue: #3666 --- docs/static/deprecated-llama-stack-spec.html | 1 - docs/static/deprecated-llama-stack-spec.yaml | 1 - docs/static/experimental-llama-stack-spec.html | 1 - docs/static/experimental-llama-stack-spec.yaml | 1 - docs/static/stainless-llama-stack-spec.html | 1 - docs/static/stainless-llama-stack-spec.yaml | 1 - llama_stack/apis/inference/inference.py | 2 +- tests/integration/eval/test_eval.py | 2 ++ 8 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index e3e182dd7..8f789aa62 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -3901,7 +3901,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 6b5b8230a..afad17d98 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -2862,7 +2862,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/experimental-llama-stack-spec.html b/docs/static/experimental-llama-stack-spec.html index ab474180e..2ad81d4f2 100644 --- a/docs/static/experimental-llama-stack-spec.html +++ b/docs/static/experimental-llama-stack-spec.html @@ -2376,7 +2376,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index dd9e43cc5..f15add8cf 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -1695,7 +1695,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 77a64ced0..a3576b398 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -15453,7 +15453,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index bd22f2129..bb3c038ba 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -11601,7 +11601,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 049482837..9da4f23cc 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -97,7 +97,7 @@ class SamplingParams(BaseModel): strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) - max_tokens: int | None = 0 + max_tokens: int | None = None repetition_penalty: float | None = 1.0 stop: list[str] | None = None diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index 98b3302e0..e042008dd 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, }, @@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, },