diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index e3e182dd7..8f789aa62 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -3901,7 +3901,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 6b5b8230a..afad17d98 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -2862,7 +2862,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/experimental-llama-stack-spec.html b/docs/static/experimental-llama-stack-spec.html index ab474180e..2ad81d4f2 100644 --- a/docs/static/experimental-llama-stack-spec.html +++ b/docs/static/experimental-llama-stack-spec.html @@ -2376,7 +2376,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index dd9e43cc5..f15add8cf 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -1695,7 +1695,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 77a64ced0..a3576b398 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -15453,7 +15453,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index bd22f2129..bb3c038ba 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -11601,7 +11601,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 049482837..9da4f23cc 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -97,7 +97,7 @@ class SamplingParams(BaseModel): strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) - max_tokens: int | None = 0 + max_tokens: int | None = None repetition_penalty: float | None = 1.0 stop: list[str] | None = None diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index 98b3302e0..e042008dd 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, }, @@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, },