diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index 7edfe3f5d..74a94a64d 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -4218,7 +4218,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index ca832d46b..af5b46941 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -3068,7 +3068,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/experimental-llama-stack-spec.html b/docs/static/experimental-llama-stack-spec.html index a84226c05..3ca170f61 100644 --- a/docs/static/experimental-llama-stack-spec.html +++ b/docs/static/experimental-llama-stack-spec.html @@ -2713,7 +2713,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index a08c0cc87..b44af4bca 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -1927,7 +1927,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 7ec48ef74..91a7e1d97 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -15437,7 +15437,6 @@ }, "max_tokens": { "type": "integer", - "default": 0, "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length." }, "repetition_penalty": { diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 3bede159b..a7a2d6359 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -11477,7 +11477,6 @@ components: description: The sampling strategy. max_tokens: type: integer - default: 0 description: >- The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index e88a16315..23b082a34 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -96,7 +96,7 @@ class SamplingParams(BaseModel): strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) - max_tokens: int | None = 0 + max_tokens: int | None = None repetition_penalty: float | None = 1.0 stop: list[str] | None = None diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index 01581e829..d735bc2a2 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -55,6 +55,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, }, @@ -88,6 +89,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): "model": text_model_id, "sampling_params": { "temperature": 0.0, + "max_tokens": 512, }, }, },