fix: Avoid BadRequestError due to invalid max_tokens (#3667)

This patch ensures if max tokens is not defined, then is set to None instead of 0 when calling openai_chat_completion. This way some providers (like gemini) that cannot handle the `max_tokens = 0` will not fail Issue: #3666
2025-12-03 09:53:45 +00:00 · 2025-10-27 17:27:21 +01:00 · 2025-10-27 17:27:21 +01:00 · f18b5eb537
commit f18b5eb537
parent 00d8414597
171 changed files with 12728 additions and 8 deletions
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -3901,7 +3901,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -2862,7 +2862,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -2376,7 +2376,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1695,7 +1695,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -15452,7 +15452,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -11600,7 +11600,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
-          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's