[Feat] Add max_completion_tokens param (#5691)

* add max_completion_tokens * add max_completion_tokens * add max_completion_tokens support for OpenAI models * add max_completion_tokens param * add max_completion_tokens for bedrock converse models * add test for converse maxTokens * fix openai o1 param mapping test * move test optional params * add max_completion_tokens for anthropic api * fix conftest * add max_completion tokens for vertex ai partner models * add max_completion_tokens for fireworks ai * add max_completion_tokens for hf rest api * add test for param mapping * add param mapping for vertex, gemini + testing * predibase is the most unstable and unusable llm api in prod, can't handle our ci/cd * add max_completion_tokens to openai supported params * fix fireworks ai param mapping
2025-04-25 10:44:24 +00:00 · 2024-09-14 14:57:01 -07:00 · 2024-09-14 14:57:01 -07:00 · 85acdb9193
commit 85acdb9193
parent 415a3ede9e
31 changed files with 591 additions and 35 deletions
--- a/litellm/main.py
+++ b/litellm/main.py
@ -264,6 +264,7 @@ async def acompletion(
    stream_options: Optional[dict] = None,
    stop=None,
    max_tokens: Optional[int] = None,
+    max_completion_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    logit_bias: Optional[dict] = None,
@ -303,6 +304,7 @@ async def acompletion(
        stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+        max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -341,6 +343,7 @@ async def acompletion(
        "stream_options": stream_options,
        "stop": stop,
        "max_tokens": max_tokens,
+        "max_completion_tokens": max_completion_tokens,
        "presence_penalty": presence_penalty,
        "frequency_penalty": frequency_penalty,
        "logit_bias": logit_bias,
@ -633,6 +636,7 @@ def completion(
    stream: Optional[bool] = None,
    stream_options: Optional[dict] = None,
    stop=None,
+    max_completion_tokens: Optional[int] = None,
    max_tokens: Optional[int] = None,
    presence_penalty: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
@ -675,6 +679,7 @@ def completion(
        stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
        stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
        max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
+        max_completion_tokens (integer, optional): An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.
        presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
        frequency_penalty: It is used to penalize new tokens based on their frequency in the text so far.
        logit_bias (dict, optional): Used to modify the probability of specific tokens appearing in the completion.
@ -759,6 +764,7 @@ def completion(
        "stream",
        "stream_options",
        "stop",
+        "max_completion_tokens",
        "max_tokens",
        "presence_penalty",
        "frequency_penalty",
@ -917,6 +923,7 @@ def completion(
            stream_options=stream_options,
            stop=stop,
            max_tokens=max_tokens,
+            max_completion_tokens=max_completion_tokens,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,