fix(utils.py): fix openai-like api response format parsing (#7273)

* fix(utils.py): fix openai-like api response format parsing Fixes issue passing structured output to litellm_proxy/ route * fix(cost_calculator.py): fix whisper transcription cost calc to use file duration, not response time ' * test: skip test if credentials not found
2025-04-25 18:54:30 +00:00 · 2024-12-17 12:49:09 -08:00 · 2024-12-17 12:49:09 -08:00 · 224ead1531
commit 224ead1531
parent 3addbf1f58
6 changed files with 134 additions and 90 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -111,6 +111,7 @@ def cost_per_token(  # noqa: PLR0915
    usage_object: Optional[Usage] = None,  # just read the usage object if provided
    ### CALL TYPE ###
    call_type: CallTypesLiteral = "completion",
    audio_transcription_file_duration: float = 0.0,  # for audio transcription calls - the file time in seconds
 ) -> Tuple[float, float]:  # type: ignore
    """
    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -236,6 +237,12 @@ def cost_per_token(  # noqa: PLR0915
            model=model,
            custom_llm_provider=custom_llm_provider,
        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
            model=model,
            custom_llm_provider=custom_llm_provider,
            duration=audio_transcription_file_duration,
        )
    elif custom_llm_provider == "vertex_ai":
        cost_router = google_cost_router(
            model=model_without_prefix,
@ -261,13 +268,7 @@ def cost_per_token(  # noqa: PLR0915
    elif custom_llm_provider == "anthropic":
        return anthropic_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "openai":
-        openai_cost_route = openai_cost_router(call_type=CallTypes(call_type))
+        return openai_cost_per_token(model=model, usage=usage_block)
        if openai_cost_route == "cost_per_token":
            return openai_cost_per_token(model=model, usage=usage_block)
        elif openai_cost_route == "cost_per_second":
            return openai_cost_per_second(
                model=model, usage=usage_block, response_time_ms=response_time_ms
            )
    elif custom_llm_provider == "databricks":
        return databricks_cost_per_token(model=model, usage=usage_block)
    elif custom_llm_provider == "fireworks_ai":
@ -484,6 +485,7 @@ def completion_cost(  # noqa: PLR0915
        completion_characters: Optional[int] = None
        cache_creation_input_tokens: Optional[int] = None
        cache_read_input_tokens: Optional[int] = None
        audio_transcription_file_duration: float = 0.0
        cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
            completion_response=completion_response
        )
@ -632,6 +634,13 @@ def completion_cost(  # noqa: PLR0915
            call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
        ):
            prompt_characters = litellm.utils._count_characters(text=prompt)
        elif (
            call_type == CallTypes.atranscription.value
            or call_type == CallTypes.transcription.value
        ):
            audio_transcription_file_duration = getattr(
                completion_response, "duration", 0.0
            )
        elif (
            call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
        ):
@ -708,6 +717,7 @@ def completion_cost(  # noqa: PLR0915
            cache_read_input_tokens=cache_read_input_tokens,
            usage_object=cost_per_token_usage_object,
            call_type=call_type,
            audio_transcription_file_duration=audio_transcription_file_duration,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
@ -814,3 +824,11 @@ def rerank_cost(
        )
    except Exception as e:
        raise e
 def transcription_cost(
    model: str, custom_llm_provider: Optional[str], duration: float
 ) -> Tuple[float, float]:
    return openai_cost_per_second(
        model=model, custom_llm_provider=custom_llm_provider, duration=duration
    )
--- a/litellm/llms/openai/cost_calculation.py
+++ b/litellm/llms/openai/cost_calculation.py
@ -78,36 +78,44 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
 def cost_per_second(
-    model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
+    model: str, custom_llm_provider: Optional[str], duration: float = 0.0
 ) -> Tuple[float, float]:
    """
    Calculates the cost per second for a given model, prompt tokens, and completion tokens.
    Input:
        - model: str, the model name without provider prefix
        - custom_llm_provider: str, the custom llm provider
        - duration: float, the duration of the response in seconds
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
    ## GET MODEL INFO
-    model_info = get_model_info(model=model, custom_llm_provider="openai")
+    model_info = get_model_info(
        model=model, custom_llm_provider=custom_llm_provider or "openai"
    )
    prompt_cost = 0.0
    completion_cost = 0.0
    ## Speech / Audio cost calculation
    if (
        "output_cost_per_second" in model_info
        and model_info["output_cost_per_second"] is not None
        and response_time_ms is not None
    ):
        verbose_logger.debug(
-            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
+            f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; duration: {duration}"
        )
        ## COST PER SECOND ##
-        completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
+        completion_cost = model_info["output_cost_per_second"] * duration
    elif (
        "input_cost_per_second" in model_info
        and model_info["input_cost_per_second"] is not None
        and response_time_ms is not None
    ):
        verbose_logger.debug(
-            f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
+            f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; duration: {duration}"
        )
        ## COST PER SECOND ##
-        prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
+        prompt_cost = model_info["input_cost_per_second"] * duration
        completion_cost = 0.0
    return prompt_cost, completion_cost
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -3612,53 +3612,21 @@ def get_optional_params(  # noqa: PLR0915
                    else False
                ),
            )
-    else:  # assume passing in params for text-completion openai
+    else:  # assume passing in params for openai-like api
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider="custom_openai"
        )
        _check_valid_arg(supported_params=supported_params)
-        if functions is not None:
+        optional_params = litellm.OpenAILikeChatConfig().map_openai_params(
-            optional_params["functions"] = functions
+            non_default_params=non_default_params,
-        if function_call is not None:
+            optional_params=optional_params,
-            optional_params["function_call"] = function_call
+            model=model,
-        if temperature is not None:
+            drop_params=(
-            optional_params["temperature"] = temperature
+                drop_params
-        if top_p is not None:
+                if drop_params is not None and isinstance(drop_params, bool)
-            optional_params["top_p"] = top_p
+                else False
-        if n is not None:
+            ),
-            optional_params["n"] = n
+        )
        if stream is not None:
            optional_params["stream"] = stream
        if stream_options is not None:
            optional_params["stream_options"] = stream_options
        if stop is not None:
            optional_params["stop"] = stop
        if max_tokens is not None:
            optional_params["max_tokens"] = max_tokens
        if presence_penalty is not None:
            optional_params["presence_penalty"] = presence_penalty
        if frequency_penalty is not None:
            optional_params["frequency_penalty"] = frequency_penalty
        if logit_bias is not None:
            optional_params["logit_bias"] = logit_bias
        if user is not None:
            optional_params["user"] = user
        if response_format is not None:
            optional_params["response_format"] = response_format
        if seed is not None:
            optional_params["seed"] = seed
        if tools is not None:
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
        if max_retries is not None:
            optional_params["max_retries"] = max_retries
        if logprobs is not None:
            optional_params["logprobs"] = logprobs
        if top_logprobs is not None:
            optional_params["top_logprobs"] = top_logprobs
        if extra_headers is not None:
            optional_params["extra_headers"] = extra_headers
    if (
        custom_llm_provider
        in ["openai", "azure", "text-completion-openai"]
--- a/tests/local_testing/test_audio_speech.py
+++ b/tests/local_testing/test_audio_speech.py
@ -138,10 +138,14 @@ async def test_speech_litellm_vertex_async():
        mock_async_post.return_value = mock_response
        model = "vertex_ai/test"
-        response = await litellm.aspeech(
+        try:
-            model=model,
+            response = await litellm.aspeech(
-            input="async hello what llm guardrail do you have",
+                model=model,
-        )
+                input="async hello what llm guardrail do you have",
            )
        except litellm.APIConnectionError as e:
            if "Your default credentials were not found" in str(e):
                pytest.skip("skipping test, credentials not found")
        # Assert asynchronous call
        mock_async_post.assert_called_once()
@ -181,18 +185,22 @@ async def test_speech_litellm_vertex_async_with_voice():
        mock_async_post.return_value = mock_response
        model = "vertex_ai/test"
-        response = await litellm.aspeech(
+        try:
-            model=model,
+            response = await litellm.aspeech(
-            input="async hello what llm guardrail do you have",
+                model=model,
-            voice={
+                input="async hello what llm guardrail do you have",
-                "languageCode": "en-UK",
+                voice={
-                "name": "en-UK-Studio-O",
+                    "languageCode": "en-UK",
-            },
+                    "name": "en-UK-Studio-O",
-            audioConfig={
+                },
-                "audioEncoding": "LINEAR22",
+                audioConfig={
-                "speakingRate": "10",
+                    "audioEncoding": "LINEAR22",
-            },
+                    "speakingRate": "10",
-        )
+                },
            )
        except litellm.APIConnectionError as e:
            if "Your default credentials were not found" in str(e):
                pytest.skip("skipping test, credentials not found")
        # Assert asynchronous call
        mock_async_post.assert_called_once()
@ -239,18 +247,22 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
        mock_async_post.return_value = mock_response
        model = "vertex_ai/test"
-        response = await litellm.aspeech(
+        try:
-            input=ssml,
+            response = await litellm.aspeech(
-            model=model,
+                input=ssml,
-            voice={
+                model=model,
-                "languageCode": "en-UK",
+                voice={
-                "name": "en-UK-Studio-O",
+                    "languageCode": "en-UK",
-            },
+                    "name": "en-UK-Studio-O",
-            audioConfig={
+                },
-                "audioEncoding": "LINEAR22",
+                audioConfig={
-                "speakingRate": "10",
+                    "audioEncoding": "LINEAR22",
-            },
+                    "speakingRate": "10",
-        )
+                },
            )
        except litellm.APIConnectionError as e:
            if "Your default credentials were not found" in str(e):
                pytest.skip("skipping test, credentials not found")
        # Assert asynchronous call
        mock_async_post.assert_called_once()
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@ -1819,6 +1819,43 @@ async def test_litellm_gateway_from_sdk():
        assert "hello" in mock_call.call_args.kwargs["extra_body"]
@pytest.mark.asyncio
 async def test_litellm_gateway_from_sdk_structured_output():
    from pydantic import BaseModel
    class Result(BaseModel):
        answer: str
    litellm.set_verbose = True
    from openai import OpenAI
    openai_client = OpenAI(api_key="fake-key")
    with patch.object(
        openai_client.chat.completions, "create", new=MagicMock()
    ) as mock_call:
        try:
            litellm.completion(
                model="litellm_proxy/openai/gpt-4o",
                messages=[
                    {"role": "user", "content": "What is the capital of France?"}
                ],
                api_key="my-test-api-key",
                user="test",
                response_format=Result,
                base_url="https://litellm.ml-serving-internal.scale.com",
                client=openai_client,
            )
        except Exception as e:
            print(e)
        mock_call.assert_called_once()
        print("Call KWARGS - {}".format(mock_call.call_args.kwargs))
        json_schema = mock_call.call_args.kwargs["response_format"]
        assert "json_schema" in json_schema
 # ################### Hugging Face Conversational models ########################
 # def hf_test_completion_conv():
 #     try:
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -393,6 +393,8 @@ def test_whisper_openai():
    transcription = TranscriptionResponse(
        text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
    )
    setattr(transcription, "duration", 3)
    transcription._hidden_params = {
        "model": "whisper-1",
        "custom_llm_provider": "openai",
@ -401,7 +403,6 @@ def test_whisper_openai():
    }
    _total_time_in_seconds = 3
    transcription._response_ms = _total_time_in_seconds * 1000
    cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
    print(f"cost: {cost}")
@ -411,7 +412,7 @@ def test_whisper_openai():
        * _total_time_in_seconds,
        5,
    )
-    assert cost == expected_cost
+    assert round(cost, 5) == round(expected_cost, 5)
 def test_whisper_azure():
@ -426,8 +427,8 @@ def test_whisper_azure():
        "model_id": None,
    }
    _total_time_in_seconds = 3
    setattr(transcription, "duration", _total_time_in_seconds)
    transcription._response_ms = _total_time_in_seconds * 1000
    cost = litellm.completion_cost(
        model="azure/azure-whisper", completion_response=transcription
    )
@ -439,7 +440,7 @@ def test_whisper_azure():
        * _total_time_in_seconds,
        5,
    )
-    assert cost == expected_cost
+    assert round(cost, 5) == round(expected_cost, 5)
 def test_dalle_3_azure_cost_tracking():