fix(utils.py): fix openai-like api response format parsing (#7273)

* fix(utils.py): fix openai-like api response format parsing

Fixes issue passing structured output to litellm_proxy/ route

* fix(cost_calculator.py): fix whisper transcription cost calc to use file duration, not response time

'

* test: skip test if credentials not found
This commit is contained in:
Krish Dholakia 2024-12-17 12:49:09 -08:00 committed by GitHub
parent 3addbf1f58
commit 224ead1531
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 134 additions and 90 deletions

View file

@ -111,6 +111,7 @@ def cost_per_token( # noqa: PLR0915
usage_object: Optional[Usage] = None, # just read the usage object if provided usage_object: Optional[Usage] = None, # just read the usage object if provided
### CALL TYPE ### ### CALL TYPE ###
call_type: CallTypesLiteral = "completion", call_type: CallTypesLiteral = "completion",
audio_transcription_file_duration: float = 0.0, # for audio transcription calls - the file time in seconds
) -> Tuple[float, float]: # type: ignore ) -> Tuple[float, float]: # type: ignore
""" """
Calculates the cost per token for a given model, prompt tokens, and completion tokens. Calculates the cost per token for a given model, prompt tokens, and completion tokens.
@ -236,6 +237,12 @@ def cost_per_token( # noqa: PLR0915
model=model, model=model,
custom_llm_provider=custom_llm_provider, custom_llm_provider=custom_llm_provider,
) )
elif call_type == "atranscription" or call_type == "transcription":
return openai_cost_per_second(
model=model,
custom_llm_provider=custom_llm_provider,
duration=audio_transcription_file_duration,
)
elif custom_llm_provider == "vertex_ai": elif custom_llm_provider == "vertex_ai":
cost_router = google_cost_router( cost_router = google_cost_router(
model=model_without_prefix, model=model_without_prefix,
@ -261,13 +268,7 @@ def cost_per_token( # noqa: PLR0915
elif custom_llm_provider == "anthropic": elif custom_llm_provider == "anthropic":
return anthropic_cost_per_token(model=model, usage=usage_block) return anthropic_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "openai": elif custom_llm_provider == "openai":
openai_cost_route = openai_cost_router(call_type=CallTypes(call_type)) return openai_cost_per_token(model=model, usage=usage_block)
if openai_cost_route == "cost_per_token":
return openai_cost_per_token(model=model, usage=usage_block)
elif openai_cost_route == "cost_per_second":
return openai_cost_per_second(
model=model, usage=usage_block, response_time_ms=response_time_ms
)
elif custom_llm_provider == "databricks": elif custom_llm_provider == "databricks":
return databricks_cost_per_token(model=model, usage=usage_block) return databricks_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "fireworks_ai": elif custom_llm_provider == "fireworks_ai":
@ -484,6 +485,7 @@ def completion_cost( # noqa: PLR0915
completion_characters: Optional[int] = None completion_characters: Optional[int] = None
cache_creation_input_tokens: Optional[int] = None cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None cache_read_input_tokens: Optional[int] = None
audio_transcription_file_duration: float = 0.0
cost_per_token_usage_object: Optional[Usage] = _get_usage_object( cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
completion_response=completion_response completion_response=completion_response
) )
@ -632,6 +634,13 @@ def completion_cost( # noqa: PLR0915
call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
): ):
prompt_characters = litellm.utils._count_characters(text=prompt) prompt_characters = litellm.utils._count_characters(text=prompt)
elif (
call_type == CallTypes.atranscription.value
or call_type == CallTypes.transcription.value
):
audio_transcription_file_duration = getattr(
completion_response, "duration", 0.0
)
elif ( elif (
call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
): ):
@ -708,6 +717,7 @@ def completion_cost( # noqa: PLR0915
cache_read_input_tokens=cache_read_input_tokens, cache_read_input_tokens=cache_read_input_tokens,
usage_object=cost_per_token_usage_object, usage_object=cost_per_token_usage_object,
call_type=call_type, call_type=call_type,
audio_transcription_file_duration=audio_transcription_file_duration,
) )
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
@ -814,3 +824,11 @@ def rerank_cost(
) )
except Exception as e: except Exception as e:
raise e raise e
def transcription_cost(
model: str, custom_llm_provider: Optional[str], duration: float
) -> Tuple[float, float]:
return openai_cost_per_second(
model=model, custom_llm_provider=custom_llm_provider, duration=duration
)

View file

@ -78,36 +78,44 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
def cost_per_second( def cost_per_second(
model: str, usage: Usage, response_time_ms: Optional[float] = 0.0 model: str, custom_llm_provider: Optional[str], duration: float = 0.0
) -> Tuple[float, float]: ) -> Tuple[float, float]:
""" """
Calculates the cost per second for a given model, prompt tokens, and completion tokens. Calculates the cost per second for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- custom_llm_provider: str, the custom llm provider
- duration: float, the duration of the response in seconds
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
""" """
## GET MODEL INFO ## GET MODEL INFO
model_info = get_model_info(model=model, custom_llm_provider="openai") model_info = get_model_info(
model=model, custom_llm_provider=custom_llm_provider or "openai"
)
prompt_cost = 0.0 prompt_cost = 0.0
completion_cost = 0.0 completion_cost = 0.0
## Speech / Audio cost calculation ## Speech / Audio cost calculation
if ( if (
"output_cost_per_second" in model_info "output_cost_per_second" in model_info
and model_info["output_cost_per_second"] is not None and model_info["output_cost_per_second"] is not None
and response_time_ms is not None
): ):
verbose_logger.debug( verbose_logger.debug(
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; duration: {duration}"
) )
## COST PER SECOND ## ## COST PER SECOND ##
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000 completion_cost = model_info["output_cost_per_second"] * duration
elif ( elif (
"input_cost_per_second" in model_info "input_cost_per_second" in model_info
and model_info["input_cost_per_second"] is not None and model_info["input_cost_per_second"] is not None
and response_time_ms is not None
): ):
verbose_logger.debug( verbose_logger.debug(
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}" f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; duration: {duration}"
) )
## COST PER SECOND ## ## COST PER SECOND ##
prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000 prompt_cost = model_info["input_cost_per_second"] * duration
completion_cost = 0.0 completion_cost = 0.0
return prompt_cost, completion_cost return prompt_cost, completion_cost

View file

@ -3612,53 +3612,21 @@ def get_optional_params( # noqa: PLR0915
else False else False
), ),
) )
else: # assume passing in params for text-completion openai else: # assume passing in params for openai-like api
supported_params = get_supported_openai_params( supported_params = get_supported_openai_params(
model=model, custom_llm_provider="custom_openai" model=model, custom_llm_provider="custom_openai"
) )
_check_valid_arg(supported_params=supported_params) _check_valid_arg(supported_params=supported_params)
if functions is not None: optional_params = litellm.OpenAILikeChatConfig().map_openai_params(
optional_params["functions"] = functions non_default_params=non_default_params,
if function_call is not None: optional_params=optional_params,
optional_params["function_call"] = function_call model=model,
if temperature is not None: drop_params=(
optional_params["temperature"] = temperature drop_params
if top_p is not None: if drop_params is not None and isinstance(drop_params, bool)
optional_params["top_p"] = top_p else False
if n is not None: ),
optional_params["n"] = n )
if stream is not None:
optional_params["stream"] = stream
if stream_options is not None:
optional_params["stream_options"] = stream_options
if stop is not None:
optional_params["stop"] = stop
if max_tokens is not None:
optional_params["max_tokens"] = max_tokens
if presence_penalty is not None:
optional_params["presence_penalty"] = presence_penalty
if frequency_penalty is not None:
optional_params["frequency_penalty"] = frequency_penalty
if logit_bias is not None:
optional_params["logit_bias"] = logit_bias
if user is not None:
optional_params["user"] = user
if response_format is not None:
optional_params["response_format"] = response_format
if seed is not None:
optional_params["seed"] = seed
if tools is not None:
optional_params["tools"] = tools
if tool_choice is not None:
optional_params["tool_choice"] = tool_choice
if max_retries is not None:
optional_params["max_retries"] = max_retries
if logprobs is not None:
optional_params["logprobs"] = logprobs
if top_logprobs is not None:
optional_params["top_logprobs"] = top_logprobs
if extra_headers is not None:
optional_params["extra_headers"] = extra_headers
if ( if (
custom_llm_provider custom_llm_provider
in ["openai", "azure", "text-completion-openai"] in ["openai", "azure", "text-completion-openai"]

View file

@ -138,10 +138,14 @@ async def test_speech_litellm_vertex_async():
mock_async_post.return_value = mock_response mock_async_post.return_value = mock_response
model = "vertex_ai/test" model = "vertex_ai/test"
response = await litellm.aspeech( try:
model=model, response = await litellm.aspeech(
input="async hello what llm guardrail do you have", model=model,
) input="async hello what llm guardrail do you have",
)
except litellm.APIConnectionError as e:
if "Your default credentials were not found" in str(e):
pytest.skip("skipping test, credentials not found")
# Assert asynchronous call # Assert asynchronous call
mock_async_post.assert_called_once() mock_async_post.assert_called_once()
@ -181,18 +185,22 @@ async def test_speech_litellm_vertex_async_with_voice():
mock_async_post.return_value = mock_response mock_async_post.return_value = mock_response
model = "vertex_ai/test" model = "vertex_ai/test"
response = await litellm.aspeech( try:
model=model, response = await litellm.aspeech(
input="async hello what llm guardrail do you have", model=model,
voice={ input="async hello what llm guardrail do you have",
"languageCode": "en-UK", voice={
"name": "en-UK-Studio-O", "languageCode": "en-UK",
}, "name": "en-UK-Studio-O",
audioConfig={ },
"audioEncoding": "LINEAR22", audioConfig={
"speakingRate": "10", "audioEncoding": "LINEAR22",
}, "speakingRate": "10",
) },
)
except litellm.APIConnectionError as e:
if "Your default credentials were not found" in str(e):
pytest.skip("skipping test, credentials not found")
# Assert asynchronous call # Assert asynchronous call
mock_async_post.assert_called_once() mock_async_post.assert_called_once()
@ -239,18 +247,22 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
mock_async_post.return_value = mock_response mock_async_post.return_value = mock_response
model = "vertex_ai/test" model = "vertex_ai/test"
response = await litellm.aspeech( try:
input=ssml, response = await litellm.aspeech(
model=model, input=ssml,
voice={ model=model,
"languageCode": "en-UK", voice={
"name": "en-UK-Studio-O", "languageCode": "en-UK",
}, "name": "en-UK-Studio-O",
audioConfig={ },
"audioEncoding": "LINEAR22", audioConfig={
"speakingRate": "10", "audioEncoding": "LINEAR22",
}, "speakingRate": "10",
) },
)
except litellm.APIConnectionError as e:
if "Your default credentials were not found" in str(e):
pytest.skip("skipping test, credentials not found")
# Assert asynchronous call # Assert asynchronous call
mock_async_post.assert_called_once() mock_async_post.assert_called_once()

View file

@ -1819,6 +1819,43 @@ async def test_litellm_gateway_from_sdk():
assert "hello" in mock_call.call_args.kwargs["extra_body"] assert "hello" in mock_call.call_args.kwargs["extra_body"]
@pytest.mark.asyncio
async def test_litellm_gateway_from_sdk_structured_output():
from pydantic import BaseModel
class Result(BaseModel):
answer: str
litellm.set_verbose = True
from openai import OpenAI
openai_client = OpenAI(api_key="fake-key")
with patch.object(
openai_client.chat.completions, "create", new=MagicMock()
) as mock_call:
try:
litellm.completion(
model="litellm_proxy/openai/gpt-4o",
messages=[
{"role": "user", "content": "What is the capital of France?"}
],
api_key="my-test-api-key",
user="test",
response_format=Result,
base_url="https://litellm.ml-serving-internal.scale.com",
client=openai_client,
)
except Exception as e:
print(e)
mock_call.assert_called_once()
print("Call KWARGS - {}".format(mock_call.call_args.kwargs))
json_schema = mock_call.call_args.kwargs["response_format"]
assert "json_schema" in json_schema
# ################### Hugging Face Conversational models ######################## # ################### Hugging Face Conversational models ########################
# def hf_test_completion_conv(): # def hf_test_completion_conv():
# try: # try:

View file

@ -393,6 +393,8 @@ def test_whisper_openai():
transcription = TranscriptionResponse( transcription = TranscriptionResponse(
text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure." text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
) )
setattr(transcription, "duration", 3)
transcription._hidden_params = { transcription._hidden_params = {
"model": "whisper-1", "model": "whisper-1",
"custom_llm_provider": "openai", "custom_llm_provider": "openai",
@ -401,7 +403,6 @@ def test_whisper_openai():
} }
_total_time_in_seconds = 3 _total_time_in_seconds = 3
transcription._response_ms = _total_time_in_seconds * 1000
cost = litellm.completion_cost(model="whisper-1", completion_response=transcription) cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
print(f"cost: {cost}") print(f"cost: {cost}")
@ -411,7 +412,7 @@ def test_whisper_openai():
* _total_time_in_seconds, * _total_time_in_seconds,
5, 5,
) )
assert cost == expected_cost assert round(cost, 5) == round(expected_cost, 5)
def test_whisper_azure(): def test_whisper_azure():
@ -426,8 +427,8 @@ def test_whisper_azure():
"model_id": None, "model_id": None,
} }
_total_time_in_seconds = 3 _total_time_in_seconds = 3
setattr(transcription, "duration", _total_time_in_seconds)
transcription._response_ms = _total_time_in_seconds * 1000
cost = litellm.completion_cost( cost = litellm.completion_cost(
model="azure/azure-whisper", completion_response=transcription model="azure/azure-whisper", completion_response=transcription
) )
@ -439,7 +440,7 @@ def test_whisper_azure():
* _total_time_in_seconds, * _total_time_in_seconds,
5, 5,
) )
assert cost == expected_cost assert round(cost, 5) == round(expected_cost, 5)
def test_dalle_3_azure_cost_tracking(): def test_dalle_3_azure_cost_tracking():