mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 18:54:30 +00:00
fix(utils.py): fix openai-like api response format parsing (#7273)
* fix(utils.py): fix openai-like api response format parsing Fixes issue passing structured output to litellm_proxy/ route * fix(cost_calculator.py): fix whisper transcription cost calc to use file duration, not response time ' * test: skip test if credentials not found
This commit is contained in:
parent
3addbf1f58
commit
224ead1531
6 changed files with 134 additions and 90 deletions
|
@ -111,6 +111,7 @@ def cost_per_token( # noqa: PLR0915
|
||||||
usage_object: Optional[Usage] = None, # just read the usage object if provided
|
usage_object: Optional[Usage] = None, # just read the usage object if provided
|
||||||
### CALL TYPE ###
|
### CALL TYPE ###
|
||||||
call_type: CallTypesLiteral = "completion",
|
call_type: CallTypesLiteral = "completion",
|
||||||
|
audio_transcription_file_duration: float = 0.0, # for audio transcription calls - the file time in seconds
|
||||||
) -> Tuple[float, float]: # type: ignore
|
) -> Tuple[float, float]: # type: ignore
|
||||||
"""
|
"""
|
||||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||||
|
@ -236,6 +237,12 @@ def cost_per_token( # noqa: PLR0915
|
||||||
model=model,
|
model=model,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
)
|
)
|
||||||
|
elif call_type == "atranscription" or call_type == "transcription":
|
||||||
|
return openai_cost_per_second(
|
||||||
|
model=model,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
duration=audio_transcription_file_duration,
|
||||||
|
)
|
||||||
elif custom_llm_provider == "vertex_ai":
|
elif custom_llm_provider == "vertex_ai":
|
||||||
cost_router = google_cost_router(
|
cost_router = google_cost_router(
|
||||||
model=model_without_prefix,
|
model=model_without_prefix,
|
||||||
|
@ -261,13 +268,7 @@ def cost_per_token( # noqa: PLR0915
|
||||||
elif custom_llm_provider == "anthropic":
|
elif custom_llm_provider == "anthropic":
|
||||||
return anthropic_cost_per_token(model=model, usage=usage_block)
|
return anthropic_cost_per_token(model=model, usage=usage_block)
|
||||||
elif custom_llm_provider == "openai":
|
elif custom_llm_provider == "openai":
|
||||||
openai_cost_route = openai_cost_router(call_type=CallTypes(call_type))
|
return openai_cost_per_token(model=model, usage=usage_block)
|
||||||
if openai_cost_route == "cost_per_token":
|
|
||||||
return openai_cost_per_token(model=model, usage=usage_block)
|
|
||||||
elif openai_cost_route == "cost_per_second":
|
|
||||||
return openai_cost_per_second(
|
|
||||||
model=model, usage=usage_block, response_time_ms=response_time_ms
|
|
||||||
)
|
|
||||||
elif custom_llm_provider == "databricks":
|
elif custom_llm_provider == "databricks":
|
||||||
return databricks_cost_per_token(model=model, usage=usage_block)
|
return databricks_cost_per_token(model=model, usage=usage_block)
|
||||||
elif custom_llm_provider == "fireworks_ai":
|
elif custom_llm_provider == "fireworks_ai":
|
||||||
|
@ -484,6 +485,7 @@ def completion_cost( # noqa: PLR0915
|
||||||
completion_characters: Optional[int] = None
|
completion_characters: Optional[int] = None
|
||||||
cache_creation_input_tokens: Optional[int] = None
|
cache_creation_input_tokens: Optional[int] = None
|
||||||
cache_read_input_tokens: Optional[int] = None
|
cache_read_input_tokens: Optional[int] = None
|
||||||
|
audio_transcription_file_duration: float = 0.0
|
||||||
cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
|
cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
|
||||||
completion_response=completion_response
|
completion_response=completion_response
|
||||||
)
|
)
|
||||||
|
@ -632,6 +634,13 @@ def completion_cost( # noqa: PLR0915
|
||||||
call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
|
call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value
|
||||||
):
|
):
|
||||||
prompt_characters = litellm.utils._count_characters(text=prompt)
|
prompt_characters = litellm.utils._count_characters(text=prompt)
|
||||||
|
elif (
|
||||||
|
call_type == CallTypes.atranscription.value
|
||||||
|
or call_type == CallTypes.transcription.value
|
||||||
|
):
|
||||||
|
audio_transcription_file_duration = getattr(
|
||||||
|
completion_response, "duration", 0.0
|
||||||
|
)
|
||||||
elif (
|
elif (
|
||||||
call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
|
call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value
|
||||||
):
|
):
|
||||||
|
@ -708,6 +717,7 @@ def completion_cost( # noqa: PLR0915
|
||||||
cache_read_input_tokens=cache_read_input_tokens,
|
cache_read_input_tokens=cache_read_input_tokens,
|
||||||
usage_object=cost_per_token_usage_object,
|
usage_object=cost_per_token_usage_object,
|
||||||
call_type=call_type,
|
call_type=call_type,
|
||||||
|
audio_transcription_file_duration=audio_transcription_file_duration,
|
||||||
)
|
)
|
||||||
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||||
|
|
||||||
|
@ -814,3 +824,11 @@ def rerank_cost(
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
|
def transcription_cost(
|
||||||
|
model: str, custom_llm_provider: Optional[str], duration: float
|
||||||
|
) -> Tuple[float, float]:
|
||||||
|
return openai_cost_per_second(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider, duration=duration
|
||||||
|
)
|
||||||
|
|
|
@ -78,36 +78,44 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
|
||||||
|
|
||||||
|
|
||||||
def cost_per_second(
|
def cost_per_second(
|
||||||
model: str, usage: Usage, response_time_ms: Optional[float] = 0.0
|
model: str, custom_llm_provider: Optional[str], duration: float = 0.0
|
||||||
) -> Tuple[float, float]:
|
) -> Tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Calculates the cost per second for a given model, prompt tokens, and completion tokens.
|
Calculates the cost per second for a given model, prompt tokens, and completion tokens.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
- model: str, the model name without provider prefix
|
||||||
|
- custom_llm_provider: str, the custom llm provider
|
||||||
|
- duration: float, the duration of the response in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||||
"""
|
"""
|
||||||
## GET MODEL INFO
|
## GET MODEL INFO
|
||||||
model_info = get_model_info(model=model, custom_llm_provider="openai")
|
model_info = get_model_info(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider or "openai"
|
||||||
|
)
|
||||||
prompt_cost = 0.0
|
prompt_cost = 0.0
|
||||||
completion_cost = 0.0
|
completion_cost = 0.0
|
||||||
## Speech / Audio cost calculation
|
## Speech / Audio cost calculation
|
||||||
if (
|
if (
|
||||||
"output_cost_per_second" in model_info
|
"output_cost_per_second" in model_info
|
||||||
and model_info["output_cost_per_second"] is not None
|
and model_info["output_cost_per_second"] is not None
|
||||||
and response_time_ms is not None
|
|
||||||
):
|
):
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
|
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; duration: {duration}"
|
||||||
)
|
)
|
||||||
## COST PER SECOND ##
|
## COST PER SECOND ##
|
||||||
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
|
completion_cost = model_info["output_cost_per_second"] * duration
|
||||||
elif (
|
elif (
|
||||||
"input_cost_per_second" in model_info
|
"input_cost_per_second" in model_info
|
||||||
and model_info["input_cost_per_second"] is not None
|
and model_info["input_cost_per_second"] is not None
|
||||||
and response_time_ms is not None
|
|
||||||
):
|
):
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
|
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; duration: {duration}"
|
||||||
)
|
)
|
||||||
## COST PER SECOND ##
|
## COST PER SECOND ##
|
||||||
prompt_cost = model_info["input_cost_per_second"] * response_time_ms / 1000
|
prompt_cost = model_info["input_cost_per_second"] * duration
|
||||||
completion_cost = 0.0
|
completion_cost = 0.0
|
||||||
|
|
||||||
return prompt_cost, completion_cost
|
return prompt_cost, completion_cost
|
||||||
|
|
|
@ -3612,53 +3612,21 @@ def get_optional_params( # noqa: PLR0915
|
||||||
else False
|
else False
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else: # assume passing in params for text-completion openai
|
else: # assume passing in params for openai-like api
|
||||||
supported_params = get_supported_openai_params(
|
supported_params = get_supported_openai_params(
|
||||||
model=model, custom_llm_provider="custom_openai"
|
model=model, custom_llm_provider="custom_openai"
|
||||||
)
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
if functions is not None:
|
optional_params = litellm.OpenAILikeChatConfig().map_openai_params(
|
||||||
optional_params["functions"] = functions
|
non_default_params=non_default_params,
|
||||||
if function_call is not None:
|
optional_params=optional_params,
|
||||||
optional_params["function_call"] = function_call
|
model=model,
|
||||||
if temperature is not None:
|
drop_params=(
|
||||||
optional_params["temperature"] = temperature
|
drop_params
|
||||||
if top_p is not None:
|
if drop_params is not None and isinstance(drop_params, bool)
|
||||||
optional_params["top_p"] = top_p
|
else False
|
||||||
if n is not None:
|
),
|
||||||
optional_params["n"] = n
|
)
|
||||||
if stream is not None:
|
|
||||||
optional_params["stream"] = stream
|
|
||||||
if stream_options is not None:
|
|
||||||
optional_params["stream_options"] = stream_options
|
|
||||||
if stop is not None:
|
|
||||||
optional_params["stop"] = stop
|
|
||||||
if max_tokens is not None:
|
|
||||||
optional_params["max_tokens"] = max_tokens
|
|
||||||
if presence_penalty is not None:
|
|
||||||
optional_params["presence_penalty"] = presence_penalty
|
|
||||||
if frequency_penalty is not None:
|
|
||||||
optional_params["frequency_penalty"] = frequency_penalty
|
|
||||||
if logit_bias is not None:
|
|
||||||
optional_params["logit_bias"] = logit_bias
|
|
||||||
if user is not None:
|
|
||||||
optional_params["user"] = user
|
|
||||||
if response_format is not None:
|
|
||||||
optional_params["response_format"] = response_format
|
|
||||||
if seed is not None:
|
|
||||||
optional_params["seed"] = seed
|
|
||||||
if tools is not None:
|
|
||||||
optional_params["tools"] = tools
|
|
||||||
if tool_choice is not None:
|
|
||||||
optional_params["tool_choice"] = tool_choice
|
|
||||||
if max_retries is not None:
|
|
||||||
optional_params["max_retries"] = max_retries
|
|
||||||
if logprobs is not None:
|
|
||||||
optional_params["logprobs"] = logprobs
|
|
||||||
if top_logprobs is not None:
|
|
||||||
optional_params["top_logprobs"] = top_logprobs
|
|
||||||
if extra_headers is not None:
|
|
||||||
optional_params["extra_headers"] = extra_headers
|
|
||||||
if (
|
if (
|
||||||
custom_llm_provider
|
custom_llm_provider
|
||||||
in ["openai", "azure", "text-completion-openai"]
|
in ["openai", "azure", "text-completion-openai"]
|
||||||
|
|
|
@ -138,10 +138,14 @@ async def test_speech_litellm_vertex_async():
|
||||||
mock_async_post.return_value = mock_response
|
mock_async_post.return_value = mock_response
|
||||||
model = "vertex_ai/test"
|
model = "vertex_ai/test"
|
||||||
|
|
||||||
response = await litellm.aspeech(
|
try:
|
||||||
model=model,
|
response = await litellm.aspeech(
|
||||||
input="async hello what llm guardrail do you have",
|
model=model,
|
||||||
)
|
input="async hello what llm guardrail do you have",
|
||||||
|
)
|
||||||
|
except litellm.APIConnectionError as e:
|
||||||
|
if "Your default credentials were not found" in str(e):
|
||||||
|
pytest.skip("skipping test, credentials not found")
|
||||||
|
|
||||||
# Assert asynchronous call
|
# Assert asynchronous call
|
||||||
mock_async_post.assert_called_once()
|
mock_async_post.assert_called_once()
|
||||||
|
@ -181,18 +185,22 @@ async def test_speech_litellm_vertex_async_with_voice():
|
||||||
mock_async_post.return_value = mock_response
|
mock_async_post.return_value = mock_response
|
||||||
model = "vertex_ai/test"
|
model = "vertex_ai/test"
|
||||||
|
|
||||||
response = await litellm.aspeech(
|
try:
|
||||||
model=model,
|
response = await litellm.aspeech(
|
||||||
input="async hello what llm guardrail do you have",
|
model=model,
|
||||||
voice={
|
input="async hello what llm guardrail do you have",
|
||||||
"languageCode": "en-UK",
|
voice={
|
||||||
"name": "en-UK-Studio-O",
|
"languageCode": "en-UK",
|
||||||
},
|
"name": "en-UK-Studio-O",
|
||||||
audioConfig={
|
},
|
||||||
"audioEncoding": "LINEAR22",
|
audioConfig={
|
||||||
"speakingRate": "10",
|
"audioEncoding": "LINEAR22",
|
||||||
},
|
"speakingRate": "10",
|
||||||
)
|
},
|
||||||
|
)
|
||||||
|
except litellm.APIConnectionError as e:
|
||||||
|
if "Your default credentials were not found" in str(e):
|
||||||
|
pytest.skip("skipping test, credentials not found")
|
||||||
|
|
||||||
# Assert asynchronous call
|
# Assert asynchronous call
|
||||||
mock_async_post.assert_called_once()
|
mock_async_post.assert_called_once()
|
||||||
|
@ -239,18 +247,22 @@ async def test_speech_litellm_vertex_async_with_voice_ssml():
|
||||||
mock_async_post.return_value = mock_response
|
mock_async_post.return_value = mock_response
|
||||||
model = "vertex_ai/test"
|
model = "vertex_ai/test"
|
||||||
|
|
||||||
response = await litellm.aspeech(
|
try:
|
||||||
input=ssml,
|
response = await litellm.aspeech(
|
||||||
model=model,
|
input=ssml,
|
||||||
voice={
|
model=model,
|
||||||
"languageCode": "en-UK",
|
voice={
|
||||||
"name": "en-UK-Studio-O",
|
"languageCode": "en-UK",
|
||||||
},
|
"name": "en-UK-Studio-O",
|
||||||
audioConfig={
|
},
|
||||||
"audioEncoding": "LINEAR22",
|
audioConfig={
|
||||||
"speakingRate": "10",
|
"audioEncoding": "LINEAR22",
|
||||||
},
|
"speakingRate": "10",
|
||||||
)
|
},
|
||||||
|
)
|
||||||
|
except litellm.APIConnectionError as e:
|
||||||
|
if "Your default credentials were not found" in str(e):
|
||||||
|
pytest.skip("skipping test, credentials not found")
|
||||||
|
|
||||||
# Assert asynchronous call
|
# Assert asynchronous call
|
||||||
mock_async_post.assert_called_once()
|
mock_async_post.assert_called_once()
|
||||||
|
|
|
@ -1819,6 +1819,43 @@ async def test_litellm_gateway_from_sdk():
|
||||||
assert "hello" in mock_call.call_args.kwargs["extra_body"]
|
assert "hello" in mock_call.call_args.kwargs["extra_body"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_litellm_gateway_from_sdk_structured_output():
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class Result(BaseModel):
|
||||||
|
answer: str
|
||||||
|
|
||||||
|
litellm.set_verbose = True
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
openai_client = OpenAI(api_key="fake-key")
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
openai_client.chat.completions, "create", new=MagicMock()
|
||||||
|
) as mock_call:
|
||||||
|
try:
|
||||||
|
litellm.completion(
|
||||||
|
model="litellm_proxy/openai/gpt-4o",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "What is the capital of France?"}
|
||||||
|
],
|
||||||
|
api_key="my-test-api-key",
|
||||||
|
user="test",
|
||||||
|
response_format=Result,
|
||||||
|
base_url="https://litellm.ml-serving-internal.scale.com",
|
||||||
|
client=openai_client,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
mock_call.assert_called_once()
|
||||||
|
|
||||||
|
print("Call KWARGS - {}".format(mock_call.call_args.kwargs))
|
||||||
|
json_schema = mock_call.call_args.kwargs["response_format"]
|
||||||
|
assert "json_schema" in json_schema
|
||||||
|
|
||||||
|
|
||||||
# ################### Hugging Face Conversational models ########################
|
# ################### Hugging Face Conversational models ########################
|
||||||
# def hf_test_completion_conv():
|
# def hf_test_completion_conv():
|
||||||
# try:
|
# try:
|
||||||
|
|
|
@ -393,6 +393,8 @@ def test_whisper_openai():
|
||||||
transcription = TranscriptionResponse(
|
transcription = TranscriptionResponse(
|
||||||
text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
|
text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
setattr(transcription, "duration", 3)
|
||||||
transcription._hidden_params = {
|
transcription._hidden_params = {
|
||||||
"model": "whisper-1",
|
"model": "whisper-1",
|
||||||
"custom_llm_provider": "openai",
|
"custom_llm_provider": "openai",
|
||||||
|
@ -401,7 +403,6 @@ def test_whisper_openai():
|
||||||
}
|
}
|
||||||
_total_time_in_seconds = 3
|
_total_time_in_seconds = 3
|
||||||
|
|
||||||
transcription._response_ms = _total_time_in_seconds * 1000
|
|
||||||
cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
|
cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
|
||||||
|
|
||||||
print(f"cost: {cost}")
|
print(f"cost: {cost}")
|
||||||
|
@ -411,7 +412,7 @@ def test_whisper_openai():
|
||||||
* _total_time_in_seconds,
|
* _total_time_in_seconds,
|
||||||
5,
|
5,
|
||||||
)
|
)
|
||||||
assert cost == expected_cost
|
assert round(cost, 5) == round(expected_cost, 5)
|
||||||
|
|
||||||
|
|
||||||
def test_whisper_azure():
|
def test_whisper_azure():
|
||||||
|
@ -426,8 +427,8 @@ def test_whisper_azure():
|
||||||
"model_id": None,
|
"model_id": None,
|
||||||
}
|
}
|
||||||
_total_time_in_seconds = 3
|
_total_time_in_seconds = 3
|
||||||
|
setattr(transcription, "duration", _total_time_in_seconds)
|
||||||
|
|
||||||
transcription._response_ms = _total_time_in_seconds * 1000
|
|
||||||
cost = litellm.completion_cost(
|
cost = litellm.completion_cost(
|
||||||
model="azure/azure-whisper", completion_response=transcription
|
model="azure/azure-whisper", completion_response=transcription
|
||||||
)
|
)
|
||||||
|
@ -439,7 +440,7 @@ def test_whisper_azure():
|
||||||
* _total_time_in_seconds,
|
* _total_time_in_seconds,
|
||||||
5,
|
5,
|
||||||
)
|
)
|
||||||
assert cost == expected_cost
|
assert round(cost, 5) == round(expected_cost, 5)
|
||||||
|
|
||||||
|
|
||||||
def test_dalle_3_azure_cost_tracking():
|
def test_dalle_3_azure_cost_tracking():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue