forked from phoenix/litellm-mirror
(feat) add cost tracking for OpenAI prompt caching (#6055)
* add cache_read_input_token_cost for prompt caching models * add prompt caching for latest models * add openai cost calculator * add openai prompt caching test * fix lint check * add not on how usage._cache_read_input_tokens is used * fix cost calc whisper openai * use output_cost_per_second * add input_cost_per_second
This commit is contained in:
parent
930606ad63
commit
3682f661d8
5 changed files with 202 additions and 32 deletions
|
@ -24,6 +24,7 @@ from litellm import (
|
|||
model_cost,
|
||||
open_ai_chat_completion_models,
|
||||
)
|
||||
from litellm.types.utils import PromptTokensDetails
|
||||
from litellm.litellm_core_utils.litellm_logging import CustomLogger
|
||||
|
||||
|
||||
|
@ -209,7 +210,9 @@ def test_cost_ft_gpt_35():
|
|||
usage=Usage(prompt_tokens=21, completion_tokens=17, total_tokens=38),
|
||||
)
|
||||
|
||||
cost = litellm.completion_cost(completion_response=resp)
|
||||
cost = litellm.completion_cost(
|
||||
completion_response=resp, custom_llm_provider="openai"
|
||||
)
|
||||
print("\n Calculated Cost for ft:gpt-3.5", cost)
|
||||
input_cost = model_cost["ft:gpt-3.5-turbo"]["input_cost_per_token"]
|
||||
output_cost = model_cost["ft:gpt-3.5-turbo"]["output_cost_per_token"]
|
||||
|
@ -1330,6 +1333,90 @@ def test_completion_cost_vertex_llama3():
|
|||
assert cost == 0
|
||||
|
||||
|
||||
def test_cost_openai_prompt_caching():
|
||||
from litellm.utils import Choices, Message, ModelResponse, Usage
|
||||
from litellm import get_model_info
|
||||
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
||||
model = "gpt-4o-mini-2024-07-18"
|
||||
|
||||
## LLM API CALL ## (MORE EXPENSIVE)
|
||||
response_1 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=14,
|
||||
total_tokens=24,
|
||||
),
|
||||
)
|
||||
|
||||
## PROMPT CACHE HIT ## (LESS EXPENSIVE)
|
||||
response_2 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=0,
|
||||
total_tokens=10,
|
||||
prompt_tokens_details=PromptTokensDetails(
|
||||
cached_tokens=14,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
cost_1 = completion_cost(model=model, completion_response=response_1)
|
||||
cost_2 = completion_cost(model=model, completion_response=response_2)
|
||||
assert cost_1 > cost_2
|
||||
|
||||
model_info = get_model_info(model=model, custom_llm_provider="openai")
|
||||
usage = response_2.usage
|
||||
|
||||
_expected_cost2 = (
|
||||
usage.prompt_tokens * model_info["input_cost_per_token"]
|
||||
+ usage.completion_tokens * model_info["output_cost_per_token"]
|
||||
+ usage.prompt_tokens_details.cached_tokens
|
||||
* model_info["cache_read_input_token_cost"]
|
||||
)
|
||||
|
||||
print("_expected_cost2", _expected_cost2)
|
||||
print("cost_2", cost_2)
|
||||
|
||||
assert cost_2 == _expected_cost2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue