mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
(feat) openai prompt caching (non streaming) - add prompt_tokens_details in usage response (#6039)
* add prompt_tokens_details in usage response * use _prompt_tokens_details as a param in Usage * fix linting errors * fix type error * fix ci/cd deps * bump deps for openai * bump deps openai * fix llm translation testing * fix llm translation embedding
This commit is contained in:
parent
9fccb4a0da
commit
4e88fd65e1
10 changed files with 1515 additions and 1428 deletions
|
@ -43,7 +43,7 @@ jobs:
|
|||
pip install opentelemetry-api==1.25.0
|
||||
pip install opentelemetry-sdk==1.25.0
|
||||
pip install opentelemetry-exporter-otlp==1.25.0
|
||||
pip install openai==1.45.0
|
||||
pip install openai==1.51.0
|
||||
pip install prisma==0.11.0
|
||||
pip install "detect_secrets==1.5.0"
|
||||
pip install "httpx==0.24.1"
|
||||
|
@ -374,7 +374,7 @@ jobs:
|
|||
pip install "aiodynamo==23.10.1"
|
||||
pip install "asyncio==3.4.3"
|
||||
pip install "PyGithub==1.59.1"
|
||||
pip install "openai==1.45.0"
|
||||
pip install "openai==1.51.0"
|
||||
# Run pytest and generate JUnit XML report
|
||||
- run:
|
||||
name: Build Docker image
|
||||
|
@ -467,7 +467,7 @@ jobs:
|
|||
pip install "pytest-retry==1.6.3"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install aiohttp
|
||||
pip install "openai==1.45.0"
|
||||
pip install "openai==1.51.0"
|
||||
python -m pip install --upgrade pip
|
||||
pip install "pydantic==2.7.1"
|
||||
pip install "pytest==7.3.1"
|
||||
|
@ -575,7 +575,7 @@ jobs:
|
|||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install "google-cloud-aiplatform==1.43.0"
|
||||
pip install aiohttp
|
||||
pip install "openai==1.45.0"
|
||||
pip install "openai==1.51.0"
|
||||
python -m pip install --upgrade pip
|
||||
pip install "pydantic==2.7.1"
|
||||
pip install "pytest==7.3.1"
|
||||
|
@ -743,7 +743,7 @@ jobs:
|
|||
pip install "pytest-retry==1.6.3"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install aiohttp
|
||||
pip install "openai==1.45.0"
|
||||
pip install "openai==1.51.0"
|
||||
python -m pip install --upgrade pip
|
||||
pip install "pydantic==2.7.1"
|
||||
pip install "pytest==7.3.1"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# used by CI/CD testing
|
||||
openai==1.34.0
|
||||
openai==1.51.0
|
||||
python-dotenv
|
||||
tiktoken
|
||||
importlib_metadata
|
||||
|
|
|
@ -1022,10 +1022,11 @@ class Huggingface(BaseLLM):
|
|||
model_response,
|
||||
"usage",
|
||||
litellm.Usage(
|
||||
**{
|
||||
"prompt_tokens": input_tokens,
|
||||
"total_tokens": input_tokens,
|
||||
}
|
||||
prompt_tokens=input_tokens,
|
||||
completion_tokens=input_tokens,
|
||||
total_tokens=input_tokens,
|
||||
prompt_tokens_details=None,
|
||||
completion_tokens_details=None,
|
||||
),
|
||||
)
|
||||
return model_response
|
||||
|
|
|
@ -585,10 +585,11 @@ async def ollama_aembeddings(
|
|||
model_response,
|
||||
"usage",
|
||||
litellm.Usage(
|
||||
**{
|
||||
"prompt_tokens": total_input_tokens,
|
||||
"total_tokens": total_input_tokens,
|
||||
}
|
||||
prompt_tokens=total_input_tokens,
|
||||
completion_tokens=total_input_tokens,
|
||||
total_tokens=total_input_tokens,
|
||||
prompt_tokens_details=None,
|
||||
completion_tokens_details=None,
|
||||
),
|
||||
)
|
||||
return model_response
|
||||
|
|
|
@ -6,7 +6,11 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|||
|
||||
from openai._models import BaseModel as OpenAIObject
|
||||
from openai.types.audio.transcription_create_params import FileTypes # type: ignore
|
||||
from openai.types.completion_usage import CompletionTokensDetails, CompletionUsage
|
||||
from openai.types.completion_usage import (
|
||||
CompletionTokensDetails,
|
||||
CompletionUsage,
|
||||
PromptTokensDetails,
|
||||
)
|
||||
from pydantic import ConfigDict, PrivateAttr
|
||||
from typing_extensions import Callable, Dict, Required, TypedDict, override
|
||||
|
||||
|
@ -347,7 +351,7 @@ class Message(OpenAIObject):
|
|||
),
|
||||
}
|
||||
super(Message, self).__init__(
|
||||
**init_values,
|
||||
**init_values, # type: ignore
|
||||
**params,
|
||||
)
|
||||
|
||||
|
@ -478,6 +482,10 @@ class Usage(CompletionUsage):
|
|||
completion_tokens: Optional[int] = None,
|
||||
total_tokens: Optional[int] = None,
|
||||
reasoning_tokens: Optional[int] = None,
|
||||
prompt_tokens_details: Optional[Union[PromptTokensDetails, dict]] = None,
|
||||
completion_tokens_details: Optional[
|
||||
Union[CompletionTokensDetails, dict]
|
||||
] = None,
|
||||
**params,
|
||||
):
|
||||
## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285
|
||||
|
@ -489,29 +497,35 @@ class Usage(CompletionUsage):
|
|||
prompt_tokens = params["prompt_cache_miss_tokens"]
|
||||
|
||||
# handle reasoning_tokens
|
||||
completion_tokens_details = None
|
||||
_completion_tokens_details: Optional[CompletionTokensDetails] = None
|
||||
if reasoning_tokens:
|
||||
completion_tokens_details = CompletionTokensDetails(
|
||||
reasoning_tokens=reasoning_tokens
|
||||
)
|
||||
|
||||
# Ensure completion_tokens_details is properly handled
|
||||
if "completion_tokens_details" in params:
|
||||
if isinstance(params["completion_tokens_details"], dict):
|
||||
completion_tokens_details = CompletionTokensDetails(
|
||||
**params["completion_tokens_details"]
|
||||
if completion_tokens_details:
|
||||
if isinstance(completion_tokens_details, dict):
|
||||
_completion_tokens_details = CompletionTokensDetails(
|
||||
**completion_tokens_details
|
||||
)
|
||||
elif isinstance(
|
||||
params["completion_tokens_details"], CompletionTokensDetails
|
||||
):
|
||||
completion_tokens_details = params["completion_tokens_details"]
|
||||
del params["completion_tokens_details"]
|
||||
elif isinstance(completion_tokens_details, CompletionTokensDetails):
|
||||
_completion_tokens_details = completion_tokens_details
|
||||
|
||||
# handle prompt_tokens_details
|
||||
_prompt_tokens_details: Optional[PromptTokensDetails] = None
|
||||
if prompt_tokens_details:
|
||||
if isinstance(prompt_tokens_details, dict):
|
||||
_prompt_tokens_details = PromptTokensDetails(**prompt_tokens_details)
|
||||
elif isinstance(prompt_tokens_details, PromptTokensDetails):
|
||||
_prompt_tokens_details = prompt_tokens_details
|
||||
|
||||
super().__init__(
|
||||
prompt_tokens=prompt_tokens or 0,
|
||||
completion_tokens=completion_tokens or 0,
|
||||
total_tokens=total_tokens or 0,
|
||||
completion_tokens_details=completion_tokens_details or None,
|
||||
completion_tokens_details=_completion_tokens_details or None,
|
||||
prompt_tokens_details=_prompt_tokens_details or None,
|
||||
)
|
||||
|
||||
## ANTHROPIC MAPPING ##
|
||||
|
|
2833
poetry.lock
generated
2833
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0, !=3.9.7"
|
||||
openai = ">=1.45.0"
|
||||
openai = ">=1.51.0"
|
||||
python-dotenv = ">=0.2.0"
|
||||
tiktoken = ">=0.7.0"
|
||||
importlib-metadata = ">=6.8.0"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# LITELLM PROXY DEPENDENCIES #
|
||||
anyio==4.4.0 # openai + http req.
|
||||
openai==1.45.0 # openai req.
|
||||
openai==1.51.0 # openai req.
|
||||
fastapi==0.111.0 # server dep
|
||||
backoff==2.2.1 # server dep
|
||||
pyyaml==6.0.0 # server dep
|
||||
|
|
|
@ -46,6 +46,7 @@ def mock_chat_response() -> Dict[str, Any]:
|
|||
"completion_tokens": 38,
|
||||
"completion_tokens_details": None,
|
||||
"total_tokens": 268,
|
||||
"prompt_tokens_details": None,
|
||||
},
|
||||
"system_fingerprint": None,
|
||||
}
|
||||
|
@ -201,6 +202,7 @@ def mock_embedding_response() -> Dict[str, Any]:
|
|||
"total_tokens": 8,
|
||||
"completion_tokens": 0,
|
||||
"completion_tokens_details": None,
|
||||
"prompt_tokens_details": None,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
34
tests/llm_translation/test_prompt_caching.py
Normal file
34
tests/llm_translation/test_prompt_caching.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
from respx import MockRouter
|
||||
|
||||
import litellm
|
||||
from litellm import Choices, Message, ModelResponse
|
||||
from litellm.types.utils import PromptTokensDetails
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_caching():
|
||||
"""
|
||||
Tests that:
|
||||
- prompt_tokens_details is correctly handled and returned as PromptTokensDetails type
|
||||
"""
|
||||
response1 = await litellm.acompletion(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
print("response1", response1)
|
||||
print("response1.usage", response1.usage)
|
||||
print("type of prompt_tokens_details", type(response1.usage.prompt_tokens_details))
|
||||
assert isinstance(response1.usage.prompt_tokens_details, PromptTokensDetails)
|
Loading…
Add table
Add a link
Reference in a new issue