forked from phoenix/litellm-mirror
(feat) openai prompt caching (non streaming) - add prompt_tokens_details in usage response (#6039)
* add prompt_tokens_details in usage response * use _prompt_tokens_details as a param in Usage * fix linting errors * fix type error * fix ci/cd deps * bump deps for openai * bump deps openai * fix llm translation testing * fix llm translation embedding
This commit is contained in:
parent
9fccb4a0da
commit
4e88fd65e1
10 changed files with 1515 additions and 1428 deletions
|
@ -43,7 +43,7 @@ jobs:
|
||||||
pip install opentelemetry-api==1.25.0
|
pip install opentelemetry-api==1.25.0
|
||||||
pip install opentelemetry-sdk==1.25.0
|
pip install opentelemetry-sdk==1.25.0
|
||||||
pip install opentelemetry-exporter-otlp==1.25.0
|
pip install opentelemetry-exporter-otlp==1.25.0
|
||||||
pip install openai==1.45.0
|
pip install openai==1.51.0
|
||||||
pip install prisma==0.11.0
|
pip install prisma==0.11.0
|
||||||
pip install "detect_secrets==1.5.0"
|
pip install "detect_secrets==1.5.0"
|
||||||
pip install "httpx==0.24.1"
|
pip install "httpx==0.24.1"
|
||||||
|
@ -374,7 +374,7 @@ jobs:
|
||||||
pip install "aiodynamo==23.10.1"
|
pip install "aiodynamo==23.10.1"
|
||||||
pip install "asyncio==3.4.3"
|
pip install "asyncio==3.4.3"
|
||||||
pip install "PyGithub==1.59.1"
|
pip install "PyGithub==1.59.1"
|
||||||
pip install "openai==1.45.0"
|
pip install "openai==1.51.0"
|
||||||
# Run pytest and generate JUnit XML report
|
# Run pytest and generate JUnit XML report
|
||||||
- run:
|
- run:
|
||||||
name: Build Docker image
|
name: Build Docker image
|
||||||
|
@ -467,7 +467,7 @@ jobs:
|
||||||
pip install "pytest-retry==1.6.3"
|
pip install "pytest-retry==1.6.3"
|
||||||
pip install "pytest-asyncio==0.21.1"
|
pip install "pytest-asyncio==0.21.1"
|
||||||
pip install aiohttp
|
pip install aiohttp
|
||||||
pip install "openai==1.45.0"
|
pip install "openai==1.51.0"
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install "pydantic==2.7.1"
|
pip install "pydantic==2.7.1"
|
||||||
pip install "pytest==7.3.1"
|
pip install "pytest==7.3.1"
|
||||||
|
@ -575,7 +575,7 @@ jobs:
|
||||||
pip install "pytest-asyncio==0.21.1"
|
pip install "pytest-asyncio==0.21.1"
|
||||||
pip install "google-cloud-aiplatform==1.43.0"
|
pip install "google-cloud-aiplatform==1.43.0"
|
||||||
pip install aiohttp
|
pip install aiohttp
|
||||||
pip install "openai==1.45.0"
|
pip install "openai==1.51.0"
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install "pydantic==2.7.1"
|
pip install "pydantic==2.7.1"
|
||||||
pip install "pytest==7.3.1"
|
pip install "pytest==7.3.1"
|
||||||
|
@ -743,7 +743,7 @@ jobs:
|
||||||
pip install "pytest-retry==1.6.3"
|
pip install "pytest-retry==1.6.3"
|
||||||
pip install "pytest-asyncio==0.21.1"
|
pip install "pytest-asyncio==0.21.1"
|
||||||
pip install aiohttp
|
pip install aiohttp
|
||||||
pip install "openai==1.45.0"
|
pip install "openai==1.51.0"
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install "pydantic==2.7.1"
|
pip install "pydantic==2.7.1"
|
||||||
pip install "pytest==7.3.1"
|
pip install "pytest==7.3.1"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# used by CI/CD testing
|
# used by CI/CD testing
|
||||||
openai==1.34.0
|
openai==1.51.0
|
||||||
python-dotenv
|
python-dotenv
|
||||||
tiktoken
|
tiktoken
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
|
|
|
@ -1022,10 +1022,11 @@ class Huggingface(BaseLLM):
|
||||||
model_response,
|
model_response,
|
||||||
"usage",
|
"usage",
|
||||||
litellm.Usage(
|
litellm.Usage(
|
||||||
**{
|
prompt_tokens=input_tokens,
|
||||||
"prompt_tokens": input_tokens,
|
completion_tokens=input_tokens,
|
||||||
"total_tokens": input_tokens,
|
total_tokens=input_tokens,
|
||||||
}
|
prompt_tokens_details=None,
|
||||||
|
completion_tokens_details=None,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return model_response
|
return model_response
|
||||||
|
|
|
@ -585,10 +585,11 @@ async def ollama_aembeddings(
|
||||||
model_response,
|
model_response,
|
||||||
"usage",
|
"usage",
|
||||||
litellm.Usage(
|
litellm.Usage(
|
||||||
**{
|
prompt_tokens=total_input_tokens,
|
||||||
"prompt_tokens": total_input_tokens,
|
completion_tokens=total_input_tokens,
|
||||||
"total_tokens": total_input_tokens,
|
total_tokens=total_input_tokens,
|
||||||
}
|
prompt_tokens_details=None,
|
||||||
|
completion_tokens_details=None,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
return model_response
|
return model_response
|
||||||
|
|
|
@ -6,7 +6,11 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
||||||
|
|
||||||
from openai._models import BaseModel as OpenAIObject
|
from openai._models import BaseModel as OpenAIObject
|
||||||
from openai.types.audio.transcription_create_params import FileTypes # type: ignore
|
from openai.types.audio.transcription_create_params import FileTypes # type: ignore
|
||||||
from openai.types.completion_usage import CompletionTokensDetails, CompletionUsage
|
from openai.types.completion_usage import (
|
||||||
|
CompletionTokensDetails,
|
||||||
|
CompletionUsage,
|
||||||
|
PromptTokensDetails,
|
||||||
|
)
|
||||||
from pydantic import ConfigDict, PrivateAttr
|
from pydantic import ConfigDict, PrivateAttr
|
||||||
from typing_extensions import Callable, Dict, Required, TypedDict, override
|
from typing_extensions import Callable, Dict, Required, TypedDict, override
|
||||||
|
|
||||||
|
@ -347,7 +351,7 @@ class Message(OpenAIObject):
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
super(Message, self).__init__(
|
super(Message, self).__init__(
|
||||||
**init_values,
|
**init_values, # type: ignore
|
||||||
**params,
|
**params,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -478,6 +482,10 @@ class Usage(CompletionUsage):
|
||||||
completion_tokens: Optional[int] = None,
|
completion_tokens: Optional[int] = None,
|
||||||
total_tokens: Optional[int] = None,
|
total_tokens: Optional[int] = None,
|
||||||
reasoning_tokens: Optional[int] = None,
|
reasoning_tokens: Optional[int] = None,
|
||||||
|
prompt_tokens_details: Optional[Union[PromptTokensDetails, dict]] = None,
|
||||||
|
completion_tokens_details: Optional[
|
||||||
|
Union[CompletionTokensDetails, dict]
|
||||||
|
] = None,
|
||||||
**params,
|
**params,
|
||||||
):
|
):
|
||||||
## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285
|
## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285
|
||||||
|
@ -489,29 +497,35 @@ class Usage(CompletionUsage):
|
||||||
prompt_tokens = params["prompt_cache_miss_tokens"]
|
prompt_tokens = params["prompt_cache_miss_tokens"]
|
||||||
|
|
||||||
# handle reasoning_tokens
|
# handle reasoning_tokens
|
||||||
completion_tokens_details = None
|
_completion_tokens_details: Optional[CompletionTokensDetails] = None
|
||||||
if reasoning_tokens:
|
if reasoning_tokens:
|
||||||
completion_tokens_details = CompletionTokensDetails(
|
completion_tokens_details = CompletionTokensDetails(
|
||||||
reasoning_tokens=reasoning_tokens
|
reasoning_tokens=reasoning_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure completion_tokens_details is properly handled
|
# Ensure completion_tokens_details is properly handled
|
||||||
if "completion_tokens_details" in params:
|
if completion_tokens_details:
|
||||||
if isinstance(params["completion_tokens_details"], dict):
|
if isinstance(completion_tokens_details, dict):
|
||||||
completion_tokens_details = CompletionTokensDetails(
|
_completion_tokens_details = CompletionTokensDetails(
|
||||||
**params["completion_tokens_details"]
|
**completion_tokens_details
|
||||||
)
|
)
|
||||||
elif isinstance(
|
elif isinstance(completion_tokens_details, CompletionTokensDetails):
|
||||||
params["completion_tokens_details"], CompletionTokensDetails
|
_completion_tokens_details = completion_tokens_details
|
||||||
):
|
|
||||||
completion_tokens_details = params["completion_tokens_details"]
|
# handle prompt_tokens_details
|
||||||
del params["completion_tokens_details"]
|
_prompt_tokens_details: Optional[PromptTokensDetails] = None
|
||||||
|
if prompt_tokens_details:
|
||||||
|
if isinstance(prompt_tokens_details, dict):
|
||||||
|
_prompt_tokens_details = PromptTokensDetails(**prompt_tokens_details)
|
||||||
|
elif isinstance(prompt_tokens_details, PromptTokensDetails):
|
||||||
|
_prompt_tokens_details = prompt_tokens_details
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
prompt_tokens=prompt_tokens or 0,
|
prompt_tokens=prompt_tokens or 0,
|
||||||
completion_tokens=completion_tokens or 0,
|
completion_tokens=completion_tokens or 0,
|
||||||
total_tokens=total_tokens or 0,
|
total_tokens=total_tokens or 0,
|
||||||
completion_tokens_details=completion_tokens_details or None,
|
completion_tokens_details=_completion_tokens_details or None,
|
||||||
|
prompt_tokens_details=_prompt_tokens_details or None,
|
||||||
)
|
)
|
||||||
|
|
||||||
## ANTHROPIC MAPPING ##
|
## ANTHROPIC MAPPING ##
|
||||||
|
|
2833
poetry.lock
generated
2833
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.8.1,<4.0, !=3.9.7"
|
python = ">=3.8.1,<4.0, !=3.9.7"
|
||||||
openai = ">=1.45.0"
|
openai = ">=1.51.0"
|
||||||
python-dotenv = ">=0.2.0"
|
python-dotenv = ">=0.2.0"
|
||||||
tiktoken = ">=0.7.0"
|
tiktoken = ">=0.7.0"
|
||||||
importlib-metadata = ">=6.8.0"
|
importlib-metadata = ">=6.8.0"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# LITELLM PROXY DEPENDENCIES #
|
# LITELLM PROXY DEPENDENCIES #
|
||||||
anyio==4.4.0 # openai + http req.
|
anyio==4.4.0 # openai + http req.
|
||||||
openai==1.45.0 # openai req.
|
openai==1.51.0 # openai req.
|
||||||
fastapi==0.111.0 # server dep
|
fastapi==0.111.0 # server dep
|
||||||
backoff==2.2.1 # server dep
|
backoff==2.2.1 # server dep
|
||||||
pyyaml==6.0.0 # server dep
|
pyyaml==6.0.0 # server dep
|
||||||
|
|
|
@ -46,6 +46,7 @@ def mock_chat_response() -> Dict[str, Any]:
|
||||||
"completion_tokens": 38,
|
"completion_tokens": 38,
|
||||||
"completion_tokens_details": None,
|
"completion_tokens_details": None,
|
||||||
"total_tokens": 268,
|
"total_tokens": 268,
|
||||||
|
"prompt_tokens_details": None,
|
||||||
},
|
},
|
||||||
"system_fingerprint": None,
|
"system_fingerprint": None,
|
||||||
}
|
}
|
||||||
|
@ -201,6 +202,7 @@ def mock_embedding_response() -> Dict[str, Any]:
|
||||||
"total_tokens": 8,
|
"total_tokens": 8,
|
||||||
"completion_tokens": 0,
|
"completion_tokens": 0,
|
||||||
"completion_tokens_details": None,
|
"completion_tokens_details": None,
|
||||||
|
"prompt_tokens_details": None,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
34
tests/llm_translation/test_prompt_caching.py
Normal file
34
tests/llm_translation/test_prompt_caching.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import AsyncMock
|
||||||
|
|
||||||
|
sys.path.insert(
|
||||||
|
0, os.path.abspath("../..")
|
||||||
|
) # Adds the parent directory to the system path
|
||||||
|
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
from respx import MockRouter
|
||||||
|
|
||||||
|
import litellm
|
||||||
|
from litellm import Choices, Message, ModelResponse
|
||||||
|
from litellm.types.utils import PromptTokensDetails
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_prompt_caching():
|
||||||
|
"""
|
||||||
|
Tests that:
|
||||||
|
- prompt_tokens_details is correctly handled and returned as PromptTokensDetails type
|
||||||
|
"""
|
||||||
|
response1 = await litellm.acompletion(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
)
|
||||||
|
print("response1", response1)
|
||||||
|
print("response1.usage", response1.usage)
|
||||||
|
print("type of prompt_tokens_details", type(response1.usage.prompt_tokens_details))
|
||||||
|
assert isinstance(response1.usage.prompt_tokens_details, PromptTokensDetails)
|
Loading…
Add table
Add a link
Reference in a new issue