(feat) openai prompt caching (non streaming) - add prompt_tokens_details in usage response (#6039)

* add prompt_tokens_details in usage response

* use _prompt_tokens_details as a param in Usage

* fix linting errors

* fix type error

* fix ci/cd deps

* bump deps for openai

* bump deps openai

* fix llm translation testing

* fix llm translation embedding
This commit is contained in:
Ishaan Jaff 2024-10-03 11:01:10 -07:00 committed by GitHub
parent 9fccb4a0da
commit 4e88fd65e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 1515 additions and 1428 deletions

View file

@ -43,7 +43,7 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.45.0 pip install openai==1.51.0
pip install prisma==0.11.0 pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
@ -374,7 +374,7 @@ jobs:
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "openai==1.45.0" pip install "openai==1.51.0"
# Run pytest and generate JUnit XML report # Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
@ -467,7 +467,7 @@ jobs:
pip install "pytest-retry==1.6.3" pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install aiohttp pip install aiohttp
pip install "openai==1.45.0" pip install "openai==1.51.0"
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install "pydantic==2.7.1" pip install "pydantic==2.7.1"
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
@ -575,7 +575,7 @@ jobs:
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install "google-cloud-aiplatform==1.43.0" pip install "google-cloud-aiplatform==1.43.0"
pip install aiohttp pip install aiohttp
pip install "openai==1.45.0" pip install "openai==1.51.0"
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install "pydantic==2.7.1" pip install "pydantic==2.7.1"
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
@ -743,7 +743,7 @@ jobs:
pip install "pytest-retry==1.6.3" pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install aiohttp pip install aiohttp
pip install "openai==1.45.0" pip install "openai==1.51.0"
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install "pydantic==2.7.1" pip install "pydantic==2.7.1"
pip install "pytest==7.3.1" pip install "pytest==7.3.1"

View file

@ -1,5 +1,5 @@
# used by CI/CD testing # used by CI/CD testing
openai==1.34.0 openai==1.51.0
python-dotenv python-dotenv
tiktoken tiktoken
importlib_metadata importlib_metadata

View file

@ -1022,10 +1022,11 @@ class Huggingface(BaseLLM):
model_response, model_response,
"usage", "usage",
litellm.Usage( litellm.Usage(
**{ prompt_tokens=input_tokens,
"prompt_tokens": input_tokens, completion_tokens=input_tokens,
"total_tokens": input_tokens, total_tokens=input_tokens,
} prompt_tokens_details=None,
completion_tokens_details=None,
), ),
) )
return model_response return model_response

View file

@ -585,10 +585,11 @@ async def ollama_aembeddings(
model_response, model_response,
"usage", "usage",
litellm.Usage( litellm.Usage(
**{ prompt_tokens=total_input_tokens,
"prompt_tokens": total_input_tokens, completion_tokens=total_input_tokens,
"total_tokens": total_input_tokens, total_tokens=total_input_tokens,
} prompt_tokens_details=None,
completion_tokens_details=None,
), ),
) )
return model_response return model_response

View file

@ -6,7 +6,11 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from openai._models import BaseModel as OpenAIObject from openai._models import BaseModel as OpenAIObject
from openai.types.audio.transcription_create_params import FileTypes # type: ignore from openai.types.audio.transcription_create_params import FileTypes # type: ignore
from openai.types.completion_usage import CompletionTokensDetails, CompletionUsage from openai.types.completion_usage import (
CompletionTokensDetails,
CompletionUsage,
PromptTokensDetails,
)
from pydantic import ConfigDict, PrivateAttr from pydantic import ConfigDict, PrivateAttr
from typing_extensions import Callable, Dict, Required, TypedDict, override from typing_extensions import Callable, Dict, Required, TypedDict, override
@ -347,7 +351,7 @@ class Message(OpenAIObject):
), ),
} }
super(Message, self).__init__( super(Message, self).__init__(
**init_values, **init_values, # type: ignore
**params, **params,
) )
@ -478,6 +482,10 @@ class Usage(CompletionUsage):
completion_tokens: Optional[int] = None, completion_tokens: Optional[int] = None,
total_tokens: Optional[int] = None, total_tokens: Optional[int] = None,
reasoning_tokens: Optional[int] = None, reasoning_tokens: Optional[int] = None,
prompt_tokens_details: Optional[Union[PromptTokensDetails, dict]] = None,
completion_tokens_details: Optional[
Union[CompletionTokensDetails, dict]
] = None,
**params, **params,
): ):
## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285 ## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285
@ -489,29 +497,35 @@ class Usage(CompletionUsage):
prompt_tokens = params["prompt_cache_miss_tokens"] prompt_tokens = params["prompt_cache_miss_tokens"]
# handle reasoning_tokens # handle reasoning_tokens
completion_tokens_details = None _completion_tokens_details: Optional[CompletionTokensDetails] = None
if reasoning_tokens: if reasoning_tokens:
completion_tokens_details = CompletionTokensDetails( completion_tokens_details = CompletionTokensDetails(
reasoning_tokens=reasoning_tokens reasoning_tokens=reasoning_tokens
) )
# Ensure completion_tokens_details is properly handled # Ensure completion_tokens_details is properly handled
if "completion_tokens_details" in params: if completion_tokens_details:
if isinstance(params["completion_tokens_details"], dict): if isinstance(completion_tokens_details, dict):
completion_tokens_details = CompletionTokensDetails( _completion_tokens_details = CompletionTokensDetails(
**params["completion_tokens_details"] **completion_tokens_details
) )
elif isinstance( elif isinstance(completion_tokens_details, CompletionTokensDetails):
params["completion_tokens_details"], CompletionTokensDetails _completion_tokens_details = completion_tokens_details
):
completion_tokens_details = params["completion_tokens_details"] # handle prompt_tokens_details
del params["completion_tokens_details"] _prompt_tokens_details: Optional[PromptTokensDetails] = None
if prompt_tokens_details:
if isinstance(prompt_tokens_details, dict):
_prompt_tokens_details = PromptTokensDetails(**prompt_tokens_details)
elif isinstance(prompt_tokens_details, PromptTokensDetails):
_prompt_tokens_details = prompt_tokens_details
super().__init__( super().__init__(
prompt_tokens=prompt_tokens or 0, prompt_tokens=prompt_tokens or 0,
completion_tokens=completion_tokens or 0, completion_tokens=completion_tokens or 0,
total_tokens=total_tokens or 0, total_tokens=total_tokens or 0,
completion_tokens_details=completion_tokens_details or None, completion_tokens_details=_completion_tokens_details or None,
prompt_tokens_details=_prompt_tokens_details or None,
) )
## ANTHROPIC MAPPING ## ## ANTHROPIC MAPPING ##

2833
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<4.0, !=3.9.7" python = ">=3.8.1,<4.0, !=3.9.7"
openai = ">=1.45.0" openai = ">=1.51.0"
python-dotenv = ">=0.2.0" python-dotenv = ">=0.2.0"
tiktoken = ">=0.7.0" tiktoken = ">=0.7.0"
importlib-metadata = ">=6.8.0" importlib-metadata = ">=6.8.0"

View file

@ -1,6 +1,6 @@
# LITELLM PROXY DEPENDENCIES # # LITELLM PROXY DEPENDENCIES #
anyio==4.4.0 # openai + http req. anyio==4.4.0 # openai + http req.
openai==1.45.0 # openai req. openai==1.51.0 # openai req.
fastapi==0.111.0 # server dep fastapi==0.111.0 # server dep
backoff==2.2.1 # server dep backoff==2.2.1 # server dep
pyyaml==6.0.0 # server dep pyyaml==6.0.0 # server dep

View file

@ -46,6 +46,7 @@ def mock_chat_response() -> Dict[str, Any]:
"completion_tokens": 38, "completion_tokens": 38,
"completion_tokens_details": None, "completion_tokens_details": None,
"total_tokens": 268, "total_tokens": 268,
"prompt_tokens_details": None,
}, },
"system_fingerprint": None, "system_fingerprint": None,
} }
@ -201,6 +202,7 @@ def mock_embedding_response() -> Dict[str, Any]:
"total_tokens": 8, "total_tokens": 8,
"completion_tokens": 0, "completion_tokens": 0,
"completion_tokens_details": None, "completion_tokens_details": None,
"prompt_tokens_details": None,
}, },
} }

View file

@ -0,0 +1,34 @@
import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
from litellm.types.utils import PromptTokensDetails
@pytest.mark.asyncio
async def test_prompt_caching():
"""
Tests that:
- prompt_tokens_details is correctly handled and returned as PromptTokensDetails type
"""
response1 = await litellm.acompletion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "hi"}],
)
print("response1", response1)
print("response1.usage", response1.usage)
print("type of prompt_tokens_details", type(response1.usage.prompt_tokens_details))
assert isinstance(response1.usage.prompt_tokens_details, PromptTokensDetails)