(feat) openai prompt caching (non streaming) - add prompt_tokens_details in usage response (#6039)

* add prompt_tokens_details in usage response

* use _prompt_tokens_details as a param in Usage

* fix linting errors

* fix type error

* fix ci/cd deps

* bump deps for openai

* bump deps openai

* fix llm translation testing

* fix llm translation embedding
This commit is contained in:
Ishaan Jaff 2024-10-03 11:01:10 -07:00 committed by GitHub
parent 9fccb4a0da
commit 4e88fd65e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 1515 additions and 1428 deletions

View file

@ -43,7 +43,7 @@ jobs:
pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0
pip install openai==1.45.0
pip install openai==1.51.0
pip install prisma==0.11.0
pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1"
@ -374,7 +374,7 @@ jobs:
pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
pip install "openai==1.45.0"
pip install "openai==1.51.0"
# Run pytest and generate JUnit XML report
- run:
name: Build Docker image
@ -467,7 +467,7 @@ jobs:
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install "openai==1.45.0"
pip install "openai==1.51.0"
python -m pip install --upgrade pip
pip install "pydantic==2.7.1"
pip install "pytest==7.3.1"
@ -575,7 +575,7 @@ jobs:
pip install "pytest-asyncio==0.21.1"
pip install "google-cloud-aiplatform==1.43.0"
pip install aiohttp
pip install "openai==1.45.0"
pip install "openai==1.51.0"
python -m pip install --upgrade pip
pip install "pydantic==2.7.1"
pip install "pytest==7.3.1"
@ -743,7 +743,7 @@ jobs:
pip install "pytest-retry==1.6.3"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install "openai==1.45.0"
pip install "openai==1.51.0"
python -m pip install --upgrade pip
pip install "pydantic==2.7.1"
pip install "pytest==7.3.1"

View file

@ -1,5 +1,5 @@
# used by CI/CD testing
openai==1.34.0
openai==1.51.0
python-dotenv
tiktoken
importlib_metadata

View file

@ -1022,10 +1022,11 @@ class Huggingface(BaseLLM):
model_response,
"usage",
litellm.Usage(
**{
"prompt_tokens": input_tokens,
"total_tokens": input_tokens,
}
prompt_tokens=input_tokens,
completion_tokens=input_tokens,
total_tokens=input_tokens,
prompt_tokens_details=None,
completion_tokens_details=None,
),
)
return model_response

View file

@ -585,10 +585,11 @@ async def ollama_aembeddings(
model_response,
"usage",
litellm.Usage(
**{
"prompt_tokens": total_input_tokens,
"total_tokens": total_input_tokens,
}
prompt_tokens=total_input_tokens,
completion_tokens=total_input_tokens,
total_tokens=total_input_tokens,
prompt_tokens_details=None,
completion_tokens_details=None,
),
)
return model_response

View file

@ -6,7 +6,11 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from openai._models import BaseModel as OpenAIObject
from openai.types.audio.transcription_create_params import FileTypes # type: ignore
from openai.types.completion_usage import CompletionTokensDetails, CompletionUsage
from openai.types.completion_usage import (
CompletionTokensDetails,
CompletionUsage,
PromptTokensDetails,
)
from pydantic import ConfigDict, PrivateAttr
from typing_extensions import Callable, Dict, Required, TypedDict, override
@ -347,7 +351,7 @@ class Message(OpenAIObject):
),
}
super(Message, self).__init__(
**init_values,
**init_values, # type: ignore
**params,
)
@ -478,6 +482,10 @@ class Usage(CompletionUsage):
completion_tokens: Optional[int] = None,
total_tokens: Optional[int] = None,
reasoning_tokens: Optional[int] = None,
prompt_tokens_details: Optional[Union[PromptTokensDetails, dict]] = None,
completion_tokens_details: Optional[
Union[CompletionTokensDetails, dict]
] = None,
**params,
):
## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285
@ -489,29 +497,35 @@ class Usage(CompletionUsage):
prompt_tokens = params["prompt_cache_miss_tokens"]
# handle reasoning_tokens
completion_tokens_details = None
_completion_tokens_details: Optional[CompletionTokensDetails] = None
if reasoning_tokens:
completion_tokens_details = CompletionTokensDetails(
reasoning_tokens=reasoning_tokens
)
# Ensure completion_tokens_details is properly handled
if "completion_tokens_details" in params:
if isinstance(params["completion_tokens_details"], dict):
completion_tokens_details = CompletionTokensDetails(
**params["completion_tokens_details"]
if completion_tokens_details:
if isinstance(completion_tokens_details, dict):
_completion_tokens_details = CompletionTokensDetails(
**completion_tokens_details
)
elif isinstance(
params["completion_tokens_details"], CompletionTokensDetails
):
completion_tokens_details = params["completion_tokens_details"]
del params["completion_tokens_details"]
elif isinstance(completion_tokens_details, CompletionTokensDetails):
_completion_tokens_details = completion_tokens_details
# handle prompt_tokens_details
_prompt_tokens_details: Optional[PromptTokensDetails] = None
if prompt_tokens_details:
if isinstance(prompt_tokens_details, dict):
_prompt_tokens_details = PromptTokensDetails(**prompt_tokens_details)
elif isinstance(prompt_tokens_details, PromptTokensDetails):
_prompt_tokens_details = prompt_tokens_details
super().__init__(
prompt_tokens=prompt_tokens or 0,
completion_tokens=completion_tokens or 0,
total_tokens=total_tokens or 0,
completion_tokens_details=completion_tokens_details or None,
completion_tokens_details=_completion_tokens_details or None,
prompt_tokens_details=_prompt_tokens_details or None,
)
## ANTHROPIC MAPPING ##

2833
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -17,7 +17,7 @@ documentation = "https://docs.litellm.ai"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0, !=3.9.7"
openai = ">=1.45.0"
openai = ">=1.51.0"
python-dotenv = ">=0.2.0"
tiktoken = ">=0.7.0"
importlib-metadata = ">=6.8.0"

View file

@ -1,6 +1,6 @@
# LITELLM PROXY DEPENDENCIES #
anyio==4.4.0 # openai + http req.
openai==1.45.0 # openai req.
openai==1.51.0 # openai req.
fastapi==0.111.0 # server dep
backoff==2.2.1 # server dep
pyyaml==6.0.0 # server dep

View file

@ -46,6 +46,7 @@ def mock_chat_response() -> Dict[str, Any]:
"completion_tokens": 38,
"completion_tokens_details": None,
"total_tokens": 268,
"prompt_tokens_details": None,
},
"system_fingerprint": None,
}
@ -201,6 +202,7 @@ def mock_embedding_response() -> Dict[str, Any]:
"total_tokens": 8,
"completion_tokens": 0,
"completion_tokens_details": None,
"prompt_tokens_details": None,
},
}

View file

@ -0,0 +1,34 @@
import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import httpx
import pytest
from respx import MockRouter
import litellm
from litellm import Choices, Message, ModelResponse
from litellm.types.utils import PromptTokensDetails
@pytest.mark.asyncio
async def test_prompt_caching():
"""
Tests that:
- prompt_tokens_details is correctly handled and returned as PromptTokensDetails type
"""
response1 = await litellm.acompletion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "hi"}],
)
print("response1", response1)
print("response1.usage", response1.usage)
print("type of prompt_tokens_details", type(response1.usage.prompt_tokens_details))
assert isinstance(response1.usage.prompt_tokens_details, PromptTokensDetails)