forked from phoenix/litellm-mirror
Litellm openai audio streaming (#6325)
* refactor(main.py): streaming_chunk_builder use <100 lines of code refactor each component into a separate function - easier to maintain + test * fix(utils.py): handle choices being None openai pydantic schema updated * fix(main.py): fix linting error * feat(streaming_chunk_builder_utils.py): update stream chunk builder to support rebuilding audio chunks from openai * test(test_custom_callback_input.py): test message redaction works for audio output * fix(streaming_chunk_builder_utils.py): return anthropic token usage info directly * fix(stream_chunk_builder_utils.py): run validation check before entering chunk processor * fix(main.py): fix import
This commit is contained in:
parent
979e8ea526
commit
c58d542282
10 changed files with 638 additions and 282 deletions
|
@ -2365,3 +2365,32 @@ async def test_caching_kwargs_input(sync_mode):
|
|||
else:
|
||||
input["original_function"] = acompletion
|
||||
await llm_caching_handler.async_set_cache(**input)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="audio caching not supported yet")
|
||||
@pytest.mark.parametrize("stream", [False]) # True,
|
||||
@pytest.mark.asyncio()
|
||||
async def test_audio_caching(stream):
|
||||
litellm.cache = Cache(type="local")
|
||||
|
||||
## CALL 1 - no cache hit
|
||||
completion = await litellm.acompletion(
|
||||
model="gpt-4o-audio-preview",
|
||||
modalities=["text", "audio"],
|
||||
audio={"voice": "alloy", "format": "pcm16"},
|
||||
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
assert "cache_hit" not in completion._hidden_params
|
||||
|
||||
## CALL 2 - cache hit
|
||||
completion = await litellm.acompletion(
|
||||
model="gpt-4o-audio-preview",
|
||||
modalities=["text", "audio"],
|
||||
audio={"voice": "alloy", "format": "pcm16"},
|
||||
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
assert "cache_hit" in completion._hidden_params
|
||||
|
|
|
@ -1267,6 +1267,100 @@ def test_standard_logging_payload(model, turn_off_message_logging):
|
|||
assert "redacted-by-litellm" == slobject["response"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stream",
|
||||
[True, False],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"turn_off_message_logging",
|
||||
[
|
||||
True,
|
||||
],
|
||||
) # False
|
||||
def test_standard_logging_payload_audio(turn_off_message_logging, stream):
|
||||
"""
|
||||
Ensure valid standard_logging_payload is passed for logging calls to s3
|
||||
|
||||
Motivation: provide a standard set of things that are logged to s3/gcs/future integrations across all llm calls
|
||||
"""
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
# sync completion
|
||||
customHandler = CompletionCustomHandler()
|
||||
litellm.callbacks = [customHandler]
|
||||
|
||||
litellm.turn_off_message_logging = turn_off_message_logging
|
||||
|
||||
with patch.object(
|
||||
customHandler, "log_success_event", new=MagicMock()
|
||||
) as mock_client:
|
||||
response = litellm.completion(
|
||||
model="gpt-4o-audio-preview",
|
||||
modalities=["text", "audio"],
|
||||
audio={"voice": "alloy", "format": "pcm16"},
|
||||
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
if stream:
|
||||
for chunk in response:
|
||||
continue
|
||||
|
||||
time.sleep(2)
|
||||
mock_client.assert_called_once()
|
||||
|
||||
print(
|
||||
f"mock_client_post.call_args: {mock_client.call_args.kwargs['kwargs'].keys()}"
|
||||
)
|
||||
assert "standard_logging_object" in mock_client.call_args.kwargs["kwargs"]
|
||||
assert (
|
||||
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
|
||||
is not None
|
||||
)
|
||||
|
||||
print(
|
||||
"Standard Logging Object - {}".format(
|
||||
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
|
||||
)
|
||||
)
|
||||
|
||||
keys_list = list(StandardLoggingPayload.__annotations__.keys())
|
||||
|
||||
for k in keys_list:
|
||||
assert (
|
||||
k in mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
|
||||
)
|
||||
|
||||
## json serializable
|
||||
json_str_payload = json.dumps(
|
||||
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
|
||||
)
|
||||
json.loads(json_str_payload)
|
||||
|
||||
## response cost
|
||||
assert (
|
||||
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"][
|
||||
"response_cost"
|
||||
]
|
||||
> 0
|
||||
)
|
||||
assert (
|
||||
mock_client.call_args.kwargs["kwargs"]["standard_logging_object"][
|
||||
"model_map_information"
|
||||
]["model_map_value"]
|
||||
is not None
|
||||
)
|
||||
|
||||
## turn off message logging
|
||||
slobject: StandardLoggingPayload = mock_client.call_args.kwargs["kwargs"][
|
||||
"standard_logging_object"
|
||||
]
|
||||
if turn_off_message_logging:
|
||||
print("checks redacted-by-litellm")
|
||||
assert "redacted-by-litellm" == slobject["messages"][0]["content"]
|
||||
assert "redacted-by-litellm" == slobject["response"]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Works locally. Flaky on ci/cd")
|
||||
def test_aaastandard_logging_payload_cache_hit():
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
|
|
@ -6,6 +6,17 @@ import traceback
|
|||
|
||||
import pytest
|
||||
from typing import List
|
||||
from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
|
||||
|
||||
|
||||
def check_non_streaming_response(completion):
|
||||
assert completion.choices[0].message.audio is not None, "Audio response is missing"
|
||||
print("audio", completion.choices[0].message.audio)
|
||||
assert isinstance(
|
||||
completion.choices[0].message.audio, ChatCompletionAudioResponse
|
||||
), "Invalid audio response type"
|
||||
assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
|
||||
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
|
@ -656,12 +667,60 @@ def test_stream_chunk_builder_openai_prompt_caching():
|
|||
response = stream_chunk_builder(chunks=chunks)
|
||||
print(f"response: {response}")
|
||||
print(f"response usage: {response.usage}")
|
||||
for k, v in usage_obj.model_dump().items():
|
||||
for k, v in usage_obj.model_dump(exclude_none=True).items():
|
||||
print(k, v)
|
||||
response_usage_value = getattr(response.usage, k) # type: ignore
|
||||
print(f"response_usage_value: {response_usage_value}")
|
||||
print(f"type: {type(response_usage_value)}")
|
||||
if isinstance(response_usage_value, BaseModel):
|
||||
assert response_usage_value.model_dump() == v
|
||||
assert response_usage_value.model_dump(exclude_none=True) == v
|
||||
else:
|
||||
assert response_usage_value == v
|
||||
|
||||
|
||||
def test_stream_chunk_builder_openai_audio_output_usage():
|
||||
from pydantic import BaseModel
|
||||
from openai import OpenAI
|
||||
from typing import Optional
|
||||
|
||||
client = OpenAI(
|
||||
# This is the default and can be omitted
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="gpt-4o-audio-preview",
|
||||
modalities=["text", "audio"],
|
||||
audio={"voice": "alloy", "format": "pcm16"},
|
||||
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
|
||||
stream=True,
|
||||
stream_options={"include_usage": True},
|
||||
)
|
||||
|
||||
chunks = []
|
||||
for chunk in completion:
|
||||
chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True))
|
||||
|
||||
usage_obj: Optional[litellm.Usage] = None
|
||||
|
||||
for index, chunk in enumerate(chunks):
|
||||
if hasattr(chunk, "usage"):
|
||||
usage_obj = chunk.usage
|
||||
print(f"chunk usage: {chunk.usage}")
|
||||
print(f"index: {index}")
|
||||
print(f"len chunks: {len(chunks)}")
|
||||
|
||||
print(f"usage_obj: {usage_obj}")
|
||||
response = stream_chunk_builder(chunks=chunks)
|
||||
print(f"response usage: {response.usage}")
|
||||
check_non_streaming_response(response)
|
||||
print(f"response: {response}")
|
||||
for k, v in usage_obj.model_dump(exclude_none=True).items():
|
||||
print(k, v)
|
||||
response_usage_value = getattr(response.usage, k) # type: ignore
|
||||
print(f"response_usage_value: {response_usage_value}")
|
||||
print(f"type: {type(response_usage_value)}")
|
||||
if isinstance(response_usage_value, BaseModel):
|
||||
assert response_usage_value.model_dump(exclude_none=True) == v
|
||||
else:
|
||||
assert response_usage_value == v
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue