Litellm openai audio streaming (#6325)

* refactor(main.py): streaming_chunk_builder

use <100 lines of code

refactor each component into a separate function - easier to maintain + test

* fix(utils.py): handle choices being None

openai pydantic schema updated

* fix(main.py): fix linting error

* feat(streaming_chunk_builder_utils.py): update stream chunk builder to support rebuilding audio chunks from openai

* test(test_custom_callback_input.py): test message redaction works for audio output

* fix(streaming_chunk_builder_utils.py): return anthropic token usage info directly

* fix(stream_chunk_builder_utils.py): run validation check before entering chunk processor

* fix(main.py): fix import
This commit is contained in:
Krish Dholakia 2024-10-19 16:16:51 -07:00 committed by GitHub
parent 979e8ea526
commit c58d542282
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 638 additions and 282 deletions

View file

@ -2365,3 +2365,32 @@ async def test_caching_kwargs_input(sync_mode):
else:
input["original_function"] = acompletion
await llm_caching_handler.async_set_cache(**input)
@pytest.mark.skip(reason="audio caching not supported yet")
@pytest.mark.parametrize("stream", [False]) # True,
@pytest.mark.asyncio()
async def test_audio_caching(stream):
litellm.cache = Cache(type="local")
## CALL 1 - no cache hit
completion = await litellm.acompletion(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=stream,
)
assert "cache_hit" not in completion._hidden_params
## CALL 2 - cache hit
completion = await litellm.acompletion(
model="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "pcm16"},
messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
stream=stream,
)
assert "cache_hit" in completion._hidden_params