Litellm openai audio streaming (#6325)

* refactor(main.py): streaming_chunk_builder use <100 lines of code refactor each component into a separate function - easier to maintain + test * fix(utils.py): handle choices being None openai pydantic schema updated * fix(main.py): fix linting error * feat(streaming_chunk_builder_utils.py): update stream chunk builder to support rebuilding audio chunks from openai * test(test_custom_callback_input.py): test message redaction works for audio output * fix(streaming_chunk_builder_utils.py): return anthropic token usage info directly * fix(stream_chunk_builder_utils.py): run validation check before entering chunk processor * fix(main.py): fix import
2024-10-19 16:16:51 -07:00 · 2024-10-19 16:16:51 -07:00 · c58d542282
commit c58d542282
parent 979e8ea526
10 changed files with 638 additions and 282 deletions
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -2365,3 +2365,32 @@ async def test_caching_kwargs_input(sync_mode):
    else:
        input["original_function"] = acompletion
        await llm_caching_handler.async_set_cache(**input)
+
+
+@pytest.mark.skip(reason="audio caching not supported yet")
+@pytest.mark.parametrize("stream", [False])  # True,
+@pytest.mark.asyncio()
+async def test_audio_caching(stream):
+    litellm.cache = Cache(type="local")
+
+    ## CALL 1 - no cache hit
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "pcm16"},
+        messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+        stream=stream,
+    )
+
+    assert "cache_hit" not in completion._hidden_params
+
+    ## CALL 2 - cache hit
+    completion = await litellm.acompletion(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "pcm16"},
+        messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+        stream=stream,
+    )
+
+    assert "cache_hit" in completion._hidden_params
--- a/tests/local_testing/test_custom_callback_input.py
+++ b/tests/local_testing/test_custom_callback_input.py
@ -1267,6 +1267,100 @@ def test_standard_logging_payload(model, turn_off_message_logging):
            assert "redacted-by-litellm" == slobject["response"]


+@pytest.mark.parametrize(
+    "stream",
+    [True, False],
+)
+@pytest.mark.parametrize(
+    "turn_off_message_logging",
+    [
+        True,
+    ],
+)  # False
+def test_standard_logging_payload_audio(turn_off_message_logging, stream):
+    """
+    Ensure valid standard_logging_payload is passed for logging calls to s3
+
+    Motivation: provide a standard set of things that are logged to s3/gcs/future integrations across all llm calls
+    """
+    from litellm.types.utils import StandardLoggingPayload
+
+    # sync completion
+    customHandler = CompletionCustomHandler()
+    litellm.callbacks = [customHandler]
+
+    litellm.turn_off_message_logging = turn_off_message_logging
+
+    with patch.object(
+        customHandler, "log_success_event", new=MagicMock()
+    ) as mock_client:
+        response = litellm.completion(
+            model="gpt-4o-audio-preview",
+            modalities=["text", "audio"],
+            audio={"voice": "alloy", "format": "pcm16"},
+            messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+            stream=stream,
+        )
+
+        if stream:
+            for chunk in response:
+                continue
+
+        time.sleep(2)
+        mock_client.assert_called_once()
+
+        print(
+            f"mock_client_post.call_args: {mock_client.call_args.kwargs['kwargs'].keys()}"
+        )
+        assert "standard_logging_object" in mock_client.call_args.kwargs["kwargs"]
+        assert (
+            mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
+            is not None
+        )
+
+        print(
+            "Standard Logging Object - {}".format(
+                mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
+            )
+        )
+
+        keys_list = list(StandardLoggingPayload.__annotations__.keys())
+
+        for k in keys_list:
+            assert (
+                k in mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
+            )
+
+        ## json serializable
+        json_str_payload = json.dumps(
+            mock_client.call_args.kwargs["kwargs"]["standard_logging_object"]
+        )
+        json.loads(json_str_payload)
+
+        ## response cost
+        assert (
+            mock_client.call_args.kwargs["kwargs"]["standard_logging_object"][
+                "response_cost"
+            ]
+            > 0
+        )
+        assert (
+            mock_client.call_args.kwargs["kwargs"]["standard_logging_object"][
+                "model_map_information"
+            ]["model_map_value"]
+            is not None
+        )
+
+        ## turn off message logging
+        slobject: StandardLoggingPayload = mock_client.call_args.kwargs["kwargs"][
+            "standard_logging_object"
+        ]
+        if turn_off_message_logging:
+            print("checks redacted-by-litellm")
+            assert "redacted-by-litellm" == slobject["messages"][0]["content"]
+            assert "redacted-by-litellm" == slobject["response"]
+
+
@pytest.mark.skip(reason="Works locally. Flaky on ci/cd")
 def test_aaastandard_logging_payload_cache_hit():
    from litellm.types.utils import StandardLoggingPayload
--- a/tests/local_testing/test_stream_chunk_builder.py
+++ b/tests/local_testing/test_stream_chunk_builder.py
@ -6,6 +6,17 @@ import traceback

 import pytest
 from typing import List
+from litellm.types.utils import StreamingChoices, ChatCompletionAudioResponse
+
+
+def check_non_streaming_response(completion):
+    assert completion.choices[0].message.audio is not None, "Audio response is missing"
+    print("audio", completion.choices[0].message.audio)
+    assert isinstance(
+        completion.choices[0].message.audio, ChatCompletionAudioResponse
+    ), "Invalid audio response type"
+    assert len(completion.choices[0].message.audio.data) > 0, "Audio data is empty"
+

 sys.path.insert(
    0, os.path.abspath("../..")
@ -656,12 +667,60 @@ def test_stream_chunk_builder_openai_prompt_caching():
    response = stream_chunk_builder(chunks=chunks)
    print(f"response: {response}")
    print(f"response usage: {response.usage}")
-    for k, v in usage_obj.model_dump().items():
+    for k, v in usage_obj.model_dump(exclude_none=True).items():
        print(k, v)
        response_usage_value = getattr(response.usage, k)  # type: ignore
        print(f"response_usage_value: {response_usage_value}")
        print(f"type: {type(response_usage_value)}")
        if isinstance(response_usage_value, BaseModel):
-            assert response_usage_value.model_dump() == v
+            assert response_usage_value.model_dump(exclude_none=True) == v
+        else:
+            assert response_usage_value == v
+
+
+def test_stream_chunk_builder_openai_audio_output_usage():
+    from pydantic import BaseModel
+    from openai import OpenAI
+    from typing import Optional
+
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=os.getenv("OPENAI_API_KEY"),
+    )
+
+    completion = client.chat.completions.create(
+        model="gpt-4o-audio-preview",
+        modalities=["text", "audio"],
+        audio={"voice": "alloy", "format": "pcm16"},
+        messages=[{"role": "user", "content": "response in 1 word - yes or no"}],
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in completion:
+        chunks.append(litellm.ModelResponse(**chunk.model_dump(), stream=True))
+
+    usage_obj: Optional[litellm.Usage] = None
+
+    for index, chunk in enumerate(chunks):
+        if hasattr(chunk, "usage"):
+            usage_obj = chunk.usage
+            print(f"chunk usage: {chunk.usage}")
+            print(f"index: {index}")
+            print(f"len chunks: {len(chunks)}")
+
+    print(f"usage_obj: {usage_obj}")
+    response = stream_chunk_builder(chunks=chunks)
+    print(f"response usage: {response.usage}")
+    check_non_streaming_response(response)
+    print(f"response: {response}")
+    for k, v in usage_obj.model_dump(exclude_none=True).items():
+        print(k, v)
+        response_usage_value = getattr(response.usage, k)  # type: ignore
+        print(f"response_usage_value: {response_usage_value}")
+        print(f"type: {type(response_usage_value)}")
+        if isinstance(response_usage_value, BaseModel):
+            assert response_usage_value.model_dump(exclude_none=True) == v
        else:
            assert response_usage_value == v