fix(utils.py): fix the response object returned when n>1 for stream=true

Fixes https://github.com/BerriAI/litellm/issues/3276
2024-04-25 13:27:29 -07:00 · 2024-04-25 13:27:29 -07:00 · 6c5c7cca3d
commit 6c5c7cca3d
parent 5ad91e1277
3 changed files with 310 additions and 55 deletions
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -1,51 +1,18 @@
 environment_variables:
  SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
 general_settings:
  alerting:
  - slack
  alerting_threshold: 300
  database_connection_pool_limit: 100
  database_connection_timeout: 60
  health_check_interval: 300
  proxy_batch_write_at: 10
  ui_access_mode: all
 litellm_settings:
  allowed_fails: 3
  failure_callback:
  - prometheus
  fallbacks:
  - gpt-3.5-turbo:
    - fake-openai-endpoint
    - gpt-4
  num_retries: 3
  service_callback:
  - prometheus_system
  success_callback:
  - prometheus
 model_list:
- litellm_params:
+- model_name: text-embedding-3-small
-    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+  litellm_params: 
-    api_key: my-fake-key
+    model: text-embedding-3-small
-    model: openai/my-fake-model
+- model_name: whisper
  model_name: fake-openai-endpoint
 - litellm_params:
    model: gpt-3.5-turbo
  model_name: gpt-3.5-turbo
 - model_name: llama-3
  litellm_params:
-    model: replicate/meta/meta-llama-3-8b-instruct
+    model: azure/azure-whisper
-router_settings:
+    api_version: 2024-02-15-preview
-  allowed_fails: 3
+    api_base: os.environ/AZURE_EUROPE_API_BASE
-  context_window_fallbacks: null
+    api_key: os.environ/AZURE_EUROPE_API_KEY
-  cooldown_time: 1
+  model_info:
-  fallbacks:
+    mode: audio_transcription
-  - gpt-3.5-turbo:
+- litellm_params:
-    - fake-openai-endpoint
+    model: gpt-4
-    - gpt-4
+  model_name: gpt-4
-  - gpt-3.5-turbo-3:
+
-    - fake-openai-endpoint
+# litellm_settings:
-  num_retries: 3
+#   cache: True
  retry_after: 0
  routing_strategy: simple-shuffle
  routing_strategy_args: {}
  timeout: 6000
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -2446,6 +2446,34 @@ class ModelResponseIterator:
        return self.model_response
 class ModelResponseListIterator:
    def __init__(self, model_responses):
        self.model_responses = model_responses
        self.index = 0
    # Sync iterator
    def __iter__(self):
        return self
    def __next__(self):
        if self.index >= len(self.model_responses):
            raise StopIteration
        model_response = self.model_responses[self.index]
        self.index += 1
        return model_response
    # Async iterator
    def __aiter__(self):
        return self
    async def __anext__(self):
        if self.index >= len(self.model_responses):
            raise StopAsyncIteration
        model_response = self.model_responses[self.index]
        self.index += 1
        return model_response
 def test_unit_test_custom_stream_wrapper():
    """
    Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2514,259 @@ def test_unit_test_custom_stream_wrapper():
            if "How are you?" in chunk.choices[0].delta.content:
                freq += 1
    assert freq == 1
 chunks = [
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {
                "index": 0,
                "delta": {"content": "It"},
                "logprobs": {
                    "content": [
                        {
                            "token": "It",
                            "logprob": -1.5952516,
                            "bytes": [73, 116],
                            "top_logprobs": [
                                {
                                    "token": "Brown",
                                    "logprob": -0.7358765,
                                    "bytes": [66, 114, 111, 119, 110],
                                }
                            ],
                        }
                    ]
                },
                "finish_reason": None,
            }
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {
                "index": 1,
                "delta": {"content": "Brown"},
                "logprobs": {
                    "content": [
                        {
                            "token": "Brown",
                            "logprob": -0.7358765,
                            "bytes": [66, 114, 111, 119, 110],
                            "top_logprobs": [
                                {
                                    "token": "Brown",
                                    "logprob": -0.7358765,
                                    "bytes": [66, 114, 111, 119, 110],
                                }
                            ],
                        }
                    ]
                },
                "finish_reason": None,
            }
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {
                "index": 0,
                "delta": {"content": "'s"},
                "logprobs": {
                    "content": [
                        {
                            "token": "'s",
                            "logprob": -0.006786893,
                            "bytes": [39, 115],
                            "top_logprobs": [
                                {
                                    "token": "'s",
                                    "logprob": -0.006786893,
                                    "bytes": [39, 115],
                                }
                            ],
                        }
                    ]
                },
                "finish_reason": None,
            }
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {
                "index": 0,
                "delta": {"content": " impossible"},
                "logprobs": {
                    "content": [
                        {
                            "token": " impossible",
                            "logprob": -0.06528423,
                            "bytes": [
                                32,
                                105,
                                109,
                                112,
                                111,
                                115,
                                115,
                                105,
                                98,
                                108,
                                101,
                            ],
                            "top_logprobs": [
                                {
                                    "token": " impossible",
                                    "logprob": -0.06528423,
                                    "bytes": [
                                        32,
                                        105,
                                        109,
                                        112,
                                        111,
                                        115,
                                        115,
                                        105,
                                        98,
                                        108,
                                        101,
                                    ],
                                }
                            ],
                        }
                    ]
                },
                "finish_reason": None,
            }
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {
                "index": 0,
                "delta": {"content": "—even"},
                "logprobs": {
                    "content": [
                        {
                            "token": "—even",
                            "logprob": -9999.0,
                            "bytes": [226, 128, 148, 101, 118, 101, 110],
                            "top_logprobs": [
                                {
                                    "token": " to",
                                    "logprob": -0.12302828,
                                    "bytes": [32, 116, 111],
                                }
                            ],
                        }
                    ]
                },
                "finish_reason": None,
            }
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
        ],
    },
    {
        "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
        "object": "chat.completion.chunk",
        "created": 1714075272,
        "model": "gpt-4-0613",
        "system_fingerprint": None,
        "choices": [
            {"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
        ],
    },
 ]
 def test_unit_test_custom_stream_wrapper_n():
    """
    Test if the translated output maps exactly to the received openai input
    Relevant issue: https://github.com/BerriAI/litellm/issues/3276
    """
    litellm.set_verbose = False
    chunk_list = []
    for chunk in chunks:
        _chunk = litellm.ModelResponse(**chunk, stream=True)
        chunk_list.append(_chunk)
    completion_stream = ModelResponseListIterator(model_responses=chunk_list)
    response = litellm.CustomStreamWrapper(
        completion_stream=completion_stream,
        model="gpt-4-0613",
        custom_llm_provider="cached_response",
        logging_obj=litellm.Logging(
            model="gpt-4-0613",
            messages=[{"role": "user", "content": "Hey"}],
            stream=True,
            call_type="completion",
            start_time=time.time(),
            litellm_call_id="12345",
            function_id="1245",
        ),
    )
    for idx, chunk in enumerate(response):
        chunk_dict = {}
        try:
            chunk_dict = chunk.model_dump(exclude_none=True)
        except:
            chunk_dict = chunk.dict(exclude_none=True)
        chunk_dict.pop("created")
        chunks[idx].pop("created")
        if chunks[idx]["system_fingerprint"] is None:
            chunks[idx].pop("system_fingerprint", None)
        if idx == 0:
            for choice in chunk_dict["choices"]:
                if "role" in choice["delta"]:
                    choice["delta"].pop("role")
        for choice in chunks[idx]["choices"]:
            # ignore finish reason None - since our pydantic object is set to exclude_none = true
            if "finish_reason" in choice and choice["finish_reason"] is None:
                choice.pop("finish_reason")
            if "logprobs" in choice and choice["logprobs"] is None:
                choice.pop("logprobs")
        assert (
            chunk_dict == chunks[idx]
        ), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -19,6 +19,7 @@ from functools import wraps
 import datetime, time
 import tiktoken
 import uuid
 from pydantic import BaseModel
 import aiohttp
 import textwrap
 import logging
@ -10120,12 +10121,15 @@ class CustomStreamWrapper:
                        model_response.id = original_chunk.id
                        self.response_id = original_chunk.id
                        if len(original_chunk.choices) > 0:
-                            try:
+                            choices = []
-                                delta = dict(original_chunk.choices[0].delta)
+                            for idx, choice in enumerate(original_chunk.choices):
-                                print_verbose(f"original delta: {delta}")
+                                try:
-                                model_response.choices[0].delta = Delta(**delta)
+                                    if isinstance(choice, BaseModel):
-                            except Exception as e:
+                                        choice_json = choice.model_dump()
-                                model_response.choices[0].delta = Delta()
+                                        choices.append(StreamingChoices(**choice_json))
                                except Exception as e:
                                    choices.append(StreamingChoices())
                            model_response.choices = choices
                        else:
                            return
                        model_response.system_fingerprint = (