diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 9372d4ca8..0ea72c85b 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -1,51 +1,18 @@ -environment_variables: - SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg== -general_settings: - alerting: - - slack - alerting_threshold: 300 - database_connection_pool_limit: 100 - database_connection_timeout: 60 - health_check_interval: 300 - proxy_batch_write_at: 10 - ui_access_mode: all -litellm_settings: - allowed_fails: 3 - failure_callback: - - prometheus - fallbacks: - - gpt-3.5-turbo: - - fake-openai-endpoint - - gpt-4 - num_retries: 3 - service_callback: - - prometheus_system - success_callback: - - prometheus model_list: -- litellm_params: - api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ - api_key: my-fake-key - model: openai/my-fake-model - model_name: fake-openai-endpoint -- litellm_params: - model: gpt-3.5-turbo - model_name: gpt-3.5-turbo -- model_name: llama-3 +- model_name: text-embedding-3-small + litellm_params: + model: text-embedding-3-small +- model_name: whisper litellm_params: - model: replicate/meta/meta-llama-3-8b-instruct -router_settings: - allowed_fails: 3 - context_window_fallbacks: null - cooldown_time: 1 - fallbacks: - - gpt-3.5-turbo: - - fake-openai-endpoint - - gpt-4 - - gpt-3.5-turbo-3: - - fake-openai-endpoint - num_retries: 3 - retry_after: 0 - routing_strategy: simple-shuffle - routing_strategy_args: {} - timeout: 6000 + model: azure/azure-whisper + api_version: 2024-02-15-preview + api_base: os.environ/AZURE_EUROPE_API_BASE + api_key: os.environ/AZURE_EUROPE_API_KEY + model_info: + mode: audio_transcription +- litellm_params: + model: gpt-4 + model_name: gpt-4 + +# litellm_settings: +# cache: True \ No newline at end of file diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 08195fb94..e7db84a6e 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -2446,6 +2446,34 @@ class ModelResponseIterator: return self.model_response +class ModelResponseListIterator: + def __init__(self, model_responses): + self.model_responses = model_responses + self.index = 0 + + # Sync iterator + def __iter__(self): + return self + + def __next__(self): + if self.index >= len(self.model_responses): + raise StopIteration + model_response = self.model_responses[self.index] + self.index += 1 + return model_response + + # Async iterator + def __aiter__(self): + return self + + async def __anext__(self): + if self.index >= len(self.model_responses): + raise StopAsyncIteration + model_response = self.model_responses[self.index] + self.index += 1 + return model_response + + def test_unit_test_custom_stream_wrapper(): """ Test if last streaming chunk ends with '?', if the message repeats itself. @@ -2486,3 +2514,259 @@ def test_unit_test_custom_stream_wrapper(): if "How are you?" in chunk.choices[0].delta.content: freq += 1 assert freq == 1 + + +chunks = [ + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "delta": {"content": "It"}, + "logprobs": { + "content": [ + { + "token": "It", + "logprob": -1.5952516, + "bytes": [73, 116], + "top_logprobs": [ + { + "token": "Brown", + "logprob": -0.7358765, + "bytes": [66, 114, 111, 119, 110], + } + ], + } + ] + }, + "finish_reason": None, + } + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + { + "index": 1, + "delta": {"content": "Brown"}, + "logprobs": { + "content": [ + { + "token": "Brown", + "logprob": -0.7358765, + "bytes": [66, 114, 111, 119, 110], + "top_logprobs": [ + { + "token": "Brown", + "logprob": -0.7358765, + "bytes": [66, 114, 111, 119, 110], + } + ], + } + ] + }, + "finish_reason": None, + } + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "delta": {"content": "'s"}, + "logprobs": { + "content": [ + { + "token": "'s", + "logprob": -0.006786893, + "bytes": [39, 115], + "top_logprobs": [ + { + "token": "'s", + "logprob": -0.006786893, + "bytes": [39, 115], + } + ], + } + ] + }, + "finish_reason": None, + } + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "delta": {"content": " impossible"}, + "logprobs": { + "content": [ + { + "token": " impossible", + "logprob": -0.06528423, + "bytes": [ + 32, + 105, + 109, + 112, + 111, + 115, + 115, + 105, + 98, + 108, + 101, + ], + "top_logprobs": [ + { + "token": " impossible", + "logprob": -0.06528423, + "bytes": [ + 32, + 105, + 109, + 112, + 111, + 115, + 115, + 105, + 98, + 108, + 101, + ], + } + ], + } + ] + }, + "finish_reason": None, + } + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "delta": {"content": "—even"}, + "logprobs": { + "content": [ + { + "token": "—even", + "logprob": -9999.0, + "bytes": [226, 128, 148, 101, 118, 101, 110], + "top_logprobs": [ + { + "token": " to", + "logprob": -0.12302828, + "bytes": [32, 116, 111], + } + ], + } + ] + }, + "finish_reason": None, + } + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"} + ], + }, + { + "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe", + "object": "chat.completion.chunk", + "created": 1714075272, + "model": "gpt-4-0613", + "system_fingerprint": None, + "choices": [ + {"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"} + ], + }, +] + + +def test_unit_test_custom_stream_wrapper_n(): + """ + Test if the translated output maps exactly to the received openai input + + Relevant issue: https://github.com/BerriAI/litellm/issues/3276 + """ + litellm.set_verbose = False + + chunk_list = [] + for chunk in chunks: + _chunk = litellm.ModelResponse(**chunk, stream=True) + chunk_list.append(_chunk) + + completion_stream = ModelResponseListIterator(model_responses=chunk_list) + + response = litellm.CustomStreamWrapper( + completion_stream=completion_stream, + model="gpt-4-0613", + custom_llm_provider="cached_response", + logging_obj=litellm.Logging( + model="gpt-4-0613", + messages=[{"role": "user", "content": "Hey"}], + stream=True, + call_type="completion", + start_time=time.time(), + litellm_call_id="12345", + function_id="1245", + ), + ) + + for idx, chunk in enumerate(response): + chunk_dict = {} + try: + chunk_dict = chunk.model_dump(exclude_none=True) + except: + chunk_dict = chunk.dict(exclude_none=True) + + chunk_dict.pop("created") + chunks[idx].pop("created") + if chunks[idx]["system_fingerprint"] is None: + chunks[idx].pop("system_fingerprint", None) + if idx == 0: + for choice in chunk_dict["choices"]: + if "role" in choice["delta"]: + choice["delta"].pop("role") + + for choice in chunks[idx]["choices"]: + # ignore finish reason None - since our pydantic object is set to exclude_none = true + if "finish_reason" in choice and choice["finish_reason"] is None: + choice.pop("finish_reason") + if "logprobs" in choice and choice["logprobs"] is None: + choice.pop("logprobs") + + assert ( + chunk_dict == chunks[idx] + ), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}" diff --git a/litellm/utils.py b/litellm/utils.py index 7fd4dc5dc..f69426c87 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -19,6 +19,7 @@ from functools import wraps import datetime, time import tiktoken import uuid +from pydantic import BaseModel import aiohttp import textwrap import logging @@ -10120,12 +10121,15 @@ class CustomStreamWrapper: model_response.id = original_chunk.id self.response_id = original_chunk.id if len(original_chunk.choices) > 0: - try: - delta = dict(original_chunk.choices[0].delta) - print_verbose(f"original delta: {delta}") - model_response.choices[0].delta = Delta(**delta) - except Exception as e: - model_response.choices[0].delta = Delta() + choices = [] + for idx, choice in enumerate(original_chunk.choices): + try: + if isinstance(choice, BaseModel): + choice_json = choice.model_dump() + choices.append(StreamingChoices(**choice_json)) + except Exception as e: + choices.append(StreamingChoices()) + model_response.choices = choices else: return model_response.system_fingerprint = (