From cb2563e3c0e49e2c9187ccfc88fd05fbbbfe6303 Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Tue, 22 Oct 2024 21:18:54 -0700 Subject: [PATCH] Litellm dev 10 22 2024 (#6384) * fix(utils.py): add 'disallowed_special' for token counting on .encode() Fixes error when '< endoftext >' in string * Revert "(fix) standard logging metadata + add unit testing (#6366)" (#6381) This reverts commit 8359cb6fa9bf7b0bf4f3df630cf8666adffa2813. * add new 35 mode lcard (#6378) * Add claude 3 5 sonnet 20241022 models for all provides (#6380) * Add Claude 3.5 v2 on Amazon Bedrock and Vertex AI. * added anthropic/claude-3-5-sonnet-20241022 * add new 35 mode lcard --------- Co-authored-by: Paul Gauthier Co-authored-by: lowjiansheng <15527690+lowjiansheng@users.noreply.github.com> * test(skip-flaky-google-context-caching-test): google is not reliable. their sample code is also not working * Fix metadata being overwritten in speech() (#6295) * fix: adding missing redis cluster kwargs (#6318) Co-authored-by: Ali Arian * Add support for `max_completion_tokens` in Azure OpenAI (#6376) Now that Azure supports `max_completion_tokens`, no need for special handling for this param and let it pass thru. More details: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models?tabs=python-secure#api-support * build(model_prices_and_context_window.json): add voyage-finance-2 pricing Closes https://github.com/BerriAI/litellm/issues/6371 * build(model_prices_and_context_window.json): fix llama3.1 pricing model name on map Closes https://github.com/BerriAI/litellm/issues/6310 * feat(realtime_streaming.py): just log specific events Closes https://github.com/BerriAI/litellm/issues/6267 * fix(utils.py): more robust checking if unmapped vertex anthropic model belongs to that family of models Fixes https://github.com/BerriAI/litellm/issues/6383 * Fix Ollama stream handling for tool calls with None content (#6155) * test(test_max_completions): update test now that azure supports 'max_completion_tokens' * fix(handler.py): fix linting error --------- Co-authored-by: Ishaan Jaff Co-authored-by: Low Jian Sheng <15527690+lowjiansheng@users.noreply.github.com> Co-authored-by: David Manouchehri Co-authored-by: Paul Gauthier Co-authored-by: John HU Co-authored-by: Ali Arian <113945203+ali-arian@users.noreply.github.com> Co-authored-by: Ali Arian Co-authored-by: Anand Taralika <46954145+taralika@users.noreply.github.com> Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com> --- docs/my-website/docs/realtime.md | 19 +++++++- litellm/__init__.py | 1 + litellm/_redis.py | 2 + .../litellm_core_utils/realtime_streaming.py | 29 +++++++++++- .../AzureOpenAI/chat/gpt_transformation.py | 3 -- litellm/llms/AzureOpenAI/realtime/handler.py | 4 +- litellm/llms/ollama.py | 1 + .../anthropic/transformation.py | 13 +++++ litellm/main.py | 1 - ...odel_prices_and_context_window_backup.json | 14 ++++-- litellm/proxy/_new_secret_config.yaml | 12 +++-- litellm/utils.py | 10 ++-- model_prices_and_context_window.json | 14 ++++-- .../test_max_completion_tokens.py | 2 +- tests/llm_translation/test_optional_params.py | 9 ++++ tests/local_testing/test_completion_cost.py | 47 +++++++++++++++++++ tests/local_testing/test_token_counter.py | 4 ++ 17 files changed, 162 insertions(+), 23 deletions(-) diff --git a/docs/my-website/docs/realtime.md b/docs/my-website/docs/realtime.md index 2149387a6..28697f44b 100644 --- a/docs/my-website/docs/realtime.md +++ b/docs/my-website/docs/realtime.md @@ -83,4 +83,21 @@ ws.on("message", function incoming(message) { ws.on("error", function handleError(error) { console.error("Error: ", error); }); -``` \ No newline at end of file +``` + +## Logging + +To prevent requests from being dropped, by default LiteLLM just logs these event types: + +- `session.created` +- `response.create` +- `response.done` + +You can override this by setting the `logged_real_time_event_types` parameter in the config. For example: + +```yaml +litellm_settings: + logged_real_time_event_types: "*" # Log all events + ## OR ## + logged_real_time_event_types: ["session.created", "response.create", "response.done"] # Log only these event types +``` diff --git a/litellm/__init__.py b/litellm/__init__.py index 357057e4c..2aa89a03c 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -56,6 +56,7 @@ _custom_logger_compatible_callbacks_literal = Literal[ "opik", "argilla", ] +logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None _known_custom_logger_compatible_callbacks: List = list( get_args(_custom_logger_compatible_callbacks_literal) ) diff --git a/litellm/_redis.py b/litellm/_redis.py index 289a7d4ae..c058a0d3a 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -69,6 +69,8 @@ def _get_redis_cluster_kwargs(client=None): available_args = [x for x in arg_spec.args if x not in exclude_args] available_args.append("password") + available_args.append("username") + available_args.append("ssl") return available_args diff --git a/litellm/litellm_core_utils/realtime_streaming.py b/litellm/litellm_core_utils/realtime_streaming.py index 922f90e36..440deac1c 100644 --- a/litellm/litellm_core_utils/realtime_streaming.py +++ b/litellm/litellm_core_utils/realtime_streaming.py @@ -26,15 +26,24 @@ async with websockets.connect( # type: ignore import asyncio import concurrent.futures +import json import traceback from asyncio import Task from typing import Any, Dict, List, Optional, Union +import litellm + from .litellm_logging import Logging as LiteLLMLogging # Create a thread pool with a maximum of 10 threads executor = concurrent.futures.ThreadPoolExecutor(max_workers=10) +DefaultLoggedRealTimeEventTypes = [ + "session.created", + "response.create", + "response.done", +] + class RealTimeStreaming: def __init__( @@ -49,9 +58,27 @@ class RealTimeStreaming: self.messages: List = [] self.input_message: Dict = {} + _logged_real_time_event_types = litellm.logged_real_time_event_types + + if _logged_real_time_event_types is None: + _logged_real_time_event_types = DefaultLoggedRealTimeEventTypes + self.logged_real_time_event_types = _logged_real_time_event_types + + def _should_store_message(self, message: Union[str, bytes]) -> bool: + if isinstance(message, bytes): + message = message.decode("utf-8") + message_obj = json.loads(message) + _msg_type = message_obj["type"] + if self.logged_real_time_event_types == "*": + return True + if _msg_type in self.logged_real_time_event_types: + return True + return False + def store_message(self, message: Union[str, bytes]): """Store message in list""" - self.messages.append(message) + if self._should_store_message(message): + self.messages.append(message) def store_input(self, message: dict): """Store input message""" diff --git a/litellm/llms/AzureOpenAI/chat/gpt_transformation.py b/litellm/llms/AzureOpenAI/chat/gpt_transformation.py index 271c5c467..fb0b21d3b 100644 --- a/litellm/llms/AzureOpenAI/chat/gpt_transformation.py +++ b/litellm/llms/AzureOpenAI/chat/gpt_transformation.py @@ -198,9 +198,6 @@ class AzureOpenAIConfig: optional_params["json_mode"] = True else: optional_params["response_format"] = value - elif param == "max_completion_tokens": - # TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support - optional_params["max_tokens"] = value elif param in supported_openai_params: optional_params[param] = value diff --git a/litellm/llms/AzureOpenAI/realtime/handler.py b/litellm/llms/AzureOpenAI/realtime/handler.py index bf45c53fb..a6c0f1967 100644 --- a/litellm/llms/AzureOpenAI/realtime/handler.py +++ b/litellm/llms/AzureOpenAI/realtime/handler.py @@ -72,5 +72,5 @@ class AzureOpenAIRealtime(AzureChatCompletion): except websockets.exceptions.InvalidStatusCode as e: # type: ignore await websocket.close(code=e.status_code, reason=str(e)) - except Exception as e: - await websocket.close(code=1011, reason=f"Internal server error: {str(e)}") + except Exception: + pass diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index e08e6f693..845d0e2dd 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -398,6 +398,7 @@ def ollama_completion_stream(url, data, logging_obj): isinstance(content_chunk, StreamingChoices) and hasattr(content_chunk, "delta") and hasattr(content_chunk.delta, "content") + and content_chunk.delta.content is not None ): content_chunks.append(content_chunk.delta.content) response_content = "".join(content_chunks) diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py index 44b8af279..406314a59 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py @@ -177,3 +177,16 @@ class VertexAIAnthropicConfig: optional_params["json_mode"] = True return optional_params + + @classmethod + def is_supported_model( + cls, model: str, custom_llm_provider: Optional[str] = None + ) -> bool: + """ + Check if the model is supported by the VertexAI Anthropic API. + """ + if custom_llm_provider == "vertex_ai" and "claude" in model.lower(): + return True + elif model in litellm.vertex_anthropic_models: + return True + return False diff --git a/litellm/main.py b/litellm/main.py index 0e62c9fa2..f239d2612 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -4986,7 +4986,6 @@ def speech( litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) proxy_server_request = kwargs.get("proxy_server_request", None) model_info = kwargs.get("model_info", None) - metadata = kwargs.get("metadata", {}) model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore kwargs.pop("tags", []) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index e63741804..890ef8688 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1104,7 +1104,7 @@ "litellm_provider": "azure_ai", "mode": "chat" }, - "azure_ai/Meta-Llama-31-8B-Instruct": { + "azure_ai/Meta-Llama-3.1-8B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -1114,7 +1114,7 @@ "mode": "chat", "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice" }, - "azure_ai/Meta-Llama-31-70B-Instruct": { + "azure_ai/Meta-Llama-3.1-70B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -1124,7 +1124,7 @@ "mode": "chat", "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice" }, - "azure_ai/Meta-Llama-31-405B-Instruct": { + "azure_ai/Meta-Llama-3.1-405B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -6446,6 +6446,14 @@ "litellm_provider": "voyage", "mode": "embedding" }, + "voyage/voyage-finance-2": { + "max_tokens": 4000, + "max_input_tokens": 4000, + "input_cost_per_token": 0.00000012, + "output_cost_per_token": 0.000000, + "litellm_provider": "voyage", + "mode": "embedding" + }, "databricks/databricks-meta-llama-3-1-405b-instruct": { "max_tokens": 128000, "max_input_tokens": 128000, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 105fabbdd..00f4da8d9 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,8 +1,10 @@ model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: gpt-3.5-turbo - api_key: os.environ/OPENAI_API_KEY + - model_name: gpt-4o + litellm_params: + model: azure/gpt-4o-realtime-preview + api_key: os.environ/AZURE_SWEDEN_API_KEY + api_base: os.environ/AZURE_SWEDEN_API_BASE litellm_settings: - callbacks: ["prometheus"] \ No newline at end of file + success_callback: ["langfuse"] + # logged_real_time_event_types: "*" \ No newline at end of file diff --git a/litellm/utils.py b/litellm/utils.py index e82a937bb..6b3f0a80c 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -126,6 +126,7 @@ except (ImportError, AttributeError): os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( "CUSTOM_TIKTOKEN_CACHE_DIR", filename ) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 +from tiktoken import Encoding encoding = tiktoken.get_encoding("cl100k_base") from importlib import resources @@ -1278,7 +1279,10 @@ def encode(model="", text="", custom_tokenizer: Optional[dict] = None): enc: The encoded text. """ tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) - enc = tokenizer_json["tokenizer"].encode(text) + if isinstance(tokenizer_json["tokenizer"], Encoding): + enc = tokenizer_json["tokenizer"].encode(text, disallowed_special=()) + else: + enc = tokenizer_json["tokenizer"].encode(text) return enc @@ -3045,8 +3049,8 @@ def get_optional_params( # noqa: PLR0915 ) if litellm.vertex_ai_safety_settings is not None: optional_params["safety_settings"] = litellm.vertex_ai_safety_settings - elif ( - custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models + elif litellm.VertexAIAnthropicConfig.is_supported_model( + model=model, custom_llm_provider=custom_llm_provider ): supported_params = get_supported_openai_params( model=model, custom_llm_provider=custom_llm_provider diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index e63741804..890ef8688 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1104,7 +1104,7 @@ "litellm_provider": "azure_ai", "mode": "chat" }, - "azure_ai/Meta-Llama-31-8B-Instruct": { + "azure_ai/Meta-Llama-3.1-8B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -1114,7 +1114,7 @@ "mode": "chat", "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice" }, - "azure_ai/Meta-Llama-31-70B-Instruct": { + "azure_ai/Meta-Llama-3.1-70B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -1124,7 +1124,7 @@ "mode": "chat", "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice" }, - "azure_ai/Meta-Llama-31-405B-Instruct": { + "azure_ai/Meta-Llama-3.1-405B-Instruct": { "max_tokens": 128000, "max_input_tokens": 128000, "max_output_tokens": 128000, @@ -6446,6 +6446,14 @@ "litellm_provider": "voyage", "mode": "embedding" }, + "voyage/voyage-finance-2": { + "max_tokens": 4000, + "max_input_tokens": 4000, + "input_cost_per_token": 0.00000012, + "output_cost_per_token": 0.000000, + "litellm_provider": "voyage", + "mode": "embedding" + }, "databricks/databricks-meta-llama-3-1-405b-instruct": { "max_tokens": 128000, "max_input_tokens": 128000, diff --git a/tests/llm_translation/test_max_completion_tokens.py b/tests/llm_translation/test_max_completion_tokens.py index d0819e544..de335a3c5 100644 --- a/tests/llm_translation/test_max_completion_tokens.py +++ b/tests/llm_translation/test_max_completion_tokens.py @@ -235,7 +235,7 @@ def test_all_model_configs(): optional_params={}, api_version="2022-12-01", drop_params=False, - ) == {"max_tokens": 10} + ) == {"max_completion_tokens": 10} from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig diff --git a/tests/llm_translation/test_optional_params.py b/tests/llm_translation/test_optional_params.py index 728ec1507..a0387ce1b 100644 --- a/tests/llm_translation/test_optional_params.py +++ b/tests/llm_translation/test_optional_params.py @@ -775,3 +775,12 @@ def test_hosted_vllm_tool_param(): ) assert "tools" not in optional_params assert "tool_choice" not in optional_params + + +def test_unmapped_vertex_anthropic_model(): + optional_params = get_optional_params( + model="claude-3-5-sonnet-v250@20241022", + custom_llm_provider="vertex_ai", + max_retries=10, + ) + assert "max_retries" not in optional_params diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py index fcd80650a..21659a4c1 100644 --- a/tests/local_testing/test_completion_cost.py +++ b/tests/local_testing/test_completion_cost.py @@ -2587,3 +2587,50 @@ async def test_test_completion_cost_gpt4o_audio_output_from_model(stream): total_output_cost = output_audio_cost + output_text_cost assert round(cost, 2) == round(total_input_cost + total_output_cost, 2) + + +def test_completion_cost_azure_ai_meta(): + """ + Relevant issue: https://github.com/BerriAI/litellm/issues/6310 + """ + from litellm import ModelResponse + + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + litellm.set_verbose = True + response = { + "id": "cmpl-55db75e0b05344058b0bd8ee4e00bf84", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": 'Here\'s one:\n\nWhy did the Linux kernel go to therapy?\n\nBecause it had a lot of "core" issues!\n\nHope that one made you laugh!', + "refusal": None, + "role": "assistant", + "audio": None, + "function_call": None, + "tool_calls": [], + }, + } + ], + "created": 1729243714, + "model": "azure_ai/Meta-Llama-3.1-70B-Instruct", + "object": "chat.completion", + "service_tier": None, + "system_fingerprint": None, + "usage": { + "completion_tokens": 32, + "prompt_tokens": 16, + "total_tokens": 48, + "completion_tokens_details": None, + "prompt_tokens_details": None, + }, + } + + model_response = ModelResponse(**response) + cost = completion_cost(model_response, custom_llm_provider="azure_ai") + + assert cost > 0 diff --git a/tests/local_testing/test_token_counter.py b/tests/local_testing/test_token_counter.py index 6dbf286e4..3ad73f2d8 100644 --- a/tests/local_testing/test_token_counter.py +++ b/tests/local_testing/test_token_counter.py @@ -375,3 +375,7 @@ def test_img_url_token_counter(img_url): assert width is not None assert height is not None + + +def test_token_encode_disallowed_special(): + encode(model="gpt-3.5-turbo", text="Hello, world! <|endoftext|>")