Litellm dev 2024 12 20 p1 (#7335)

* fix(utils.py): e2e azure tts cost tracking working moves tts response obj to include hidden params (allows for litellm call id, etc. to be sent in response headers) ; fixes spend_Tracking_utils logging payload to account for non-base model use-case Fixes https://github.com/BerriAI/litellm/issues/7223 * fix: fix linting errors * build(model_prices_and_context_window.json): add bedrock llama 3.3 Closes https://github.com/BerriAI/litellm/issues/7329 * fix(openai.py): fix return type for sync openai httpx response * test: update test * fix(spend_tracking_utils.py): fix if check * fix(spend_tracking_utils.py): fix if check * test: improve debugging for test * fix: fix import
2025-04-25 18:54:30 +00:00 · 2024-12-20 21:22:31 -08:00 · 2024-12-20 21:22:31 -08:00 · 404bf2974b
commit 404bf2974b
parent 522da384b6
12 changed files with 63 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@ -66,3 +66,4 @@ litellm/tests/langfuse.log
 litellm/tests/langfuse.log
 litellm/proxy/google-cloud-sdk/*
 tests/llm_translation/log.txt
+venv/
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -1386,7 +1386,7 @@ class AzureChatCompletion(BaseLLM):
            input=input,
            **optional_params,
        )
-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    async def async_audio_speech(
        self,
@ -1415,14 +1415,14 @@ class AzureChatCompletion(BaseLLM):
            client_type="async",
        )  # type: ignore

-        response = await azure_client.audio.speech.create(
+        azure_response = await azure_client.audio.speech.create(
            model=model,
            voice=voice,  # type: ignore
            input=input,
            **optional_params,
        )

-        return response
+        return HttpxBinaryResponseContent(response=azure_response.response)

    def get_headers(
        self,
--- a/litellm/llms/azure/files/handler.py
+++ b/litellm/llms/azure/files/handler.py
@ -1,4 +1,4 @@
-from typing import Any, Coroutine, Optional, Union
+from typing import Any, Coroutine, Optional, Union, cast

 import httpx
 from openai import AsyncAzureOpenAI, AzureOpenAI
@ -111,7 +111,7 @@ class AzureOpenAIFilesAPI(BaseLLM):
        openai_client: AsyncAzureOpenAI,
    ) -> HttpxBinaryResponseContent:
        response = await openai_client.files.content(**file_content_request)
-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    def file_content(
        self,
@ -152,9 +152,11 @@ class AzureOpenAIFilesAPI(BaseLLM):
                file_content_request=file_content_request,
                openai_client=openai_client,
            )
-        response = openai_client.files.content(**file_content_request)
+        response = cast(AzureOpenAI, openai_client).files.content(
+            **file_content_request
+        )

-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    async def aretrieve_file(
        self,
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@ -1253,13 +1253,13 @@ class OpenAIChatCompletion(BaseLLM):
            client=client,
        )

-        response = openai_client.audio.speech.create(
+        response = cast(OpenAI, openai_client).audio.speech.create(
            model=model,
            voice=voice,  # type: ignore
            input=input,
            **optional_params,
        )
-        return response  # type: ignore
+        return HttpxBinaryResponseContent(response=response.response)

    async def async_audio_speech(
        self,
@ -1276,13 +1276,16 @@ class OpenAIChatCompletion(BaseLLM):
        client=None,
    ) -> HttpxBinaryResponseContent:

-        openai_client = self._get_openai_client(
+        openai_client = cast(
+            AsyncOpenAI,
+            self._get_openai_client(
                is_async=True,
                api_key=api_key,
                api_base=api_base,
                timeout=timeout,
                max_retries=max_retries,
                client=client,
+            ),
        )

        response = await openai_client.audio.speech.create(
@ -1292,7 +1295,7 @@ class OpenAIChatCompletion(BaseLLM):
            **optional_params,
        )

-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    async def ahealth_check(
        self,
@ -1477,7 +1480,7 @@ class OpenAIFilesAPI(BaseLLM):
        openai_client: AsyncOpenAI,
    ) -> HttpxBinaryResponseContent:
        response = await openai_client.files.content(**file_content_request)
-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    def file_content(
        self,
@ -1515,9 +1518,9 @@ class OpenAIFilesAPI(BaseLLM):
                file_content_request=file_content_request,
                openai_client=openai_client,
            )
-        response = openai_client.files.content(**file_content_request)
+        response = cast(OpenAI, openai_client).files.content(**file_content_request)

-        return response
+        return HttpxBinaryResponseContent(response=response.response)

    async def aretrieve_file(
        self,
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -5986,6 +5986,15 @@
        "litellm_provider": "bedrock",
        "mode": "embedding"
    },
+    "meta.llama3-3-70b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000072,
+        "output_cost_per_token": 0.00000072,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "meta.llama2-13b-chat-v1": {
        "max_tokens": 4096, 
        "max_input_tokens": 4096, 
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -12,12 +12,6 @@ model_list:
      model: openai/gpt-4o
      api_key: os.environ/OPENAI_API_KEY
      num_retries: 3
-  - model_name: azure-tts
-    litellm_params:
-      model: azure/tts-1
-      api_key: "6a4ULQtTSBZi5TONpzGP2GkKDUSSReHFQYwjMpZaywNxY03FsplGJQQJ99ALACHrzpqXJ3w3AAAAACOGUsjy"
-      api_base: "https://krris-m4rve6fd-northcentralus.cognitiveservices.azure.com/openai/deployments/tts"
-      api_version: "2024-05-01-preview"

 litellm_settings:
  success_callback: ["langfuse"]
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@ -1,7 +1,7 @@
 import json
 import secrets
 from datetime import datetime as dt
-from typing import Optional
+from typing import Optional, cast

 from pydantic import BaseModel

@ -40,7 +40,9 @@ def get_logging_payload(

    if kwargs is None:
        kwargs = {}
-    if response_obj is None:
+    if response_obj is None or (
+        not isinstance(response_obj, BaseModel) and not isinstance(response_obj, dict)
+    ):
        response_obj = {}
    # standardize this function to be used across, s3, dynamoDB, langfuse logging
    litellm_params = kwargs.get("litellm_params", {})
@ -50,10 +52,10 @@ def get_logging_payload(
    completion_start_time = kwargs.get("completion_start_time", end_time)
    call_type = kwargs.get("call_type")
    cache_hit = kwargs.get("cache_hit", False)
-    usage = response_obj.get("usage", None) or {}
+    usage = cast(dict, response_obj).get("usage", None) or {}
    if isinstance(usage, litellm.Usage):
        usage = dict(usage)
-    id = response_obj.get("id") or kwargs.get("litellm_call_id")
+    id = cast(dict, response_obj).get("id") or kwargs.get("litellm_call_id")
    api_key = metadata.get("user_api_key", "")
    if api_key is not None and isinstance(api_key, str):
        if api_key.startswith("sk-"):
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@ -12,7 +12,7 @@ from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
 from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
 from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
 from litellm.rerank_api.rerank_utils import get_optional_rerank_params
-from litellm.secret_managers.main import get_secret
+from litellm.secret_managers.main import get_secret, get_secret_str
 from litellm.types.rerank import OptionalRerankParams, RerankResponse
 from litellm.types.router import *
 from litellm.utils import ProviderConfigManager, client, exception_type
@ -211,7 +211,7 @@ def rerank(  # noqa: PLR0915
                dynamic_api_base
                or optional_params.api_base
                or litellm.api_base
-                or get_secret("INFINITY_API_BASE")  # type: ignore
+                or get_secret_str("INFINITY_API_BASE")
            )

            if api_base is None:
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -1,7 +1,9 @@
 from os import PathLike
 from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union

-from openai._legacy_response import HttpxBinaryResponseContent
+from openai._legacy_response import (
+    HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
+)
 from openai.lib.streaming._assistants import (
    AssistantEventHandler,
    AssistantStreamManager,
@ -48,6 +50,11 @@ FileTypes = Union[
 EmbeddingInput = Union[str, List[str]]


+class HttpxBinaryResponseContent(_HttpxBinaryResponseContent):
+    _hidden_params: dict = {}
+    pass
+
+
 class NotGiven:
    """
    A sentinel singleton class used to distinguish omitted keyword arguments
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -5986,6 +5986,15 @@
        "litellm_provider": "bedrock",
        "mode": "embedding"
    },
+    "meta.llama3-3-70b-instruct-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000072,
+        "output_cost_per_token": 0.00000072,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "meta.llama2-13b-chat-v1": {
        "max_tokens": 4096, 
        "max_input_tokens": 4096, 
--- a/tests/local_testing/test_amazing_vertex_completion.py
+++ b/tests/local_testing/test_amazing_vertex_completion.py
@ -210,7 +210,7 @@ async def test_get_router_response():
 #     reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
 # )
 # @pytest.mark.flaky(retries=3, delay=1)
-def test_vertex_ai_anthropic():
+def test_aavertex_ai_anthropic():
    model = "claude-3-sonnet@20240229"

    vertex_ai_project = "adroit-crow-413218"
--- a/tests/local_testing/test_audio_speech.py
+++ b/tests/local_testing/test_audio_speech.py
@ -61,7 +61,7 @@ async def test_audio_speech_litellm(sync_mode, model, api_base, api_key):
            optional_params={},
        )

-        from litellm.llms.openai.openai import HttpxBinaryResponseContent
+        from litellm.types.llms.openai import HttpxBinaryResponseContent

        assert isinstance(response, HttpxBinaryResponseContent)
    else: