Litellm dev 2024 12 20 p1 (#7335)

* fix(utils.py): e2e azure tts cost tracking working

moves tts response obj to include hidden params (allows for litellm call id, etc. to be sent in response headers) ; fixes spend_Tracking_utils logging payload to account for non-base model use-case

Fixes https://github.com/BerriAI/litellm/issues/7223

* fix: fix linting errors

* build(model_prices_and_context_window.json): add bedrock llama 3.3

Closes https://github.com/BerriAI/litellm/issues/7329

* fix(openai.py): fix return type for sync openai httpx response

* test: update test

* fix(spend_tracking_utils.py): fix if check

* fix(spend_tracking_utils.py): fix if check

* test: improve debugging for test

* fix: fix import
This commit is contained in:
Krish Dholakia 2024-12-20 21:22:31 -08:00 committed by GitHub
parent 522da384b6
commit 404bf2974b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 63 additions and 36 deletions

1
.gitignore vendored
View file

@ -66,3 +66,4 @@ litellm/tests/langfuse.log
litellm/tests/langfuse.log litellm/tests/langfuse.log
litellm/proxy/google-cloud-sdk/* litellm/proxy/google-cloud-sdk/*
tests/llm_translation/log.txt tests/llm_translation/log.txt
venv/

View file

@ -1386,7 +1386,7 @@ class AzureChatCompletion(BaseLLM):
input=input, input=input,
**optional_params, **optional_params,
) )
return response return HttpxBinaryResponseContent(response=response.response)
async def async_audio_speech( async def async_audio_speech(
self, self,
@ -1415,14 +1415,14 @@ class AzureChatCompletion(BaseLLM):
client_type="async", client_type="async",
) # type: ignore ) # type: ignore
response = await azure_client.audio.speech.create( azure_response = await azure_client.audio.speech.create(
model=model, model=model,
voice=voice, # type: ignore voice=voice, # type: ignore
input=input, input=input,
**optional_params, **optional_params,
) )
return response return HttpxBinaryResponseContent(response=azure_response.response)
def get_headers( def get_headers(
self, self,

View file

@ -1,4 +1,4 @@
from typing import Any, Coroutine, Optional, Union from typing import Any, Coroutine, Optional, Union, cast
import httpx import httpx
from openai import AsyncAzureOpenAI, AzureOpenAI from openai import AsyncAzureOpenAI, AzureOpenAI
@ -111,7 +111,7 @@ class AzureOpenAIFilesAPI(BaseLLM):
openai_client: AsyncAzureOpenAI, openai_client: AsyncAzureOpenAI,
) -> HttpxBinaryResponseContent: ) -> HttpxBinaryResponseContent:
response = await openai_client.files.content(**file_content_request) response = await openai_client.files.content(**file_content_request)
return response return HttpxBinaryResponseContent(response=response.response)
def file_content( def file_content(
self, self,
@ -152,9 +152,11 @@ class AzureOpenAIFilesAPI(BaseLLM):
file_content_request=file_content_request, file_content_request=file_content_request,
openai_client=openai_client, openai_client=openai_client,
) )
response = openai_client.files.content(**file_content_request) response = cast(AzureOpenAI, openai_client).files.content(
**file_content_request
)
return response return HttpxBinaryResponseContent(response=response.response)
async def aretrieve_file( async def aretrieve_file(
self, self,

View file

@ -1253,13 +1253,13 @@ class OpenAIChatCompletion(BaseLLM):
client=client, client=client,
) )
response = openai_client.audio.speech.create( response = cast(OpenAI, openai_client).audio.speech.create(
model=model, model=model,
voice=voice, # type: ignore voice=voice, # type: ignore
input=input, input=input,
**optional_params, **optional_params,
) )
return response # type: ignore return HttpxBinaryResponseContent(response=response.response)
async def async_audio_speech( async def async_audio_speech(
self, self,
@ -1276,13 +1276,16 @@ class OpenAIChatCompletion(BaseLLM):
client=None, client=None,
) -> HttpxBinaryResponseContent: ) -> HttpxBinaryResponseContent:
openai_client = self._get_openai_client( openai_client = cast(
AsyncOpenAI,
self._get_openai_client(
is_async=True, is_async=True,
api_key=api_key, api_key=api_key,
api_base=api_base, api_base=api_base,
timeout=timeout, timeout=timeout,
max_retries=max_retries, max_retries=max_retries,
client=client, client=client,
),
) )
response = await openai_client.audio.speech.create( response = await openai_client.audio.speech.create(
@ -1292,7 +1295,7 @@ class OpenAIChatCompletion(BaseLLM):
**optional_params, **optional_params,
) )
return response return HttpxBinaryResponseContent(response=response.response)
async def ahealth_check( async def ahealth_check(
self, self,
@ -1477,7 +1480,7 @@ class OpenAIFilesAPI(BaseLLM):
openai_client: AsyncOpenAI, openai_client: AsyncOpenAI,
) -> HttpxBinaryResponseContent: ) -> HttpxBinaryResponseContent:
response = await openai_client.files.content(**file_content_request) response = await openai_client.files.content(**file_content_request)
return response return HttpxBinaryResponseContent(response=response.response)
def file_content( def file_content(
self, self,
@ -1515,9 +1518,9 @@ class OpenAIFilesAPI(BaseLLM):
file_content_request=file_content_request, file_content_request=file_content_request,
openai_client=openai_client, openai_client=openai_client,
) )
response = openai_client.files.content(**file_content_request) response = cast(OpenAI, openai_client).files.content(**file_content_request)
return response return HttpxBinaryResponseContent(response=response.response)
async def aretrieve_file( async def aretrieve_file(
self, self,

View file

@ -5986,6 +5986,15 @@
"litellm_provider": "bedrock", "litellm_provider": "bedrock",
"mode": "embedding" "mode": "embedding"
}, },
"meta.llama3-3-70b-instruct-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000072,
"output_cost_per_token": 0.00000072,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama2-13b-chat-v1": { "meta.llama2-13b-chat-v1": {
"max_tokens": 4096, "max_tokens": 4096,
"max_input_tokens": 4096, "max_input_tokens": 4096,

View file

@ -12,12 +12,6 @@ model_list:
model: openai/gpt-4o model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY api_key: os.environ/OPENAI_API_KEY
num_retries: 3 num_retries: 3
- model_name: azure-tts
litellm_params:
model: azure/tts-1
api_key: "6a4ULQtTSBZi5TONpzGP2GkKDUSSReHFQYwjMpZaywNxY03FsplGJQQJ99ALACHrzpqXJ3w3AAAAACOGUsjy"
api_base: "https://krris-m4rve6fd-northcentralus.cognitiveservices.azure.com/openai/deployments/tts"
api_version: "2024-05-01-preview"
litellm_settings: litellm_settings:
success_callback: ["langfuse"] success_callback: ["langfuse"]

View file

@ -1,7 +1,7 @@
import json import json
import secrets import secrets
from datetime import datetime as dt from datetime import datetime as dt
from typing import Optional from typing import Optional, cast
from pydantic import BaseModel from pydantic import BaseModel
@ -40,7 +40,9 @@ def get_logging_payload(
if kwargs is None: if kwargs is None:
kwargs = {} kwargs = {}
if response_obj is None: if response_obj is None or (
not isinstance(response_obj, BaseModel) and not isinstance(response_obj, dict)
):
response_obj = {} response_obj = {}
# standardize this function to be used across, s3, dynamoDB, langfuse logging # standardize this function to be used across, s3, dynamoDB, langfuse logging
litellm_params = kwargs.get("litellm_params", {}) litellm_params = kwargs.get("litellm_params", {})
@ -50,10 +52,10 @@ def get_logging_payload(
completion_start_time = kwargs.get("completion_start_time", end_time) completion_start_time = kwargs.get("completion_start_time", end_time)
call_type = kwargs.get("call_type") call_type = kwargs.get("call_type")
cache_hit = kwargs.get("cache_hit", False) cache_hit = kwargs.get("cache_hit", False)
usage = response_obj.get("usage", None) or {} usage = cast(dict, response_obj).get("usage", None) or {}
if isinstance(usage, litellm.Usage): if isinstance(usage, litellm.Usage):
usage = dict(usage) usage = dict(usage)
id = response_obj.get("id") or kwargs.get("litellm_call_id") id = cast(dict, response_obj).get("id") or kwargs.get("litellm_call_id")
api_key = metadata.get("user_api_key", "") api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str): if api_key is not None and isinstance(api_key, str):
if api_key.startswith("sk-"): if api_key.startswith("sk-"):

View file

@ -12,7 +12,7 @@ from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
from litellm.llms.jina_ai.rerank.handler import JinaAIRerank from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
from litellm.llms.together_ai.rerank.handler import TogetherAIRerank from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
from litellm.rerank_api.rerank_utils import get_optional_rerank_params from litellm.rerank_api.rerank_utils import get_optional_rerank_params
from litellm.secret_managers.main import get_secret from litellm.secret_managers.main import get_secret, get_secret_str
from litellm.types.rerank import OptionalRerankParams, RerankResponse from litellm.types.rerank import OptionalRerankParams, RerankResponse
from litellm.types.router import * from litellm.types.router import *
from litellm.utils import ProviderConfigManager, client, exception_type from litellm.utils import ProviderConfigManager, client, exception_type
@ -211,7 +211,7 @@ def rerank( # noqa: PLR0915
dynamic_api_base dynamic_api_base
or optional_params.api_base or optional_params.api_base
or litellm.api_base or litellm.api_base
or get_secret("INFINITY_API_BASE") # type: ignore or get_secret_str("INFINITY_API_BASE")
) )
if api_base is None: if api_base is None:

View file

@ -1,7 +1,9 @@
from os import PathLike from os import PathLike
from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union
from openai._legacy_response import HttpxBinaryResponseContent from openai._legacy_response import (
HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
)
from openai.lib.streaming._assistants import ( from openai.lib.streaming._assistants import (
AssistantEventHandler, AssistantEventHandler,
AssistantStreamManager, AssistantStreamManager,
@ -48,6 +50,11 @@ FileTypes = Union[
EmbeddingInput = Union[str, List[str]] EmbeddingInput = Union[str, List[str]]
class HttpxBinaryResponseContent(_HttpxBinaryResponseContent):
_hidden_params: dict = {}
pass
class NotGiven: class NotGiven:
""" """
A sentinel singleton class used to distinguish omitted keyword arguments A sentinel singleton class used to distinguish omitted keyword arguments

View file

@ -5986,6 +5986,15 @@
"litellm_provider": "bedrock", "litellm_provider": "bedrock",
"mode": "embedding" "mode": "embedding"
}, },
"meta.llama3-3-70b-instruct-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000072,
"output_cost_per_token": 0.00000072,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama2-13b-chat-v1": { "meta.llama2-13b-chat-v1": {
"max_tokens": 4096, "max_tokens": 4096,
"max_input_tokens": 4096, "max_input_tokens": 4096,

View file

@ -210,7 +210,7 @@ async def test_get_router_response():
# reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd." # reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
# ) # )
# @pytest.mark.flaky(retries=3, delay=1) # @pytest.mark.flaky(retries=3, delay=1)
def test_vertex_ai_anthropic(): def test_aavertex_ai_anthropic():
model = "claude-3-sonnet@20240229" model = "claude-3-sonnet@20240229"
vertex_ai_project = "adroit-crow-413218" vertex_ai_project = "adroit-crow-413218"

View file

@ -61,7 +61,7 @@ async def test_audio_speech_litellm(sync_mode, model, api_base, api_key):
optional_params={}, optional_params={},
) )
from litellm.llms.openai.openai import HttpxBinaryResponseContent from litellm.types.llms.openai import HttpxBinaryResponseContent
assert isinstance(response, HttpxBinaryResponseContent) assert isinstance(response, HttpxBinaryResponseContent)
else: else: