Litellm dev 2024 12 20 p1 (#7335)

* fix(utils.py): e2e azure tts cost tracking working

moves tts response obj to include hidden params (allows for litellm call id, etc. to be sent in response headers) ; fixes spend_Tracking_utils logging payload to account for non-base model use-case

Fixes https://github.com/BerriAI/litellm/issues/7223

* fix: fix linting errors

* build(model_prices_and_context_window.json): add bedrock llama 3.3

Closes https://github.com/BerriAI/litellm/issues/7329

* fix(openai.py): fix return type for sync openai httpx response

* test: update test

* fix(spend_tracking_utils.py): fix if check

* fix(spend_tracking_utils.py): fix if check

* test: improve debugging for test

* fix: fix import
This commit is contained in:
Krish Dholakia 2024-12-20 21:22:31 -08:00 committed by GitHub
parent 522da384b6
commit 404bf2974b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 63 additions and 36 deletions

1
.gitignore vendored
View file

@ -66,3 +66,4 @@ litellm/tests/langfuse.log
litellm/tests/langfuse.log
litellm/proxy/google-cloud-sdk/*
tests/llm_translation/log.txt
venv/

View file

@ -1386,7 +1386,7 @@ class AzureChatCompletion(BaseLLM):
input=input,
**optional_params,
)
return response
return HttpxBinaryResponseContent(response=response.response)
async def async_audio_speech(
self,
@ -1415,14 +1415,14 @@ class AzureChatCompletion(BaseLLM):
client_type="async",
) # type: ignore
response = await azure_client.audio.speech.create(
azure_response = await azure_client.audio.speech.create(
model=model,
voice=voice, # type: ignore
input=input,
**optional_params,
)
return response
return HttpxBinaryResponseContent(response=azure_response.response)
def get_headers(
self,

View file

@ -1,4 +1,4 @@
from typing import Any, Coroutine, Optional, Union
from typing import Any, Coroutine, Optional, Union, cast
import httpx
from openai import AsyncAzureOpenAI, AzureOpenAI
@ -111,7 +111,7 @@ class AzureOpenAIFilesAPI(BaseLLM):
openai_client: AsyncAzureOpenAI,
) -> HttpxBinaryResponseContent:
response = await openai_client.files.content(**file_content_request)
return response
return HttpxBinaryResponseContent(response=response.response)
def file_content(
self,
@ -152,9 +152,11 @@ class AzureOpenAIFilesAPI(BaseLLM):
file_content_request=file_content_request,
openai_client=openai_client,
)
response = openai_client.files.content(**file_content_request)
response = cast(AzureOpenAI, openai_client).files.content(
**file_content_request
)
return response
return HttpxBinaryResponseContent(response=response.response)
async def aretrieve_file(
self,

View file

@ -1253,13 +1253,13 @@ class OpenAIChatCompletion(BaseLLM):
client=client,
)
response = openai_client.audio.speech.create(
response = cast(OpenAI, openai_client).audio.speech.create(
model=model,
voice=voice, # type: ignore
input=input,
**optional_params,
)
return response # type: ignore
return HttpxBinaryResponseContent(response=response.response)
async def async_audio_speech(
self,
@ -1276,13 +1276,16 @@ class OpenAIChatCompletion(BaseLLM):
client=None,
) -> HttpxBinaryResponseContent:
openai_client = self._get_openai_client(
openai_client = cast(
AsyncOpenAI,
self._get_openai_client(
is_async=True,
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
client=client,
),
)
response = await openai_client.audio.speech.create(
@ -1292,7 +1295,7 @@ class OpenAIChatCompletion(BaseLLM):
**optional_params,
)
return response
return HttpxBinaryResponseContent(response=response.response)
async def ahealth_check(
self,
@ -1477,7 +1480,7 @@ class OpenAIFilesAPI(BaseLLM):
openai_client: AsyncOpenAI,
) -> HttpxBinaryResponseContent:
response = await openai_client.files.content(**file_content_request)
return response
return HttpxBinaryResponseContent(response=response.response)
def file_content(
self,
@ -1515,9 +1518,9 @@ class OpenAIFilesAPI(BaseLLM):
file_content_request=file_content_request,
openai_client=openai_client,
)
response = openai_client.files.content(**file_content_request)
response = cast(OpenAI, openai_client).files.content(**file_content_request)
return response
return HttpxBinaryResponseContent(response=response.response)
async def aretrieve_file(
self,

View file

@ -5986,6 +5986,15 @@
"litellm_provider": "bedrock",
"mode": "embedding"
},
"meta.llama3-3-70b-instruct-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000072,
"output_cost_per_token": 0.00000072,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama2-13b-chat-v1": {
"max_tokens": 4096,
"max_input_tokens": 4096,

View file

@ -12,12 +12,6 @@ model_list:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
num_retries: 3
- model_name: azure-tts
litellm_params:
model: azure/tts-1
api_key: "6a4ULQtTSBZi5TONpzGP2GkKDUSSReHFQYwjMpZaywNxY03FsplGJQQJ99ALACHrzpqXJ3w3AAAAACOGUsjy"
api_base: "https://krris-m4rve6fd-northcentralus.cognitiveservices.azure.com/openai/deployments/tts"
api_version: "2024-05-01-preview"
litellm_settings:
success_callback: ["langfuse"]

View file

@ -1,7 +1,7 @@
import json
import secrets
from datetime import datetime as dt
from typing import Optional
from typing import Optional, cast
from pydantic import BaseModel
@ -40,7 +40,9 @@ def get_logging_payload(
if kwargs is None:
kwargs = {}
if response_obj is None:
if response_obj is None or (
not isinstance(response_obj, BaseModel) and not isinstance(response_obj, dict)
):
response_obj = {}
# standardize this function to be used across, s3, dynamoDB, langfuse logging
litellm_params = kwargs.get("litellm_params", {})
@ -50,10 +52,10 @@ def get_logging_payload(
completion_start_time = kwargs.get("completion_start_time", end_time)
call_type = kwargs.get("call_type")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj.get("usage", None) or {}
usage = cast(dict, response_obj).get("usage", None) or {}
if isinstance(usage, litellm.Usage):
usage = dict(usage)
id = response_obj.get("id") or kwargs.get("litellm_call_id")
id = cast(dict, response_obj).get("id") or kwargs.get("litellm_call_id")
api_key = metadata.get("user_api_key", "")
if api_key is not None and isinstance(api_key, str):
if api_key.startswith("sk-"):

View file

@ -12,7 +12,7 @@ from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
from litellm.llms.jina_ai.rerank.handler import JinaAIRerank
from litellm.llms.together_ai.rerank.handler import TogetherAIRerank
from litellm.rerank_api.rerank_utils import get_optional_rerank_params
from litellm.secret_managers.main import get_secret
from litellm.secret_managers.main import get_secret, get_secret_str
from litellm.types.rerank import OptionalRerankParams, RerankResponse
from litellm.types.router import *
from litellm.utils import ProviderConfigManager, client, exception_type
@ -211,7 +211,7 @@ def rerank( # noqa: PLR0915
dynamic_api_base
or optional_params.api_base
or litellm.api_base
or get_secret("INFINITY_API_BASE") # type: ignore
or get_secret_str("INFINITY_API_BASE")
)
if api_base is None:

View file

@ -1,7 +1,9 @@
from os import PathLike
from typing import IO, Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union
from openai._legacy_response import HttpxBinaryResponseContent
from openai._legacy_response import (
HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
)
from openai.lib.streaming._assistants import (
AssistantEventHandler,
AssistantStreamManager,
@ -48,6 +50,11 @@ FileTypes = Union[
EmbeddingInput = Union[str, List[str]]
class HttpxBinaryResponseContent(_HttpxBinaryResponseContent):
_hidden_params: dict = {}
pass
class NotGiven:
"""
A sentinel singleton class used to distinguish omitted keyword arguments

View file

@ -5986,6 +5986,15 @@
"litellm_provider": "bedrock",
"mode": "embedding"
},
"meta.llama3-3-70b-instruct-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 128000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000072,
"output_cost_per_token": 0.00000072,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama2-13b-chat-v1": {
"max_tokens": 4096,
"max_input_tokens": 4096,

View file

@ -210,7 +210,7 @@ async def test_get_router_response():
# reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd."
# )
# @pytest.mark.flaky(retries=3, delay=1)
def test_vertex_ai_anthropic():
def test_aavertex_ai_anthropic():
model = "claude-3-sonnet@20240229"
vertex_ai_project = "adroit-crow-413218"

View file

@ -61,7 +61,7 @@ async def test_audio_speech_litellm(sync_mode, model, api_base, api_key):
optional_params={},
)
from litellm.llms.openai.openai import HttpxBinaryResponseContent
from litellm.types.llms.openai import HttpxBinaryResponseContent
assert isinstance(response, HttpxBinaryResponseContent)
else: