Litellm dev 10 22 2024 (#6384)

* fix(utils.py): add 'disallowed_special' for token counting on .encode()

Fixes error when '<
endoftext
>' in string

* Revert "(fix) standard logging metadata + add unit testing  (#6366)" (#6381)

This reverts commit 8359cb6fa9.

* add new 35 mode lcard (#6378)

* Add claude 3 5 sonnet 20241022 models for all provides (#6380)

* Add Claude 3.5 v2 on Amazon Bedrock and Vertex AI.

* added anthropic/claude-3-5-sonnet-20241022

* add new 35 mode lcard

---------

Co-authored-by: Paul Gauthier <paul@paulg.com>
Co-authored-by: lowjiansheng <15527690+lowjiansheng@users.noreply.github.com>

* test(skip-flaky-google-context-caching-test): google is not reliable. their sample code is also not working

* Fix metadata being overwritten in speech() (#6295)

* fix: adding missing redis cluster kwargs (#6318)

Co-authored-by: Ali Arian <ali.arian@breadfinancial.com>

* Add support for `max_completion_tokens` in Azure OpenAI (#6376)

Now that Azure supports `max_completion_tokens`, no need for special handling for this param and let it pass thru. More details: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models?tabs=python-secure#api-support

* build(model_prices_and_context_window.json): add voyage-finance-2 pricing

Closes https://github.com/BerriAI/litellm/issues/6371

* build(model_prices_and_context_window.json): fix llama3.1 pricing model name on map

Closes https://github.com/BerriAI/litellm/issues/6310

* feat(realtime_streaming.py): just log specific events

Closes https://github.com/BerriAI/litellm/issues/6267

* fix(utils.py): more robust checking if unmapped vertex anthropic model belongs to that family of models

Fixes https://github.com/BerriAI/litellm/issues/6383

* Fix Ollama stream handling for tool calls with None content (#6155)

* test(test_max_completions): update test now that azure supports 'max_completion_tokens'

* fix(handler.py): fix linting error

---------

Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Co-authored-by: Low Jian Sheng <15527690+lowjiansheng@users.noreply.github.com>
Co-authored-by: David Manouchehri <david.manouchehri@ai.moda>
Co-authored-by: Paul Gauthier <paul@paulg.com>
Co-authored-by: John HU <hszqqq12@gmail.com>
Co-authored-by: Ali Arian <113945203+ali-arian@users.noreply.github.com>
Co-authored-by: Ali Arian <ali.arian@breadfinancial.com>
Co-authored-by: Anand Taralika <46954145+taralika@users.noreply.github.com>
Co-authored-by: Nolan Tremelling <34580718+NolanTrem@users.noreply.github.com>
This commit is contained in:
Krish Dholakia 2024-10-22 21:18:54 -07:00 committed by GitHub
parent b75019c1a5
commit cb2563e3c0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 162 additions and 23 deletions

View file

@ -83,4 +83,21 @@ ws.on("message", function incoming(message) {
ws.on("error", function handleError(error) { ws.on("error", function handleError(error) {
console.error("Error: ", error); console.error("Error: ", error);
}); });
``` ```
## Logging
To prevent requests from being dropped, by default LiteLLM just logs these event types:
- `session.created`
- `response.create`
- `response.done`
You can override this by setting the `logged_real_time_event_types` parameter in the config. For example:
```yaml
litellm_settings:
logged_real_time_event_types: "*" # Log all events
## OR ##
logged_real_time_event_types: ["session.created", "response.create", "response.done"] # Log only these event types
```

View file

@ -56,6 +56,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
"opik", "opik",
"argilla", "argilla",
] ]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
_known_custom_logger_compatible_callbacks: List = list( _known_custom_logger_compatible_callbacks: List = list(
get_args(_custom_logger_compatible_callbacks_literal) get_args(_custom_logger_compatible_callbacks_literal)
) )

View file

@ -69,6 +69,8 @@ def _get_redis_cluster_kwargs(client=None):
available_args = [x for x in arg_spec.args if x not in exclude_args] available_args = [x for x in arg_spec.args if x not in exclude_args]
available_args.append("password") available_args.append("password")
available_args.append("username")
available_args.append("ssl")
return available_args return available_args

View file

@ -26,15 +26,24 @@ async with websockets.connect( # type: ignore
import asyncio import asyncio
import concurrent.futures import concurrent.futures
import json
import traceback import traceback
from asyncio import Task from asyncio import Task
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
import litellm
from .litellm_logging import Logging as LiteLLMLogging from .litellm_logging import Logging as LiteLLMLogging
# Create a thread pool with a maximum of 10 threads # Create a thread pool with a maximum of 10 threads
executor = concurrent.futures.ThreadPoolExecutor(max_workers=10) executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
DefaultLoggedRealTimeEventTypes = [
"session.created",
"response.create",
"response.done",
]
class RealTimeStreaming: class RealTimeStreaming:
def __init__( def __init__(
@ -49,9 +58,27 @@ class RealTimeStreaming:
self.messages: List = [] self.messages: List = []
self.input_message: Dict = {} self.input_message: Dict = {}
_logged_real_time_event_types = litellm.logged_real_time_event_types
if _logged_real_time_event_types is None:
_logged_real_time_event_types = DefaultLoggedRealTimeEventTypes
self.logged_real_time_event_types = _logged_real_time_event_types
def _should_store_message(self, message: Union[str, bytes]) -> bool:
if isinstance(message, bytes):
message = message.decode("utf-8")
message_obj = json.loads(message)
_msg_type = message_obj["type"]
if self.logged_real_time_event_types == "*":
return True
if _msg_type in self.logged_real_time_event_types:
return True
return False
def store_message(self, message: Union[str, bytes]): def store_message(self, message: Union[str, bytes]):
"""Store message in list""" """Store message in list"""
self.messages.append(message) if self._should_store_message(message):
self.messages.append(message)
def store_input(self, message: dict): def store_input(self, message: dict):
"""Store input message""" """Store input message"""

View file

@ -198,9 +198,6 @@ class AzureOpenAIConfig:
optional_params["json_mode"] = True optional_params["json_mode"] = True
else: else:
optional_params["response_format"] = value optional_params["response_format"] = value
elif param == "max_completion_tokens":
# TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
optional_params["max_tokens"] = value
elif param in supported_openai_params: elif param in supported_openai_params:
optional_params[param] = value optional_params[param] = value

View file

@ -72,5 +72,5 @@ class AzureOpenAIRealtime(AzureChatCompletion):
except websockets.exceptions.InvalidStatusCode as e: # type: ignore except websockets.exceptions.InvalidStatusCode as e: # type: ignore
await websocket.close(code=e.status_code, reason=str(e)) await websocket.close(code=e.status_code, reason=str(e))
except Exception as e: except Exception:
await websocket.close(code=1011, reason=f"Internal server error: {str(e)}") pass

View file

@ -398,6 +398,7 @@ def ollama_completion_stream(url, data, logging_obj):
isinstance(content_chunk, StreamingChoices) isinstance(content_chunk, StreamingChoices)
and hasattr(content_chunk, "delta") and hasattr(content_chunk, "delta")
and hasattr(content_chunk.delta, "content") and hasattr(content_chunk.delta, "content")
and content_chunk.delta.content is not None
): ):
content_chunks.append(content_chunk.delta.content) content_chunks.append(content_chunk.delta.content)
response_content = "".join(content_chunks) response_content = "".join(content_chunks)

View file

@ -177,3 +177,16 @@ class VertexAIAnthropicConfig:
optional_params["json_mode"] = True optional_params["json_mode"] = True
return optional_params return optional_params
@classmethod
def is_supported_model(
cls, model: str, custom_llm_provider: Optional[str] = None
) -> bool:
"""
Check if the model is supported by the VertexAI Anthropic API.
"""
if custom_llm_provider == "vertex_ai" and "claude" in model.lower():
return True
elif model in litellm.vertex_anthropic_models:
return True
return False

View file

@ -4986,7 +4986,6 @@ def speech(
litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
proxy_server_request = kwargs.get("proxy_server_request", None) proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None) model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", {})
model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore
kwargs.pop("tags", []) kwargs.pop("tags", [])

View file

@ -1104,7 +1104,7 @@
"litellm_provider": "azure_ai", "litellm_provider": "azure_ai",
"mode": "chat" "mode": "chat"
}, },
"azure_ai/Meta-Llama-31-8B-Instruct": { "azure_ai/Meta-Llama-3.1-8B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
"mode": "chat", "mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice" "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
}, },
"azure_ai/Meta-Llama-31-70B-Instruct": { "azure_ai/Meta-Llama-3.1-70B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
"mode": "chat", "mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice" "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
}, },
"azure_ai/Meta-Llama-31-405B-Instruct": { "azure_ai/Meta-Llama-3.1-405B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -6446,6 +6446,14 @@
"litellm_provider": "voyage", "litellm_provider": "voyage",
"mode": "embedding" "mode": "embedding"
}, },
"voyage/voyage-finance-2": {
"max_tokens": 4000,
"max_input_tokens": 4000,
"input_cost_per_token": 0.00000012,
"output_cost_per_token": 0.000000,
"litellm_provider": "voyage",
"mode": "embedding"
},
"databricks/databricks-meta-llama-3-1-405b-instruct": { "databricks/databricks-meta-llama-3-1-405b-instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,

View file

@ -1,8 +1,10 @@
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-4o
litellm_params: litellm_params:
model: gpt-3.5-turbo model: azure/gpt-4o-realtime-preview
api_key: os.environ/OPENAI_API_KEY api_key: os.environ/AZURE_SWEDEN_API_KEY
api_base: os.environ/AZURE_SWEDEN_API_BASE
litellm_settings: litellm_settings:
callbacks: ["prometheus"] success_callback: ["langfuse"]
# logged_real_time_event_types: "*"

View file

@ -126,6 +126,7 @@ except (ImportError, AttributeError):
os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
"CUSTOM_TIKTOKEN_CACHE_DIR", filename "CUSTOM_TIKTOKEN_CACHE_DIR", filename
) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 ) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
from tiktoken import Encoding
encoding = tiktoken.get_encoding("cl100k_base") encoding = tiktoken.get_encoding("cl100k_base")
from importlib import resources from importlib import resources
@ -1278,7 +1279,10 @@ def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
enc: The encoded text. enc: The encoded text.
""" """
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model) tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
enc = tokenizer_json["tokenizer"].encode(text) if isinstance(tokenizer_json["tokenizer"], Encoding):
enc = tokenizer_json["tokenizer"].encode(text, disallowed_special=())
else:
enc = tokenizer_json["tokenizer"].encode(text)
return enc return enc
@ -3045,8 +3049,8 @@ def get_optional_params( # noqa: PLR0915
) )
if litellm.vertex_ai_safety_settings is not None: if litellm.vertex_ai_safety_settings is not None:
optional_params["safety_settings"] = litellm.vertex_ai_safety_settings optional_params["safety_settings"] = litellm.vertex_ai_safety_settings
elif ( elif litellm.VertexAIAnthropicConfig.is_supported_model(
custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models model=model, custom_llm_provider=custom_llm_provider
): ):
supported_params = get_supported_openai_params( supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider model=model, custom_llm_provider=custom_llm_provider

View file

@ -1104,7 +1104,7 @@
"litellm_provider": "azure_ai", "litellm_provider": "azure_ai",
"mode": "chat" "mode": "chat"
}, },
"azure_ai/Meta-Llama-31-8B-Instruct": { "azure_ai/Meta-Llama-3.1-8B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
"mode": "chat", "mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice" "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
}, },
"azure_ai/Meta-Llama-31-70B-Instruct": { "azure_ai/Meta-Llama-3.1-70B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
"mode": "chat", "mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice" "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
}, },
"azure_ai/Meta-Llama-31-405B-Instruct": { "azure_ai/Meta-Llama-3.1-405B-Instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,
"max_output_tokens": 128000, "max_output_tokens": 128000,
@ -6446,6 +6446,14 @@
"litellm_provider": "voyage", "litellm_provider": "voyage",
"mode": "embedding" "mode": "embedding"
}, },
"voyage/voyage-finance-2": {
"max_tokens": 4000,
"max_input_tokens": 4000,
"input_cost_per_token": 0.00000012,
"output_cost_per_token": 0.000000,
"litellm_provider": "voyage",
"mode": "embedding"
},
"databricks/databricks-meta-llama-3-1-405b-instruct": { "databricks/databricks-meta-llama-3-1-405b-instruct": {
"max_tokens": 128000, "max_tokens": 128000,
"max_input_tokens": 128000, "max_input_tokens": 128000,

View file

@ -235,7 +235,7 @@ def test_all_model_configs():
optional_params={}, optional_params={},
api_version="2022-12-01", api_version="2022-12-01",
drop_params=False, drop_params=False,
) == {"max_tokens": 10} ) == {"max_completion_tokens": 10}
from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig

View file

@ -775,3 +775,12 @@ def test_hosted_vllm_tool_param():
) )
assert "tools" not in optional_params assert "tools" not in optional_params
assert "tool_choice" not in optional_params assert "tool_choice" not in optional_params
def test_unmapped_vertex_anthropic_model():
optional_params = get_optional_params(
model="claude-3-5-sonnet-v250@20241022",
custom_llm_provider="vertex_ai",
max_retries=10,
)
assert "max_retries" not in optional_params

View file

@ -2587,3 +2587,50 @@ async def test_test_completion_cost_gpt4o_audio_output_from_model(stream):
total_output_cost = output_audio_cost + output_text_cost total_output_cost = output_audio_cost + output_text_cost
assert round(cost, 2) == round(total_input_cost + total_output_cost, 2) assert round(cost, 2) == round(total_input_cost + total_output_cost, 2)
def test_completion_cost_azure_ai_meta():
"""
Relevant issue: https://github.com/BerriAI/litellm/issues/6310
"""
from litellm import ModelResponse
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
litellm.set_verbose = True
response = {
"id": "cmpl-55db75e0b05344058b0bd8ee4e00bf84",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": 'Here\'s one:\n\nWhy did the Linux kernel go to therapy?\n\nBecause it had a lot of "core" issues!\n\nHope that one made you laugh!',
"refusal": None,
"role": "assistant",
"audio": None,
"function_call": None,
"tool_calls": [],
},
}
],
"created": 1729243714,
"model": "azure_ai/Meta-Llama-3.1-70B-Instruct",
"object": "chat.completion",
"service_tier": None,
"system_fingerprint": None,
"usage": {
"completion_tokens": 32,
"prompt_tokens": 16,
"total_tokens": 48,
"completion_tokens_details": None,
"prompt_tokens_details": None,
},
}
model_response = ModelResponse(**response)
cost = completion_cost(model_response, custom_llm_provider="azure_ai")
assert cost > 0

View file

@ -375,3 +375,7 @@ def test_img_url_token_counter(img_url):
assert width is not None assert width is not None
assert height is not None assert height is not None
def test_token_encode_disallowed_special():
encode(model="gpt-3.5-turbo", text="Hello, world! <|endoftext|>")