diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 5d38afd390..da12d28c45 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -76,7 +76,7 @@ class PrometheusLogger(CustomLogger): UserAPIKeyLabelNames.TEAM.value, UserAPIKeyLabelNames.TEAM_ALIAS.value, UserAPIKeyLabelNames.USER.value, - UserAPIKeyLabelNames.LITELLM_MODEL.value, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value, ], buckets=LATENCY_BUCKETS, ) @@ -85,7 +85,7 @@ class PrometheusLogger(CustomLogger): "litellm_llm_api_latency_metric", "Total latency (seconds) for a models LLM API call", labelnames=[ - UserAPIKeyLabelNames.LITELLM_MODEL.value, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value, UserAPIKeyLabelNames.API_KEY_HASH.value, UserAPIKeyLabelNames.API_KEY_ALIAS.value, UserAPIKeyLabelNames.TEAM.value, @@ -140,6 +140,14 @@ class PrometheusLogger(CustomLogger): ], ) + # Counter for tokens by tag + self.litellm_tokens_by_tag_metric = Counter( + "litellm_total_tokens_by_tag", + "Total number of input + output tokens from LLM requests by custom metadata tags", + labelnames=[ + UserAPIKeyLabelNames.TAG.value, + ], + ) self.litellm_input_tokens_metric = Counter( "litellm_input_tokens", "Total number of input tokens from LLM requests", @@ -153,6 +161,16 @@ class PrometheusLogger(CustomLogger): "user", ], ) + + # Counter for input tokens by tag + self.litellm_input_tokens_by_tag_metric = Counter( + "litellm_input_tokens_by_tag", + "Total number of input tokens from LLM requests by custom metadata tags", + labelnames=[ + UserAPIKeyLabelNames.TAG.value, + ], + ) + self.litellm_output_tokens_metric = Counter( "litellm_output_tokens", "Total number of output tokens from LLM requests", @@ -167,6 +185,15 @@ class PrometheusLogger(CustomLogger): ], ) + # Counter for output tokens by tag + self.litellm_output_tokens_by_tag_metric = Counter( + "litellm_output_tokens_by_tag", + "Total number of output tokens from LLM requests by custom metadata tags", + labelnames=[ + UserAPIKeyLabelNames.TAG.value, + ], + ) + # Remaining Budget for Team self.litellm_remaining_team_budget_metric = Gauge( "litellm_remaining_team_budget_metric", @@ -237,10 +264,10 @@ class PrometheusLogger(CustomLogger): # Get all keys _logged_llm_labels = [ - "litellm_model_name", - "model_id", - "api_base", - "api_provider", + UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value, + UserAPIKeyLabelNames.MODEL_ID.value, + UserAPIKeyLabelNames.API_BASE.value, + UserAPIKeyLabelNames.API_PROVIDER.value, ] team_and_key_labels = [ "hashed_api_key", @@ -275,6 +302,16 @@ class PrometheusLogger(CustomLogger): + EXCEPTION_LABELS + team_and_key_labels, ) + self.litellm_deployment_failure_by_tag_responses = Counter( + "litellm_deployment_failure_by_tag_responses", + "Total number of failed LLM API calls for a specific LLM deploymeny by custom metadata tags", + labelnames=[ + UserAPIKeyLabelNames.REQUESTED_MODEL.value, + UserAPIKeyLabelNames.TAG.value, + ] + + _logged_llm_labels + + EXCEPTION_LABELS, + ) self.litellm_deployment_total_requests = Counter( name="litellm_deployment_total_requests", documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure", @@ -490,6 +527,14 @@ class PrometheusLogger(CustomLogger): user_id, ).inc(standard_logging_payload["total_tokens"]) + _tags = standard_logging_payload["request_tags"] + for tag in _tags: + self.litellm_tokens_by_tag_metric.labels( + **{ + UserAPIKeyLabelNames.TAG.value: tag, + } + ).inc(standard_logging_payload["total_tokens"]) + self.litellm_input_tokens_metric.labels( end_user_id, user_api_key, @@ -500,6 +545,13 @@ class PrometheusLogger(CustomLogger): user_id, ).inc(standard_logging_payload["prompt_tokens"]) + for tag in _tags: + self.litellm_input_tokens_by_tag_metric.labels( + **{ + UserAPIKeyLabelNames.TAG.value: tag, + } + ).inc(standard_logging_payload["prompt_tokens"]) + self.litellm_output_tokens_metric.labels( end_user_id, user_api_key, @@ -510,6 +562,13 @@ class PrometheusLogger(CustomLogger): user_id, ).inc(standard_logging_payload["completion_tokens"]) + for tag in _tags: + self.litellm_output_tokens_by_tag_metric.labels( + **{ + UserAPIKeyLabelNames.TAG.value: tag, + } + ).inc(standard_logging_payload["completion_tokens"]) + def _increment_remaining_budget_metrics( self, user_api_team: Optional[str], @@ -651,7 +710,7 @@ class PrometheusLogger(CustomLogger): api_call_total_time_seconds = api_call_total_time.total_seconds() self.litellm_llm_api_latency_metric.labels( **{ - UserAPIKeyLabelNames.LITELLM_MODEL.value: model, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model, UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key, UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias, UserAPIKeyLabelNames.TEAM.value: user_api_team, @@ -686,7 +745,7 @@ class PrometheusLogger(CustomLogger): UserAPIKeyLabelNames.USER.value: standard_logging_payload[ "metadata" ]["user_api_key_user_id"], - UserAPIKeyLabelNames.LITELLM_MODEL.value: model, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value: model, } ).observe(total_time_seconds) @@ -862,6 +921,24 @@ class PrometheusLogger(CustomLogger): ], ).inc() + # tag based tracking + _tags = standard_logging_payload["request_tags"] + for tag in _tags: + self.litellm_deployment_failure_by_tag_responses.labels( + **{ + UserAPIKeyLabelNames.REQUESTED_MODEL.value: model_group, + UserAPIKeyLabelNames.TAG.value: tag, + UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value: litellm_model_name, + UserAPIKeyLabelNames.MODEL_ID.value: model_id, + UserAPIKeyLabelNames.API_BASE.value: api_base, + UserAPIKeyLabelNames.API_PROVIDER.value: llm_provider, + UserAPIKeyLabelNames.EXCEPTION_CLASS.value: exception.__class__.__name__, + UserAPIKeyLabelNames.EXCEPTION_STATUS.value: str( + getattr(exception, "status_code", None) + ), + } + ).inc() + self.litellm_deployment_total_requests.labels( litellm_model_name=litellm_model_name, model_id=model_id, @@ -881,8 +958,12 @@ class PrometheusLogger(CustomLogger): ).inc() pass - except Exception: - pass + except Exception as e: + verbose_logger.debug( + "Prometheus Error: set_llm_deployment_failure_metrics. Exception occured - {}".format( + str(e) + ) + ) def set_llm_deployment_success_metrics( self, diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py index d0ceea37b3..a1808d3427 100644 --- a/litellm/llms/bedrock/chat/invoke_handler.py +++ b/litellm/llms/bedrock/chat/invoke_handler.py @@ -9,7 +9,17 @@ import types import urllib.parse import uuid from functools import partial -from typing import Any, AsyncIterator, Callable, Iterator, List, Optional, Tuple, Union +from typing import ( + Any, + AsyncIterator, + Callable, + Iterator, + List, + Optional, + Tuple, + Union, + cast, +) import httpx # type: ignore @@ -36,8 +46,10 @@ from litellm.llms.custom_httpx.http_handler import ( from litellm.types.llms.bedrock import * from litellm.types.llms.openai import ( ChatCompletionToolCallChunk, + ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) +from litellm.types.utils import ChatCompletionMessageToolCall, Choices from litellm.types.utils import GenericStreamingChunk as GChunk from litellm.types.utils import ModelResponse, Usage from litellm.utils import CustomStreamWrapper, get_secret @@ -1294,11 +1306,25 @@ class MockResponseIterator: # for returning ai21 streaming responses chunk_usage: Usage = getattr(chunk_data, "usage") text = chunk_data.choices[0].message.content or "" # type: ignore tool_use = None + _model_response_tool_call = cast( + Optional[List[ChatCompletionMessageToolCall]], + cast(Choices, chunk_data.choices[0]).message.tool_calls, + ) if self.json_mode is True: text, tool_use = self._handle_json_mode_chunk( text=text, tool_calls=chunk_data.choices[0].message.tool_calls, # type: ignore ) + elif _model_response_tool_call is not None: + tool_use = ChatCompletionToolCallChunk( + id=_model_response_tool_call[0].id, + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=_model_response_tool_call[0].function.name, + arguments=_model_response_tool_call[0].function.arguments, + ), + index=0, + ) processed_chunk = GChunk( text=text, tool_use=tool_use, diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py index 22da0425e4..ec4125e520 100644 --- a/litellm/types/integrations/prometheus.py +++ b/litellm/types/integrations/prometheus.py @@ -53,4 +53,11 @@ class UserAPIKeyLabelNames(Enum): TEAM = "team" TEAM_ALIAS = "team_alias" REQUESTED_MODEL = REQUESTED_MODEL - LITELLM_MODEL = "model" + v1_LITELLM_MODEL_NAME = "model" + v2_LITELLM_MODEL_NAME = "litellm_model_name" + TAG = "tag" + MODEL_ID = "model_id" + API_BASE = "api_base" + API_PROVIDER = "api_provider" + EXCEPTION_STATUS = EXCEPTION_STATUS + EXCEPTION_CLASS = EXCEPTION_CLASS diff --git a/tests/local_testing/test_stream_chunk_builder.py b/tests/local_testing/test_stream_chunk_builder.py index 1056fad597..4540799186 100644 --- a/tests/local_testing/test_stream_chunk_builder.py +++ b/tests/local_testing/test_stream_chunk_builder.py @@ -745,3 +745,20 @@ def test_stream_chunk_builder_empty_initial_chunk(): id = ChunkProcessor._get_chunk_id(chunks) assert id == "1" + + +import json + + +def get_current_weather(location, unit="fahrenheit"): + """Get the current weather in a given location""" + if "tokyo" in location.lower(): + return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"}) + elif "san francisco" in location.lower(): + return json.dumps( + {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"} + ) + elif "paris" in location.lower(): + return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"}) + else: + return json.dumps({"location": location, "temperature": "unknown"}) diff --git a/tests/local_testing/test_streaming.py b/tests/local_testing/test_streaming.py index 5d553cae40..db39300e5e 100644 --- a/tests/local_testing/test_streaming.py +++ b/tests/local_testing/test_streaming.py @@ -3990,3 +3990,69 @@ def test_streaming_api_base(): stream=True, ) assert "https://api.openai.com" in stream._hidden_params["api_base"] + + +def test_mock_response_iterator_tool_use(): + """ + Relevant Issue: https://github.com/BerriAI/litellm/issues/7364 + """ + from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator + from litellm.types.utils import ( + ChatCompletionMessageToolCall, + Function, + Message, + Usage, + CompletionTokensDetailsWrapper, + PromptTokensDetailsWrapper, + Choices, + ) + + litellm.set_verbose = False + response = ModelResponse( + id="chatcmpl-Ai8KRI5vJPZXQ9SQvEJfTVuVqkyEZ", + created=1735081811, + model="o1-2024-12-17", + object="chat.completion", + system_fingerprint="fp_e6d02d4a78", + choices=[ + Choices( + finish_reason="tool_calls", + index=0, + message=Message( + content=None, + role="assistant", + tool_calls=[ + ChatCompletionMessageToolCall( + function=Function( + arguments='{"location":"San Francisco, CA","unit":"fahrenheit"}', + name="get_current_weather", + ), + id="call_BfRX2S7YCKL0BtxbWMl89ZNk", + type="function", + ) + ], + function_call=None, + ), + ) + ], + usage=Usage( + completion_tokens=1955, + prompt_tokens=85, + total_tokens=2040, + completion_tokens_details=CompletionTokensDetailsWrapper( + accepted_prediction_tokens=0, + audio_tokens=0, + reasoning_tokens=1920, + rejected_prediction_tokens=0, + text_tokens=None, + ), + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None + ), + ), + service_tier=None, + ) + completion_stream = MockResponseIterator(model_response=response) + response_chunk = completion_stream._chunk_parser(chunk_data=response) + + assert response_chunk["tool_use"] is not None