feat(databricks.py): support vertex mistral cost tracking

2025-04-26 11:14:04 +00:00 · 2024-07-27 20:22:35 -07:00 · 2024-07-27 20:22:35 -07:00 · 6d5aedc48d
commit 6d5aedc48d
parent 6f9c29d39b
4 changed files with 56 additions and 87 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -509,7 +509,7 @@ def completion_cost(
                ):
                    model = completion_response._hidden_params.get("model", model)
                custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", ""
+                    "custom_llm_provider", custom_llm_provider or ""
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
@ -732,7 +732,7 @@ def response_cost_calculator(
                )
        return response_cost
    except litellm.NotFoundError as e:
-        print_verbose(
+        verbose_logger.debug(  # debug since it can be spammy in logs, for calls
            f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map."
        )
        return None
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -259,87 +259,6 @@ class DatabricksChatCompletion(BaseLLM):
            api_base = "{}/embeddings".format(api_base)
        return api_base, headers

-    def process_response(
-        self,
-        model: str,
-        response: Union[requests.Response, httpx.Response],
-        model_response: ModelResponse,
-        stream: bool,
-        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
-        optional_params: dict,
-        api_key: str,
-        data: Union[dict, str],
-        messages: List,
-        print_verbose,
-        encoding,
-    ) -> ModelResponse:
-        ## LOGGING
-        logging_obj.post_call(
-            input=messages,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data},
-        )
-        print_verbose(f"raw model_response: {response.text}")
-        ## RESPONSE OBJECT
-        try:
-            completion_response = response.json()
-        except:
-            raise DatabricksError(
-                message=response.text, status_code=response.status_code
-            )
-        if "error" in completion_response:
-            raise DatabricksError(
-                message=str(completion_response["error"]),
-                status_code=response.status_code,
-            )
-        else:
-            text_content = ""
-            tool_calls = []
-            for content in completion_response["content"]:
-                if content["type"] == "text":
-                    text_content += content["text"]
-                ## TOOL CALLING
-                elif content["type"] == "tool_use":
-                    tool_calls.append(
-                        {
-                            "id": content["id"],
-                            "type": "function",
-                            "function": {
-                                "name": content["name"],
-                                "arguments": json.dumps(content["input"]),
-                            },
-                        }
-                    )
-
-            _message = litellm.Message(
-                tool_calls=tool_calls,
-                content=text_content or None,
-            )
-            model_response.choices[0].message = _message  # type: ignore
-            model_response._hidden_params["original_response"] = completion_response[
-                "content"
-            ]  # allow user to access raw anthropic tool calling response
-
-            model_response.choices[0].finish_reason = map_finish_reason(
-                completion_response["stop_reason"]
-            )
-
-        ## CALCULATING USAGE
-        prompt_tokens = completion_response["usage"]["input_tokens"]
-        completion_tokens = completion_response["usage"]["output_tokens"]
-        total_tokens = prompt_tokens + completion_tokens
-
-        model_response.created = int(time.time())
-        model_response.model = model
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=total_tokens,
-        )
-        setattr(model_response, "usage", usage)  # type: ignore
-        return model_response
-
    async def acompletion_stream_function(
        self,
        model: str,
@ -392,6 +311,7 @@ class DatabricksChatCompletion(BaseLLM):
        logging_obj,
        stream,
        data: dict,
+        base_model: Optional[str],
        optional_params: dict,
        litellm_params=None,
        logger_fn=None,
@ -420,7 +340,11 @@ class DatabricksChatCompletion(BaseLLM):
        except Exception as e:
            raise DatabricksError(status_code=500, message=str(e))

-        return ModelResponse(**response_json)
+        response = ModelResponse(**response_json)
+
+        if base_model is not None:
+            response._hidden_params["model"] = base_model
+        return response

    def completion(
        self,
@ -443,6 +367,7 @@ class DatabricksChatCompletion(BaseLLM):
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ):
        custom_endpoint: Optional[bool] = optional_params.pop("custom_endpoint", None)
+        base_model: Optional[str] = optional_params.pop("base_model", None)
        api_base, headers = self._validate_environment(
            api_base=api_base,
            api_key=api_key,
@ -476,11 +401,11 @@ class DatabricksChatCompletion(BaseLLM):
                "headers": headers,
            },
        )
-        if acompletion == True:
+        if acompletion is True:
            if client is not None and isinstance(client, HTTPHandler):
                client = None
            if (
-                stream is not None and stream == True
+                stream is not None and stream is True
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                print_verbose("makes async anthropic streaming POST request")
                data["stream"] = stream
@ -521,6 +446,7 @@ class DatabricksChatCompletion(BaseLLM):
                    logger_fn=logger_fn,
                    headers=headers,
                    timeout=timeout,
+                    base_model=base_model,
                )
        else:
            if client is None or not isinstance(client, HTTPHandler):
@ -562,7 +488,12 @@ class DatabricksChatCompletion(BaseLLM):
                except Exception as e:
                    raise DatabricksError(status_code=500, message=str(e))

-        return ModelResponse(**response_json)
+        response = ModelResponse(**response_json)
+
+        if base_model is not None:
+            response._hidden_params["model"] = base_model
+
+        return response

    async def aembedding(
        self,
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -939,6 +939,8 @@ async def test_partner_models_httpx(model, sync_mode):
        response_format_tests(response=response)

        print(f"response: {response}")
+
+        assert response._hidden_params["response_cost"] > 0
    except litellm.RateLimitError as e:
        pass
    except Exception as e:
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -918,6 +918,42 @@ def test_vertex_ai_llama_predict_cost():
    assert predictive_cost == 0


+def test_vertex_ai_mistral_predict_cost():
+    from litellm.types.utils import Choices, Message, ModelResponse, Usage
+
+    response_object = ModelResponse(
+        id="26c0ef045020429d9c5c9b078c01e564",
+        choices=[
+            Choices(
+                finish_reason="stop",
+                index=0,
+                message=Message(
+                    content="Hello! I'm Litellm Bot, your helpful assistant. While I can't provide real-time weather updates, I can help you find a reliable weather service or guide you on how to check the weather on your device. Would you like assistance with that?",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1722124652,
+        model="vertex_ai/mistral-large",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(prompt_tokens=32, completion_tokens=55, total_tokens=87),
+    )
+    model = "mistral-large@2407"
+    messages = [{"role": "user", "content": "Hey, hows it going???"}]
+    custom_llm_provider = "vertex_ai"
+    predictive_cost = completion_cost(
+        completion_response=response_object,
+        model=model,
+        messages=messages,
+        custom_llm_provider=custom_llm_provider,
+    )
+
+    assert predictive_cost > 0
+
+
@pytest.mark.parametrize("model", ["openai/tts-1", "azure/tts-1"])
 def test_completion_cost_tts(model):
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"