Merge branch 'main' into litellm_region_based_routing

2025-04-26 03:04:13 +00:00 · 2024-05-08 22:19:51 -07:00 · 2024-05-08 22:19:51 -07:00 · 8ad979cdfe
commit 8ad979cdfe
parent 3d18897d69 3f13251241
85 changed files with 793 additions and 448 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -14,7 +14,7 @@ import subprocess, os
 from os.path import abspath, join, dirname
 import litellm, openai
 import itertools
-import random, uuid, requests
+import random, uuid, requests  # type: ignore
 from functools import wraps
 import datetime, time
 import tiktoken
@ -36,7 +36,7 @@ import litellm._service_logger  # for storing API inputs, outputs, and metadata

 try:
    # this works in python 3.8
-    import pkg_resources
+    import pkg_resources  # type: ignore

    filename = pkg_resources.resource_filename(__name__, "llms/tokenizers")
 # try:
@ -4161,8 +4161,30 @@ def cost_per_token(
                model_with_provider_and_region in model_cost_ref
            ):  # use region based pricing, if it's available
                model_with_provider = model_with_provider_and_region
-    if model_with_provider in model_cost_ref:
+
+    model_without_prefix = model
+    model_parts = model.split("/")
+    if len(model_parts) > 1:
+        model_without_prefix = model_parts[1]
+    else:
+        model_without_prefix = model
+    """
+    Code block that formats model to lookup in litellm.model_cost
+    Option1. model = "bedrock/ap-northeast-1/anthropic.claude-instant-v1". This is the most accurate since it is region based. Should always be option 1
+    Option2. model = "openai/gpt-4"       - model = provider/model
+    Option3. model = "anthropic.claude-3" - model = model
+    """
+    if (
+        model_with_provider in model_cost_ref
+    ):  # Option 2. use model with provider, model = "openai/gpt-4"
        model = model_with_provider
+    elif model in model_cost_ref:  # Option 1. use model passed, model="gpt-4"
+        model = model
+    elif (
+        model_without_prefix in model_cost_ref
+    ):  # Option 3. if user passed model="bedrock/anthropic.claude-3", use model="anthropic.claude-3"
+        model = model_without_prefix
+
    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
    if model in model_cost_ref:
@ -7766,11 +7788,11 @@ def _calculate_retry_after(
            try:
                retry_after = int(retry_header)
            except Exception:
-                retry_date_tuple = email.utils.parsedate_tz(retry_header)
+                retry_date_tuple = email.utils.parsedate_tz(retry_header)  # type: ignore
                if retry_date_tuple is None:
                    retry_after = -1
                else:
-                    retry_date = email.utils.mktime_tz(retry_date_tuple)
+                    retry_date = email.utils.mktime_tz(retry_date_tuple)  # type: ignore
                    retry_after = int(retry_date - time.time())
        else:
            retry_after = -1
@ -10545,6 +10567,18 @@ class CustomStreamWrapper:
            elif self.custom_llm_provider == "watsonx":
                response_obj = self.handle_watsonx_stream(chunk)
                completion_obj["content"] = response_obj["text"]
+                print_verbose(f"completion obj content: {completion_obj['content']}")
+                if getattr(model_response, "usage", None) is None:
+                    model_response.usage = Usage()
+                if response_obj.get("prompt_tokens") is not None:
+                    prompt_token_count = getattr(model_response.usage, "prompt_tokens", 0)
+                    model_response.usage.prompt_tokens = (prompt_token_count+response_obj["prompt_tokens"])
+                if response_obj.get("completion_tokens") is not None:
+                    model_response.usage.completion_tokens = response_obj["completion_tokens"]
+                model_response.usage.total_tokens = (
+                    getattr(model_response.usage, "prompt_tokens", 0)
+                    + getattr(model_response.usage, "completion_tokens", 0)
+                )
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
@ -10949,6 +10983,7 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "sagemaker"
                or self.custom_llm_provider == "gemini"
                or self.custom_llm_provider == "cached_response"
+                or self.custom_llm_provider == "watsonx"
                or self.custom_llm_provider in litellm.openai_compatible_endpoints
            ):
                async for chunk in self.completion_stream: