Merge branch 'BerriAI:main' into main

2024-08-02 21:08:48 +05:30 · 2024-08-02 21:08:48 +05:30 · 5439e72a6b
commit 5439e72a6b
parent 851db5ecea 48419ca8bc
139 changed files with 8966 additions and 17524 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1376,15 +1376,7 @@ def client(original_function):
                    optional_params=kwargs,
                )
                result._hidden_params["response_cost"] = (
-                    litellm.response_cost_calculator(
-                        response_object=result,
-                        model=getattr(logging_obj, "model", ""),
-                        custom_llm_provider=getattr(
-                            logging_obj, "custom_llm_provider", None
-                        ),
-                        call_type=getattr(logging_obj, "call_type", "completion"),
-                        optional_params=getattr(logging_obj, "optional_params", {}),
-                    )
+                    logging_obj._response_cost_calculator(result=result)
                )
            if (
                isinstance(result, ModelResponse)
@ -2156,6 +2148,13 @@ def supports_parallel_function_calling(model: str):


 ####### HELPER FUNCTIONS ################
+def _update_dictionary(existing_dict: Dict, new_dict: dict) -> dict:
+    for k, v in new_dict.items():
+        existing_dict[k] = v
+
+    return existing_dict
+
+
 def register_model(model_cost: Union[str, dict]):
    """
    Register new / Override existing models (and their pricing) to specific providers.
@ -2178,8 +2177,17 @@ def register_model(model_cost: Union[str, dict]):
        loaded_model_cost = litellm.get_model_cost_map(url=model_cost)

    for key, value in loaded_model_cost.items():
+        ## get model info ##
+        try:
+            existing_model: Union[ModelInfo, dict] = get_model_info(model=key)
+            model_cost_key = existing_model["key"]
+        except Exception:
+            existing_model = {}
+            model_cost_key = key
        ## override / add new keys to the existing model cost dictionary
-        litellm.model_cost.setdefault(key, {}).update(value)
+        litellm.model_cost.setdefault(model_cost_key, {}).update(
+            _update_dictionary(existing_model, value)  # type: ignore
+        )
        verbose_logger.debug(f"{key} added to model cost map")
        # add new model names to provider lists
        if value.get("litellm_provider") == "openai":
@ -2250,6 +2258,7 @@ def get_litellm_params(
    output_cost_per_token=None,
    output_cost_per_second=None,
    cooldown_time=None,
+    text_completion=None,
 ):
    litellm_params = {
        "acompletion": acompletion,
@ -2273,6 +2282,7 @@ def get_litellm_params(
        "output_cost_per_token": output_cost_per_token,
        "output_cost_per_second": output_cost_per_second,
        "cooldown_time": cooldown_time,
+        "text_completion": text_completion,
    }

    return litellm_params
@ -3119,10 +3129,15 @@ def get_optional_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
-        optional_params = litellm.MistralConfig().map_openai_params(
-            non_default_params=non_default_params,
-            optional_params=optional_params,
-        )
+        if "codestral" in model:
+            optional_params = litellm.MistralTextCompletionConfig().map_openai_params(
+                non_default_params=non_default_params, optional_params=optional_params
+            )
+        else:
+            optional_params = litellm.MistralConfig().map_openai_params(
+                non_default_params=non_default_params,
+                optional_params=optional_params,
+            )
    elif custom_llm_provider == "sagemaker":
        ## check if unsupported param passed in
        supported_params = get_supported_openai_params(
@ -4231,6 +4246,10 @@ def get_supported_openai_params(
                return litellm.VertexAILlama3Config().get_supported_openai_params()
            if model.startswith("mistral"):
                return litellm.MistralConfig().get_supported_openai_params()
+            if model.startswith("codestral"):
+                return (
+                    litellm.MistralTextCompletionConfig().get_supported_openai_params()
+                )
            return litellm.VertexAIConfig().get_supported_openai_params()
        elif request_type == "embeddings":
            return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
@ -4866,6 +4885,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod

    Returns:
        dict: A dictionary containing the following information:
+            key: Required[str] # the key in litellm.model_cost which is returned
            max_tokens: Required[Optional[int]]
            max_input_tokens: Required[Optional[int]]
            max_output_tokens: Required[Optional[int]]
@ -4946,6 +4966,8 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
        if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
            if "meta/" + model in litellm.vertex_llama3_models:
                model = "meta/" + model
+            elif model + "@latest" in litellm.vertex_mistral_models:
+                model = model + "@latest"
        ##########################
        if custom_llm_provider is None:
            # Get custom_llm_provider
@ -4965,6 +4987,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
        if custom_llm_provider == "huggingface":
            max_tokens = _get_max_position_embeddings(model_name=model)
            return ModelInfo(
+                key=model,
                max_tokens=max_tokens,  # type: ignore
                max_input_tokens=None,
                max_output_tokens=None,
@ -4985,6 +5008,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
            3. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
            """
            if combined_model_name in litellm.model_cost:
+                key = combined_model_name
                _model_info = litellm.model_cost[combined_model_name]
                _model_info["supported_openai_params"] = supported_openai_params
                if (
@ -4998,6 +5022,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                    else:
                        raise Exception
            elif model in litellm.model_cost:
+                key = model
                _model_info = litellm.model_cost[model]
                _model_info["supported_openai_params"] = supported_openai_params
                if (
@ -5011,6 +5036,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                    else:
                        raise Exception
            elif split_model in litellm.model_cost:
+                key = split_model
                _model_info = litellm.model_cost[split_model]
                _model_info["supported_openai_params"] = supported_openai_params
                if (
@ -5033,6 +5059,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                _model_info["supports_response_schema"] = True

            return ModelInfo(
+                key=key,
                max_tokens=_model_info.get("max_tokens", None),
                max_input_tokens=_model_info.get("max_input_tokens", None),
                max_output_tokens=_model_info.get("max_output_tokens", None),
@ -5843,6 +5870,12 @@ def convert_to_model_response_object(
            if _response_headers is not None:
                model_response_object._response_headers = _response_headers

+            special_keys = list(litellm.ModelResponse.model_fields.keys())
+            special_keys.append("usage")
+            for k, v in response_object.items():
+                if k not in special_keys:
+                    setattr(model_response_object, k, v)
+
            return model_response_object
        elif response_type == "embedding" and (
            model_response_object is None
@ -7312,6 +7345,13 @@ def exception_type(
                            model=model,
                            response=original_exception.response,
                        )
+                    elif original_exception.status_code == 408:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model,
+                        )
                    elif original_exception.status_code == 500:
                        exception_mapping_worked = True
                        raise ServiceUnavailableError(
@ -8349,6 +8389,28 @@ def get_secret(
 ######## Streaming Class ############################
 # wraps the completion stream to return the correct format for the model
 # replicate/anthropic/cohere
+
+
+def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
+    """Assume most recent usage chunk has total usage uptil then."""
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    for chunk in chunks:
+        if "usage" in chunk:
+            if "prompt_tokens" in chunk["usage"]:
+                prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0
+            if "completion_tokens" in chunk["usage"]:
+                completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0
+
+    returned_usage_chunk = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+    )
+
+    return returned_usage_chunk
+
+
 class CustomStreamWrapper:
    def __init__(
        self,
@ -9238,7 +9300,9 @@ class CustomStreamWrapper:
            verbose_logger.debug(traceback.format_exc())
            return ""

-    def model_response_creator(self, chunk: Optional[dict] = None):
+    def model_response_creator(
+        self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
+    ):
        _model = self.model
        _received_llm_provider = self.custom_llm_provider
        _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None)  # type: ignore
@ -9252,6 +9316,7 @@ class CustomStreamWrapper:
        else:
            # pop model keyword
            chunk.pop("model", None)
+
        model_response = ModelResponse(
            stream=True, model=_model, stream_options=self.stream_options, **chunk
        )
@ -9261,6 +9326,8 @@ class CustomStreamWrapper:
            self.response_id = model_response.id  # type: ignore
        if self.system_fingerprint is not None:
            model_response.system_fingerprint = self.system_fingerprint
+        if hidden_params is not None:
+            model_response._hidden_params = hidden_params
        model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
        model_response._hidden_params["created_at"] = time.time()

@ -9315,11 +9382,7 @@ class CustomStreamWrapper:
                        "finish_reason"
                    ]

-                if (
-                    self.stream_options
-                    and self.stream_options.get("include_usage", False) is True
-                    and anthropic_response_obj["usage"] is not None
-                ):
+                if anthropic_response_obj["usage"] is not None:
                    model_response.usage = litellm.Usage(
                        prompt_tokens=anthropic_response_obj["usage"]["prompt_tokens"],
                        completion_tokens=anthropic_response_obj["usage"][
@ -9642,11 +9705,7 @@ class CustomStreamWrapper:
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
-                if (
-                    self.stream_options
-                    and self.stream_options.get("include_usage", False) == True
-                    and response_obj["usage"] is not None
-                ):
+                if response_obj["usage"] is not None:
                    model_response.usage = litellm.Usage(
                        prompt_tokens=response_obj["usage"].prompt_tokens,
                        completion_tokens=response_obj["usage"].completion_tokens,
@ -9660,11 +9719,7 @@ class CustomStreamWrapper:
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
-                if (
-                    self.stream_options
-                    and self.stream_options.get("include_usage", False) == True
-                    and response_obj["usage"] is not None
-                ):
+                if "usage" in response_obj is not None:
                    model_response.usage = litellm.Usage(
                        prompt_tokens=response_obj["usage"].prompt_tokens,
                        completion_tokens=response_obj["usage"].completion_tokens,
@ -9732,16 +9787,26 @@ class CustomStreamWrapper:
                if response_obj["logprobs"] is not None:
                    model_response.choices[0].logprobs = response_obj["logprobs"]

-                if (
-                    self.stream_options is not None
-                    and self.stream_options["include_usage"] == True
-                    and response_obj["usage"] is not None
-                ):
-                    model_response.usage = litellm.Usage(
-                        prompt_tokens=response_obj["usage"].prompt_tokens,
-                        completion_tokens=response_obj["usage"].completion_tokens,
-                        total_tokens=response_obj["usage"].total_tokens,
-                    )
+                if response_obj["usage"] is not None:
+                    if isinstance(response_obj["usage"], dict):
+                        model_response.usage = litellm.Usage(
+                            prompt_tokens=response_obj["usage"].get(
+                                "prompt_tokens", None
+                            )
+                            or None,
+                            completion_tokens=response_obj["usage"].get(
+                                "completion_tokens", None
+                            )
+                            or None,
+                            total_tokens=response_obj["usage"].get("total_tokens", None)
+                            or None,
+                        )
+                    elif isinstance(response_obj["usage"], BaseModel):
+                        model_response.usage = litellm.Usage(
+                            prompt_tokens=response_obj["usage"].prompt_tokens,
+                            completion_tokens=response_obj["usage"].completion_tokens,
+                            total_tokens=response_obj["usage"].total_tokens,
+                        )

            model_response.model = self.model
            print_verbose(
@ -9855,19 +9920,6 @@ class CustomStreamWrapper:

            ## RETURN ARG
            if (
-                "content" in completion_obj
-                and isinstance(completion_obj["content"], str)
-                and len(completion_obj["content"]) == 0
-                and hasattr(model_response, "usage")
-                and hasattr(model_response.usage, "prompt_tokens")
-            ):
-                if self.sent_first_chunk is False:
-                    completion_obj["role"] = "assistant"
-                    self.sent_first_chunk = True
-                model_response.choices[0].delta = Delta(**completion_obj)
-                print_verbose(f"returning model_response: {model_response}")
-                return model_response
-            elif (
                "content" in completion_obj
                and (
                    isinstance(completion_obj["content"], str)
@ -9962,6 +10014,7 @@ class CustomStreamWrapper:
                    model_response.choices[0].finish_reason = map_finish_reason(
                        finish_reason=self.received_finish_reason
                    )  # ensure consistent output to openai
+
                    self.sent_last_chunk = True

                return model_response
@ -9974,6 +10027,8 @@ class CustomStreamWrapper:
                    self.sent_first_chunk = True
                return model_response
            else:
+                if hasattr(model_response, "usage"):
+                    self.chunks.append(model_response)
                return
        except StopIteration:
            raise StopIteration
@ -10090,17 +10145,22 @@ class CustomStreamWrapper:
                            del obj_dict["usage"]

                        # Create a new object without the removed attribute
-                        response = self.model_response_creator(chunk=obj_dict)
-
+                        response = self.model_response_creator(
+                            chunk=obj_dict, hidden_params=response._hidden_params
+                        )
+                    # add usage as hidden param
+                    if self.sent_last_chunk is True and self.stream_options is None:
+                        usage = calculate_total_usage(chunks=self.chunks)
+                        response._hidden_params["usage"] = usage
                    # RETURN RESULT
                    return response

        except StopIteration:
            if self.sent_last_chunk is True:
                if (
-                    self.sent_stream_usage == False
+                    self.sent_stream_usage is False
                    and self.stream_options is not None
-                    and self.stream_options.get("include_usage", False) == True
+                    and self.stream_options.get("include_usage", False) is True
                ):
                    # send the final chunk with stream options
                    complete_streaming_response = litellm.stream_chunk_builder(
@ -10108,6 +10168,7 @@ class CustomStreamWrapper:
                    )
                    response = self.model_response_creator()
                    response.usage = complete_streaming_response.usage  # type: ignore
+                    response._hidden_params["usage"] = complete_streaming_response.usage  # type: ignore
                    ## LOGGING
                    threading.Thread(
                        target=self.logging_obj.success_handler,
@ -10119,6 +10180,9 @@ class CustomStreamWrapper:
            else:
                self.sent_last_chunk = True
                processed_chunk = self.finish_reason_handler()
+                if self.stream_options is None:  # add usage as hidden param
+                    usage = calculate_total_usage(chunks=self.chunks)
+                    processed_chunk._hidden_params["usage"] = usage
                ## LOGGING
                threading.Thread(
                    target=self.logging_obj.success_handler,