forked from phoenix/litellm-mirror
Merge branch 'BerriAI:main' into main
This commit is contained in:
commit
5439e72a6b
139 changed files with 8966 additions and 17524 deletions
178
litellm/utils.py
178
litellm/utils.py
|
@ -1376,15 +1376,7 @@ def client(original_function):
|
|||
optional_params=kwargs,
|
||||
)
|
||||
result._hidden_params["response_cost"] = (
|
||||
litellm.response_cost_calculator(
|
||||
response_object=result,
|
||||
model=getattr(logging_obj, "model", ""),
|
||||
custom_llm_provider=getattr(
|
||||
logging_obj, "custom_llm_provider", None
|
||||
),
|
||||
call_type=getattr(logging_obj, "call_type", "completion"),
|
||||
optional_params=getattr(logging_obj, "optional_params", {}),
|
||||
)
|
||||
logging_obj._response_cost_calculator(result=result)
|
||||
)
|
||||
if (
|
||||
isinstance(result, ModelResponse)
|
||||
|
@ -2156,6 +2148,13 @@ def supports_parallel_function_calling(model: str):
|
|||
|
||||
|
||||
####### HELPER FUNCTIONS ################
|
||||
def _update_dictionary(existing_dict: Dict, new_dict: dict) -> dict:
|
||||
for k, v in new_dict.items():
|
||||
existing_dict[k] = v
|
||||
|
||||
return existing_dict
|
||||
|
||||
|
||||
def register_model(model_cost: Union[str, dict]):
|
||||
"""
|
||||
Register new / Override existing models (and their pricing) to specific providers.
|
||||
|
@ -2178,8 +2177,17 @@ def register_model(model_cost: Union[str, dict]):
|
|||
loaded_model_cost = litellm.get_model_cost_map(url=model_cost)
|
||||
|
||||
for key, value in loaded_model_cost.items():
|
||||
## get model info ##
|
||||
try:
|
||||
existing_model: Union[ModelInfo, dict] = get_model_info(model=key)
|
||||
model_cost_key = existing_model["key"]
|
||||
except Exception:
|
||||
existing_model = {}
|
||||
model_cost_key = key
|
||||
## override / add new keys to the existing model cost dictionary
|
||||
litellm.model_cost.setdefault(key, {}).update(value)
|
||||
litellm.model_cost.setdefault(model_cost_key, {}).update(
|
||||
_update_dictionary(existing_model, value) # type: ignore
|
||||
)
|
||||
verbose_logger.debug(f"{key} added to model cost map")
|
||||
# add new model names to provider lists
|
||||
if value.get("litellm_provider") == "openai":
|
||||
|
@ -2250,6 +2258,7 @@ def get_litellm_params(
|
|||
output_cost_per_token=None,
|
||||
output_cost_per_second=None,
|
||||
cooldown_time=None,
|
||||
text_completion=None,
|
||||
):
|
||||
litellm_params = {
|
||||
"acompletion": acompletion,
|
||||
|
@ -2273,6 +2282,7 @@ def get_litellm_params(
|
|||
"output_cost_per_token": output_cost_per_token,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"cooldown_time": cooldown_time,
|
||||
"text_completion": text_completion,
|
||||
}
|
||||
|
||||
return litellm_params
|
||||
|
@ -3119,10 +3129,15 @@ def get_optional_params(
|
|||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
optional_params = litellm.MistralConfig().map_openai_params(
|
||||
non_default_params=non_default_params,
|
||||
optional_params=optional_params,
|
||||
)
|
||||
if "codestral" in model:
|
||||
optional_params = litellm.MistralTextCompletionConfig().map_openai_params(
|
||||
non_default_params=non_default_params, optional_params=optional_params
|
||||
)
|
||||
else:
|
||||
optional_params = litellm.MistralConfig().map_openai_params(
|
||||
non_default_params=non_default_params,
|
||||
optional_params=optional_params,
|
||||
)
|
||||
elif custom_llm_provider == "sagemaker":
|
||||
## check if unsupported param passed in
|
||||
supported_params = get_supported_openai_params(
|
||||
|
@ -4231,6 +4246,10 @@ def get_supported_openai_params(
|
|||
return litellm.VertexAILlama3Config().get_supported_openai_params()
|
||||
if model.startswith("mistral"):
|
||||
return litellm.MistralConfig().get_supported_openai_params()
|
||||
if model.startswith("codestral"):
|
||||
return (
|
||||
litellm.MistralTextCompletionConfig().get_supported_openai_params()
|
||||
)
|
||||
return litellm.VertexAIConfig().get_supported_openai_params()
|
||||
elif request_type == "embeddings":
|
||||
return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
|
||||
|
@ -4866,6 +4885,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
|
||||
Returns:
|
||||
dict: A dictionary containing the following information:
|
||||
key: Required[str] # the key in litellm.model_cost which is returned
|
||||
max_tokens: Required[Optional[int]]
|
||||
max_input_tokens: Required[Optional[int]]
|
||||
max_output_tokens: Required[Optional[int]]
|
||||
|
@ -4946,6 +4966,8 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
if custom_llm_provider is not None and custom_llm_provider == "vertex_ai":
|
||||
if "meta/" + model in litellm.vertex_llama3_models:
|
||||
model = "meta/" + model
|
||||
elif model + "@latest" in litellm.vertex_mistral_models:
|
||||
model = model + "@latest"
|
||||
##########################
|
||||
if custom_llm_provider is None:
|
||||
# Get custom_llm_provider
|
||||
|
@ -4965,6 +4987,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
if custom_llm_provider == "huggingface":
|
||||
max_tokens = _get_max_position_embeddings(model_name=model)
|
||||
return ModelInfo(
|
||||
key=model,
|
||||
max_tokens=max_tokens, # type: ignore
|
||||
max_input_tokens=None,
|
||||
max_output_tokens=None,
|
||||
|
@ -4985,6 +5008,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
3. 'split_model' in litellm.model_cost. Checks "llama3-8b-8192" in litellm.model_cost if model="groq/llama3-8b-8192"
|
||||
"""
|
||||
if combined_model_name in litellm.model_cost:
|
||||
key = combined_model_name
|
||||
_model_info = litellm.model_cost[combined_model_name]
|
||||
_model_info["supported_openai_params"] = supported_openai_params
|
||||
if (
|
||||
|
@ -4998,6 +5022,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
else:
|
||||
raise Exception
|
||||
elif model in litellm.model_cost:
|
||||
key = model
|
||||
_model_info = litellm.model_cost[model]
|
||||
_model_info["supported_openai_params"] = supported_openai_params
|
||||
if (
|
||||
|
@ -5011,6 +5036,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
else:
|
||||
raise Exception
|
||||
elif split_model in litellm.model_cost:
|
||||
key = split_model
|
||||
_model_info = litellm.model_cost[split_model]
|
||||
_model_info["supported_openai_params"] = supported_openai_params
|
||||
if (
|
||||
|
@ -5033,6 +5059,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
_model_info["supports_response_schema"] = True
|
||||
|
||||
return ModelInfo(
|
||||
key=key,
|
||||
max_tokens=_model_info.get("max_tokens", None),
|
||||
max_input_tokens=_model_info.get("max_input_tokens", None),
|
||||
max_output_tokens=_model_info.get("max_output_tokens", None),
|
||||
|
@ -5843,6 +5870,12 @@ def convert_to_model_response_object(
|
|||
if _response_headers is not None:
|
||||
model_response_object._response_headers = _response_headers
|
||||
|
||||
special_keys = list(litellm.ModelResponse.model_fields.keys())
|
||||
special_keys.append("usage")
|
||||
for k, v in response_object.items():
|
||||
if k not in special_keys:
|
||||
setattr(model_response_object, k, v)
|
||||
|
||||
return model_response_object
|
||||
elif response_type == "embedding" and (
|
||||
model_response_object is None
|
||||
|
@ -7312,6 +7345,13 @@ def exception_type(
|
|||
model=model,
|
||||
response=original_exception.response,
|
||||
)
|
||||
elif original_exception.status_code == 408:
|
||||
exception_mapping_worked = True
|
||||
raise Timeout(
|
||||
message=f"CohereException - {original_exception.message}",
|
||||
llm_provider="cohere",
|
||||
model=model,
|
||||
)
|
||||
elif original_exception.status_code == 500:
|
||||
exception_mapping_worked = True
|
||||
raise ServiceUnavailableError(
|
||||
|
@ -8349,6 +8389,28 @@ def get_secret(
|
|||
######## Streaming Class ############################
|
||||
# wraps the completion stream to return the correct format for the model
|
||||
# replicate/anthropic/cohere
|
||||
|
||||
|
||||
def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
|
||||
"""Assume most recent usage chunk has total usage uptil then."""
|
||||
prompt_tokens: int = 0
|
||||
completion_tokens: int = 0
|
||||
for chunk in chunks:
|
||||
if "usage" in chunk:
|
||||
if "prompt_tokens" in chunk["usage"]:
|
||||
prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0
|
||||
if "completion_tokens" in chunk["usage"]:
|
||||
completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0
|
||||
|
||||
returned_usage_chunk = Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
)
|
||||
|
||||
return returned_usage_chunk
|
||||
|
||||
|
||||
class CustomStreamWrapper:
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -9238,7 +9300,9 @@ class CustomStreamWrapper:
|
|||
verbose_logger.debug(traceback.format_exc())
|
||||
return ""
|
||||
|
||||
def model_response_creator(self, chunk: Optional[dict] = None):
|
||||
def model_response_creator(
|
||||
self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
|
||||
):
|
||||
_model = self.model
|
||||
_received_llm_provider = self.custom_llm_provider
|
||||
_logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None) # type: ignore
|
||||
|
@ -9252,6 +9316,7 @@ class CustomStreamWrapper:
|
|||
else:
|
||||
# pop model keyword
|
||||
chunk.pop("model", None)
|
||||
|
||||
model_response = ModelResponse(
|
||||
stream=True, model=_model, stream_options=self.stream_options, **chunk
|
||||
)
|
||||
|
@ -9261,6 +9326,8 @@ class CustomStreamWrapper:
|
|||
self.response_id = model_response.id # type: ignore
|
||||
if self.system_fingerprint is not None:
|
||||
model_response.system_fingerprint = self.system_fingerprint
|
||||
if hidden_params is not None:
|
||||
model_response._hidden_params = hidden_params
|
||||
model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
|
||||
model_response._hidden_params["created_at"] = time.time()
|
||||
|
||||
|
@ -9315,11 +9382,7 @@ class CustomStreamWrapper:
|
|||
"finish_reason"
|
||||
]
|
||||
|
||||
if (
|
||||
self.stream_options
|
||||
and self.stream_options.get("include_usage", False) is True
|
||||
and anthropic_response_obj["usage"] is not None
|
||||
):
|
||||
if anthropic_response_obj["usage"] is not None:
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=anthropic_response_obj["usage"]["prompt_tokens"],
|
||||
completion_tokens=anthropic_response_obj["usage"][
|
||||
|
@ -9642,11 +9705,7 @@ class CustomStreamWrapper:
|
|||
print_verbose(f"completion obj content: {completion_obj['content']}")
|
||||
if response_obj["is_finished"]:
|
||||
self.received_finish_reason = response_obj["finish_reason"]
|
||||
if (
|
||||
self.stream_options
|
||||
and self.stream_options.get("include_usage", False) == True
|
||||
and response_obj["usage"] is not None
|
||||
):
|
||||
if response_obj["usage"] is not None:
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=response_obj["usage"].prompt_tokens,
|
||||
completion_tokens=response_obj["usage"].completion_tokens,
|
||||
|
@ -9660,11 +9719,7 @@ class CustomStreamWrapper:
|
|||
print_verbose(f"completion obj content: {completion_obj['content']}")
|
||||
if response_obj["is_finished"]:
|
||||
self.received_finish_reason = response_obj["finish_reason"]
|
||||
if (
|
||||
self.stream_options
|
||||
and self.stream_options.get("include_usage", False) == True
|
||||
and response_obj["usage"] is not None
|
||||
):
|
||||
if "usage" in response_obj is not None:
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=response_obj["usage"].prompt_tokens,
|
||||
completion_tokens=response_obj["usage"].completion_tokens,
|
||||
|
@ -9732,16 +9787,26 @@ class CustomStreamWrapper:
|
|||
if response_obj["logprobs"] is not None:
|
||||
model_response.choices[0].logprobs = response_obj["logprobs"]
|
||||
|
||||
if (
|
||||
self.stream_options is not None
|
||||
and self.stream_options["include_usage"] == True
|
||||
and response_obj["usage"] is not None
|
||||
):
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=response_obj["usage"].prompt_tokens,
|
||||
completion_tokens=response_obj["usage"].completion_tokens,
|
||||
total_tokens=response_obj["usage"].total_tokens,
|
||||
)
|
||||
if response_obj["usage"] is not None:
|
||||
if isinstance(response_obj["usage"], dict):
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=response_obj["usage"].get(
|
||||
"prompt_tokens", None
|
||||
)
|
||||
or None,
|
||||
completion_tokens=response_obj["usage"].get(
|
||||
"completion_tokens", None
|
||||
)
|
||||
or None,
|
||||
total_tokens=response_obj["usage"].get("total_tokens", None)
|
||||
or None,
|
||||
)
|
||||
elif isinstance(response_obj["usage"], BaseModel):
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=response_obj["usage"].prompt_tokens,
|
||||
completion_tokens=response_obj["usage"].completion_tokens,
|
||||
total_tokens=response_obj["usage"].total_tokens,
|
||||
)
|
||||
|
||||
model_response.model = self.model
|
||||
print_verbose(
|
||||
|
@ -9855,19 +9920,6 @@ class CustomStreamWrapper:
|
|||
|
||||
## RETURN ARG
|
||||
if (
|
||||
"content" in completion_obj
|
||||
and isinstance(completion_obj["content"], str)
|
||||
and len(completion_obj["content"]) == 0
|
||||
and hasattr(model_response, "usage")
|
||||
and hasattr(model_response.usage, "prompt_tokens")
|
||||
):
|
||||
if self.sent_first_chunk is False:
|
||||
completion_obj["role"] = "assistant"
|
||||
self.sent_first_chunk = True
|
||||
model_response.choices[0].delta = Delta(**completion_obj)
|
||||
print_verbose(f"returning model_response: {model_response}")
|
||||
return model_response
|
||||
elif (
|
||||
"content" in completion_obj
|
||||
and (
|
||||
isinstance(completion_obj["content"], str)
|
||||
|
@ -9962,6 +10014,7 @@ class CustomStreamWrapper:
|
|||
model_response.choices[0].finish_reason = map_finish_reason(
|
||||
finish_reason=self.received_finish_reason
|
||||
) # ensure consistent output to openai
|
||||
|
||||
self.sent_last_chunk = True
|
||||
|
||||
return model_response
|
||||
|
@ -9974,6 +10027,8 @@ class CustomStreamWrapper:
|
|||
self.sent_first_chunk = True
|
||||
return model_response
|
||||
else:
|
||||
if hasattr(model_response, "usage"):
|
||||
self.chunks.append(model_response)
|
||||
return
|
||||
except StopIteration:
|
||||
raise StopIteration
|
||||
|
@ -10090,17 +10145,22 @@ class CustomStreamWrapper:
|
|||
del obj_dict["usage"]
|
||||
|
||||
# Create a new object without the removed attribute
|
||||
response = self.model_response_creator(chunk=obj_dict)
|
||||
|
||||
response = self.model_response_creator(
|
||||
chunk=obj_dict, hidden_params=response._hidden_params
|
||||
)
|
||||
# add usage as hidden param
|
||||
if self.sent_last_chunk is True and self.stream_options is None:
|
||||
usage = calculate_total_usage(chunks=self.chunks)
|
||||
response._hidden_params["usage"] = usage
|
||||
# RETURN RESULT
|
||||
return response
|
||||
|
||||
except StopIteration:
|
||||
if self.sent_last_chunk is True:
|
||||
if (
|
||||
self.sent_stream_usage == False
|
||||
self.sent_stream_usage is False
|
||||
and self.stream_options is not None
|
||||
and self.stream_options.get("include_usage", False) == True
|
||||
and self.stream_options.get("include_usage", False) is True
|
||||
):
|
||||
# send the final chunk with stream options
|
||||
complete_streaming_response = litellm.stream_chunk_builder(
|
||||
|
@ -10108,6 +10168,7 @@ class CustomStreamWrapper:
|
|||
)
|
||||
response = self.model_response_creator()
|
||||
response.usage = complete_streaming_response.usage # type: ignore
|
||||
response._hidden_params["usage"] = complete_streaming_response.usage # type: ignore
|
||||
## LOGGING
|
||||
threading.Thread(
|
||||
target=self.logging_obj.success_handler,
|
||||
|
@ -10119,6 +10180,9 @@ class CustomStreamWrapper:
|
|||
else:
|
||||
self.sent_last_chunk = True
|
||||
processed_chunk = self.finish_reason_handler()
|
||||
if self.stream_options is None: # add usage as hidden param
|
||||
usage = calculate_total_usage(chunks=self.chunks)
|
||||
processed_chunk._hidden_params["usage"] = usage
|
||||
## LOGGING
|
||||
threading.Thread(
|
||||
target=self.logging_obj.success_handler,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue