forked from phoenix/litellm-mirror
LiteLLM Minor Fixes + Improvements (#5474)
* feat(proxy/_types.py): add lago billing to callbacks ui Closes https://github.com/BerriAI/litellm/issues/5472 * fix(anthropic.py): return anthropic prompt caching information Fixes https://github.com/BerriAI/litellm/issues/5364 * feat(bedrock/chat.py): support 'json_schema' for bedrock models Closes https://github.com/BerriAI/litellm/issues/5434 * fix(bedrock/embed/embeddings.py): support async embeddings for amazon titan models * fix: linting fixes * fix: handle key errors * fix(bedrock/chat.py): fix bedrock ai21 streaming object * feat(bedrock/embed): support bedrock embedding optional params * fix(databricks.py): fix usage chunk * fix(internal_user_endpoints.py): apply internal user defaults, if user role updated Fixes issue where user update wouldn't apply defaults * feat(slack_alerting.py): provide multiple slack channels for a given alert type multiple channels might be interested in receiving an alert for a given type * docs(alerting.md): add multiple channel alerting to docs
This commit is contained in:
parent
02f288a8a3
commit
f9e6507cd1
22 changed files with 720 additions and 209 deletions
|
@ -1,12 +1,12 @@
|
|||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
# - id: mypy
|
||||
# name: mypy
|
||||
# entry: python3 -m mypy --ignore-missing-imports
|
||||
# language: system
|
||||
# types: [python]
|
||||
# files: ^litellm/
|
||||
- id: mypy
|
||||
name: mypy
|
||||
entry: python3 -m mypy --ignore-missing-imports
|
||||
language: system
|
||||
types: [python]
|
||||
files: ^litellm/
|
||||
- id: isort
|
||||
name: isort
|
||||
entry: isort
|
||||
|
|
|
@ -190,6 +190,36 @@ curl -i http://localhost:4000/v1/chat/completions \
|
|||
```
|
||||
|
||||
|
||||
## Advanced - provide multiple slack channels for a given alert type
|
||||
|
||||
Just add it like this - `alert_type: [<hook_url_channel_1>, <hook_url_channel_2>]`.
|
||||
|
||||
1. Setup config.yaml
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
alerting: ["slack"]
|
||||
alert_to_webhook_url: {
|
||||
"spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"]
|
||||
}
|
||||
```
|
||||
|
||||
2. Start proxy
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \
|
||||
-H 'Authorization: Bearer sk-1234'
|
||||
```
|
||||
|
||||
In case of error, check server logs for the error message!
|
||||
|
||||
## Advanced - Using MS Teams Webhooks
|
||||
|
||||
MS Teams provides a slack compatible webhook url that you can use for alerting
|
||||
|
|
|
@ -900,6 +900,14 @@ from .llms.bedrock.common_utils import (
|
|||
AmazonMistralConfig,
|
||||
AmazonBedrockGlobalConfig,
|
||||
)
|
||||
from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
|
||||
from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
|
||||
AmazonTitanMultimodalEmbeddingG1Config,
|
||||
)
|
||||
from .llms.bedrock.embed.amazon_titan_v2_transformation import (
|
||||
AmazonTitanV2Config,
|
||||
)
|
||||
from .llms.bedrock.embed.cohere_transformation import BedrockCohereEmbeddingConfig
|
||||
from .llms.openai import (
|
||||
OpenAIConfig,
|
||||
OpenAITextCompletionConfig,
|
||||
|
|
|
@ -1514,7 +1514,9 @@ Model Info:
|
|||
self.alert_to_webhook_url is not None
|
||||
and alert_type in self.alert_to_webhook_url
|
||||
):
|
||||
slack_webhook_url = self.alert_to_webhook_url[alert_type]
|
||||
slack_webhook_url: Optional[Union[str, List[str]]] = (
|
||||
self.alert_to_webhook_url[alert_type]
|
||||
)
|
||||
elif self.default_webhook_url is not None:
|
||||
slack_webhook_url = self.default_webhook_url
|
||||
else:
|
||||
|
@ -1525,18 +1527,39 @@ Model Info:
|
|||
payload = {"text": formatted_message}
|
||||
headers = {"Content-type": "application/json"}
|
||||
|
||||
response = await self.async_http_handler.post(
|
||||
url=slack_webhook_url,
|
||||
headers=headers,
|
||||
data=json.dumps(payload),
|
||||
)
|
||||
if response.status_code == 200:
|
||||
pass
|
||||
else:
|
||||
verbose_proxy_logger.debug(
|
||||
"Error sending slack alert. Error={}".format(response.text)
|
||||
async def send_to_webhook(url: str):
|
||||
return await self.async_http_handler.post(
|
||||
url=url,
|
||||
headers=headers,
|
||||
data=json.dumps(payload),
|
||||
)
|
||||
|
||||
if isinstance(slack_webhook_url, list):
|
||||
# Parallelize the calls if it's a list of URLs
|
||||
responses = await asyncio.gather(
|
||||
*[send_to_webhook(url) for url in slack_webhook_url]
|
||||
)
|
||||
|
||||
for response, url in zip(responses, slack_webhook_url):
|
||||
if response.status_code == 200:
|
||||
pass
|
||||
else:
|
||||
verbose_proxy_logger.debug(
|
||||
"Error sending slack alert to url={}. Error={}".format(
|
||||
url, response.text
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Single call if it's a single URL
|
||||
response = await send_to_webhook(slack_webhook_url)
|
||||
|
||||
if response.status_code == 200:
|
||||
pass
|
||||
else:
|
||||
verbose_proxy_logger.debug(
|
||||
"Error sending slack alert. Error={}".format(response.text)
|
||||
)
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
"""Log deployment latency"""
|
||||
try:
|
||||
|
@ -1718,7 +1741,9 @@ Model Info:
|
|||
try:
|
||||
from calendar import monthrange
|
||||
|
||||
from litellm.proxy.proxy_server import _get_spend_report_for_time_range
|
||||
from litellm.proxy.spend_tracking.spend_management_endpoints import (
|
||||
_get_spend_report_for_time_range,
|
||||
)
|
||||
|
||||
todays_date = datetime.datetime.now().date()
|
||||
first_day_of_month = todays_date.replace(day=1)
|
||||
|
@ -1763,7 +1788,7 @@ Model Info:
|
|||
alerting_metadata={},
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.error("Error sending weekly spend report %s", e)
|
||||
verbose_proxy_logger.exception("Error sending weekly spend report %s", e)
|
||||
|
||||
async def send_fallback_stats_from_prometheus(self):
|
||||
"""
|
||||
|
|
|
@ -30,6 +30,7 @@ from litellm.llms.custom_httpx.http_handler import (
|
|||
)
|
||||
from litellm.types.llms.anthropic import (
|
||||
AnthopicMessagesAssistantMessageParam,
|
||||
AnthropicChatCompletionUsageBlock,
|
||||
AnthropicFinishReason,
|
||||
AnthropicMessagesRequest,
|
||||
AnthropicMessagesTool,
|
||||
|
@ -1177,6 +1178,30 @@ class ModelResponseIterator:
|
|||
return True
|
||||
return False
|
||||
|
||||
def _handle_usage(
|
||||
self, anthropic_usage_chunk: dict
|
||||
) -> AnthropicChatCompletionUsageBlock:
|
||||
special_fields = ["input_tokens", "output_tokens"]
|
||||
|
||||
usage_block = AnthropicChatCompletionUsageBlock(
|
||||
prompt_tokens=anthropic_usage_chunk.get("input_tokens", 0),
|
||||
completion_tokens=anthropic_usage_chunk.get("output_tokens", 0),
|
||||
total_tokens=anthropic_usage_chunk.get("input_tokens", 0)
|
||||
+ anthropic_usage_chunk.get("output_tokens", 0),
|
||||
)
|
||||
|
||||
if "cache_creation_input_tokens" in anthropic_usage_chunk:
|
||||
usage_block["cache_creation_input_tokens"] = anthropic_usage_chunk[
|
||||
"cache_creation_input_tokens"
|
||||
]
|
||||
|
||||
if "cache_read_input_tokens" in anthropic_usage_chunk:
|
||||
usage_block["cache_read_input_tokens"] = anthropic_usage_chunk[
|
||||
"cache_read_input_tokens"
|
||||
]
|
||||
|
||||
return usage_block
|
||||
|
||||
def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
|
||||
try:
|
||||
type_chunk = chunk.get("type", "") or ""
|
||||
|
@ -1252,12 +1277,7 @@ class ModelResponseIterator:
|
|||
finish_reason=message_delta["delta"].get("stop_reason", "stop")
|
||||
or "stop"
|
||||
)
|
||||
usage = ChatCompletionUsageBlock(
|
||||
prompt_tokens=message_delta["usage"].get("input_tokens", 0),
|
||||
completion_tokens=message_delta["usage"].get("output_tokens", 0),
|
||||
total_tokens=message_delta["usage"].get("input_tokens", 0)
|
||||
+ message_delta["usage"].get("output_tokens", 0),
|
||||
)
|
||||
usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"])
|
||||
is_finished = True
|
||||
elif type_chunk == "message_start":
|
||||
"""
|
||||
|
@ -1280,19 +1300,8 @@ class ModelResponseIterator:
|
|||
}
|
||||
"""
|
||||
message_start_block = MessageStartBlock(**chunk) # type: ignore
|
||||
usage = ChatCompletionUsageBlock(
|
||||
prompt_tokens=message_start_block["message"]
|
||||
.get("usage", {})
|
||||
.get("input_tokens", 0),
|
||||
completion_tokens=message_start_block["message"]
|
||||
.get("usage", {})
|
||||
.get("output_tokens", 0),
|
||||
total_tokens=message_start_block["message"]
|
||||
.get("usage", {})
|
||||
.get("input_tokens", 0)
|
||||
+ message_start_block["message"]
|
||||
.get("usage", {})
|
||||
.get("output_tokens", 0),
|
||||
usage = self._handle_usage(
|
||||
anthropic_usage_chunk=message_start_block["message"]["usage"]
|
||||
)
|
||||
elif type_chunk == "error":
|
||||
"""
|
||||
|
|
|
@ -43,6 +43,10 @@ from litellm.types.llms.openai import (
|
|||
ChatCompletionResponseMessage,
|
||||
ChatCompletionToolCallChunk,
|
||||
ChatCompletionToolCallFunctionChunk,
|
||||
ChatCompletionToolChoiceFunctionParam,
|
||||
ChatCompletionToolChoiceObjectParam,
|
||||
ChatCompletionToolParam,
|
||||
ChatCompletionToolParamFunctionChunk,
|
||||
ChatCompletionUsageBlock,
|
||||
)
|
||||
from litellm.types.utils import GenericStreamingChunk as GChunk
|
||||
|
@ -1152,6 +1156,7 @@ class AmazonConverseConfig:
|
|||
"temperature",
|
||||
"top_p",
|
||||
"extra_headers",
|
||||
"response_format",
|
||||
]
|
||||
|
||||
if (
|
||||
|
@ -1210,6 +1215,48 @@ class AmazonConverseConfig:
|
|||
drop_params: bool,
|
||||
) -> dict:
|
||||
for param, value in non_default_params.items():
|
||||
if param == "response_format":
|
||||
json_schema: Optional[dict] = None
|
||||
schema_name: str = ""
|
||||
if "response_schema" in value:
|
||||
json_schema = value["response_schema"]
|
||||
schema_name = "json_tool_call"
|
||||
elif "json_schema" in value:
|
||||
json_schema = value["json_schema"]["schema"]
|
||||
schema_name = value["json_schema"]["name"]
|
||||
"""
|
||||
Follow similar approach to anthropic - translate to a single tool call.
|
||||
|
||||
When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode
|
||||
- You usually want to provide a single tool
|
||||
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
|
||||
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
|
||||
"""
|
||||
if json_schema is not None:
|
||||
_tool_choice = self.map_tool_choice_values(
|
||||
model=model, tool_choice="required", drop_params=drop_params # type: ignore
|
||||
)
|
||||
|
||||
_tool = ChatCompletionToolParam(
|
||||
type="function",
|
||||
function=ChatCompletionToolParamFunctionChunk(
|
||||
name=schema_name, parameters=json_schema
|
||||
),
|
||||
)
|
||||
|
||||
optional_params["tools"] = [_tool]
|
||||
optional_params["tool_choice"] = _tool_choice
|
||||
optional_params["json_mode"] = True
|
||||
else:
|
||||
if litellm.drop_params is True or drop_params is True:
|
||||
pass
|
||||
else:
|
||||
raise litellm.utils.UnsupportedParamsError(
|
||||
message="Bedrock doesn't support response_format={}. To drop it from the call, set `litellm.drop_params = True.".format(
|
||||
value
|
||||
),
|
||||
status_code=400,
|
||||
)
|
||||
if param == "max_tokens":
|
||||
optional_params["maxTokens"] = value
|
||||
if param == "stream":
|
||||
|
@ -1263,7 +1310,7 @@ class BedrockConverseLLM(BaseAWSLLM):
|
|||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
print_verbose(f"raw model_response: {response.text}")
|
||||
|
||||
json_mode: Optional[bool] = optional_params.pop("json_mode", None)
|
||||
## RESPONSE OBJECT
|
||||
try:
|
||||
completion_response = ConverseResponseBlock(**response.json()) # type: ignore
|
||||
|
@ -1332,6 +1379,7 @@ class BedrockConverseLLM(BaseAWSLLM):
|
|||
name=response_tool_name,
|
||||
arguments=json.dumps(content["toolUse"]["input"]),
|
||||
)
|
||||
|
||||
_tool_response_chunk = ChatCompletionToolCallChunk(
|
||||
id=content["toolUse"]["toolUseId"],
|
||||
type="function",
|
||||
|
@ -1340,7 +1388,14 @@ class BedrockConverseLLM(BaseAWSLLM):
|
|||
)
|
||||
tools.append(_tool_response_chunk)
|
||||
chat_completion_message["content"] = content_str
|
||||
chat_completion_message["tool_calls"] = tools
|
||||
|
||||
if json_mode is True and tools is not None and len(tools) == 1:
|
||||
# to support 'json_schema' logic on bedrock models
|
||||
json_mode_content_str: Optional[str] = tools[0]["function"].get("arguments")
|
||||
if json_mode_content_str is not None:
|
||||
chat_completion_message["content"] = json_mode_content_str
|
||||
else:
|
||||
chat_completion_message["tool_calls"] = tools
|
||||
|
||||
## CALCULATING USAGE - bedrock returns usage in the headers
|
||||
input_tokens = completion_response["usage"]["inputTokens"]
|
||||
|
@ -1586,6 +1641,9 @@ class BedrockConverseLLM(BaseAWSLLM):
|
|||
supported_converse_params = AmazonConverseConfig.__annotations__.keys()
|
||||
supported_tool_call_params = ["tools", "tool_choice"]
|
||||
supported_guardrail_params = ["guardrailConfig"]
|
||||
json_mode: Optional[bool] = inference_params.pop(
|
||||
"json_mode", None
|
||||
) # used for handling json_schema
|
||||
## TRANSFORMATION ##
|
||||
|
||||
bedrock_messages: List[MessageBlock] = _bedrock_converse_messages_pt(
|
||||
|
@ -2028,8 +2086,14 @@ class MockResponseIterator: # for returning ai21 streaming responses
|
|||
text=chunk_data.choices[0].message.content or "", # type: ignore
|
||||
tool_use=None,
|
||||
is_finished=True,
|
||||
finish_reason=chunk_data.choices[0].finish_reason, # type: ignore
|
||||
usage=chunk_usage, # type: ignore
|
||||
finish_reason=map_finish_reason(
|
||||
finish_reason=chunk_data.choices[0].finish_reason or ""
|
||||
),
|
||||
usage=ChatCompletionUsageBlock(
|
||||
prompt_tokens=chunk_usage.prompt_tokens,
|
||||
completion_tokens=chunk_usage.completion_tokens,
|
||||
total_tokens=chunk_usage.total_tokens,
|
||||
),
|
||||
index=0,
|
||||
)
|
||||
return processed_chunk
|
||||
|
|
|
@ -15,8 +15,6 @@ from typing import List, Optional
|
|||
from litellm.types.llms.bedrock import (
|
||||
AmazonTitanG1EmbeddingRequest,
|
||||
AmazonTitanG1EmbeddingResponse,
|
||||
AmazonTitanV2EmbeddingRequest,
|
||||
AmazonTitanV2EmbeddingResponse,
|
||||
)
|
||||
from litellm.types.utils import Embedding, EmbeddingResponse, Usage
|
||||
|
||||
|
@ -52,6 +50,14 @@ class AmazonTitanG1Config:
|
|||
and v is not None
|
||||
}
|
||||
|
||||
def get_supported_openai_params(self) -> List[str]:
|
||||
return []
|
||||
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict
|
||||
) -> dict:
|
||||
return optional_params
|
||||
|
||||
def _transform_request(
|
||||
self, input: str, inference_params: dict
|
||||
) -> AmazonTitanG1EmbeddingRequest:
|
||||
|
@ -80,70 +86,3 @@ class AmazonTitanG1Config:
|
|||
total_tokens=total_prompt_tokens,
|
||||
)
|
||||
return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
|
||||
|
||||
|
||||
class AmazonTitanV2Config:
|
||||
"""
|
||||
Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html
|
||||
|
||||
normalize: boolean - flag indicating whether or not to normalize the output embeddings. Defaults to true
|
||||
dimensions: int - The number of dimensions the output embeddings should have. The following values are accepted: 1024 (default), 512, 256.
|
||||
"""
|
||||
|
||||
normalize: Optional[bool] = None
|
||||
dimensions: Optional[int] = None
|
||||
|
||||
def __init__(
|
||||
self, normalize: Optional[bool] = None, dimensions: Optional[int] = None
|
||||
) -> None:
|
||||
locals_ = locals()
|
||||
for key, value in locals_.items():
|
||||
if key != "self" and value is not None:
|
||||
setattr(self.__class__, key, value)
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
def _transform_request(
|
||||
self, input: str, inference_params: dict
|
||||
) -> AmazonTitanV2EmbeddingRequest:
|
||||
return AmazonTitanV2EmbeddingRequest(inputText=input, **inference_params) # type: ignore
|
||||
|
||||
def _transform_response(
|
||||
self, response_list: List[dict], model: str
|
||||
) -> EmbeddingResponse:
|
||||
total_prompt_tokens = 0
|
||||
|
||||
transformed_responses: List[Embedding] = []
|
||||
for index, response in enumerate(response_list):
|
||||
_parsed_response = AmazonTitanV2EmbeddingResponse(**response) # type: ignore
|
||||
transformed_responses.append(
|
||||
Embedding(
|
||||
embedding=_parsed_response["embedding"],
|
||||
index=index,
|
||||
object="embedding",
|
||||
)
|
||||
)
|
||||
total_prompt_tokens += _parsed_response["inputTextTokenCount"]
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=total_prompt_tokens,
|
||||
)
|
||||
return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
|
||||
|
|
|
@ -17,38 +17,64 @@ from litellm.types.utils import Embedding, EmbeddingResponse, Usage
|
|||
from litellm.utils import is_base64_encoded
|
||||
|
||||
|
||||
def _transform_request(
|
||||
input: str, inference_params: dict
|
||||
) -> AmazonTitanMultimodalEmbeddingRequest:
|
||||
## check if b64 encoded str or not ##
|
||||
is_encoded = is_base64_encoded(input)
|
||||
if is_encoded: # check if string is b64 encoded image or not
|
||||
transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputImage=input)
|
||||
else:
|
||||
transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputText=input)
|
||||
class AmazonTitanMultimodalEmbeddingG1Config:
|
||||
"""
|
||||
Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-mm.html
|
||||
"""
|
||||
|
||||
for k, v in inference_params.items():
|
||||
transformed_request[k] = v # type: ignore
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
return transformed_request
|
||||
def get_supported_openai_params(self) -> List[str]:
|
||||
return ["dimensions"]
|
||||
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict
|
||||
) -> dict:
|
||||
for k, v in non_default_params.items():
|
||||
if k == "dimensions":
|
||||
optional_params["embeddingConfig"] = (
|
||||
AmazonTitanMultimodalEmbeddingConfig(outputEmbeddingLength=v)
|
||||
)
|
||||
return optional_params
|
||||
|
||||
def _transform_response(response_list: List[dict], model: str) -> EmbeddingResponse:
|
||||
|
||||
total_prompt_tokens = 0
|
||||
transformed_responses: List[Embedding] = []
|
||||
for index, response in enumerate(response_list):
|
||||
_parsed_response = AmazonTitanMultimodalEmbeddingResponse(**response) # type: ignore
|
||||
transformed_responses.append(
|
||||
Embedding(
|
||||
embedding=_parsed_response["embedding"], index=index, object="embedding"
|
||||
def _transform_request(
|
||||
self, input: str, inference_params: dict
|
||||
) -> AmazonTitanMultimodalEmbeddingRequest:
|
||||
## check if b64 encoded str or not ##
|
||||
is_encoded = is_base64_encoded(input)
|
||||
if is_encoded: # check if string is b64 encoded image or not
|
||||
transformed_request = AmazonTitanMultimodalEmbeddingRequest(
|
||||
inputImage=input
|
||||
)
|
||||
)
|
||||
total_prompt_tokens += _parsed_response["inputTextTokenCount"]
|
||||
else:
|
||||
transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputText=input)
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=total_prompt_tokens,
|
||||
)
|
||||
return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
|
||||
for k, v in inference_params.items():
|
||||
transformed_request[k] = v # type: ignore
|
||||
|
||||
return transformed_request
|
||||
|
||||
def _transform_response(
|
||||
self, response_list: List[dict], model: str
|
||||
) -> EmbeddingResponse:
|
||||
|
||||
total_prompt_tokens = 0
|
||||
transformed_responses: List[Embedding] = []
|
||||
for index, response in enumerate(response_list):
|
||||
_parsed_response = AmazonTitanMultimodalEmbeddingResponse(**response) # type: ignore
|
||||
transformed_responses.append(
|
||||
Embedding(
|
||||
embedding=_parsed_response["embedding"],
|
||||
index=index,
|
||||
object="embedding",
|
||||
)
|
||||
)
|
||||
total_prompt_tokens += _parsed_response["inputTextTokenCount"]
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=total_prompt_tokens,
|
||||
completion_tokens=0,
|
||||
total_tokens=total_prompt_tokens,
|
||||
)
|
||||
return EmbeddingResponse(model=model, usage=usage, data=transformed_responses)
|
||||
|
|
|
@ -56,6 +56,17 @@ class AmazonTitanV2Config:
|
|||
and v is not None
|
||||
}
|
||||
|
||||
def get_supported_openai_params(self) -> List[str]:
|
||||
return ["dimensions"]
|
||||
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict
|
||||
) -> dict:
|
||||
for k, v in non_default_params.items():
|
||||
if k == "dimensions":
|
||||
optional_params["dimensions"] = v
|
||||
return optional_params
|
||||
|
||||
def _transform_request(
|
||||
self, input: str, inference_params: dict
|
||||
) -> AmazonTitanV2EmbeddingRequest:
|
||||
|
|
|
@ -11,15 +11,30 @@ from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingRe
|
|||
from litellm.types.utils import Embedding, EmbeddingResponse
|
||||
|
||||
|
||||
def _transform_request(
|
||||
input: List[str], inference_params: dict
|
||||
) -> CohereEmbeddingRequest:
|
||||
transformed_request = CohereEmbeddingRequest(
|
||||
texts=input,
|
||||
input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE, # type: ignore
|
||||
)
|
||||
class BedrockCohereEmbeddingConfig:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
for k, v in inference_params.items():
|
||||
transformed_request[k] = v # type: ignore
|
||||
def get_supported_openai_params(self) -> List[str]:
|
||||
return ["encoding_format"]
|
||||
|
||||
return transformed_request
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict
|
||||
) -> dict:
|
||||
for k, v in non_default_params.items():
|
||||
if k == "encoding_format":
|
||||
optional_params["embedding_types"] = v
|
||||
return optional_params
|
||||
|
||||
def _transform_request(
|
||||
self, input: List[str], inference_params: dict
|
||||
) -> CohereEmbeddingRequest:
|
||||
transformed_request = CohereEmbeddingRequest(
|
||||
texts=input,
|
||||
input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE, # type: ignore
|
||||
)
|
||||
|
||||
for k, v in inference_params.items():
|
||||
transformed_request[k] = v # type: ignore
|
||||
|
||||
return transformed_request
|
||||
|
|
|
@ -16,6 +16,7 @@ from litellm.llms.cohere.embed import embedding as cohere_embedding
|
|||
from litellm.llms.custom_httpx.http_handler import (
|
||||
AsyncHTTPHandler,
|
||||
HTTPHandler,
|
||||
_get_async_httpx_client,
|
||||
_get_httpx_client,
|
||||
)
|
||||
from litellm.types.llms.bedrock import AmazonEmbeddingRequest, CohereEmbeddingRequest
|
||||
|
@ -25,13 +26,10 @@ from ...base_aws_llm import BaseAWSLLM
|
|||
from ..common_utils import BedrockError, get_runtime_endpoint
|
||||
from .amazon_titan_g1_transformation import AmazonTitanG1Config
|
||||
from .amazon_titan_multimodal_transformation import (
|
||||
_transform_request as amazon_multimodal_transform_request,
|
||||
)
|
||||
from .amazon_titan_multimodal_transformation import (
|
||||
_transform_response as amazon_multimodal_transform_response,
|
||||
AmazonTitanMultimodalEmbeddingG1Config,
|
||||
)
|
||||
from .amazon_titan_v2_transformation import AmazonTitanV2Config
|
||||
from .cohere_transformation import _transform_request as cohere_transform_request
|
||||
from .cohere_transformation import BedrockCohereEmbeddingConfig
|
||||
|
||||
|
||||
class BedrockEmbedding(BaseAWSLLM):
|
||||
|
@ -118,6 +116,35 @@ class BedrockEmbedding(BaseAWSLLM):
|
|||
|
||||
return response.json()
|
||||
|
||||
async def _make_async_call(
|
||||
self,
|
||||
client: Optional[AsyncHTTPHandler],
|
||||
timeout: Optional[Union[float, httpx.Timeout]],
|
||||
api_base: str,
|
||||
headers: dict,
|
||||
data: dict,
|
||||
) -> dict:
|
||||
if client is None or not isinstance(client, AsyncHTTPHandler):
|
||||
_params = {}
|
||||
if timeout is not None:
|
||||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = timeout
|
||||
client = _get_async_httpx_client(_params) # type: ignore
|
||||
else:
|
||||
client = client
|
||||
|
||||
try:
|
||||
response = await client.post(url=api_base, headers=headers, data=json.dumps(data)) # type: ignore
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as err:
|
||||
error_code = err.response.status_code
|
||||
raise BedrockError(status_code=error_code, message=response.text)
|
||||
except httpx.TimeoutException:
|
||||
raise BedrockError(status_code=408, message="Timeout error occurred.")
|
||||
|
||||
return response.json()
|
||||
|
||||
def _single_func_embeddings(
|
||||
self,
|
||||
client: Optional[HTTPHandler],
|
||||
|
@ -186,9 +213,102 @@ class BedrockEmbedding(BaseAWSLLM):
|
|||
|
||||
## TRANSFORM RESPONSE ##
|
||||
if model == "amazon.titan-embed-image-v1":
|
||||
returned_response = amazon_multimodal_transform_response(
|
||||
returned_response = (
|
||||
AmazonTitanMultimodalEmbeddingG1Config()._transform_response(
|
||||
response_list=responses, model=model
|
||||
)
|
||||
)
|
||||
elif model == "amazon.titan-embed-text-v1":
|
||||
returned_response = AmazonTitanG1Config()._transform_response(
|
||||
response_list=responses, model=model
|
||||
)
|
||||
elif model == "amazon.titan-embed-text-v2:0":
|
||||
returned_response = AmazonTitanV2Config()._transform_response(
|
||||
response_list=responses, model=model
|
||||
)
|
||||
|
||||
if returned_response is None:
|
||||
raise Exception(
|
||||
"Unable to map model response to known provider format. model={}".format(
|
||||
model
|
||||
)
|
||||
)
|
||||
|
||||
return returned_response
|
||||
|
||||
async def _async_single_func_embeddings(
|
||||
self,
|
||||
client: Optional[AsyncHTTPHandler],
|
||||
timeout: Optional[Union[float, httpx.Timeout]],
|
||||
batch_data: List[dict],
|
||||
credentials: Any,
|
||||
extra_headers: Optional[dict],
|
||||
endpoint_url: str,
|
||||
aws_region_name: str,
|
||||
model: str,
|
||||
logging_obj: Any,
|
||||
):
|
||||
try:
|
||||
import boto3
|
||||
from botocore.auth import SigV4Auth
|
||||
from botocore.awsrequest import AWSRequest
|
||||
from botocore.credentials import Credentials
|
||||
except ImportError:
|
||||
raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.")
|
||||
|
||||
responses: List[dict] = []
|
||||
for data in batch_data:
|
||||
sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if extra_headers is not None:
|
||||
headers = {"Content-Type": "application/json", **extra_headers}
|
||||
request = AWSRequest(
|
||||
method="POST", url=endpoint_url, data=json.dumps(data), headers=headers
|
||||
)
|
||||
sigv4.add_auth(request)
|
||||
if (
|
||||
extra_headers is not None and "Authorization" in extra_headers
|
||||
): # prevent sigv4 from overwriting the auth header
|
||||
request.headers["Authorization"] = extra_headers["Authorization"]
|
||||
prepped = request.prepare()
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"api_base": prepped.url,
|
||||
"headers": prepped.headers,
|
||||
},
|
||||
)
|
||||
response = await self._make_async_call(
|
||||
client=client,
|
||||
timeout=timeout,
|
||||
api_base=prepped.url,
|
||||
headers=prepped.headers,
|
||||
data=data,
|
||||
)
|
||||
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=data,
|
||||
api_key="",
|
||||
original_response=response,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
|
||||
responses.append(response)
|
||||
|
||||
returned_response: Optional[EmbeddingResponse] = None
|
||||
|
||||
## TRANSFORM RESPONSE ##
|
||||
if model == "amazon.titan-embed-image-v1":
|
||||
returned_response = (
|
||||
AmazonTitanMultimodalEmbeddingG1Config()._transform_response(
|
||||
response_list=responses, model=model
|
||||
)
|
||||
)
|
||||
elif model == "amazon.titan-embed-text-v1":
|
||||
returned_response = AmazonTitanG1Config()._transform_response(
|
||||
response_list=responses, model=model
|
||||
|
@ -246,7 +366,7 @@ class BedrockEmbedding(BaseAWSLLM):
|
|||
data: Optional[CohereEmbeddingRequest] = None
|
||||
batch_data: Optional[List] = None
|
||||
if provider == "cohere":
|
||||
data = cohere_transform_request(
|
||||
data = BedrockCohereEmbeddingConfig()._transform_request(
|
||||
input=input, inference_params=inference_params
|
||||
)
|
||||
elif provider == "amazon" and model in [
|
||||
|
@ -257,10 +377,10 @@ class BedrockEmbedding(BaseAWSLLM):
|
|||
batch_data = []
|
||||
for i in input:
|
||||
if model == "amazon.titan-embed-image-v1":
|
||||
transformed_request: AmazonEmbeddingRequest = (
|
||||
amazon_multimodal_transform_request(
|
||||
input=i, inference_params=inference_params
|
||||
)
|
||||
transformed_request: (
|
||||
AmazonEmbeddingRequest
|
||||
) = AmazonTitanMultimodalEmbeddingG1Config()._transform_request(
|
||||
input=i, inference_params=inference_params
|
||||
)
|
||||
elif model == "amazon.titan-embed-text-v1":
|
||||
transformed_request = AmazonTitanG1Config()._transform_request(
|
||||
|
@ -283,6 +403,22 @@ class BedrockEmbedding(BaseAWSLLM):
|
|||
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
|
||||
|
||||
if batch_data is not None:
|
||||
if aembedding:
|
||||
return self._async_single_func_embeddings( # type: ignore
|
||||
client=(
|
||||
client
|
||||
if client is not None and isinstance(client, AsyncHTTPHandler)
|
||||
else None
|
||||
),
|
||||
timeout=timeout,
|
||||
batch_data=batch_data,
|
||||
credentials=credentials,
|
||||
extra_headers=extra_headers,
|
||||
endpoint_url=endpoint_url,
|
||||
aws_region_name=aws_region_name,
|
||||
model=model,
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
return self._single_func_embeddings(
|
||||
client=(
|
||||
client
|
||||
|
|
|
@ -703,8 +703,16 @@ class ModelResponseIterator:
|
|||
is_finished = True
|
||||
finish_reason = processed_chunk.choices[0].finish_reason
|
||||
|
||||
if hasattr(processed_chunk, "usage"):
|
||||
usage = processed_chunk.usage # type: ignore
|
||||
if hasattr(processed_chunk, "usage") and isinstance(
|
||||
processed_chunk.usage, litellm.Usage
|
||||
):
|
||||
usage_chunk: litellm.Usage = processed_chunk.usage
|
||||
|
||||
usage = ChatCompletionUsageBlock(
|
||||
prompt_tokens=usage_chunk.prompt_tokens,
|
||||
completion_tokens=usage_chunk.completion_tokens,
|
||||
total_tokens=usage_chunk.total_tokens,
|
||||
)
|
||||
|
||||
return GenericStreamingChunk(
|
||||
text=text,
|
||||
|
|
|
@ -5431,6 +5431,9 @@ def stream_chunk_builder(
|
|||
# # Update usage information if needed
|
||||
prompt_tokens = 0
|
||||
completion_tokens = 0
|
||||
## anthropic prompt caching information ##
|
||||
cache_creation_input_tokens: Optional[int] = None
|
||||
cache_read_input_tokens: Optional[int] = None
|
||||
for chunk in chunks:
|
||||
usage_chunk: Optional[Usage] = None
|
||||
if "usage" in chunk:
|
||||
|
@ -5442,6 +5445,13 @@ def stream_chunk_builder(
|
|||
prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0
|
||||
if "completion_tokens" in usage_chunk:
|
||||
completion_tokens = usage_chunk.get("completion_tokens", 0) or 0
|
||||
if "cache_creation_input_tokens" in usage_chunk:
|
||||
cache_creation_input_tokens = usage_chunk.get(
|
||||
"cache_creation_input_tokens"
|
||||
)
|
||||
if "cache_read_input_tokens" in usage_chunk:
|
||||
cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens")
|
||||
|
||||
try:
|
||||
response["usage"]["prompt_tokens"] = prompt_tokens or token_counter(
|
||||
model=model, messages=messages
|
||||
|
@ -5460,6 +5470,13 @@ def stream_chunk_builder(
|
|||
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
|
||||
)
|
||||
|
||||
if cache_creation_input_tokens is not None:
|
||||
response["usage"][
|
||||
"cache_creation_input_tokens"
|
||||
] = cache_creation_input_tokens
|
||||
if cache_read_input_tokens is not None:
|
||||
response["usage"]["cache_read_input_tokens"] = cache_read_input_tokens
|
||||
|
||||
return convert_to_model_response_object(
|
||||
response_object=response,
|
||||
model_response_object=model_response,
|
||||
|
|
|
@ -2,3 +2,16 @@ model_list:
|
|||
- model_name: "gpt-3.5-turbo"
|
||||
litellm_params:
|
||||
model: "gpt-3.5-turbo"
|
||||
|
||||
litellm_settings:
|
||||
max_internal_user_budget: 0.02 # amount in USD
|
||||
internal_user_budget_duration: "1s" # reset every second
|
||||
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
alerting: ["slack"]
|
||||
alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
|
||||
alert_to_webhook_url: {
|
||||
"spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"]
|
||||
}
|
||||
|
||||
|
|
|
@ -1632,6 +1632,16 @@ class AllCallbacks(LiteLLMBase):
|
|||
ui_callback_name="Langsmith",
|
||||
)
|
||||
|
||||
lago: CallbackOnUI = CallbackOnUI(
|
||||
litellm_callback_name="lago",
|
||||
litellm_callback_params=[
|
||||
"LAGO_API_BASE",
|
||||
"LAGO_API_KEY",
|
||||
"LAGO_API_EVENT_CODE",
|
||||
],
|
||||
ui_callback_name="Lago Billing",
|
||||
)
|
||||
|
||||
|
||||
class SpendLogsMetadata(TypedDict):
|
||||
"""
|
||||
|
|
|
@ -505,6 +505,10 @@ async def user_update(
|
|||
): # models default to [], spend defaults to 0, we should not reset these values
|
||||
non_default_values[k] = v
|
||||
|
||||
is_internal_user = False
|
||||
if data.user_role == LitellmUserRoles.INTERNAL_USER:
|
||||
is_internal_user = True
|
||||
|
||||
if "budget_duration" in non_default_values:
|
||||
duration_s = _duration_in_seconds(
|
||||
duration=non_default_values["budget_duration"]
|
||||
|
@ -512,6 +516,20 @@ async def user_update(
|
|||
user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
|
||||
non_default_values["budget_reset_at"] = user_reset_at
|
||||
|
||||
if "max_budget" not in non_default_values:
|
||||
if (
|
||||
is_internal_user and litellm.max_internal_user_budget is not None
|
||||
): # applies internal user limits, if user role updated
|
||||
non_default_values["max_budget"] = litellm.max_internal_user_budget
|
||||
|
||||
if (
|
||||
"budget_duration" not in non_default_values
|
||||
): # applies internal user limits, if user role updated
|
||||
if is_internal_user and litellm.internal_user_budget_duration is not None:
|
||||
non_default_values["budget_duration"] = (
|
||||
litellm.internal_user_budget_duration
|
||||
)
|
||||
|
||||
## ADD USER, IF NEW ##
|
||||
verbose_proxy_logger.debug("/user/update: Received data = %s", data)
|
||||
if data.user_id is not None and len(data.user_id) > 0:
|
||||
|
|
|
@ -282,6 +282,82 @@ async def test_anthropic_api_prompt_caching_no_headers():
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
async def test_anthropic_api_prompt_caching_streaming():
|
||||
from litellm.tests.test_streaming import streaming_format_tests
|
||||
|
||||
response = await litellm.acompletion(
|
||||
model="anthropic/claude-3-5-sonnet-20240620",
|
||||
messages=[
|
||||
# System Message
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Here is the full text of a complex legal agreement"
|
||||
* 400,
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
|
||||
},
|
||||
# The final turn is marked with cache-control, for continuing in followups.
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the key terms and conditions in this agreement?",
|
||||
"cache_control": {"type": "ephemeral"},
|
||||
}
|
||||
],
|
||||
},
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=10,
|
||||
stream=True,
|
||||
stream_options={"include_usage": True},
|
||||
)
|
||||
|
||||
idx = 0
|
||||
is_cache_read_input_tokens_in_usage = False
|
||||
is_cache_creation_input_tokens_in_usage = False
|
||||
async for chunk in response:
|
||||
streaming_format_tests(idx=idx, chunk=chunk)
|
||||
# Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl
|
||||
if hasattr(chunk, "usage"):
|
||||
print("Received final usage - {}".format(chunk.usage))
|
||||
if hasattr(chunk, "usage") and hasattr(chunk.usage, "cache_read_input_tokens"):
|
||||
is_cache_read_input_tokens_in_usage = True
|
||||
if hasattr(chunk, "usage") and hasattr(
|
||||
chunk.usage, "cache_creation_input_tokens"
|
||||
):
|
||||
is_cache_creation_input_tokens_in_usage = True
|
||||
|
||||
idx += 1
|
||||
|
||||
print("response=", response)
|
||||
|
||||
assert (
|
||||
is_cache_read_input_tokens_in_usage and is_cache_creation_input_tokens_in_usage
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_litellm_anthropic_prompt_caching_system():
|
||||
# https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples
|
||||
|
|
|
@ -2172,7 +2172,14 @@ def test_completion_openai():
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["gpt-4o-2024-08-06", "azure/chatgpt-v-2"])
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"gpt-4o-2024-08-06",
|
||||
"azure/chatgpt-v-2",
|
||||
"bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
||||
],
|
||||
)
|
||||
def test_completion_openai_pydantic(model):
|
||||
try:
|
||||
litellm.set_verbose = True
|
||||
|
@ -2201,7 +2208,7 @@ def test_completion_openai_pydantic(model):
|
|||
)
|
||||
break
|
||||
except litellm.JSONSchemaValidationError:
|
||||
print("ERROR OCCURRED! INVALID JSON")
|
||||
pytest.fail("ERROR OCCURRED! INVALID JSON")
|
||||
|
||||
print("This is the response object\n", response)
|
||||
|
||||
|
|
|
@ -319,9 +319,52 @@ async def test_cohere_embedding3(custom_llm_provider):
|
|||
"bedrock/amazon.titan-embed-text-v2:0",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sync_mode", [True])
|
||||
@pytest.mark.parametrize("sync_mode", [True, False]) # ,
|
||||
@pytest.mark.asyncio
|
||||
async def test_bedrock_embedding_titan(model, sync_mode):
|
||||
try:
|
||||
# this tests if we support str input for bedrock embedding
|
||||
litellm.set_verbose = True
|
||||
litellm.enable_cache()
|
||||
import time
|
||||
|
||||
current_time = str(time.time())
|
||||
# DO NOT MAKE THE INPUT A LIST in this test
|
||||
if sync_mode:
|
||||
response = embedding(
|
||||
model=model,
|
||||
input=f"good morning from litellm, attempting to embed data {current_time}", # input should always be a string in this test
|
||||
aws_region_name="us-west-2",
|
||||
)
|
||||
else:
|
||||
response = await litellm.aembedding(
|
||||
model=model,
|
||||
input=f"good morning from litellm, attempting to embed data {current_time}", # input should always be a string in this test
|
||||
aws_region_name="us-west-2",
|
||||
)
|
||||
print("response:", response)
|
||||
assert isinstance(
|
||||
response["data"][0]["embedding"], list
|
||||
), "Expected response to be a list"
|
||||
print("type of first embedding:", type(response["data"][0]["embedding"][0]))
|
||||
assert all(
|
||||
isinstance(x, float) for x in response["data"][0]["embedding"]
|
||||
), "Expected response to be a list of floats"
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"bedrock/amazon.titan-embed-text-v1",
|
||||
"bedrock/amazon.titan-embed-image-v1",
|
||||
"bedrock/amazon.titan-embed-text-v2:0",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sync_mode", [True]) # True,
|
||||
@pytest.mark.asyncio
|
||||
async def test_bedrock_embedding_titan_caching(model, sync_mode):
|
||||
try:
|
||||
# this tests if we support str input for bedrock embedding
|
||||
litellm.set_verbose = True
|
||||
|
|
|
@ -70,13 +70,43 @@ def test_anthropic_optional_params(stop_sequence, expected_count):
|
|||
def test_bedrock_optional_params_embeddings():
|
||||
litellm.drop_params = True
|
||||
optional_params = get_optional_params_embeddings(
|
||||
user="John", encoding_format=None, custom_llm_provider="bedrock"
|
||||
model="", user="John", encoding_format=None, custom_llm_provider="bedrock"
|
||||
)
|
||||
assert len(optional_params) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model, expected_dimensions, dimensions_kwarg",
|
||||
[
|
||||
("bedrock/amazon.titan-embed-text-v1", False, None),
|
||||
("bedrock/amazon.titan-embed-image-v1", True, "embeddingConfig"),
|
||||
("bedrock/amazon.titan-embed-text-v2:0", True, "dimensions"),
|
||||
("bedrock/cohere.embed-multilingual-v3", False, None),
|
||||
],
|
||||
)
|
||||
def test_bedrock_optional_params_embeddings_dimension(
|
||||
model, expected_dimensions, dimensions_kwarg
|
||||
):
|
||||
litellm.drop_params = True
|
||||
optional_params = get_optional_params_embeddings(
|
||||
model=model,
|
||||
user="John",
|
||||
encoding_format=None,
|
||||
dimensions=20,
|
||||
custom_llm_provider="bedrock",
|
||||
)
|
||||
if expected_dimensions:
|
||||
assert len(optional_params) == 1
|
||||
else:
|
||||
assert len(optional_params) == 0
|
||||
|
||||
if dimensions_kwarg is not None:
|
||||
assert dimensions_kwarg in optional_params
|
||||
|
||||
|
||||
def test_google_ai_studio_optional_params_embeddings():
|
||||
optional_params = get_optional_params_embeddings(
|
||||
model="",
|
||||
user="John",
|
||||
encoding_format=None,
|
||||
custom_llm_provider="gemini",
|
||||
|
@ -88,7 +118,7 @@ def test_google_ai_studio_optional_params_embeddings():
|
|||
def test_openai_optional_params_embeddings():
|
||||
litellm.drop_params = True
|
||||
optional_params = get_optional_params_embeddings(
|
||||
user="John", encoding_format=None, custom_llm_provider="openai"
|
||||
model="", user="John", encoding_format=None, custom_llm_provider="openai"
|
||||
)
|
||||
assert len(optional_params) == 1
|
||||
assert optional_params["user"] == "John"
|
||||
|
@ -97,7 +127,10 @@ def test_openai_optional_params_embeddings():
|
|||
def test_azure_optional_params_embeddings():
|
||||
litellm.drop_params = True
|
||||
optional_params = get_optional_params_embeddings(
|
||||
user="John", encoding_format=None, custom_llm_provider="azure"
|
||||
model="chatgpt-v-2",
|
||||
user="John",
|
||||
encoding_format=None,
|
||||
custom_llm_provider="azure",
|
||||
)
|
||||
assert len(optional_params) == 1
|
||||
assert optional_params["user"] == "John"
|
||||
|
@ -455,6 +488,7 @@ def test_get_optional_params_image_gen():
|
|||
|
||||
def test_bedrock_optional_params_embeddings_provider_specific_params():
|
||||
optional_params = get_optional_params_embeddings(
|
||||
model="my-custom-model",
|
||||
custom_llm_provider="huggingface",
|
||||
wait_for_model=True,
|
||||
)
|
||||
|
|
|
@ -287,3 +287,11 @@ class AnthropicResponse(BaseModel):
|
|||
|
||||
usage: AnthropicResponseUsageBlock
|
||||
"""Billing and rate-limit usage."""
|
||||
|
||||
|
||||
class AnthropicChatCompletionUsageBlock(TypedDict, total=False):
|
||||
prompt_tokens: Required[int]
|
||||
completion_tokens: Required[int]
|
||||
total_tokens: Required[int]
|
||||
cache_creation_input_tokens: int
|
||||
cache_read_input_tokens: int
|
||||
|
|
|
@ -2550,7 +2550,7 @@ def get_optional_params_image_gen(
|
|||
|
||||
def get_optional_params_embeddings(
|
||||
# 2 optional params
|
||||
model=None,
|
||||
model: str,
|
||||
user=None,
|
||||
encoding_format=None,
|
||||
dimensions=None,
|
||||
|
@ -2606,7 +2606,7 @@ def get_optional_params_embeddings(
|
|||
):
|
||||
raise UnsupportedParamsError(
|
||||
status_code=500,
|
||||
message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
|
||||
message="Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.",
|
||||
)
|
||||
if custom_llm_provider == "triton":
|
||||
keys = list(non_default_params.keys())
|
||||
|
@ -2641,39 +2641,57 @@ def get_optional_params_embeddings(
|
|||
)
|
||||
final_params = {**optional_params, **kwargs}
|
||||
return final_params
|
||||
if custom_llm_provider == "vertex_ai":
|
||||
if len(non_default_params.keys()) > 0:
|
||||
if litellm.drop_params is True: # drop the unsupported non-default values
|
||||
keys = list(non_default_params.keys())
|
||||
for k in keys:
|
||||
non_default_params.pop(k, None)
|
||||
final_params = {**non_default_params, **kwargs}
|
||||
return final_params
|
||||
raise UnsupportedParamsError(
|
||||
status_code=500,
|
||||
message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
|
||||
)
|
||||
if custom_llm_provider == "bedrock":
|
||||
# if dimensions is in non_default_params -> pass it for model=bedrock/amazon.titan-embed-text-v2
|
||||
if (
|
||||
"dimensions" in non_default_params.keys()
|
||||
and "amazon.titan-embed-text-v2" in model
|
||||
):
|
||||
kwargs["dimensions"] = non_default_params["dimensions"]
|
||||
non_default_params.pop("dimensions", None)
|
||||
if "amazon.titan-embed-text-v1" in model:
|
||||
object: Any = litellm.AmazonTitanG1Config()
|
||||
elif "amazon.titan-embed-image-v1" in model:
|
||||
object = litellm.AmazonTitanMultimodalEmbeddingG1Config()
|
||||
elif "amazon.titan-embed-text-v2:0" in model:
|
||||
object = litellm.AmazonTitanV2Config()
|
||||
elif "cohere.embed-multilingual-v3" in model:
|
||||
object = litellm.BedrockCohereEmbeddingConfig()
|
||||
else: # unmapped model
|
||||
supported_params = []
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
final_params = {**kwargs}
|
||||
return final_params
|
||||
|
||||
if len(non_default_params.keys()) > 0:
|
||||
if litellm.drop_params is True: # drop the unsupported non-default values
|
||||
keys = list(non_default_params.keys())
|
||||
for k in keys:
|
||||
non_default_params.pop(k, None)
|
||||
final_params = {**non_default_params, **kwargs}
|
||||
return final_params
|
||||
raise UnsupportedParamsError(
|
||||
status_code=500,
|
||||
message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
|
||||
)
|
||||
return {**non_default_params, **kwargs}
|
||||
supported_params = object.get_supported_openai_params()
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
optional_params = object.map_openai_params(
|
||||
non_default_params=non_default_params, optional_params={}
|
||||
)
|
||||
final_params = {**optional_params, **kwargs}
|
||||
return final_params
|
||||
# elif model == "amazon.titan-embed-image-v1":
|
||||
# supported_params = litellm.AmazonTitanG1Config().get_supported_openai_params()
|
||||
# _check_valid_arg(supported_params=supported_params)
|
||||
# optional_params = litellm.AmazonTitanG1Config().map_openai_params(
|
||||
# non_default_params=non_default_params, optional_params={}
|
||||
# )
|
||||
# final_params = {**optional_params, **kwargs}
|
||||
# return final_params
|
||||
|
||||
# if (
|
||||
# "dimensions" in non_default_params.keys()
|
||||
# and "amazon.titan-embed-text-v2" in model
|
||||
# ):
|
||||
# kwargs["dimensions"] = non_default_params["dimensions"]
|
||||
# non_default_params.pop("dimensions", None)
|
||||
|
||||
# if len(non_default_params.keys()) > 0:
|
||||
# if litellm.drop_params is True: # drop the unsupported non-default values
|
||||
# keys = list(non_default_params.keys())
|
||||
# for k in keys:
|
||||
# non_default_params.pop(k, None)
|
||||
# final_params = {**non_default_params, **kwargs}
|
||||
# return final_params
|
||||
# raise UnsupportedParamsError(
|
||||
# status_code=500,
|
||||
# message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
|
||||
# )
|
||||
# return {**non_default_params, **kwargs}
|
||||
if custom_llm_provider == "mistral":
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model,
|
||||
|
@ -9888,11 +9906,7 @@ class CustomStreamWrapper:
|
|||
|
||||
if anthropic_response_obj["usage"] is not None:
|
||||
model_response.usage = litellm.Usage(
|
||||
prompt_tokens=anthropic_response_obj["usage"]["prompt_tokens"],
|
||||
completion_tokens=anthropic_response_obj["usage"][
|
||||
"completion_tokens"
|
||||
],
|
||||
total_tokens=anthropic_response_obj["usage"]["total_tokens"],
|
||||
**anthropic_response_obj["usage"]
|
||||
)
|
||||
|
||||
if (
|
||||
|
@ -10507,10 +10521,10 @@ class CustomStreamWrapper:
|
|||
original_chunk.system_fingerprint
|
||||
)
|
||||
print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
|
||||
if self.sent_first_chunk == False:
|
||||
if self.sent_first_chunk is False:
|
||||
model_response.choices[0].delta["role"] = "assistant"
|
||||
self.sent_first_chunk = True
|
||||
elif self.sent_first_chunk == True and hasattr(
|
||||
elif self.sent_first_chunk is True and hasattr(
|
||||
model_response.choices[0].delta, "role"
|
||||
):
|
||||
_initial_delta = model_response.choices[
|
||||
|
@ -10575,7 +10589,7 @@ class CustomStreamWrapper:
|
|||
model_response.choices[0].delta.tool_calls is not None
|
||||
or model_response.choices[0].delta.function_call is not None
|
||||
):
|
||||
if self.sent_first_chunk == False:
|
||||
if self.sent_first_chunk is False:
|
||||
model_response.choices[0].delta["role"] = "assistant"
|
||||
self.sent_first_chunk = True
|
||||
return model_response
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue