mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 19:24:27 +00:00
Merge branch 'main' into litellm_allow_turning_off_message_logging_for_callbacks
This commit is contained in:
commit
02325f33d7
34 changed files with 442 additions and 117 deletions
|
@ -25,6 +25,13 @@ model_list:
|
||||||
model: openai/gpt-4o
|
model: openai/gpt-4o
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
tags: ["paid"] # 👈 Key Change
|
tags: ["paid"] # 👈 Key Change
|
||||||
|
- model_name: gpt-4
|
||||||
|
litellm_params:
|
||||||
|
model: openai/gpt-4o
|
||||||
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
|
||||||
|
|
||||||
|
|
||||||
router_settings:
|
router_settings:
|
||||||
enable_tag_filtering: True # 👈 Key Change
|
enable_tag_filtering: True # 👈 Key Change
|
||||||
|
@ -136,6 +143,46 @@ Response
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Setting Default Tags
|
||||||
|
|
||||||
|
Use this if you want all untagged requests to be routed to specific deployments
|
||||||
|
|
||||||
|
1. Set default tag on your yaml
|
||||||
|
```yaml
|
||||||
|
model_list:
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
tags: ["default"] # 👈 Key Change - All untagged requests will get routed to this
|
||||||
|
model_info:
|
||||||
|
id: "default-model" # used for identifying model in response headers
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Start proxy
|
||||||
|
```shell
|
||||||
|
$ litellm --config /path/to/config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Make request with no tags
|
||||||
|
```shell
|
||||||
|
curl -i http://localhost:4000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer sk-1234" \
|
||||||
|
-d '{
|
||||||
|
"model": "fake-openai-endpoint",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Hello, Claude gm!"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Expect to see the following response header when this works
|
||||||
|
```shell
|
||||||
|
x-litellm-model-id: default-model
|
||||||
|
```
|
||||||
|
|
||||||
## ✨ Team based tag routing (Enterprise)
|
## ✨ Team based tag routing (Enterprise)
|
||||||
|
|
||||||
LiteLLM Proxy supports team-based tag routing, allowing you to associate specific tags with teams and route requests accordingly. Example **Team A can access gpt-4 deployment A, Team B can access gpt-4 deployment B** (LLM Access Control For Teams)
|
LiteLLM Proxy supports team-based tag routing, allowing you to associate specific tags with teams and route requests accordingly. Example **Team A can access gpt-4 deployment A, Team B can access gpt-4 deployment B** (LLM Access Control For Teams)
|
||||||
|
@ -170,6 +217,12 @@ Here's how to set up and use team-based tag routing using curl commands:
|
||||||
tags: ["teamB"] # 👈 Key Change
|
tags: ["teamB"] # 👈 Key Change
|
||||||
model_info:
|
model_info:
|
||||||
id: "team-b-model" # used for identifying model in response headers
|
id: "team-b-model" # used for identifying model in response headers
|
||||||
|
- model_name: fake-openai-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/fake
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||||
|
tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
|
||||||
|
|
||||||
router_settings:
|
router_settings:
|
||||||
enable_tag_filtering: True # 👈 Key Change
|
enable_tag_filtering: True # 👈 Key Change
|
||||||
|
|
|
@ -208,8 +208,8 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
|
||||||
-d '{
|
-d '{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"logging": [{
|
"logging": [{
|
||||||
"callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
|
"callback_name": "langfuse", # "otel", "langfuse", "lunary"
|
||||||
"callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default
|
"callback_type": "success", # "success", "failure", "success_and_failure"
|
||||||
"callback_vars": {
|
"callback_vars": {
|
||||||
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
|
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
|
||||||
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment
|
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment
|
||||||
|
|
|
@ -118,7 +118,7 @@ in_memory_llm_clients_cache: dict = {}
|
||||||
safe_memory_mode: bool = False
|
safe_memory_mode: bool = False
|
||||||
enable_azure_ad_token_refresh: Optional[bool] = False
|
enable_azure_ad_token_refresh: Optional[bool] = False
|
||||||
### DEFAULT AZURE API VERSION ###
|
### DEFAULT AZURE API VERSION ###
|
||||||
AZURE_DEFAULT_API_VERSION = "2024-07-01-preview" # this is updated to the latest
|
AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest
|
||||||
### COHERE EMBEDDINGS DEFAULT TYPE ###
|
### COHERE EMBEDDINGS DEFAULT TYPE ###
|
||||||
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
|
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
|
||||||
### GUARDRAILS ###
|
### GUARDRAILS ###
|
||||||
|
@ -483,7 +483,12 @@ openai_compatible_providers: List = [
|
||||||
"azure_ai",
|
"azure_ai",
|
||||||
"github",
|
"github",
|
||||||
]
|
]
|
||||||
|
openai_text_completion_compatible_providers: List = (
|
||||||
|
[ # providers that support `/v1/completions`
|
||||||
|
"together_ai",
|
||||||
|
"fireworks_ai",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# well supported replicate llms
|
# well supported replicate llms
|
||||||
replicate_models: List = [
|
replicate_models: List = [
|
||||||
|
@ -863,7 +868,7 @@ from .llms.custom_llm import CustomLLM
|
||||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||||
from .llms.anthropic.chat import AnthropicConfig
|
from .llms.anthropic.chat import AnthropicConfig
|
||||||
from .llms.anthropic.completion import AnthropicTextConfig
|
from .llms.anthropic.completion import AnthropicTextConfig
|
||||||
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
|
from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig
|
||||||
from .llms.predibase import PredibaseConfig
|
from .llms.predibase import PredibaseConfig
|
||||||
from .llms.replicate import ReplicateConfig
|
from .llms.replicate import ReplicateConfig
|
||||||
from .llms.cohere.completion import CohereConfig
|
from .llms.cohere.completion import CohereConfig
|
||||||
|
|
|
@ -22,6 +22,9 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha
|
||||||
from litellm.llms.anthropic.cost_calculation import (
|
from litellm.llms.anthropic.cost_calculation import (
|
||||||
cost_per_token as anthropic_cost_per_token,
|
cost_per_token as anthropic_cost_per_token,
|
||||||
)
|
)
|
||||||
|
from litellm.llms.databricks.cost_calculator import (
|
||||||
|
cost_per_token as databricks_cost_per_token,
|
||||||
|
)
|
||||||
from litellm.rerank_api.types import RerankResponse
|
from litellm.rerank_api.types import RerankResponse
|
||||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||||
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
|
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
|
||||||
|
@ -159,7 +162,7 @@ def cost_per_token(
|
||||||
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
|
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
|
||||||
|
|
||||||
model_without_prefix = model
|
model_without_prefix = model
|
||||||
model_parts = model.split("/")
|
model_parts = model.split("/", 1)
|
||||||
if len(model_parts) > 1:
|
if len(model_parts) > 1:
|
||||||
model_without_prefix = model_parts[1]
|
model_without_prefix = model_parts[1]
|
||||||
else:
|
else:
|
||||||
|
@ -212,6 +215,8 @@ def cost_per_token(
|
||||||
)
|
)
|
||||||
elif custom_llm_provider == "anthropic":
|
elif custom_llm_provider == "anthropic":
|
||||||
return anthropic_cost_per_token(model=model, usage=usage_block)
|
return anthropic_cost_per_token(model=model, usage=usage_block)
|
||||||
|
elif custom_llm_provider == "databricks":
|
||||||
|
return databricks_cost_per_token(model=model, usage=usage_block)
|
||||||
elif custom_llm_provider == "gemini":
|
elif custom_llm_provider == "gemini":
|
||||||
return google_cost_per_token(
|
return google_cost_per_token(
|
||||||
model=model_without_prefix,
|
model=model_without_prefix,
|
||||||
|
|
|
@ -649,7 +649,9 @@ class OpenTelemetry(CustomLogger):
|
||||||
return BatchSpanProcessor(
|
return BatchSpanProcessor(
|
||||||
OTLPSpanExporterHTTP(
|
OTLPSpanExporterHTTP(
|
||||||
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
|
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
|
||||||
)
|
),
|
||||||
|
max_queue_size=100,
|
||||||
|
max_export_batch_size=100,
|
||||||
)
|
)
|
||||||
elif self.OTEL_EXPORTER == "otlp_grpc":
|
elif self.OTEL_EXPORTER == "otlp_grpc":
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
|
@ -659,7 +661,9 @@ class OpenTelemetry(CustomLogger):
|
||||||
return BatchSpanProcessor(
|
return BatchSpanProcessor(
|
||||||
OTLPSpanExporterGRPC(
|
OTLPSpanExporterGRPC(
|
||||||
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
|
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
|
||||||
)
|
),
|
||||||
|
max_queue_size=100,
|
||||||
|
max_export_batch_size=100,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
|
|
|
@ -2333,6 +2333,8 @@ def get_standard_logging_object_payload(
|
||||||
completion_start_time_float = completion_start_time.timestamp()
|
completion_start_time_float = completion_start_time.timestamp()
|
||||||
elif isinstance(completion_start_time, float):
|
elif isinstance(completion_start_time, float):
|
||||||
completion_start_time_float = completion_start_time
|
completion_start_time_float = completion_start_time
|
||||||
|
else:
|
||||||
|
completion_start_time_float = end_time_float
|
||||||
# clean up litellm hidden params
|
# clean up litellm hidden params
|
||||||
clean_hidden_params = StandardLoggingHiddenParams(
|
clean_hidden_params = StandardLoggingHiddenParams(
|
||||||
model_id=None,
|
model_id=None,
|
||||||
|
|
|
@ -245,7 +245,10 @@ class AzureOpenAIConfig:
|
||||||
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
|
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
|
||||||
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
|
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
|
||||||
"""
|
"""
|
||||||
if json_schema is not None:
|
if json_schema is not None and (
|
||||||
|
(api_version_year <= "2024" and api_version_month < "08")
|
||||||
|
or "gpt-4o" not in model
|
||||||
|
): # azure api version "2024-08-01-preview" onwards supports 'json_schema' only for gpt-4o
|
||||||
_tool_choice = ChatCompletionToolChoiceObjectParam(
|
_tool_choice = ChatCompletionToolChoiceObjectParam(
|
||||||
type="function",
|
type="function",
|
||||||
function=ChatCompletionToolChoiceFunctionParam(
|
function=ChatCompletionToolChoiceFunctionParam(
|
||||||
|
|
|
@ -1263,6 +1263,7 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
|
|
||||||
error_headers = getattr(e, "headers", None)
|
error_headers = getattr(e, "headers", None)
|
||||||
if response is not None and hasattr(response, "text"):
|
if response is not None and hasattr(response, "text"):
|
||||||
|
error_headers = getattr(e, "headers", None)
|
||||||
raise OpenAIError(
|
raise OpenAIError(
|
||||||
status_code=500,
|
status_code=500,
|
||||||
message=f"{str(e)}\n\nOriginal Response: {response.text}",
|
message=f"{str(e)}\n\nOriginal Response: {response.text}",
|
||||||
|
@ -1800,12 +1801,11 @@ class OpenAITextCompletion(BaseLLM):
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
super().completion()
|
super().completion()
|
||||||
exception_mapping_worked = False
|
|
||||||
try:
|
try:
|
||||||
if headers is None:
|
if headers is None:
|
||||||
headers = self.validate_environment(api_key=api_key)
|
headers = self.validate_environment(api_key=api_key)
|
||||||
if model is None or messages is None:
|
if model is None or messages is None:
|
||||||
raise OpenAIError(status_code=422, message=f"Missing model or messages")
|
raise OpenAIError(status_code=422, message="Missing model or messages")
|
||||||
|
|
||||||
if (
|
if (
|
||||||
len(messages) > 0
|
len(messages) > 0
|
||||||
|
|
|
@ -162,11 +162,10 @@ class AzureTextCompletion(BaseLLM):
|
||||||
client=None,
|
client=None,
|
||||||
):
|
):
|
||||||
super().completion()
|
super().completion()
|
||||||
exception_mapping_worked = False
|
|
||||||
try:
|
try:
|
||||||
if model is None or messages is None:
|
if model is None or messages is None:
|
||||||
raise AzureOpenAIError(
|
raise AzureOpenAIError(
|
||||||
status_code=422, message=f"Missing model or messages"
|
status_code=422, message="Missing model or messages"
|
||||||
)
|
)
|
||||||
|
|
||||||
max_retries = optional_params.pop("max_retries", 2)
|
max_retries = optional_params.pop("max_retries", 2)
|
||||||
|
@ -293,7 +292,10 @@ class AzureTextCompletion(BaseLLM):
|
||||||
"api-version", api_version
|
"api-version", api_version
|
||||||
)
|
)
|
||||||
|
|
||||||
response = azure_client.completions.create(**data, timeout=timeout) # type: ignore
|
raw_response = azure_client.completions.with_raw_response.create(
|
||||||
|
**data, timeout=timeout
|
||||||
|
)
|
||||||
|
response = raw_response.parse()
|
||||||
stringified_response = response.model_dump()
|
stringified_response = response.model_dump()
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.post_call(
|
logging_obj.post_call(
|
||||||
|
@ -380,13 +382,15 @@ class AzureTextCompletion(BaseLLM):
|
||||||
"complete_input_dict": data,
|
"complete_input_dict": data,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
response = await azure_client.completions.create(**data, timeout=timeout)
|
raw_response = await azure_client.completions.with_raw_response.create(
|
||||||
|
**data, timeout=timeout
|
||||||
|
)
|
||||||
|
response = raw_response.parse()
|
||||||
return openai_text_completion_config.convert_to_chat_model_response_object(
|
return openai_text_completion_config.convert_to_chat_model_response_object(
|
||||||
response_object=response.model_dump(),
|
response_object=response.model_dump(),
|
||||||
model_response_object=model_response,
|
model_response_object=model_response,
|
||||||
)
|
)
|
||||||
except AzureOpenAIError as e:
|
except AzureOpenAIError as e:
|
||||||
exception_mapping_worked = True
|
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
status_code = getattr(e, "status_code", 500)
|
status_code = getattr(e, "status_code", 500)
|
||||||
|
|
|
@ -736,7 +736,9 @@ class BedrockLLM(BaseAWSLLM):
|
||||||
|
|
||||||
if (stream is not None and stream is True) and provider != "ai21":
|
if (stream is not None and stream is True) and provider != "ai21":
|
||||||
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream"
|
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream"
|
||||||
proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
|
proxy_endpoint_url = (
|
||||||
|
f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
|
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
|
||||||
proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke"
|
proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke"
|
||||||
|
@ -1268,7 +1270,7 @@ class AmazonConverseConfig:
|
||||||
if len(value) == 0: # converse raises error for empty strings
|
if len(value) == 0: # converse raises error for empty strings
|
||||||
continue
|
continue
|
||||||
value = [value]
|
value = [value]
|
||||||
optional_params["stop_sequences"] = value
|
optional_params["stopSequences"] = value
|
||||||
if param == "temperature":
|
if param == "temperature":
|
||||||
optional_params["temperature"] = value
|
optional_params["temperature"] = value
|
||||||
if param == "top_p":
|
if param == "top_p":
|
||||||
|
|
|
@ -29,8 +29,8 @@ from litellm.types.utils import (
|
||||||
)
|
)
|
||||||
from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
|
from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
|
||||||
|
|
||||||
from .base import BaseLLM
|
from ..base import BaseLLM
|
||||||
from .prompt_templates.factory import custom_prompt, prompt_factory
|
from ..prompt_templates.factory import custom_prompt, prompt_factory
|
||||||
|
|
||||||
|
|
||||||
class DatabricksError(Exception):
|
class DatabricksError(Exception):
|
||||||
|
@ -328,6 +328,7 @@ class DatabricksChatCompletion(BaseLLM):
|
||||||
api_base: str,
|
api_base: str,
|
||||||
custom_prompt_dict: dict,
|
custom_prompt_dict: dict,
|
||||||
model_response: ModelResponse,
|
model_response: ModelResponse,
|
||||||
|
custom_llm_provider: str,
|
||||||
print_verbose: Callable,
|
print_verbose: Callable,
|
||||||
encoding,
|
encoding,
|
||||||
api_key,
|
api_key,
|
||||||
|
@ -371,6 +372,8 @@ class DatabricksChatCompletion(BaseLLM):
|
||||||
)
|
)
|
||||||
response = ModelResponse(**response_json)
|
response = ModelResponse(**response_json)
|
||||||
|
|
||||||
|
response.model = custom_llm_provider + "/" + response.model
|
||||||
|
|
||||||
if base_model is not None:
|
if base_model is not None:
|
||||||
response._hidden_params["model"] = base_model
|
response._hidden_params["model"] = base_model
|
||||||
return response
|
return response
|
||||||
|
@ -472,6 +475,7 @@ class DatabricksChatCompletion(BaseLLM):
|
||||||
data=data,
|
data=data,
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
custom_prompt_dict=custom_prompt_dict,
|
custom_prompt_dict=custom_prompt_dict,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
model_response=model_response,
|
model_response=model_response,
|
||||||
print_verbose=print_verbose,
|
print_verbose=print_verbose,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
|
@ -528,6 +532,8 @@ class DatabricksChatCompletion(BaseLLM):
|
||||||
|
|
||||||
response = ModelResponse(**response_json)
|
response = ModelResponse(**response_json)
|
||||||
|
|
||||||
|
response.model = custom_llm_provider + "/" + response.model
|
||||||
|
|
||||||
if base_model is not None:
|
if base_model is not None:
|
||||||
response._hidden_params["model"] = base_model
|
response._hidden_params["model"] = base_model
|
||||||
|
|
39
litellm/llms/databricks/cost_calculator.py
Normal file
39
litellm/llms/databricks/cost_calculator.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
"""
|
||||||
|
Helper util for handling databricks-specific cost calculation
|
||||||
|
- e.g.: handling 'dbrx-instruct-*'
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from litellm.types.utils import Usage
|
||||||
|
from litellm.utils import get_model_info
|
||||||
|
|
||||||
|
|
||||||
|
def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
- model: str, the model name without provider prefix
|
||||||
|
- usage: LiteLLM Usage block, containing anthropic caching information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||||
|
"""
|
||||||
|
base_model = model
|
||||||
|
if model.startswith("databricks/dbrx-instruct") or model.startswith(
|
||||||
|
"dbrx-instruct"
|
||||||
|
):
|
||||||
|
base_model = "databricks-dbrx-instruct"
|
||||||
|
|
||||||
|
## GET MODEL INFO
|
||||||
|
model_info = get_model_info(model=base_model, custom_llm_provider="databricks")
|
||||||
|
|
||||||
|
## CALCULATE INPUT COST
|
||||||
|
|
||||||
|
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
|
||||||
|
|
||||||
|
## CALCULATE OUTPUT COST
|
||||||
|
completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
|
||||||
|
|
||||||
|
return prompt_cost, completion_cost
|
|
@ -273,7 +273,7 @@ class SagemakerLLM(BaseAWSLLM):
|
||||||
model_id = optional_params.get("model_id", None)
|
model_id = optional_params.get("model_id", None)
|
||||||
|
|
||||||
if use_messages_api is True:
|
if use_messages_api is True:
|
||||||
from litellm.llms.databricks import DatabricksChatCompletion
|
from litellm.llms.databricks.chat import DatabricksChatCompletion
|
||||||
|
|
||||||
openai_like_chat_completions = DatabricksChatCompletion()
|
openai_like_chat_completions = DatabricksChatCompletion()
|
||||||
inference_params["stream"] = True if stream is True else False
|
inference_params["stream"] = True if stream is True else False
|
||||||
|
|
|
@ -80,7 +80,7 @@ class VertexAIPartnerModels(BaseLLM):
|
||||||
import vertexai
|
import vertexai
|
||||||
from google.cloud import aiplatform
|
from google.cloud import aiplatform
|
||||||
|
|
||||||
from litellm.llms.databricks import DatabricksChatCompletion
|
from litellm.llms.databricks.chat import DatabricksChatCompletion
|
||||||
from litellm.llms.OpenAI.openai import OpenAIChatCompletion
|
from litellm.llms.OpenAI.openai import OpenAIChatCompletion
|
||||||
from litellm.llms.text_completion_codestral import CodestralTextCompletion
|
from litellm.llms.text_completion_codestral import CodestralTextCompletion
|
||||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||||
|
|
|
@ -92,7 +92,7 @@ from .llms.cohere import chat as cohere_chat
|
||||||
from .llms.cohere import completion as cohere_completion # type: ignore
|
from .llms.cohere import completion as cohere_completion # type: ignore
|
||||||
from .llms.cohere import embed as cohere_embed
|
from .llms.cohere import embed as cohere_embed
|
||||||
from .llms.custom_llm import CustomLLM, custom_chat_llm_router
|
from .llms.custom_llm import CustomLLM, custom_chat_llm_router
|
||||||
from .llms.databricks import DatabricksChatCompletion
|
from .llms.databricks.chat import DatabricksChatCompletion
|
||||||
from .llms.huggingface_restapi import Huggingface
|
from .llms.huggingface_restapi import Huggingface
|
||||||
from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
|
from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
|
||||||
from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion
|
from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion
|
||||||
|
@ -1013,7 +1013,10 @@ def completion(
|
||||||
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
|
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
|
||||||
|
|
||||||
api_version = (
|
api_version = (
|
||||||
api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
|
api_version
|
||||||
|
or litellm.api_version
|
||||||
|
or get_secret("AZURE_API_VERSION")
|
||||||
|
or litellm.AZURE_DEFAULT_API_VERSION
|
||||||
)
|
)
|
||||||
|
|
||||||
api_key = (
|
api_key = (
|
||||||
|
@ -1209,6 +1212,9 @@ def completion(
|
||||||
custom_llm_provider == "text-completion-openai"
|
custom_llm_provider == "text-completion-openai"
|
||||||
or "ft:babbage-002" in model
|
or "ft:babbage-002" in model
|
||||||
or "ft:davinci-002" in model # support for finetuned completion models
|
or "ft:davinci-002" in model # support for finetuned completion models
|
||||||
|
or custom_llm_provider
|
||||||
|
in litellm.openai_text_completion_compatible_providers
|
||||||
|
and kwargs.get("text_completion") is True
|
||||||
):
|
):
|
||||||
openai.api_type = "openai"
|
openai.api_type = "openai"
|
||||||
|
|
||||||
|
@ -4099,8 +4105,8 @@ def text_completion(
|
||||||
|
|
||||||
kwargs.pop("prompt", None)
|
kwargs.pop("prompt", None)
|
||||||
|
|
||||||
if (
|
if _model is not None and (
|
||||||
_model is not None and custom_llm_provider == "openai"
|
custom_llm_provider == "openai"
|
||||||
): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
|
): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
|
||||||
if _model not in litellm.open_ai_chat_completion_models:
|
if _model not in litellm.open_ai_chat_completion_models:
|
||||||
model = "text-completion-openai/" + _model
|
model = "text-completion-openai/" + _model
|
||||||
|
|
|
@ -2512,16 +2512,16 @@
|
||||||
"max_audio_length_hours": 8.4,
|
"max_audio_length_hours": 8.4,
|
||||||
"max_audio_per_prompt": 1,
|
"max_audio_per_prompt": 1,
|
||||||
"max_pdf_size_mb": 30,
|
"max_pdf_size_mb": 30,
|
||||||
"input_cost_per_token": 0.00000035,
|
"input_cost_per_token": 0.000000075,
|
||||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
"input_cost_per_token_above_128k_tokens": 0.00000015,
|
||||||
"output_cost_per_token": 0.00000105,
|
"output_cost_per_token": 0.0000003,
|
||||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
"output_cost_per_token_above_128k_tokens": 0.0000006,
|
||||||
"litellm_provider": "gemini",
|
"litellm_provider": "gemini",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_system_messages": true,
|
"supports_system_messages": true,
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
"supports_vision": true,
|
"supports_vision": true,
|
||||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
"source": "https://ai.google.dev/pricing"
|
||||||
},
|
},
|
||||||
"gemini/gemini-1.5-flash-latest": {
|
"gemini/gemini-1.5-flash-latest": {
|
||||||
"max_tokens": 8192,
|
"max_tokens": 8192,
|
||||||
|
@ -2533,16 +2533,16 @@
|
||||||
"max_audio_length_hours": 8.4,
|
"max_audio_length_hours": 8.4,
|
||||||
"max_audio_per_prompt": 1,
|
"max_audio_per_prompt": 1,
|
||||||
"max_pdf_size_mb": 30,
|
"max_pdf_size_mb": 30,
|
||||||
"input_cost_per_token": 0.00000035,
|
"input_cost_per_token": 0.000000075,
|
||||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
"input_cost_per_token_above_128k_tokens": 0.00000015,
|
||||||
"output_cost_per_token": 0.00000105,
|
"output_cost_per_token": 0.0000003,
|
||||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
"output_cost_per_token_above_128k_tokens": 0.0000006,
|
||||||
"litellm_provider": "gemini",
|
"litellm_provider": "gemini",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_system_messages": true,
|
"supports_system_messages": true,
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
"supports_vision": true,
|
"supports_vision": true,
|
||||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
"source": "https://ai.google.dev/pricing"
|
||||||
},
|
},
|
||||||
"gemini/gemini-pro": {
|
"gemini/gemini-pro": {
|
||||||
"max_tokens": 8192,
|
"max_tokens": 8192,
|
||||||
|
|
|
@ -1,16 +1,9 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: "anthropic/claude-3-5-sonnet-20240620"
|
- model_name: "gpt-turbo"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: anthropic/claude-3-5-sonnet-20240620
|
model: azure/chatgpt-v-2
|
||||||
# api_base: http://0.0.0.0:9000
|
api_key: os.environ/AZURE_API_KEY
|
||||||
- model_name: gpt-3.5-turbo
|
api_base: os.environ/AZURE_API_BASE
|
||||||
litellm_params:
|
|
||||||
model: openai/*
|
|
||||||
|
|
||||||
litellm_settings:
|
router_settings:
|
||||||
success_callback: ["s3"]
|
model_group_alias: {"gpt-4": "gpt-turbo"}
|
||||||
s3_callback_params:
|
|
||||||
s3_bucket_name: litellm-logs # AWS Bucket Name for S3
|
|
||||||
s3_region_name: us-west-2 # AWS Region Name for S3
|
|
||||||
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
|
|
||||||
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
|
|
|
@ -242,6 +242,9 @@ class LiteLLMRoutes(enum.Enum):
|
||||||
"/v1/models",
|
"/v1/models",
|
||||||
# token counter
|
# token counter
|
||||||
"/utils/token_counter",
|
"/utils/token_counter",
|
||||||
|
# rerank
|
||||||
|
"/rerank",
|
||||||
|
"/v1/rerank",
|
||||||
]
|
]
|
||||||
|
|
||||||
mapped_pass_through_routes: List = [
|
mapped_pass_through_routes: List = [
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm._logging import print_verbose
|
from litellm._logging import print_verbose
|
||||||
|
@ -36,6 +36,25 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_deployments_by_id(
|
||||||
|
model_list: List,
|
||||||
|
) -> List:
|
||||||
|
seen_ids = set()
|
||||||
|
filtered_deployments = []
|
||||||
|
|
||||||
|
for deployment in model_list:
|
||||||
|
_model_info = deployment.get("model_info") or {}
|
||||||
|
_id = _model_info.get("id") or None
|
||||||
|
if _id is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if _id not in seen_ids:
|
||||||
|
seen_ids.add(_id)
|
||||||
|
filtered_deployments.append(deployment)
|
||||||
|
|
||||||
|
return filtered_deployments
|
||||||
|
|
||||||
|
|
||||||
async def _perform_health_check(model_list: list, details: Optional[bool] = True):
|
async def _perform_health_check(model_list: list, details: Optional[bool] = True):
|
||||||
"""
|
"""
|
||||||
Perform a health check for each model in the list.
|
Perform a health check for each model in the list.
|
||||||
|
@ -105,6 +124,9 @@ async def perform_health_check(
|
||||||
_new_model_list = [x for x in model_list if x["model_name"] == model]
|
_new_model_list = [x for x in model_list if x["model_name"] == model]
|
||||||
model_list = _new_model_list
|
model_list = _new_model_list
|
||||||
|
|
||||||
|
model_list = filter_deployments_by_id(
|
||||||
|
model_list=model_list
|
||||||
|
) # filter duplicate deployments (e.g. when model alias'es are used)
|
||||||
healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
|
healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
|
||||||
model_list, details
|
model_list, details
|
||||||
)
|
)
|
||||||
|
|
|
@ -86,10 +86,11 @@ def convert_key_logging_metadata_to_callback(
|
||||||
team_callback_settings_obj.success_callback = []
|
team_callback_settings_obj.success_callback = []
|
||||||
if team_callback_settings_obj.failure_callback is None:
|
if team_callback_settings_obj.failure_callback is None:
|
||||||
team_callback_settings_obj.failure_callback = []
|
team_callback_settings_obj.failure_callback = []
|
||||||
|
|
||||||
if data.callback_name not in team_callback_settings_obj.success_callback:
|
if data.callback_name not in team_callback_settings_obj.success_callback:
|
||||||
team_callback_settings_obj.success_callback.append(data.callback_name)
|
team_callback_settings_obj.success_callback.append(data.callback_name)
|
||||||
|
|
||||||
if data.callback_name in team_callback_settings_obj.failure_callback:
|
if data.callback_name not in team_callback_settings_obj.failure_callback:
|
||||||
team_callback_settings_obj.failure_callback.append(data.callback_name)
|
team_callback_settings_obj.failure_callback.append(data.callback_name)
|
||||||
|
|
||||||
for var, value in data.callback_vars.items():
|
for var, value in data.callback_vars.items():
|
||||||
|
|
|
@ -109,8 +109,8 @@ async def add_new_member(
|
||||||
where={"user_id": user_info.user_id}, # type: ignore
|
where={"user_id": user_info.user_id}, # type: ignore
|
||||||
data={"teams": {"push": [team_id]}},
|
data={"teams": {"push": [team_id]}},
|
||||||
)
|
)
|
||||||
|
if _returned_user is not None:
|
||||||
returned_user = LiteLLM_UserTable(**_returned_user.model_dump())
|
returned_user = LiteLLM_UserTable(**_returned_user.model_dump())
|
||||||
elif len(existing_user_row) > 1:
|
elif len(existing_user_row) > 1:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
|
|
|
@ -1,19 +1,19 @@
|
||||||
model_list:
|
model_list:
|
||||||
- model_name: openai/*
|
- model_name: openai/*
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo
|
model: openai/*
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
api_key: os.environ/OPENAI_API_KEY
|
||||||
|
model_info:
|
||||||
litellm_settings:
|
id: "good-openai"
|
||||||
success_callback: ["prometheus"]
|
- model_name: openai/*
|
||||||
failure_callback: ["prometheus"]
|
|
||||||
|
|
||||||
guardrails:
|
|
||||||
- guardrail_name: "presidio-pre-guard"
|
|
||||||
litellm_params:
|
litellm_params:
|
||||||
guardrail: presidio # supported values: "aporia", "lakera", "presidio"
|
model: openai/*
|
||||||
mode: "pre_call" # pre_call, during_call, post_call
|
api_key: os.environ/non-exsitent-env-var
|
||||||
output_parse_pii: True
|
tags: ["bad-model"]
|
||||||
|
model_info:
|
||||||
|
id: "test-openai"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
callbacks: ["otel"]
|
callbacks: ["otel"]
|
||||||
|
@ -22,8 +22,16 @@ callback_settings:
|
||||||
otel:
|
otel:
|
||||||
message_logging: False
|
message_logging: False
|
||||||
|
|
||||||
|
router_settings:
|
||||||
|
enable_tag_filtering: True # 👈 Key Chang
|
||||||
|
|
||||||
|
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234
|
master_key: sk-1234
|
||||||
alerting: ["slack"]
|
alerting: ["slack"]
|
||||||
spend_report_frequency: "1d"
|
spend_report_frequency: "1d"
|
||||||
|
|
||||||
|
|
||||||
|
litellm_settings:
|
||||||
|
success_callback: ["prometheus"]
|
||||||
|
failure_callback: ["prometheus"]
|
|
@ -3690,7 +3690,7 @@ class Router:
|
||||||
exception=original_exception,
|
exception=original_exception,
|
||||||
)
|
)
|
||||||
|
|
||||||
allowed_fails = _allowed_fails or self.allowed_fails
|
allowed_fails = _allowed_fails if _allowed_fails is not None else self.allowed_fails
|
||||||
|
|
||||||
dt = get_utc_datetime()
|
dt = get_utc_datetime()
|
||||||
current_minute = dt.strftime("%H-%M")
|
current_minute = dt.strftime("%H-%M")
|
||||||
|
@ -4556,6 +4556,27 @@ class Router:
|
||||||
ids.append(id)
|
ids.append(id)
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
def _get_all_deployments(
|
||||||
|
self, model_name: str, model_alias: Optional[str] = None
|
||||||
|
) -> List[DeploymentTypedDict]:
|
||||||
|
"""
|
||||||
|
Return all deployments of a model name
|
||||||
|
|
||||||
|
Used for accurate 'get_model_list'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
returned_models: List[DeploymentTypedDict] = []
|
||||||
|
for model in self.model_list:
|
||||||
|
if model["model_name"] == model_name:
|
||||||
|
if model_alias is not None:
|
||||||
|
alias_model = copy.deepcopy(model)
|
||||||
|
alias_model["model_name"] = model_name
|
||||||
|
returned_models.append(alias_model)
|
||||||
|
else:
|
||||||
|
returned_models.append(model)
|
||||||
|
|
||||||
|
return returned_models
|
||||||
|
|
||||||
def get_model_names(self) -> List[str]:
|
def get_model_names(self) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Returns all possible model names for router.
|
Returns all possible model names for router.
|
||||||
|
@ -4567,15 +4588,18 @@ class Router:
|
||||||
def get_model_list(
|
def get_model_list(
|
||||||
self, model_name: Optional[str] = None
|
self, model_name: Optional[str] = None
|
||||||
) -> Optional[List[DeploymentTypedDict]]:
|
) -> Optional[List[DeploymentTypedDict]]:
|
||||||
|
"""
|
||||||
|
Includes router model_group_alias'es as well
|
||||||
|
"""
|
||||||
if hasattr(self, "model_list"):
|
if hasattr(self, "model_list"):
|
||||||
returned_models: List[DeploymentTypedDict] = []
|
returned_models: List[DeploymentTypedDict] = []
|
||||||
|
|
||||||
for model_alias, model_value in self.model_group_alias.items():
|
for model_alias, model_value in self.model_group_alias.items():
|
||||||
model_alias_item = DeploymentTypedDict(
|
returned_models.extend(
|
||||||
model_name=model_alias,
|
self._get_all_deployments(
|
||||||
litellm_params=LiteLLMParamsTypedDict(model=model_value),
|
model_name=model_value, model_alias=model_alias
|
||||||
|
)
|
||||||
)
|
)
|
||||||
returned_models.append(model_alias_item)
|
|
||||||
|
|
||||||
if model_name is None:
|
if model_name is None:
|
||||||
returned_models += self.model_list
|
returned_models += self.model_list
|
||||||
|
@ -4583,8 +4607,7 @@ class Router:
|
||||||
return returned_models
|
return returned_models
|
||||||
|
|
||||||
for model in self.model_list:
|
for model in self.model_list:
|
||||||
if model["model_name"] == model_name:
|
returned_models.extend(self._get_all_deployments(model_name=model_name))
|
||||||
returned_models.append(model)
|
|
||||||
|
|
||||||
return returned_models
|
return returned_models
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
"""
|
"""
|
||||||
Use this to route requests between free and paid tiers
|
Use this to route requests between Teams
|
||||||
|
|
||||||
|
- If tags in request is a subset of tags in deployment, return deployment
|
||||||
|
- if deployments are set with default tags, return all default deployment
|
||||||
|
- If no default_deployments are set, return all deployments
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union
|
||||||
|
@ -25,14 +29,14 @@ async def get_deployments_for_tag(
|
||||||
|
|
||||||
if request_kwargs is None:
|
if request_kwargs is None:
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
"get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
|
"get_deployments_for_tag: request_kwargs is None returning healthy_deployments: %s",
|
||||||
healthy_deployments,
|
healthy_deployments,
|
||||||
)
|
)
|
||||||
return healthy_deployments
|
return healthy_deployments
|
||||||
|
|
||||||
if healthy_deployments is None:
|
if healthy_deployments is None:
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
"get_deployments_for_tier: healthy_deployments is None returning healthy_deployments"
|
"get_deployments_for_tag: healthy_deployments is None returning healthy_deployments"
|
||||||
)
|
)
|
||||||
return healthy_deployments
|
return healthy_deployments
|
||||||
|
|
||||||
|
@ -43,7 +47,9 @@ async def get_deployments_for_tag(
|
||||||
|
|
||||||
new_healthy_deployments = []
|
new_healthy_deployments = []
|
||||||
if request_tags:
|
if request_tags:
|
||||||
verbose_logger.debug("parameter routing: router_keys: %s", request_tags)
|
verbose_logger.debug(
|
||||||
|
"get_deployments_for_tag routing: router_keys: %s", request_tags
|
||||||
|
)
|
||||||
# example this can be router_keys=["free", "custom"]
|
# example this can be router_keys=["free", "custom"]
|
||||||
# get all deployments that have a superset of these router keys
|
# get all deployments that have a superset of these router keys
|
||||||
for deployment in healthy_deployments:
|
for deployment in healthy_deployments:
|
||||||
|
@ -66,9 +72,26 @@ async def get_deployments_for_tag(
|
||||||
request_tags,
|
request_tags,
|
||||||
)
|
)
|
||||||
new_healthy_deployments.append(deployment)
|
new_healthy_deployments.append(deployment)
|
||||||
|
elif "default" in deployment_tags:
|
||||||
|
verbose_logger.debug(
|
||||||
|
"adding default deployment with tags: %s, request tags: %s",
|
||||||
|
deployment_tags,
|
||||||
|
request_tags,
|
||||||
|
)
|
||||||
|
new_healthy_deployments.append(deployment)
|
||||||
|
|
||||||
return new_healthy_deployments
|
return new_healthy_deployments
|
||||||
|
|
||||||
|
# for Untagged requests use default deployments if set
|
||||||
|
_default_deployments_with_tags = []
|
||||||
|
for deployment in healthy_deployments:
|
||||||
|
if "default" in deployment.get("litellm_params", {}).get("tags", []):
|
||||||
|
_default_deployments_with_tags.append(deployment)
|
||||||
|
|
||||||
|
if len(_default_deployments_with_tags) > 0:
|
||||||
|
return _default_deployments_with_tags
|
||||||
|
|
||||||
|
# if no default deployment is found, return healthy_deployments
|
||||||
verbose_logger.debug(
|
verbose_logger.debug(
|
||||||
"no tier found in metadata, returning healthy_deployments: %s",
|
"no tier found in metadata, returning healthy_deployments: %s",
|
||||||
healthy_deployments,
|
healthy_deployments,
|
||||||
|
|
|
@ -626,6 +626,8 @@ async def test_model_function_invoke(model, sync_mode, api_key, api_base):
|
||||||
response = await litellm.acompletion(**data)
|
response = await litellm.acompletion(**data)
|
||||||
|
|
||||||
print(f"response: {response}")
|
print(f"response: {response}")
|
||||||
|
except litellm.InternalServerError:
|
||||||
|
pass
|
||||||
except litellm.RateLimitError as e:
|
except litellm.RateLimitError as e:
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -889,18 +891,29 @@ def encode_image(image_path):
|
||||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.parametrize(
|
||||||
reason="we already test claude-3, this is just another way to pass images"
|
"model",
|
||||||
)
|
[
|
||||||
def test_completion_claude_3_base64():
|
"gpt-4o",
|
||||||
|
"azure/gpt-4o",
|
||||||
|
"anthropic/claude-3-opus-20240229",
|
||||||
|
],
|
||||||
|
) #
|
||||||
|
def test_completion_base64(model):
|
||||||
try:
|
try:
|
||||||
|
import base64
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
litellm.num_retries = 3
|
url = "https://dummyimage.com/100/100/fff&text=Test+image"
|
||||||
image_path = "../proxy/cached_logo.jpg"
|
response = requests.get(url)
|
||||||
# Getting the base64 string
|
file_data = response.content
|
||||||
base64_image = encode_image(image_path)
|
|
||||||
|
encoded_file = base64.b64encode(file_data).decode("utf-8")
|
||||||
|
base64_image = f"data:image/png;base64,{encoded_file}"
|
||||||
resp = litellm.completion(
|
resp = litellm.completion(
|
||||||
model="anthropic/claude-3-opus-20240229",
|
model=model,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -908,9 +921,7 @@ def test_completion_claude_3_base64():
|
||||||
{"type": "text", "text": "Whats in this image?"},
|
{"type": "text", "text": "Whats in this image?"},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {"url": base64_image},
|
||||||
"url": "data:image/jpeg;base64," + base64_image
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -919,7 +930,6 @@ def test_completion_claude_3_base64():
|
||||||
print(f"\nResponse: {resp}")
|
print(f"\nResponse: {resp}")
|
||||||
|
|
||||||
prompt_tokens = resp.usage.prompt_tokens
|
prompt_tokens = resp.usage.prompt_tokens
|
||||||
raise Exception("it worked!")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "500 Internal error encountered.'" in str(e):
|
if "500 Internal error encountered.'" in str(e):
|
||||||
pass
|
pass
|
||||||
|
@ -2174,15 +2184,16 @@ def test_completion_openai():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model",
|
"model, api_version",
|
||||||
[
|
[
|
||||||
"gpt-4o-2024-08-06",
|
("gpt-4o-2024-08-06", None),
|
||||||
"azure/chatgpt-v-2",
|
("azure/chatgpt-v-2", None),
|
||||||
"bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
|
("bedrock/anthropic.claude-3-sonnet-20240229-v1:0", None),
|
||||||
|
("azure/gpt-4o", "2024-08-01-preview"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.flaky(retries=3, delay=1)
|
@pytest.mark.flaky(retries=3, delay=1)
|
||||||
def test_completion_openai_pydantic(model):
|
def test_completion_openai_pydantic(model, api_version):
|
||||||
try:
|
try:
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
@ -2207,6 +2218,7 @@ def test_completion_openai_pydantic(model):
|
||||||
messages=messages,
|
messages=messages,
|
||||||
metadata={"hi": "bye"},
|
metadata={"hi": "bye"},
|
||||||
response_format=EventsList,
|
response_format=EventsList,
|
||||||
|
api_version=api_version,
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except litellm.JSONSchemaValidationError:
|
except litellm.JSONSchemaValidationError:
|
||||||
|
@ -3469,14 +3481,14 @@ def response_format_tests(response: litellm.ModelResponse):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model",
|
"model",
|
||||||
[
|
[
|
||||||
# "bedrock/cohere.command-r-plus-v1:0",
|
"bedrock/mistral.mistral-large-2407-v1:0",
|
||||||
|
"bedrock/cohere.command-r-plus-v1:0",
|
||||||
"anthropic.claude-3-sonnet-20240229-v1:0",
|
"anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
# "anthropic.claude-instant-v1",
|
"anthropic.claude-instant-v1",
|
||||||
# "bedrock/ai21.j2-mid",
|
"mistral.mistral-7b-instruct-v0:2",
|
||||||
# "mistral.mistral-7b-instruct-v0:2",
|
|
||||||
# "bedrock/amazon.titan-tg1-large",
|
# "bedrock/amazon.titan-tg1-large",
|
||||||
# "meta.llama3-8b-instruct-v1:0",
|
"meta.llama3-8b-instruct-v1:0",
|
||||||
# "cohere.command-text-v14",
|
"cohere.command-text-v14",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||||
|
@ -3491,6 +3503,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
|
||||||
messages=[{"role": "user", "content": "Hey! how's it going?"}],
|
messages=[{"role": "user", "content": "Hey! how's it going?"}],
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=200,
|
max_tokens=200,
|
||||||
|
stop=["stop sequence"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert isinstance(response, litellm.ModelResponse)
|
assert isinstance(response, litellm.ModelResponse)
|
||||||
|
@ -3502,6 +3515,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
|
||||||
messages=[{"role": "user", "content": "Hey! how's it going?"}],
|
messages=[{"role": "user", "content": "Hey! how's it going?"}],
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=100,
|
max_tokens=100,
|
||||||
|
stop=["stop sequence"],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert isinstance(response, litellm.ModelResponse)
|
assert isinstance(response, litellm.ModelResponse)
|
||||||
|
|
|
@ -1219,3 +1219,13 @@ def test_completion_cost_anthropic_prompt_caching():
|
||||||
cost_2 = completion_cost(model=model, completion_response=response_2)
|
cost_2 = completion_cost(model=model, completion_response=response_2)
|
||||||
|
|
||||||
assert cost_1 > cost_2
|
assert cost_1 > cost_2
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_cost_databricks():
|
||||||
|
model, messages = "databricks/databricks-dbrx-instruct", [
|
||||||
|
{"role": "user", "content": "What is 2+2?"}
|
||||||
|
]
|
||||||
|
|
||||||
|
resp = litellm.completion(model=model, messages=messages) # works fine
|
||||||
|
|
||||||
|
cost = completion_cost(completion_response=resp)
|
||||||
|
|
|
@ -864,7 +864,7 @@ def _pre_call_utils(
|
||||||
data["messages"] = [{"role": "user", "content": "Hello world"}]
|
data["messages"] = [{"role": "user", "content": "Hello world"}]
|
||||||
if streaming is True:
|
if streaming is True:
|
||||||
data["stream"] = True
|
data["stream"] = True
|
||||||
mapped_target = client.chat.completions.with_raw_response
|
mapped_target = client.chat.completions.with_raw_response # type: ignore
|
||||||
if sync_mode:
|
if sync_mode:
|
||||||
original_function = litellm.completion
|
original_function = litellm.completion
|
||||||
else:
|
else:
|
||||||
|
@ -873,7 +873,7 @@ def _pre_call_utils(
|
||||||
data["prompt"] = "Hello world"
|
data["prompt"] = "Hello world"
|
||||||
if streaming is True:
|
if streaming is True:
|
||||||
data["stream"] = True
|
data["stream"] = True
|
||||||
mapped_target = client.completions.with_raw_response
|
mapped_target = client.completions.with_raw_response # type: ignore
|
||||||
if sync_mode:
|
if sync_mode:
|
||||||
original_function = litellm.text_completion
|
original_function = litellm.text_completion
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -52,6 +52,7 @@ def get_current_weather(location, unit="fahrenheit"):
|
||||||
# "anthropic.claude-3-sonnet-20240229-v1:0",
|
# "anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@pytest.mark.flaky(retries=3, delay=1)
|
||||||
def test_aaparallel_function_call(model):
|
def test_aaparallel_function_call(model):
|
||||||
try:
|
try:
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
|
|
@ -1255,7 +1255,17 @@ async def test_add_callback_via_key(prisma_client):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
|
@pytest.mark.parametrize(
|
||||||
|
"callback_type, expected_success_callbacks, expected_failure_callbacks",
|
||||||
|
[
|
||||||
|
("success", ["langfuse"], []),
|
||||||
|
("failure", [], ["langfuse"]),
|
||||||
|
("success_and_failure", ["langfuse"], ["langfuse"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_add_callback_via_key_litellm_pre_call_utils(
|
||||||
|
prisma_client, callback_type, expected_success_callbacks, expected_failure_callbacks
|
||||||
|
):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from fastapi import HTTPException, Request, Response
|
from fastapi import HTTPException, Request, Response
|
||||||
|
@ -1312,7 +1322,7 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
|
||||||
"logging": [
|
"logging": [
|
||||||
{
|
{
|
||||||
"callback_name": "langfuse",
|
"callback_name": "langfuse",
|
||||||
"callback_type": "success",
|
"callback_type": callback_type,
|
||||||
"callback_vars": {
|
"callback_vars": {
|
||||||
"langfuse_public_key": "my-mock-public-key",
|
"langfuse_public_key": "my-mock-public-key",
|
||||||
"langfuse_secret_key": "my-mock-secret-key",
|
"langfuse_secret_key": "my-mock-secret-key",
|
||||||
|
@ -1359,14 +1369,21 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
|
||||||
}
|
}
|
||||||
|
|
||||||
new_data = await add_litellm_data_to_request(**data)
|
new_data = await add_litellm_data_to_request(**data)
|
||||||
|
print("NEW DATA: {}".format(new_data))
|
||||||
|
|
||||||
assert "success_callback" in new_data
|
|
||||||
assert new_data["success_callback"] == ["langfuse"]
|
|
||||||
assert "langfuse_public_key" in new_data
|
assert "langfuse_public_key" in new_data
|
||||||
assert new_data["langfuse_public_key"] == "my-mock-public-key"
|
assert new_data["langfuse_public_key"] == "my-mock-public-key"
|
||||||
assert "langfuse_secret_key" in new_data
|
assert "langfuse_secret_key" in new_data
|
||||||
assert new_data["langfuse_secret_key"] == "my-mock-secret-key"
|
assert new_data["langfuse_secret_key"] == "my-mock-secret-key"
|
||||||
|
|
||||||
|
if expected_success_callbacks:
|
||||||
|
assert "success_callback" in new_data
|
||||||
|
assert new_data["success_callback"] == expected_success_callbacks
|
||||||
|
|
||||||
|
if expected_failure_callbacks:
|
||||||
|
assert "failure_callback" in new_data
|
||||||
|
assert new_data["failure_callback"] == expected_failure_callbacks
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_gemini_pass_through_endpoint():
|
async def test_gemini_pass_through_endpoint():
|
||||||
|
|
|
@ -91,3 +91,72 @@ async def test_router_free_paid_tier():
|
||||||
print("response_extra_info: ", response_extra_info)
|
print("response_extra_info: ", response_extra_info)
|
||||||
|
|
||||||
assert response_extra_info["model_id"] == "very-expensive-model"
|
assert response_extra_info["model_id"] == "very-expensive-model"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio()
|
||||||
|
async def test_default_tagged_deployments():
|
||||||
|
"""
|
||||||
|
- only use default deployment for untagged requests
|
||||||
|
- if a request has tag "default", use default deployment
|
||||||
|
"""
|
||||||
|
|
||||||
|
router = litellm.Router(
|
||||||
|
model_list=[
|
||||||
|
{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"tags": ["default"],
|
||||||
|
},
|
||||||
|
"model_info": {"id": "default-model"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
},
|
||||||
|
"model_info": {"id": "default-model-2"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "gpt-4",
|
||||||
|
"litellm_params": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
|
||||||
|
"tags": ["teamA"],
|
||||||
|
},
|
||||||
|
"model_info": {"id": "very-expensive-model"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
enable_tag_filtering=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
# Untagged request, this should pick model with id == "default-model"
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-4",
|
||||||
|
messages=[{"role": "user", "content": "Tell me a joke."}],
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Response: ", response)
|
||||||
|
|
||||||
|
response_extra_info = response._hidden_params
|
||||||
|
print("response_extra_info: ", response_extra_info)
|
||||||
|
|
||||||
|
assert response_extra_info["model_id"] == "default-model"
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
# requests tagged with "default", this should pick model with id == "default-model"
|
||||||
|
response = await router.acompletion(
|
||||||
|
model="gpt-4",
|
||||||
|
messages=[{"role": "user", "content": "Tell me a joke."}],
|
||||||
|
metadata={"tags": ["default"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Response: ", response)
|
||||||
|
|
||||||
|
response_extra_info = response._hidden_params
|
||||||
|
print("response_extra_info: ", response_extra_info)
|
||||||
|
|
||||||
|
assert response_extra_info["model_id"] == "default-model"
|
||||||
|
|
|
@ -4239,3 +4239,14 @@ def test_completion_vllm():
|
||||||
mock_call.assert_called_once()
|
mock_call.assert_called_once()
|
||||||
|
|
||||||
assert "hello" in mock_call.call_args.kwargs["extra_body"]
|
assert "hello" in mock_call.call_args.kwargs["extra_body"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion_fireworks_ai_multiple_choices():
|
||||||
|
litellm.set_verbose = True
|
||||||
|
response = litellm.text_completion(
|
||||||
|
model="fireworks_ai/llama-v3p1-8b-instruct",
|
||||||
|
prompt=["halo", "hi", "halo", "hi"],
|
||||||
|
)
|
||||||
|
print(response.choices)
|
||||||
|
|
||||||
|
assert len(response.choices) == 4
|
||||||
|
|
|
@ -2512,16 +2512,16 @@
|
||||||
"max_audio_length_hours": 8.4,
|
"max_audio_length_hours": 8.4,
|
||||||
"max_audio_per_prompt": 1,
|
"max_audio_per_prompt": 1,
|
||||||
"max_pdf_size_mb": 30,
|
"max_pdf_size_mb": 30,
|
||||||
"input_cost_per_token": 0.00000035,
|
"input_cost_per_token": 0.000000075,
|
||||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
"input_cost_per_token_above_128k_tokens": 0.00000015,
|
||||||
"output_cost_per_token": 0.00000105,
|
"output_cost_per_token": 0.0000003,
|
||||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
"output_cost_per_token_above_128k_tokens": 0.0000006,
|
||||||
"litellm_provider": "gemini",
|
"litellm_provider": "gemini",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_system_messages": true,
|
"supports_system_messages": true,
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
"supports_vision": true,
|
"supports_vision": true,
|
||||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
"source": "https://ai.google.dev/pricing"
|
||||||
},
|
},
|
||||||
"gemini/gemini-1.5-flash-latest": {
|
"gemini/gemini-1.5-flash-latest": {
|
||||||
"max_tokens": 8192,
|
"max_tokens": 8192,
|
||||||
|
@ -2533,16 +2533,16 @@
|
||||||
"max_audio_length_hours": 8.4,
|
"max_audio_length_hours": 8.4,
|
||||||
"max_audio_per_prompt": 1,
|
"max_audio_per_prompt": 1,
|
||||||
"max_pdf_size_mb": 30,
|
"max_pdf_size_mb": 30,
|
||||||
"input_cost_per_token": 0.00000035,
|
"input_cost_per_token": 0.000000075,
|
||||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
"input_cost_per_token_above_128k_tokens": 0.00000015,
|
||||||
"output_cost_per_token": 0.00000105,
|
"output_cost_per_token": 0.0000003,
|
||||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
"output_cost_per_token_above_128k_tokens": 0.0000006,
|
||||||
"litellm_provider": "gemini",
|
"litellm_provider": "gemini",
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"supports_system_messages": true,
|
"supports_system_messages": true,
|
||||||
"supports_function_calling": true,
|
"supports_function_calling": true,
|
||||||
"supports_vision": true,
|
"supports_vision": true,
|
||||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
"source": "https://ai.google.dev/pricing"
|
||||||
},
|
},
|
||||||
"gemini/gemini-pro": {
|
"gemini/gemini-pro": {
|
||||||
"max_tokens": 8192,
|
"max_tokens": 8192,
|
||||||
|
|
|
@ -148,6 +148,7 @@ router_settings:
|
||||||
redis_password: os.environ/REDIS_PASSWORD
|
redis_password: os.environ/REDIS_PASSWORD
|
||||||
redis_port: os.environ/REDIS_PORT
|
redis_port: os.environ/REDIS_PORT
|
||||||
enable_pre_call_checks: true
|
enable_pre_call_checks: true
|
||||||
|
model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"}
|
||||||
|
|
||||||
general_settings:
|
general_settings:
|
||||||
master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
|
master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "1.44.22"
|
version = "1.44.23"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "1.44.22"
|
version = "1.44.23"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:^version"
|
"pyproject.toml:^version"
|
||||||
]
|
]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue