Merge branch 'main' into litellm_allow_turning_off_message_logging_for_callbacks

This commit is contained in:
Ishaan Jaff 2024-09-09 21:59:36 -07:00 committed by GitHub
commit 02325f33d7
34 changed files with 442 additions and 117 deletions

View file

@ -25,6 +25,13 @@ model_list:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
tags: ["paid"] # 👈 Key Change
- model_name: gpt-4
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
api_base: https://exampleopenaiendpoint-production.up.railway.app/
tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
router_settings:
enable_tag_filtering: True # 👈 Key Change
@ -136,6 +143,46 @@ Response
}
```
## Setting Default Tags
Use this if you want all untagged requests to be routed to specific deployments
1. Set default tag on your yaml
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
tags: ["default"] # 👈 Key Change - All untagged requests will get routed to this
model_info:
id: "default-model" # used for identifying model in response headers
```
2. Start proxy
```shell
$ litellm --config /path/to/config.yaml
```
3. Make request with no tags
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
Expect to see the following response header when this works
```shell
x-litellm-model-id: default-model
```
## ✨ Team based tag routing (Enterprise)
LiteLLM Proxy supports team-based tag routing, allowing you to associate specific tags with teams and route requests accordingly. Example **Team A can access gpt-4 deployment A, Team B can access gpt-4 deployment B** (LLM Access Control For Teams)
@ -170,6 +217,12 @@ Here's how to set up and use team-based tag routing using curl commands:
tags: ["teamB"] # 👈 Key Change
model_info:
id: "team-b-model" # used for identifying model in response headers
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
router_settings:
enable_tag_filtering: True # 👈 Key Change

View file

@ -208,8 +208,8 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
-d '{
"metadata": {
"logging": [{
"callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
"callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default
"callback_name": "langfuse", # "otel", "langfuse", "lunary"
"callback_type": "success", # "success", "failure", "success_and_failure"
"callback_vars": {
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment

View file

@ -118,7 +118,7 @@ in_memory_llm_clients_cache: dict = {}
safe_memory_mode: bool = False
enable_azure_ad_token_refresh: Optional[bool] = False
### DEFAULT AZURE API VERSION ###
AZURE_DEFAULT_API_VERSION = "2024-07-01-preview" # this is updated to the latest
AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest
### COHERE EMBEDDINGS DEFAULT TYPE ###
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
### GUARDRAILS ###
@ -483,7 +483,12 @@ openai_compatible_providers: List = [
"azure_ai",
"github",
]
openai_text_completion_compatible_providers: List = (
[ # providers that support `/v1/completions`
"together_ai",
"fireworks_ai",
]
)
# well supported replicate llms
replicate_models: List = [
@ -863,7 +868,7 @@ from .llms.custom_llm import CustomLLM
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic.chat import AnthropicConfig
from .llms.anthropic.completion import AnthropicTextConfig
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig
from .llms.predibase import PredibaseConfig
from .llms.replicate import ReplicateConfig
from .llms.cohere.completion import CohereConfig

View file

@ -22,6 +22,9 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha
from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token,
)
from litellm.llms.databricks.cost_calculator import (
cost_per_token as databricks_cost_per_token,
)
from litellm.rerank_api.types import RerankResponse
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -159,7 +162,7 @@ def cost_per_token(
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model
model_parts = model.split("/")
model_parts = model.split("/", 1)
if len(model_parts) > 1:
model_without_prefix = model_parts[1]
else:
@ -212,6 +215,8 @@ def cost_per_token(
)
elif custom_llm_provider == "anthropic":
return anthropic_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "databricks":
return databricks_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "gemini":
return google_cost_per_token(
model=model_without_prefix,

View file

@ -649,7 +649,9 @@ class OpenTelemetry(CustomLogger):
return BatchSpanProcessor(
OTLPSpanExporterHTTP(
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
)
),
max_queue_size=100,
max_export_batch_size=100,
)
elif self.OTEL_EXPORTER == "otlp_grpc":
verbose_logger.debug(
@ -659,7 +661,9 @@ class OpenTelemetry(CustomLogger):
return BatchSpanProcessor(
OTLPSpanExporterGRPC(
endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
)
),
max_queue_size=100,
max_export_batch_size=100,
)
else:
verbose_logger.debug(

View file

@ -2333,6 +2333,8 @@ def get_standard_logging_object_payload(
completion_start_time_float = completion_start_time.timestamp()
elif isinstance(completion_start_time, float):
completion_start_time_float = completion_start_time
else:
completion_start_time_float = end_time_float
# clean up litellm hidden params
clean_hidden_params = StandardLoggingHiddenParams(
model_id=None,

View file

@ -245,7 +245,10 @@ class AzureOpenAIConfig:
- You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
- Remember that the model will pass the input to the tool, so the name of the tool and description should be from the models perspective.
"""
if json_schema is not None:
if json_schema is not None and (
(api_version_year <= "2024" and api_version_month < "08")
or "gpt-4o" not in model
): # azure api version "2024-08-01-preview" onwards supports 'json_schema' only for gpt-4o
_tool_choice = ChatCompletionToolChoiceObjectParam(
type="function",
function=ChatCompletionToolChoiceFunctionParam(

View file

@ -1263,6 +1263,7 @@ class OpenAIChatCompletion(BaseLLM):
error_headers = getattr(e, "headers", None)
if response is not None and hasattr(response, "text"):
error_headers = getattr(e, "headers", None)
raise OpenAIError(
status_code=500,
message=f"{str(e)}\n\nOriginal Response: {response.text}",
@ -1800,12 +1801,11 @@ class OpenAITextCompletion(BaseLLM):
headers: Optional[dict] = None,
):
super().completion()
exception_mapping_worked = False
try:
if headers is None:
headers = self.validate_environment(api_key=api_key)
if model is None or messages is None:
raise OpenAIError(status_code=422, message=f"Missing model or messages")
raise OpenAIError(status_code=422, message="Missing model or messages")
if (
len(messages) > 0

View file

@ -162,11 +162,10 @@ class AzureTextCompletion(BaseLLM):
client=None,
):
super().completion()
exception_mapping_worked = False
try:
if model is None or messages is None:
raise AzureOpenAIError(
status_code=422, message=f"Missing model or messages"
status_code=422, message="Missing model or messages"
)
max_retries = optional_params.pop("max_retries", 2)
@ -293,7 +292,10 @@ class AzureTextCompletion(BaseLLM):
"api-version", api_version
)
response = azure_client.completions.create(**data, timeout=timeout) # type: ignore
raw_response = azure_client.completions.with_raw_response.create(
**data, timeout=timeout
)
response = raw_response.parse()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
@ -380,13 +382,15 @@ class AzureTextCompletion(BaseLLM):
"complete_input_dict": data,
},
)
response = await azure_client.completions.create(**data, timeout=timeout)
raw_response = await azure_client.completions.with_raw_response.create(
**data, timeout=timeout
)
response = raw_response.parse()
return openai_text_completion_config.convert_to_chat_model_response_object(
response_object=response.model_dump(),
model_response_object=model_response,
)
except AzureOpenAIError as e:
exception_mapping_worked = True
raise e
except Exception as e:
status_code = getattr(e, "status_code", 500)

View file

@ -736,7 +736,9 @@ class BedrockLLM(BaseAWSLLM):
if (stream is not None and stream is True) and provider != "ai21":
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream"
proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
proxy_endpoint_url = (
f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
)
else:
endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke"
@ -1268,7 +1270,7 @@ class AmazonConverseConfig:
if len(value) == 0: # converse raises error for empty strings
continue
value = [value]
optional_params["stop_sequences"] = value
optional_params["stopSequences"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "top_p":

View file

@ -29,8 +29,8 @@ from litellm.types.utils import (
)
from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
from ..base import BaseLLM
from ..prompt_templates.factory import custom_prompt, prompt_factory
class DatabricksError(Exception):
@ -328,6 +328,7 @@ class DatabricksChatCompletion(BaseLLM):
api_base: str,
custom_prompt_dict: dict,
model_response: ModelResponse,
custom_llm_provider: str,
print_verbose: Callable,
encoding,
api_key,
@ -371,6 +372,8 @@ class DatabricksChatCompletion(BaseLLM):
)
response = ModelResponse(**response_json)
response.model = custom_llm_provider + "/" + response.model
if base_model is not None:
response._hidden_params["model"] = base_model
return response
@ -472,6 +475,7 @@ class DatabricksChatCompletion(BaseLLM):
data=data,
api_base=api_base,
custom_prompt_dict=custom_prompt_dict,
custom_llm_provider=custom_llm_provider,
model_response=model_response,
print_verbose=print_verbose,
encoding=encoding,
@ -528,6 +532,8 @@ class DatabricksChatCompletion(BaseLLM):
response = ModelResponse(**response_json)
response.model = custom_llm_provider + "/" + response.model
if base_model is not None:
response._hidden_params["model"] = base_model

View file

@ -0,0 +1,39 @@
"""
Helper util for handling databricks-specific cost calculation
- e.g.: handling 'dbrx-instruct-*'
"""
from typing import Tuple
from litellm.types.utils import Usage
from litellm.utils import get_model_info
def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- usage: LiteLLM Usage block, containing anthropic caching information
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
"""
base_model = model
if model.startswith("databricks/dbrx-instruct") or model.startswith(
"dbrx-instruct"
):
base_model = "databricks-dbrx-instruct"
## GET MODEL INFO
model_info = get_model_info(model=base_model, custom_llm_provider="databricks")
## CALCULATE INPUT COST
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
## CALCULATE OUTPUT COST
completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
return prompt_cost, completion_cost

View file

@ -273,7 +273,7 @@ class SagemakerLLM(BaseAWSLLM):
model_id = optional_params.get("model_id", None)
if use_messages_api is True:
from litellm.llms.databricks import DatabricksChatCompletion
from litellm.llms.databricks.chat import DatabricksChatCompletion
openai_like_chat_completions = DatabricksChatCompletion()
inference_params["stream"] = True if stream is True else False

View file

@ -80,7 +80,7 @@ class VertexAIPartnerModels(BaseLLM):
import vertexai
from google.cloud import aiplatform
from litellm.llms.databricks import DatabricksChatCompletion
from litellm.llms.databricks.chat import DatabricksChatCompletion
from litellm.llms.OpenAI.openai import OpenAIChatCompletion
from litellm.llms.text_completion_codestral import CodestralTextCompletion
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (

View file

@ -92,7 +92,7 @@ from .llms.cohere import chat as cohere_chat
from .llms.cohere import completion as cohere_completion # type: ignore
from .llms.cohere import embed as cohere_embed
from .llms.custom_llm import CustomLLM, custom_chat_llm_router
from .llms.databricks import DatabricksChatCompletion
from .llms.databricks.chat import DatabricksChatCompletion
from .llms.huggingface_restapi import Huggingface
from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion
@ -1013,7 +1013,10 @@ def completion(
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
api_version = (
api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
api_version
or litellm.api_version
or get_secret("AZURE_API_VERSION")
or litellm.AZURE_DEFAULT_API_VERSION
)
api_key = (
@ -1209,6 +1212,9 @@ def completion(
custom_llm_provider == "text-completion-openai"
or "ft:babbage-002" in model
or "ft:davinci-002" in model # support for finetuned completion models
or custom_llm_provider
in litellm.openai_text_completion_compatible_providers
and kwargs.get("text_completion") is True
):
openai.api_type = "openai"
@ -4099,8 +4105,8 @@ def text_completion(
kwargs.pop("prompt", None)
if (
_model is not None and custom_llm_provider == "openai"
if _model is not None and (
custom_llm_provider == "openai"
): # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
if _model not in litellm.open_ai_chat_completion_models:
model = "text-completion-openai/" + _model

View file

@ -2512,16 +2512,16 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"input_cost_per_token": 0.000000075,
"input_cost_per_token_above_128k_tokens": 0.00000015,
"output_cost_per_token": 0.0000003,
"output_cost_per_token_above_128k_tokens": 0.0000006,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://ai.google.dev/pricing"
},
"gemini/gemini-1.5-flash-latest": {
"max_tokens": 8192,
@ -2533,16 +2533,16 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"input_cost_per_token": 0.000000075,
"input_cost_per_token_above_128k_tokens": 0.00000015,
"output_cost_per_token": 0.0000003,
"output_cost_per_token_above_128k_tokens": 0.0000006,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://ai.google.dev/pricing"
},
"gemini/gemini-pro": {
"max_tokens": 8192,

View file

@ -1,16 +1,9 @@
model_list:
- model_name: "anthropic/claude-3-5-sonnet-20240620"
- model_name: "gpt-turbo"
litellm_params:
model: anthropic/claude-3-5-sonnet-20240620
# api_base: http://0.0.0.0:9000
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/*
model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: litellm-logs # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
router_settings:
model_group_alias: {"gpt-4": "gpt-turbo"}

View file

@ -242,6 +242,9 @@ class LiteLLMRoutes(enum.Enum):
"/v1/models",
# token counter
"/utils/token_counter",
# rerank
"/rerank",
"/v1/rerank",
]
mapped_pass_through_routes: List = [

View file

@ -3,7 +3,7 @@
import asyncio
import logging
import random
from typing import Optional
from typing import List, Optional
import litellm
from litellm._logging import print_verbose
@ -36,6 +36,25 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
)
def filter_deployments_by_id(
model_list: List,
) -> List:
seen_ids = set()
filtered_deployments = []
for deployment in model_list:
_model_info = deployment.get("model_info") or {}
_id = _model_info.get("id") or None
if _id is None:
continue
if _id not in seen_ids:
seen_ids.add(_id)
filtered_deployments.append(deployment)
return filtered_deployments
async def _perform_health_check(model_list: list, details: Optional[bool] = True):
"""
Perform a health check for each model in the list.
@ -105,6 +124,9 @@ async def perform_health_check(
_new_model_list = [x for x in model_list if x["model_name"] == model]
model_list = _new_model_list
model_list = filter_deployments_by_id(
model_list=model_list
) # filter duplicate deployments (e.g. when model alias'es are used)
healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
model_list, details
)

View file

@ -86,10 +86,11 @@ def convert_key_logging_metadata_to_callback(
team_callback_settings_obj.success_callback = []
if team_callback_settings_obj.failure_callback is None:
team_callback_settings_obj.failure_callback = []
if data.callback_name not in team_callback_settings_obj.success_callback:
team_callback_settings_obj.success_callback.append(data.callback_name)
if data.callback_name in team_callback_settings_obj.failure_callback:
if data.callback_name not in team_callback_settings_obj.failure_callback:
team_callback_settings_obj.failure_callback.append(data.callback_name)
for var, value in data.callback_vars.items():

View file

@ -109,7 +109,7 @@ async def add_new_member(
where={"user_id": user_info.user_id}, # type: ignore
data={"teams": {"push": [team_id]}},
)
if _returned_user is not None:
returned_user = LiteLLM_UserTable(**_returned_user.model_dump())
elif len(existing_user_row) > 1:
raise HTTPException(

View file

@ -1,19 +1,19 @@
model_list:
- model_name: openai/*
litellm_params:
model: gpt-3.5-turbo
model: openai/*
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]
guardrails:
- guardrail_name: "presidio-pre-guard"
model_info:
id: "good-openai"
- model_name: openai/*
litellm_params:
guardrail: presidio # supported values: "aporia", "lakera", "presidio"
mode: "pre_call" # pre_call, during_call, post_call
output_parse_pii: True
model: openai/*
api_key: os.environ/non-exsitent-env-var
tags: ["bad-model"]
model_info:
id: "test-openai"
litellm_settings:
callbacks: ["otel"]
@ -22,8 +22,16 @@ callback_settings:
otel:
message_logging: False
router_settings:
enable_tag_filtering: True # 👈 Key Chang
general_settings:
master_key: sk-1234
alerting: ["slack"]
spend_report_frequency: "1d"
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]

View file

@ -3690,7 +3690,7 @@ class Router:
exception=original_exception,
)
allowed_fails = _allowed_fails or self.allowed_fails
allowed_fails = _allowed_fails if _allowed_fails is not None else self.allowed_fails
dt = get_utc_datetime()
current_minute = dt.strftime("%H-%M")
@ -4556,6 +4556,27 @@ class Router:
ids.append(id)
return ids
def _get_all_deployments(
self, model_name: str, model_alias: Optional[str] = None
) -> List[DeploymentTypedDict]:
"""
Return all deployments of a model name
Used for accurate 'get_model_list'.
"""
returned_models: List[DeploymentTypedDict] = []
for model in self.model_list:
if model["model_name"] == model_name:
if model_alias is not None:
alias_model = copy.deepcopy(model)
alias_model["model_name"] = model_name
returned_models.append(alias_model)
else:
returned_models.append(model)
return returned_models
def get_model_names(self) -> List[str]:
"""
Returns all possible model names for router.
@ -4567,15 +4588,18 @@ class Router:
def get_model_list(
self, model_name: Optional[str] = None
) -> Optional[List[DeploymentTypedDict]]:
"""
Includes router model_group_alias'es as well
"""
if hasattr(self, "model_list"):
returned_models: List[DeploymentTypedDict] = []
for model_alias, model_value in self.model_group_alias.items():
model_alias_item = DeploymentTypedDict(
model_name=model_alias,
litellm_params=LiteLLMParamsTypedDict(model=model_value),
returned_models.extend(
self._get_all_deployments(
model_name=model_value, model_alias=model_alias
)
)
returned_models.append(model_alias_item)
if model_name is None:
returned_models += self.model_list
@ -4583,8 +4607,7 @@ class Router:
return returned_models
for model in self.model_list:
if model["model_name"] == model_name:
returned_models.append(model)
returned_models.extend(self._get_all_deployments(model_name=model_name))
return returned_models
return None

View file

@ -1,5 +1,9 @@
"""
Use this to route requests between free and paid tiers
Use this to route requests between Teams
- If tags in request is a subset of tags in deployment, return deployment
- if deployments are set with default tags, return all default deployment
- If no default_deployments are set, return all deployments
"""
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union
@ -25,14 +29,14 @@ async def get_deployments_for_tag(
if request_kwargs is None:
verbose_logger.debug(
"get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
"get_deployments_for_tag: request_kwargs is None returning healthy_deployments: %s",
healthy_deployments,
)
return healthy_deployments
if healthy_deployments is None:
verbose_logger.debug(
"get_deployments_for_tier: healthy_deployments is None returning healthy_deployments"
"get_deployments_for_tag: healthy_deployments is None returning healthy_deployments"
)
return healthy_deployments
@ -43,7 +47,9 @@ async def get_deployments_for_tag(
new_healthy_deployments = []
if request_tags:
verbose_logger.debug("parameter routing: router_keys: %s", request_tags)
verbose_logger.debug(
"get_deployments_for_tag routing: router_keys: %s", request_tags
)
# example this can be router_keys=["free", "custom"]
# get all deployments that have a superset of these router keys
for deployment in healthy_deployments:
@ -66,9 +72,26 @@ async def get_deployments_for_tag(
request_tags,
)
new_healthy_deployments.append(deployment)
elif "default" in deployment_tags:
verbose_logger.debug(
"adding default deployment with tags: %s, request tags: %s",
deployment_tags,
request_tags,
)
new_healthy_deployments.append(deployment)
return new_healthy_deployments
# for Untagged requests use default deployments if set
_default_deployments_with_tags = []
for deployment in healthy_deployments:
if "default" in deployment.get("litellm_params", {}).get("tags", []):
_default_deployments_with_tags.append(deployment)
if len(_default_deployments_with_tags) > 0:
return _default_deployments_with_tags
# if no default deployment is found, return healthy_deployments
verbose_logger.debug(
"no tier found in metadata, returning healthy_deployments: %s",
healthy_deployments,

View file

@ -626,6 +626,8 @@ async def test_model_function_invoke(model, sync_mode, api_key, api_base):
response = await litellm.acompletion(**data)
print(f"response: {response}")
except litellm.InternalServerError:
pass
except litellm.RateLimitError as e:
pass
except Exception as e:
@ -889,18 +891,29 @@ def encode_image(image_path):
return base64.b64encode(image_file.read()).decode("utf-8")
@pytest.mark.skip(
reason="we already test claude-3, this is just another way to pass images"
)
def test_completion_claude_3_base64():
@pytest.mark.parametrize(
"model",
[
"gpt-4o",
"azure/gpt-4o",
"anthropic/claude-3-opus-20240229",
],
) #
def test_completion_base64(model):
try:
import base64
import requests
litellm.set_verbose = True
litellm.num_retries = 3
image_path = "../proxy/cached_logo.jpg"
# Getting the base64 string
base64_image = encode_image(image_path)
url = "https://dummyimage.com/100/100/fff&text=Test+image"
response = requests.get(url)
file_data = response.content
encoded_file = base64.b64encode(file_data).decode("utf-8")
base64_image = f"data:image/png;base64,{encoded_file}"
resp = litellm.completion(
model="anthropic/claude-3-opus-20240229",
model=model,
messages=[
{
"role": "user",
@ -908,9 +921,7 @@ def test_completion_claude_3_base64():
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + base64_image
},
"image_url": {"url": base64_image},
},
],
}
@ -919,7 +930,6 @@ def test_completion_claude_3_base64():
print(f"\nResponse: {resp}")
prompt_tokens = resp.usage.prompt_tokens
raise Exception("it worked!")
except Exception as e:
if "500 Internal error encountered.'" in str(e):
pass
@ -2174,15 +2184,16 @@ def test_completion_openai():
@pytest.mark.parametrize(
"model",
"model, api_version",
[
"gpt-4o-2024-08-06",
"azure/chatgpt-v-2",
"bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
("gpt-4o-2024-08-06", None),
("azure/chatgpt-v-2", None),
("bedrock/anthropic.claude-3-sonnet-20240229-v1:0", None),
("azure/gpt-4o", "2024-08-01-preview"),
],
)
@pytest.mark.flaky(retries=3, delay=1)
def test_completion_openai_pydantic(model):
def test_completion_openai_pydantic(model, api_version):
try:
litellm.set_verbose = True
from pydantic import BaseModel
@ -2207,6 +2218,7 @@ def test_completion_openai_pydantic(model):
messages=messages,
metadata={"hi": "bye"},
response_format=EventsList,
api_version=api_version,
)
break
except litellm.JSONSchemaValidationError:
@ -3469,14 +3481,14 @@ def response_format_tests(response: litellm.ModelResponse):
@pytest.mark.parametrize(
"model",
[
# "bedrock/cohere.command-r-plus-v1:0",
"bedrock/mistral.mistral-large-2407-v1:0",
"bedrock/cohere.command-r-plus-v1:0",
"anthropic.claude-3-sonnet-20240229-v1:0",
# "anthropic.claude-instant-v1",
# "bedrock/ai21.j2-mid",
# "mistral.mistral-7b-instruct-v0:2",
"anthropic.claude-instant-v1",
"mistral.mistral-7b-instruct-v0:2",
# "bedrock/amazon.titan-tg1-large",
# "meta.llama3-8b-instruct-v1:0",
# "cohere.command-text-v14",
"meta.llama3-8b-instruct-v1:0",
"cohere.command-text-v14",
],
)
@pytest.mark.parametrize("sync_mode", [True, False])
@ -3491,6 +3503,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=200,
stop=["stop sequence"],
)
assert isinstance(response, litellm.ModelResponse)
@ -3502,6 +3515,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=100,
stop=["stop sequence"],
)
assert isinstance(response, litellm.ModelResponse)

View file

@ -1219,3 +1219,13 @@ def test_completion_cost_anthropic_prompt_caching():
cost_2 = completion_cost(model=model, completion_response=response_2)
assert cost_1 > cost_2
def test_completion_cost_databricks():
model, messages = "databricks/databricks-dbrx-instruct", [
{"role": "user", "content": "What is 2+2?"}
]
resp = litellm.completion(model=model, messages=messages) # works fine
cost = completion_cost(completion_response=resp)

View file

@ -864,7 +864,7 @@ def _pre_call_utils(
data["messages"] = [{"role": "user", "content": "Hello world"}]
if streaming is True:
data["stream"] = True
mapped_target = client.chat.completions.with_raw_response
mapped_target = client.chat.completions.with_raw_response # type: ignore
if sync_mode:
original_function = litellm.completion
else:
@ -873,7 +873,7 @@ def _pre_call_utils(
data["prompt"] = "Hello world"
if streaming is True:
data["stream"] = True
mapped_target = client.completions.with_raw_response
mapped_target = client.completions.with_raw_response # type: ignore
if sync_mode:
original_function = litellm.text_completion
else:

View file

@ -52,6 +52,7 @@ def get_current_weather(location, unit="fahrenheit"):
# "anthropic.claude-3-sonnet-20240229-v1:0",
],
)
@pytest.mark.flaky(retries=3, delay=1)
def test_aaparallel_function_call(model):
try:
litellm.set_verbose = True

View file

@ -1255,7 +1255,17 @@ async def test_add_callback_via_key(prisma_client):
@pytest.mark.asyncio
async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
@pytest.mark.parametrize(
"callback_type, expected_success_callbacks, expected_failure_callbacks",
[
("success", ["langfuse"], []),
("failure", [], ["langfuse"]),
("success_and_failure", ["langfuse"], ["langfuse"]),
],
)
async def test_add_callback_via_key_litellm_pre_call_utils(
prisma_client, callback_type, expected_success_callbacks, expected_failure_callbacks
):
import json
from fastapi import HTTPException, Request, Response
@ -1312,7 +1322,7 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
"logging": [
{
"callback_name": "langfuse",
"callback_type": "success",
"callback_type": callback_type,
"callback_vars": {
"langfuse_public_key": "my-mock-public-key",
"langfuse_secret_key": "my-mock-secret-key",
@ -1359,14 +1369,21 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
}
new_data = await add_litellm_data_to_request(**data)
print("NEW DATA: {}".format(new_data))
assert "success_callback" in new_data
assert new_data["success_callback"] == ["langfuse"]
assert "langfuse_public_key" in new_data
assert new_data["langfuse_public_key"] == "my-mock-public-key"
assert "langfuse_secret_key" in new_data
assert new_data["langfuse_secret_key"] == "my-mock-secret-key"
if expected_success_callbacks:
assert "success_callback" in new_data
assert new_data["success_callback"] == expected_success_callbacks
if expected_failure_callbacks:
assert "failure_callback" in new_data
assert new_data["failure_callback"] == expected_failure_callbacks
@pytest.mark.asyncio
async def test_gemini_pass_through_endpoint():

View file

@ -91,3 +91,72 @@ async def test_router_free_paid_tier():
print("response_extra_info: ", response_extra_info)
assert response_extra_info["model_id"] == "very-expensive-model"
@pytest.mark.asyncio()
async def test_default_tagged_deployments():
"""
- only use default deployment for untagged requests
- if a request has tag "default", use default deployment
"""
router = litellm.Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4o",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"tags": ["default"],
},
"model_info": {"id": "default-model"},
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4o",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
},
"model_info": {"id": "default-model-2"},
},
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4o-mini",
"api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
"tags": ["teamA"],
},
"model_info": {"id": "very-expensive-model"},
},
],
enable_tag_filtering=True,
)
for _ in range(5):
# Untagged request, this should pick model with id == "default-model"
response = await router.acompletion(
model="gpt-4",
messages=[{"role": "user", "content": "Tell me a joke."}],
)
print("Response: ", response)
response_extra_info = response._hidden_params
print("response_extra_info: ", response_extra_info)
assert response_extra_info["model_id"] == "default-model"
for _ in range(5):
# requests tagged with "default", this should pick model with id == "default-model"
response = await router.acompletion(
model="gpt-4",
messages=[{"role": "user", "content": "Tell me a joke."}],
metadata={"tags": ["default"]},
)
print("Response: ", response)
response_extra_info = response._hidden_params
print("response_extra_info: ", response_extra_info)
assert response_extra_info["model_id"] == "default-model"

View file

@ -4239,3 +4239,14 @@ def test_completion_vllm():
mock_call.assert_called_once()
assert "hello" in mock_call.call_args.kwargs["extra_body"]
def test_completion_fireworks_ai_multiple_choices():
litellm.set_verbose = True
response = litellm.text_completion(
model="fireworks_ai/llama-v3p1-8b-instruct",
prompt=["halo", "hi", "halo", "hi"],
)
print(response.choices)
assert len(response.choices) == 4

View file

@ -2512,16 +2512,16 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"input_cost_per_token": 0.000000075,
"input_cost_per_token_above_128k_tokens": 0.00000015,
"output_cost_per_token": 0.0000003,
"output_cost_per_token_above_128k_tokens": 0.0000006,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://ai.google.dev/pricing"
},
"gemini/gemini-1.5-flash-latest": {
"max_tokens": 8192,
@ -2533,16 +2533,16 @@
"max_audio_length_hours": 8.4,
"max_audio_per_prompt": 1,
"max_pdf_size_mb": 30,
"input_cost_per_token": 0.00000035,
"input_cost_per_token_above_128k_tokens": 0.0000007,
"output_cost_per_token": 0.00000105,
"output_cost_per_token_above_128k_tokens": 0.0000021,
"input_cost_per_token": 0.000000075,
"input_cost_per_token_above_128k_tokens": 0.00000015,
"output_cost_per_token": 0.0000003,
"output_cost_per_token_above_128k_tokens": 0.0000006,
"litellm_provider": "gemini",
"mode": "chat",
"supports_system_messages": true,
"supports_function_calling": true,
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
"source": "https://ai.google.dev/pricing"
},
"gemini/gemini-pro": {
"max_tokens": 8192,

View file

@ -148,6 +148,7 @@ router_settings:
redis_password: os.environ/REDIS_PASSWORD
redis_port: os.environ/REDIS_PORT
enable_pre_call_checks: true
model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"}
general_settings:
master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.44.22"
version = "1.44.23"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.44.22"
version = "1.44.23"
version_files = [
"pyproject.toml:^version"
]