forked from phoenix/litellm-mirror
* LiteLLM Minor Fixes & Improvements (09/26/2024) (#5925)
* fix(litellm_logging.py): don't initialize prometheus_logger if non premium user
Prevents bad error messages in logs
Fixes https://github.com/BerriAI/litellm/issues/5897
* Add Support for Custom Providers in Vision and Function Call Utils (#5688)
* Add Support for Custom Providers in Vision and Function Call Utils Lookup
* Remove parallel function call due to missing model info param
* Add Unit Tests for Vision and Function Call Changes
* fix-#5920: set header value to string to fix "'int' object has no att… (#5922)
* LiteLLM Minor Fixes & Improvements (09/24/2024) (#5880)
* LiteLLM Minor Fixes & Improvements (09/23/2024) (#5842)
* feat(auth_utils.py): enable admin to allow client-side credentials to be passed
Makes it easier for devs to experiment with finetuned fireworks ai models
* feat(router.py): allow setting configurable_clientside_auth_params for a model
Closes https://github.com/BerriAI/litellm/issues/5843
* build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit
Fixes https://github.com/BerriAI/litellm/issues/5850
* fix(azure_ai/): support content list for azure ai
Fixes https://github.com/BerriAI/litellm/issues/4237
* fix(litellm_logging.py): always set saved_cache_cost
Set to 0 by default
* fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing
handles calling 405b+ size models
* fix(slack_alerting.py): fix error alerting for failed spend tracking
Fixes regression with slack alerting error monitoring
* fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error
* docs(bedrock.md): add llama3-1 models
* test: fix tests
* fix(azure_ai/chat): fix transformation for azure ai calls
* feat(azure_ai/embed): Add azure ai embeddings support
Closes https://github.com/BerriAI/litellm/issues/5861
* fix(azure_ai/embed): enable async embedding
* feat(azure_ai/embed): support azure ai multimodal embeddings
* fix(azure_ai/embed): support async multi modal embeddings
* feat(together_ai/embed): support together ai embedding calls
* feat(rerank/main.py): log source documents for rerank endpoints to langfuse
improves rerank endpoint logging
* fix(langfuse.py): support logging `/audio/speech` input to langfuse
* test(test_embedding.py): fix test
* test(test_completion_cost.py): fix helper util
* fix-#5920: set header value to string to fix "'int' object has no attribute 'encode'"
---------
Co-authored-by: Krish Dholakia <krrishdholakia@gmail.com>
* Revert "fix-#5920: set header value to string to fix "'int' object has no att…" (#5926)
This reverts commit a554ae2695
.
* build(model_prices_and_context_window.json): add azure ai cohere rerank model pricing
Enables cost tracking for azure ai cohere rerank models
* fix(litellm_logging.py): fix debug log to be clearer
Closes https://github.com/BerriAI/litellm/issues/5909
* test(test_utils.py): fix test name
* fix(azure_ai/cost_calculator.py): support cost tracking for azure ai rerank models
* fix(azure_ai): fix azure ai base model cost tracking for rerank endpoints
* fix(converse_handler.py): support new llama 3-2 models
Fixes https://github.com/BerriAI/litellm/issues/5901
* fix(litellm_logging.py): ensure response is redacted for standard message logging
Fixes https://github.com/BerriAI/litellm/issues/5890#issuecomment-2378242360
* fix(cost_calculator.py): use 'get_model_info' for cohere rerank cost calculation
allows user to set custom cost for model
* fix(config.yml): fix docker hub auht
* build(config.yml): add docker auth to all tests
* fix(db/create_views.py): fix linting error
* fix(main.py): fix circular import
* fix(azure_ai/__init__.py): fix circular import
* fix(main.py): fix import
* fix: fix linting errors
* test: fix test
* fix(proxy_server.py): pass premium user value on startup
used for prometheus init
---------
Co-authored-by: Cole Murray <colemurray.cs@gmail.com>
Co-authored-by: bravomark <62681807+bravomark@users.noreply.github.com>
* handle streaming for azure ai studio error
* [Perf Proxy] parallel request limiter - use one cache update call (#5932)
* fix parallel request limiter - use one cache update call
* ci/cd run again
* run ci/cd again
* use docker username password
* fix config.yml
* fix config
* fix config
* fix config.yml
* ci/cd run again
* use correct typing for batch set cache
* fix async_set_cache_pipeline
* fix only check user id tpm / rpm limits when limits set
* fix test_openai_azure_embedding_with_oidc_and_cf
* test: fix test
* test(test_rerank.py): fix test
---------
Co-authored-by: Cole Murray <colemurray.cs@gmail.com>
Co-authored-by: bravomark <62681807+bravomark@users.noreply.github.com>
Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com>
This commit is contained in:
parent
789ce6b747
commit
bd17424c4b
29 changed files with 564 additions and 104 deletions
|
@ -280,6 +280,9 @@ jobs:
|
||||||
installing_litellm_on_python:
|
installing_litellm_on_python:
|
||||||
docker:
|
docker:
|
||||||
- image: circleci/python:3.8
|
- image: circleci/python:3.8
|
||||||
|
auth:
|
||||||
|
username: ${DOCKERHUB_USERNAME}
|
||||||
|
password: ${DOCKERHUB_PASSWORD}
|
||||||
working_directory: ~/project
|
working_directory: ~/project
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
|
@ -22,6 +22,12 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha
|
||||||
from litellm.llms.anthropic.cost_calculation import (
|
from litellm.llms.anthropic.cost_calculation import (
|
||||||
cost_per_token as anthropic_cost_per_token,
|
cost_per_token as anthropic_cost_per_token,
|
||||||
)
|
)
|
||||||
|
from litellm.llms.azure_ai.cost_calculator import (
|
||||||
|
cost_per_query as azure_ai_rerank_cost_per_query,
|
||||||
|
)
|
||||||
|
from litellm.llms.cohere.cost_calculator import (
|
||||||
|
cost_per_query as cohere_rerank_cost_per_query,
|
||||||
|
)
|
||||||
from litellm.llms.databricks.cost_calculator import (
|
from litellm.llms.databricks.cost_calculator import (
|
||||||
cost_per_token as databricks_cost_per_token,
|
cost_per_token as databricks_cost_per_token,
|
||||||
)
|
)
|
||||||
|
@ -85,6 +91,8 @@ def cost_per_token(
|
||||||
### CUSTOM PRICING ###
|
### CUSTOM PRICING ###
|
||||||
custom_cost_per_token: Optional[CostPerToken] = None,
|
custom_cost_per_token: Optional[CostPerToken] = None,
|
||||||
custom_cost_per_second: Optional[float] = None,
|
custom_cost_per_second: Optional[float] = None,
|
||||||
|
### NUMBER OF QUERIES ###
|
||||||
|
number_of_queries: Optional[int] = None,
|
||||||
### CALL TYPE ###
|
### CALL TYPE ###
|
||||||
call_type: Literal[
|
call_type: Literal[
|
||||||
"embedding",
|
"embedding",
|
||||||
|
@ -190,7 +198,6 @@ def cost_per_token(
|
||||||
|
|
||||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||||
print_verbose(f"Looking up model={model} in model_cost_map")
|
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||||
|
|
||||||
if custom_llm_provider == "vertex_ai":
|
if custom_llm_provider == "vertex_ai":
|
||||||
cost_router = google_cost_router(
|
cost_router = google_cost_router(
|
||||||
model=model_without_prefix,
|
model=model_without_prefix,
|
||||||
|
@ -252,12 +259,10 @@ def cost_per_token(
|
||||||
)
|
)
|
||||||
return prompt_cost, completion_cost
|
return prompt_cost, completion_cost
|
||||||
elif call_type == "arerank" or call_type == "rerank":
|
elif call_type == "arerank" or call_type == "rerank":
|
||||||
completion_tokens_cost_usd_dollar = rerank_cost(
|
return rerank_cost(
|
||||||
model=model,
|
model=model,
|
||||||
custom_llm_provider=custom_llm_provider,
|
custom_llm_provider=custom_llm_provider,
|
||||||
)
|
)
|
||||||
prompt_tokens_cost_usd_dollar = 0
|
|
||||||
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
|
|
||||||
elif model in model_cost_ref:
|
elif model in model_cost_ref:
|
||||||
print_verbose(f"Success: model={model} in model_cost_map")
|
print_verbose(f"Success: model={model} in model_cost_map")
|
||||||
print_verbose(
|
print_verbose(
|
||||||
|
@ -793,7 +798,6 @@ def response_cost_calculator(
|
||||||
if custom_pricing is True: # override defaults if custom pricing is set
|
if custom_pricing is True: # override defaults if custom pricing is set
|
||||||
base_model = model
|
base_model = model
|
||||||
# base_model defaults to None if not set on model_info
|
# base_model defaults to None if not set on model_info
|
||||||
|
|
||||||
response_cost = completion_cost(
|
response_cost = completion_cost(
|
||||||
completion_response=response_object,
|
completion_response=response_object,
|
||||||
call_type=call_type,
|
call_type=call_type,
|
||||||
|
@ -808,23 +812,27 @@ def response_cost_calculator(
|
||||||
def rerank_cost(
|
def rerank_cost(
|
||||||
model: str,
|
model: str,
|
||||||
custom_llm_provider: Optional[str],
|
custom_llm_provider: Optional[str],
|
||||||
) -> float:
|
) -> Tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
- float or None: cost of response OR none if error.
|
- float or None: cost of response OR none if error.
|
||||||
"""
|
"""
|
||||||
|
default_num_queries = 1
|
||||||
_, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
_, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
||||||
model=model, custom_llm_provider=custom_llm_provider
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if custom_llm_provider == "cohere":
|
if custom_llm_provider == "cohere":
|
||||||
return 0.002
|
return cohere_rerank_cost_per_query(
|
||||||
|
model=model, num_queries=default_num_queries
|
||||||
|
)
|
||||||
|
elif custom_llm_provider == "azure_ai":
|
||||||
|
return azure_ai_rerank_cost_per_query(
|
||||||
|
model=model, num_queries=default_num_queries
|
||||||
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"invalid custom_llm_provider for rerank model: {model}, custom_llm_provider: {custom_llm_provider}"
|
f"invalid custom_llm_provider for rerank model: {model}, custom_llm_provider: {custom_llm_provider}"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.exception(
|
|
||||||
f"litellm.cost_calculator.py::rerank_cost - Exception occurred - {str(e)}"
|
|
||||||
)
|
|
||||||
raise e
|
raise e
|
||||||
|
|
|
@ -31,6 +31,7 @@ from litellm.litellm_core_utils.redact_messages import (
|
||||||
redact_message_input_output_from_custom_logger,
|
redact_message_input_output_from_custom_logger,
|
||||||
redact_message_input_output_from_logging,
|
redact_message_input_output_from_logging,
|
||||||
)
|
)
|
||||||
|
from litellm.proxy._types import CommonProxyErrors
|
||||||
from litellm.rerank_api.types import RerankResponse
|
from litellm.rerank_api.types import RerankResponse
|
||||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||||
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
|
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
|
||||||
|
@ -97,7 +98,9 @@ try:
|
||||||
GenericAPILogger,
|
GenericAPILogger,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_logger.debug(f"Exception import enterprise features {str(e)}")
|
verbose_logger.debug(
|
||||||
|
f"[Non-Blocking] Unable to import GenericAPILogger - LiteLLM Enterprise Feature - {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
_in_memory_loggers: List[Any] = []
|
_in_memory_loggers: List[Any] = []
|
||||||
|
|
||||||
|
@ -2140,7 +2143,8 @@ def _init_custom_logger_compatible_class(
|
||||||
llm_router: Optional[
|
llm_router: Optional[
|
||||||
Any
|
Any
|
||||||
], # expect litellm.Router, but typing errors due to circular import
|
], # expect litellm.Router, but typing errors due to circular import
|
||||||
) -> CustomLogger:
|
premium_user: bool = False,
|
||||||
|
) -> Optional[CustomLogger]:
|
||||||
if logging_integration == "lago":
|
if logging_integration == "lago":
|
||||||
for callback in _in_memory_loggers:
|
for callback in _in_memory_loggers:
|
||||||
if isinstance(callback, LagoLogger):
|
if isinstance(callback, LagoLogger):
|
||||||
|
@ -2174,6 +2178,7 @@ def _init_custom_logger_compatible_class(
|
||||||
_in_memory_loggers.append(_langsmith_logger)
|
_in_memory_loggers.append(_langsmith_logger)
|
||||||
return _langsmith_logger # type: ignore
|
return _langsmith_logger # type: ignore
|
||||||
elif logging_integration == "prometheus":
|
elif logging_integration == "prometheus":
|
||||||
|
if premium_user:
|
||||||
for callback in _in_memory_loggers:
|
for callback in _in_memory_loggers:
|
||||||
if isinstance(callback, PrometheusLogger):
|
if isinstance(callback, PrometheusLogger):
|
||||||
return callback # type: ignore
|
return callback # type: ignore
|
||||||
|
@ -2181,6 +2186,11 @@ def _init_custom_logger_compatible_class(
|
||||||
_prometheus_logger = PrometheusLogger()
|
_prometheus_logger = PrometheusLogger()
|
||||||
_in_memory_loggers.append(_prometheus_logger)
|
_in_memory_loggers.append(_prometheus_logger)
|
||||||
return _prometheus_logger # type: ignore
|
return _prometheus_logger # type: ignore
|
||||||
|
else:
|
||||||
|
verbose_logger.warning(
|
||||||
|
f"🚨🚨🚨 Prometheus Metrics is on LiteLLM Enterprise\n🚨 {CommonProxyErrors.not_premium_user.value}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
elif logging_integration == "datadog":
|
elif logging_integration == "datadog":
|
||||||
for callback in _in_memory_loggers:
|
for callback in _in_memory_loggers:
|
||||||
if isinstance(callback, DataDogLogger):
|
if isinstance(callback, DataDogLogger):
|
||||||
|
@ -2411,6 +2421,7 @@ def get_standard_logging_object_payload(
|
||||||
response_obj = init_response_obj
|
response_obj = init_response_obj
|
||||||
else:
|
else:
|
||||||
response_obj = {}
|
response_obj = {}
|
||||||
|
|
||||||
# standardize this function to be used across, s3, dynamoDB, langfuse logging
|
# standardize this function to be used across, s3, dynamoDB, langfuse logging
|
||||||
litellm_params = kwargs.get("litellm_params", {})
|
litellm_params = kwargs.get("litellm_params", {})
|
||||||
proxy_server_request = litellm_params.get("proxy_server_request") or {}
|
proxy_server_request = litellm_params.get("proxy_server_request") or {}
|
||||||
|
@ -2546,6 +2557,16 @@ def get_standard_logging_object_payload(
|
||||||
|
|
||||||
response_cost: float = kwargs.get("response_cost", 0) or 0.0
|
response_cost: float = kwargs.get("response_cost", 0) or 0.0
|
||||||
|
|
||||||
|
if response_obj is not None:
|
||||||
|
final_response_obj: Optional[Union[dict, str, list]] = response_obj
|
||||||
|
elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str):
|
||||||
|
final_response_obj = init_response_obj
|
||||||
|
else:
|
||||||
|
final_response_obj = None
|
||||||
|
|
||||||
|
if litellm.turn_off_message_logging:
|
||||||
|
final_response_obj = "redacted-by-litellm"
|
||||||
|
|
||||||
payload: StandardLoggingPayload = StandardLoggingPayload(
|
payload: StandardLoggingPayload = StandardLoggingPayload(
|
||||||
id=str(id),
|
id=str(id),
|
||||||
call_type=call_type or "",
|
call_type=call_type or "",
|
||||||
|
@ -2569,9 +2590,7 @@ def get_standard_logging_object_payload(
|
||||||
model_id=_model_id,
|
model_id=_model_id,
|
||||||
requester_ip_address=clean_metadata.get("requester_ip_address", None),
|
requester_ip_address=clean_metadata.get("requester_ip_address", None),
|
||||||
messages=kwargs.get("messages"),
|
messages=kwargs.get("messages"),
|
||||||
response=( # type: ignore
|
response=final_response_obj,
|
||||||
response_obj if len(response_obj.keys()) > 0 else init_response_obj # type: ignore
|
|
||||||
),
|
|
||||||
model_parameters=kwargs.get("optional_params", None),
|
model_parameters=kwargs.get("optional_params", None),
|
||||||
hidden_params=clean_hidden_params,
|
hidden_params=clean_hidden_params,
|
||||||
model_map_information=model_cost_information,
|
model_map_information=model_cost_information,
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
from .chat.handler import AzureAIChatCompletion
|
|
||||||
from .embed.handler import AzureAIEmbedding
|
|
||||||
from .rerank.handler import AzureAIRerank
|
|
1
litellm/llms/azure_ai/chat/__init__.py
Normal file
1
litellm/llms/azure_ai/chat/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from .handler import AzureAIChatCompletion
|
|
@ -0,0 +1,33 @@
|
||||||
|
"""
|
||||||
|
Handles custom cost calculation for Azure AI models.
|
||||||
|
|
||||||
|
Custom cost calculation for Azure AI models only requied for rerank.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from litellm.types.utils import Usage
|
||||||
|
from litellm.utils import get_model_info
|
||||||
|
|
||||||
|
|
||||||
|
def cost_per_query(model: str, num_queries: int = 1) -> Tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Calculates the cost per query for a given rerank model.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
- model: str, the model name without provider prefix
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||||
|
"""
|
||||||
|
model_info = get_model_info(model=model, custom_llm_provider="azure_ai")
|
||||||
|
|
||||||
|
if (
|
||||||
|
"input_cost_per_query" not in model_info
|
||||||
|
or model_info["input_cost_per_query"] is None
|
||||||
|
):
|
||||||
|
return 0.0, 0.0
|
||||||
|
|
||||||
|
prompt_cost = model_info["input_cost_per_query"] * num_queries
|
||||||
|
|
||||||
|
return prompt_cost, 0.0
|
1
litellm/llms/azure_ai/embed/__init__.py
Normal file
1
litellm/llms/azure_ai/embed/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from .handler import AzureAIEmbedding
|
|
@ -8,6 +8,57 @@ from litellm.rerank_api.types import RerankResponse
|
||||||
|
|
||||||
|
|
||||||
class AzureAIRerank(CohereRerank):
|
class AzureAIRerank(CohereRerank):
|
||||||
|
|
||||||
|
def get_base_model(self, azure_model_group: Optional[str]) -> Optional[str]:
|
||||||
|
if azure_model_group is None:
|
||||||
|
return None
|
||||||
|
if azure_model_group == "offer-cohere-rerank-mul-paygo":
|
||||||
|
return "azure_ai/cohere-rerank-v3-multilingual"
|
||||||
|
if azure_model_group == "offer-cohere-rerank-eng-paygo":
|
||||||
|
return "azure_ai/cohere-rerank-v3-english"
|
||||||
|
return azure_model_group
|
||||||
|
|
||||||
|
async def async_azure_rerank(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
api_key: str,
|
||||||
|
api_base: str,
|
||||||
|
query: str,
|
||||||
|
documents: List[Union[str, Dict[str, Any]]],
|
||||||
|
headers: Optional[dict],
|
||||||
|
litellm_logging_obj: LiteLLMLoggingObj,
|
||||||
|
top_n: Optional[int] = None,
|
||||||
|
rank_fields: Optional[List[str]] = None,
|
||||||
|
return_documents: Optional[bool] = True,
|
||||||
|
max_chunks_per_doc: Optional[int] = None,
|
||||||
|
):
|
||||||
|
returned_response: RerankResponse = await super().rerank( # type: ignore
|
||||||
|
model=model,
|
||||||
|
api_key=api_key,
|
||||||
|
api_base=api_base,
|
||||||
|
query=query,
|
||||||
|
documents=documents,
|
||||||
|
top_n=top_n,
|
||||||
|
rank_fields=rank_fields,
|
||||||
|
return_documents=return_documents,
|
||||||
|
max_chunks_per_doc=max_chunks_per_doc,
|
||||||
|
_is_async=True,
|
||||||
|
headers=headers,
|
||||||
|
litellm_logging_obj=litellm_logging_obj,
|
||||||
|
)
|
||||||
|
|
||||||
|
# get base model
|
||||||
|
additional_headers = (
|
||||||
|
returned_response._hidden_params.get("additional_headers") or {}
|
||||||
|
)
|
||||||
|
|
||||||
|
base_model = self.get_base_model(
|
||||||
|
additional_headers.get("llm_provider-azureml-model-group")
|
||||||
|
)
|
||||||
|
returned_response._hidden_params["model"] = base_model
|
||||||
|
|
||||||
|
return returned_response
|
||||||
|
|
||||||
def rerank(
|
def rerank(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
|
@ -36,7 +87,22 @@ class AzureAIRerank(CohereRerank):
|
||||||
if not api_base_url.path.endswith("/v1/rerank"):
|
if not api_base_url.path.endswith("/v1/rerank"):
|
||||||
api_base = str(api_base_url.copy_with(path="/v1/rerank"))
|
api_base = str(api_base_url.copy_with(path="/v1/rerank"))
|
||||||
|
|
||||||
return super().rerank(
|
if _is_async:
|
||||||
|
return self.async_azure_rerank( # type: ignore
|
||||||
|
model=model,
|
||||||
|
api_key=api_key,
|
||||||
|
api_base=api_base,
|
||||||
|
query=query,
|
||||||
|
documents=documents,
|
||||||
|
top_n=top_n,
|
||||||
|
rank_fields=rank_fields,
|
||||||
|
return_documents=return_documents,
|
||||||
|
max_chunks_per_doc=max_chunks_per_doc,
|
||||||
|
headers=headers,
|
||||||
|
litellm_logging_obj=litellm_logging_obj,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
returned_response = super().rerank(
|
||||||
model=model,
|
model=model,
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
api_base=api_base,
|
api_base=api_base,
|
||||||
|
@ -50,3 +116,10 @@ class AzureAIRerank(CohereRerank):
|
||||||
headers=headers,
|
headers=headers,
|
||||||
litellm_logging_obj=litellm_logging_obj,
|
litellm_logging_obj=litellm_logging_obj,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# get base model
|
||||||
|
base_model = self.get_base_model(
|
||||||
|
returned_response._hidden_params.get("llm_provider-azureml-model-group")
|
||||||
|
)
|
||||||
|
returned_response._hidden_params["model"] = base_model
|
||||||
|
return returned_response
|
||||||
|
|
|
@ -20,17 +20,9 @@ from .invoke_handler import AWSEventStreamDecoder, MockResponseIterator, make_ca
|
||||||
|
|
||||||
BEDROCK_CONVERSE_MODELS = [
|
BEDROCK_CONVERSE_MODELS = [
|
||||||
"anthropic.claude-3-5-sonnet-20240620-v1:0",
|
"anthropic.claude-3-5-sonnet-20240620-v1:0",
|
||||||
"us.anthropic.claude-3-5-sonnet-20240620-v1:0",
|
|
||||||
"eu.anthropic.claude-3-5-sonnet-20240620-v1:0",
|
|
||||||
"anthropic.claude-3-opus-20240229-v1:0",
|
"anthropic.claude-3-opus-20240229-v1:0",
|
||||||
"us.anthropic.claude-3-opus-20240229-v1:0",
|
|
||||||
"eu.anthropic.claude-3-opus-20240229-v1:0",
|
|
||||||
"anthropic.claude-3-sonnet-20240229-v1:0",
|
"anthropic.claude-3-sonnet-20240229-v1:0",
|
||||||
"us.anthropic.claude-3-sonnet-20240229-v1:0",
|
|
||||||
"eu.anthropic.claude-3-sonnet-20240229-v1:0",
|
|
||||||
"anthropic.claude-3-haiku-20240307-v1:0",
|
"anthropic.claude-3-haiku-20240307-v1:0",
|
||||||
"us.anthropic.claude-3-haiku-20240307-v1:0",
|
|
||||||
"eu.anthropic.claude-3-haiku-20240307-v1:0",
|
|
||||||
"anthropic.claude-v2",
|
"anthropic.claude-v2",
|
||||||
"anthropic.claude-v2:1",
|
"anthropic.claude-v2:1",
|
||||||
"anthropic.claude-v1",
|
"anthropic.claude-v1",
|
||||||
|
@ -43,6 +35,11 @@ BEDROCK_CONVERSE_MODELS = [
|
||||||
"meta.llama3-1-405b-instruct-v1:0",
|
"meta.llama3-1-405b-instruct-v1:0",
|
||||||
"meta.llama3-70b-instruct-v1:0",
|
"meta.llama3-70b-instruct-v1:0",
|
||||||
"mistral.mistral-large-2407-v1:0",
|
"mistral.mistral-large-2407-v1:0",
|
||||||
|
"meta.llama3-2-1b-instruct-v1:0",
|
||||||
|
"meta.llama3-2-3b-instruct-v1:0",
|
||||||
|
"meta.llama3-2-11b-instruct-v1:0",
|
||||||
|
"meta.llama3-2-90b-instruct-v1:0",
|
||||||
|
"meta.llama3-2-405b-instruct-v1:0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -430,3 +430,22 @@ class AmazonConverseConfig:
|
||||||
setattr(model_response, "trace", completion_response["trace"])
|
setattr(model_response, "trace", completion_response["trace"])
|
||||||
|
|
||||||
return model_response
|
return model_response
|
||||||
|
|
||||||
|
def _supported_cross_region_inference_region(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Abbreviations of regions AWS Bedrock supports for cross region inference
|
||||||
|
"""
|
||||||
|
return ["us", "eu"]
|
||||||
|
|
||||||
|
def _get_base_model(self, model: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the base model from the given model name.
|
||||||
|
|
||||||
|
Handle model names like - "us.meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
|
||||||
|
AND "meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
|
||||||
|
"""
|
||||||
|
|
||||||
|
potential_region = model.split(".", 1)[0]
|
||||||
|
if potential_region in self._supported_cross_region_inference_region():
|
||||||
|
return model.split(".", 1)[1]
|
||||||
|
return model
|
||||||
|
|
31
litellm/llms/cohere/cost_calculator.py
Normal file
31
litellm/llms/cohere/cost_calculator.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
"""
|
||||||
|
Custom cost calculator for Cohere rerank models
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from litellm.utils import get_model_info
|
||||||
|
|
||||||
|
|
||||||
|
def cost_per_query(model: str, num_queries: int = 1) -> Tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Calculates the cost per query for a given rerank model.
|
||||||
|
|
||||||
|
Input:
|
||||||
|
- model: str, the model name without provider prefix
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_info = get_model_info(model=model, custom_llm_provider="cohere")
|
||||||
|
|
||||||
|
if (
|
||||||
|
"input_cost_per_query" not in model_info
|
||||||
|
or model_info["input_cost_per_query"] is None
|
||||||
|
):
|
||||||
|
return 0.0, 0.0
|
||||||
|
|
||||||
|
prompt_cost = model_info["input_cost_per_query"] * num_queries
|
||||||
|
|
||||||
|
return prompt_cost, 0.0
|
|
@ -6,9 +6,6 @@ LiteLLM supports the re rank API format, no paramter transformation occurs
|
||||||
|
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import httpx
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||||
from litellm.llms.base import BaseLLM
|
from litellm.llms.base import BaseLLM
|
||||||
|
@ -65,7 +62,6 @@ class CohereRerank(BaseLLM):
|
||||||
)
|
)
|
||||||
|
|
||||||
request_data_dict = request_data.dict(exclude_none=True)
|
request_data_dict = request_data.dict(exclude_none=True)
|
||||||
|
|
||||||
## LOGGING
|
## LOGGING
|
||||||
litellm_logging_obj.pre_call(
|
litellm_logging_obj.pre_call(
|
||||||
input=request_data_dict,
|
input=request_data_dict,
|
||||||
|
@ -78,7 +74,7 @@ class CohereRerank(BaseLLM):
|
||||||
)
|
)
|
||||||
|
|
||||||
if _is_async:
|
if _is_async:
|
||||||
return self.async_rerank(request_data_dict=request_data_dict, api_key=api_key, api_base=api_base, headers=headers) # type: ignore # Call async method
|
return self.async_rerank(request_data=request_data, api_key=api_key, api_base=api_base, headers=headers) # type: ignore # Call async method
|
||||||
|
|
||||||
client = _get_httpx_client()
|
client = _get_httpx_client()
|
||||||
response = client.post(
|
response = client.post(
|
||||||
|
@ -87,15 +83,26 @@ class CohereRerank(BaseLLM):
|
||||||
json=request_data_dict,
|
json=request_data_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
return RerankResponse(**response.json())
|
returned_response = RerankResponse(**response.json())
|
||||||
|
|
||||||
|
_response_headers = response.headers
|
||||||
|
|
||||||
|
llm_response_headers = {
|
||||||
|
"{}-{}".format("llm_provider", k): v for k, v in _response_headers.items()
|
||||||
|
}
|
||||||
|
returned_response._hidden_params["additional_headers"] = llm_response_headers
|
||||||
|
|
||||||
|
return returned_response
|
||||||
|
|
||||||
async def async_rerank(
|
async def async_rerank(
|
||||||
self,
|
self,
|
||||||
request_data_dict: Dict[str, Any],
|
request_data: RerankRequest,
|
||||||
api_key: str,
|
api_key: str,
|
||||||
api_base: str,
|
api_base: str,
|
||||||
headers: dict,
|
headers: dict,
|
||||||
) -> RerankResponse:
|
) -> RerankResponse:
|
||||||
|
request_data_dict = request_data.dict(exclude_none=True)
|
||||||
|
|
||||||
client = get_async_httpx_client(llm_provider=litellm.LlmProviders.COHERE)
|
client = get_async_httpx_client(llm_provider=litellm.LlmProviders.COHERE)
|
||||||
|
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
|
@ -104,4 +111,14 @@ class CohereRerank(BaseLLM):
|
||||||
json=request_data_dict,
|
json=request_data_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
return RerankResponse(**response.json())
|
returned_response = RerankResponse(**response.json())
|
||||||
|
|
||||||
|
_response_headers = dict(response.headers)
|
||||||
|
|
||||||
|
llm_response_headers = {
|
||||||
|
"{}-{}".format("llm_provider", k): v for k, v in _response_headers.items()
|
||||||
|
}
|
||||||
|
returned_response._hidden_params["additional_headers"] = llm_response_headers
|
||||||
|
returned_response._hidden_params["model"] = request_data.model
|
||||||
|
|
||||||
|
return returned_response
|
||||||
|
|
|
@ -83,7 +83,8 @@ from .llms import (
|
||||||
from .llms.AI21 import completion as ai21
|
from .llms.AI21 import completion as ai21
|
||||||
from .llms.anthropic.chat import AnthropicChatCompletion
|
from .llms.anthropic.chat import AnthropicChatCompletion
|
||||||
from .llms.anthropic.completion import AnthropicTextCompletion
|
from .llms.anthropic.completion import AnthropicTextCompletion
|
||||||
from .llms.azure_ai import AzureAIChatCompletion, AzureAIEmbedding
|
from .llms.azure_ai.chat import AzureAIChatCompletion
|
||||||
|
from .llms.azure_ai.embed import AzureAIEmbedding
|
||||||
from .llms.azure_text import AzureTextCompletion
|
from .llms.azure_text import AzureTextCompletion
|
||||||
from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
|
from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
|
||||||
from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
|
from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
|
||||||
|
@ -2411,8 +2412,9 @@ def completion(
|
||||||
aws_bedrock_client.meta.region_name
|
aws_bedrock_client.meta.region_name
|
||||||
)
|
)
|
||||||
|
|
||||||
if model in litellm.BEDROCK_CONVERSE_MODELS:
|
base_model = litellm.AmazonConverseConfig()._get_base_model(model)
|
||||||
|
|
||||||
|
if base_model in litellm.BEDROCK_CONVERSE_MODELS:
|
||||||
response = bedrock_converse_chat_completion.completion(
|
response = bedrock_converse_chat_completion.completion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
|
|
@ -990,6 +990,28 @@
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
|
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
|
||||||
},
|
},
|
||||||
|
"azure_ai/cohere-rerank-v3-multilingual": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "azure_ai",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"azure_ai/cohere-rerank-v3-english": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "azure_ai",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
"azure_ai/Cohere-embed-v3-english": {
|
"azure_ai/Cohere-embed-v3-english": {
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"max_input_tokens": 512,
|
"max_input_tokens": 512,
|
||||||
|
@ -3114,6 +3136,50 @@
|
||||||
"litellm_provider": "cohere",
|
"litellm_provider": "cohere",
|
||||||
"mode": "completion"
|
"mode": "completion"
|
||||||
},
|
},
|
||||||
|
"rerank-english-v3.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-multilingual-v3.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-english-v2.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-multilingual-v2.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
"embed-english-v3.0": {
|
"embed-english-v3.0": {
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"max_input_tokens": 512,
|
"max_input_tokens": 512,
|
||||||
|
|
|
@ -11,7 +11,11 @@ model_list:
|
||||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
|
api_base: https://exampleopenaiendpoint-production.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001
|
||||||
vertex_project: "adroit-crow-413218"
|
vertex_project: "adroit-crow-413218"
|
||||||
vertex_location: "us-central1"
|
vertex_location: "us-central1"
|
||||||
|
- model_name: fake-azure-endpoint
|
||||||
|
litellm_params:
|
||||||
|
model: openai/429
|
||||||
|
api_key: fake-key
|
||||||
|
api_base: https://exampleopenaiendpoint-production.up.railway.app
|
||||||
- model_name: fake-openai-endpoint
|
- model_name: fake-openai-endpoint
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: gpt-3.5-turbo
|
model: gpt-3.5-turbo
|
||||||
|
@ -23,6 +27,11 @@ model_list:
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: cohere/rerank-english-v3.0
|
model: cohere/rerank-english-v3.0
|
||||||
api_key: os.environ/COHERE_API_KEY
|
api_key: os.environ/COHERE_API_KEY
|
||||||
|
- model_name: azure-rerank-english-v3.0
|
||||||
|
litellm_params:
|
||||||
|
model: azure_ai/rerank-english-v3.0
|
||||||
|
api_base: os.environ/AZURE_AI_COHERE_API_BASE
|
||||||
|
api_key: os.environ/AZURE_AI_COHERE_API_KEY
|
||||||
- model_name: "databricks/*"
|
- model_name: "databricks/*"
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: "databricks/*"
|
model: "databricks/*"
|
||||||
|
@ -43,9 +52,19 @@ model_list:
|
||||||
model: "vertex_ai/gemini-flash-experimental"
|
model: "vertex_ai/gemini-flash-experimental"
|
||||||
|
|
||||||
litellm_settings:
|
litellm_settings:
|
||||||
success_callback: ["langfuse", "prometheus"]
|
callbacks: ["prometheus"]
|
||||||
failure_callback: ["prometheus"]
|
redact_user_api_key_info: true
|
||||||
|
|
||||||
|
default_team_settings:
|
||||||
|
- team_id: "09ae376d-f6c8-42cd-88be-59717135684d" # team 1
|
||||||
|
success_callbacks: ["langfuse"]
|
||||||
|
langfuse_public_key: "pk-lf-1"
|
||||||
|
langfuse_secret: "sk-lf-1"
|
||||||
|
langfuse_host: ""
|
||||||
|
|
||||||
|
- team_id: "e5db79db-d623-4a5b-afd5-162be56074df" # team2
|
||||||
|
success_callback: ["langfuse"]
|
||||||
|
langfuse_public_key: "pk-lf-2"
|
||||||
|
langfuse_secret: "sk-lf-2"
|
||||||
|
langfuse_host: ""
|
||||||
|
|
||||||
general_settings:
|
|
||||||
proxy_budget_rescheduler_min_time: 1
|
|
||||||
proxy_budget_rescheduler_max_time: 1
|
|
|
@ -1,13 +1,8 @@
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import Any
|
||||||
|
|
||||||
from litellm import verbose_logger
|
from litellm import verbose_logger
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
_db = Any
|
||||||
from prisma import Prisma
|
|
||||||
|
|
||||||
_db = Prisma
|
|
||||||
else:
|
|
||||||
_db = Any
|
|
||||||
|
|
||||||
|
|
||||||
async def create_missing_views(db: _db):
|
async def create_missing_views(db: _db):
|
||||||
|
|
|
@ -505,7 +505,9 @@ prompt_injection_detection_obj: Optional[_OPTIONAL_PromptInjectionDetection] = N
|
||||||
store_model_in_db: bool = False
|
store_model_in_db: bool = False
|
||||||
open_telemetry_logger = None
|
open_telemetry_logger = None
|
||||||
### INITIALIZE GLOBAL LOGGING OBJECT ###
|
### INITIALIZE GLOBAL LOGGING OBJECT ###
|
||||||
proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
|
proxy_logging_obj = ProxyLogging(
|
||||||
|
user_api_key_cache=user_api_key_cache, premium_user=premium_user
|
||||||
|
)
|
||||||
### REDIS QUEUE ###
|
### REDIS QUEUE ###
|
||||||
async_result = None
|
async_result = None
|
||||||
celery_app_conn = None
|
celery_app_conn = None
|
||||||
|
@ -567,7 +569,9 @@ def get_custom_headers(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return {
|
return {
|
||||||
key: value for key, value in headers.items() if value not in exclude_values
|
key: str(value)
|
||||||
|
for key, value in headers.items()
|
||||||
|
if value not in exclude_values
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
verbose_proxy_logger.error(f"Error setting custom headers: {e}")
|
verbose_proxy_logger.error(f"Error setting custom headers: {e}")
|
||||||
|
|
|
@ -86,7 +86,7 @@ async def rerank(
|
||||||
model_id = hidden_params.get("model_id", None) or ""
|
model_id = hidden_params.get("model_id", None) or ""
|
||||||
cache_key = hidden_params.get("cache_key", None) or ""
|
cache_key = hidden_params.get("cache_key", None) or ""
|
||||||
api_base = hidden_params.get("api_base", None) or ""
|
api_base = hidden_params.get("api_base", None) or ""
|
||||||
|
additional_headers = hidden_params.get("additional_headers", None) or {}
|
||||||
fastapi_response.headers.update(
|
fastapi_response.headers.update(
|
||||||
get_custom_headers(
|
get_custom_headers(
|
||||||
user_api_key_dict=user_api_key_dict,
|
user_api_key_dict=user_api_key_dict,
|
||||||
|
@ -96,6 +96,7 @@ async def rerank(
|
||||||
version=version,
|
version=version,
|
||||||
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
|
||||||
request_data=data,
|
request_data=data,
|
||||||
|
**additional_headers,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -312,6 +312,7 @@ class ProxyLogging:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
user_api_key_cache: DualCache,
|
user_api_key_cache: DualCache,
|
||||||
|
premium_user: bool = False,
|
||||||
):
|
):
|
||||||
## INITIALIZE LITELLM CALLBACKS ##
|
## INITIALIZE LITELLM CALLBACKS ##
|
||||||
self.call_details: dict = {}
|
self.call_details: dict = {}
|
||||||
|
@ -334,6 +335,7 @@ class ProxyLogging:
|
||||||
alert_types=self.alert_types,
|
alert_types=self.alert_types,
|
||||||
internal_usage_cache=self.internal_usage_cache.dual_cache,
|
internal_usage_cache=self.internal_usage_cache.dual_cache,
|
||||||
)
|
)
|
||||||
|
self.premium_user = premium_user
|
||||||
|
|
||||||
def update_values(
|
def update_values(
|
||||||
self,
|
self,
|
||||||
|
@ -394,7 +396,10 @@ class ProxyLogging:
|
||||||
callback,
|
callback,
|
||||||
internal_usage_cache=self.internal_usage_cache.dual_cache,
|
internal_usage_cache=self.internal_usage_cache.dual_cache,
|
||||||
llm_router=llm_router,
|
llm_router=llm_router,
|
||||||
|
premium_user=self.premium_user,
|
||||||
)
|
)
|
||||||
|
if callback is None:
|
||||||
|
continue
|
||||||
if callback not in litellm.input_callback:
|
if callback not in litellm.input_callback:
|
||||||
litellm.input_callback.append(callback) # type: ignore
|
litellm.input_callback.append(callback) # type: ignore
|
||||||
if callback not in litellm.success_callback:
|
if callback not in litellm.success_callback:
|
||||||
|
|
|
@ -1226,10 +1226,17 @@ def test_not_found_error():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_bedrock_cross_region_inference():
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
[
|
||||||
|
# "bedrock/us.anthropic.claude-3-haiku-20240307-v1:0",
|
||||||
|
"bedrock/us.meta.llama3-2-11b-instruct-v1:0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_bedrock_cross_region_inference(model):
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
response = completion(
|
response = completion(
|
||||||
model="bedrock/us.anthropic.claude-3-haiku-20240307-v1:0",
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_tokens=10,
|
max_tokens=10,
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
|
|
|
@ -1328,6 +1328,41 @@ def test_completion_cost_vertex_llama3():
|
||||||
assert cost == 0
|
assert cost == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model",
|
||||||
|
[
|
||||||
|
"cohere/rerank-english-v3.0",
|
||||||
|
"azure_ai/cohere-rerank-v3-english",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_completion_cost_azure_ai_rerank(model):
|
||||||
|
from litellm import RerankResponse, rerank
|
||||||
|
|
||||||
|
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||||
|
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||||
|
|
||||||
|
response = RerankResponse(
|
||||||
|
id="b01dbf2e-63c8-4981-9e69-32241da559ed",
|
||||||
|
results=[
|
||||||
|
{
|
||||||
|
"document": {
|
||||||
|
"id": "1",
|
||||||
|
"text": "Paris is the capital of France.",
|
||||||
|
},
|
||||||
|
"index": 0,
|
||||||
|
"relevance_score": 0.990732,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
meta={},
|
||||||
|
)
|
||||||
|
print("response", response)
|
||||||
|
model = model
|
||||||
|
cost = completion_cost(
|
||||||
|
model=model, completion_response=response, call_type="arerank"
|
||||||
|
)
|
||||||
|
assert cost > 0
|
||||||
|
|
||||||
|
|
||||||
def test_together_ai_embedding_completion_cost():
|
def test_together_ai_embedding_completion_cost():
|
||||||
from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
|
from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
|
||||||
|
|
||||||
|
|
|
@ -1254,6 +1254,7 @@ def test_standard_logging_payload(model, turn_off_message_logging):
|
||||||
]
|
]
|
||||||
if turn_off_message_logging:
|
if turn_off_message_logging:
|
||||||
assert "redacted-by-litellm" == slobject["messages"][0]["content"]
|
assert "redacted-by-litellm" == slobject["messages"][0]["content"]
|
||||||
|
assert "redacted-by-litellm" == slobject["response"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Works locally. Flaky on ci/cd")
|
@pytest.mark.skip(reason="Works locally. Flaky on ci/cd")
|
||||||
|
|
|
@ -23,12 +23,16 @@ litellm.set_verbose = True
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="duplicate test of logging with callbacks")
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
async def test_async_prometheus_success_logging():
|
async def test_async_prometheus_success_logging():
|
||||||
|
from litellm.integrations.prometheus import PrometheusLogger
|
||||||
|
|
||||||
|
pl = PrometheusLogger()
|
||||||
run_id = str(uuid.uuid4())
|
run_id = str(uuid.uuid4())
|
||||||
|
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
litellm.success_callback = ["prometheus"]
|
litellm.callbacks = [pl]
|
||||||
litellm.failure_callback = ["prometheus"]
|
|
||||||
|
|
||||||
response = await litellm.acompletion(
|
response = await litellm.acompletion(
|
||||||
model="claude-instant-1.2",
|
model="claude-instant-1.2",
|
||||||
|
@ -54,12 +58,7 @@ async def test_async_prometheus_success_logging():
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
# get prometheus logger
|
# get prometheus logger
|
||||||
from litellm.litellm_core_utils.litellm_logging import _in_memory_loggers
|
test_prometheus_logger = pl
|
||||||
|
|
||||||
for callback in _in_memory_loggers:
|
|
||||||
if isinstance(callback, PrometheusLogger):
|
|
||||||
test_prometheus_logger = callback
|
|
||||||
|
|
||||||
print("done with success request")
|
print("done with success request")
|
||||||
|
|
||||||
print(
|
print(
|
||||||
|
@ -83,12 +82,15 @@ async def test_async_prometheus_success_logging():
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
async def test_async_prometheus_success_logging_with_callbacks():
|
async def test_async_prometheus_success_logging_with_callbacks():
|
||||||
|
|
||||||
|
pl = PrometheusLogger()
|
||||||
|
|
||||||
run_id = str(uuid.uuid4())
|
run_id = str(uuid.uuid4())
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
|
|
||||||
litellm.success_callback = []
|
litellm.success_callback = []
|
||||||
litellm.failure_callback = []
|
litellm.failure_callback = []
|
||||||
litellm.callbacks = ["prometheus"]
|
litellm.callbacks = [pl]
|
||||||
|
|
||||||
# Get initial metric values
|
# Get initial metric values
|
||||||
initial_metrics = {}
|
initial_metrics = {}
|
||||||
|
@ -120,11 +122,7 @@ async def test_async_prometheus_success_logging_with_callbacks():
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
# get prometheus logger
|
# get prometheus logger
|
||||||
from litellm.litellm_core_utils.litellm_logging import _in_memory_loggers
|
test_prometheus_logger = pl
|
||||||
|
|
||||||
for callback in _in_memory_loggers:
|
|
||||||
if isinstance(callback, PrometheusLogger):
|
|
||||||
test_prometheus_logger = callback
|
|
||||||
|
|
||||||
print("done with success request")
|
print("done with success request")
|
||||||
|
|
||||||
|
|
|
@ -185,6 +185,7 @@ async def test_rerank_custom_api_base():
|
||||||
}
|
}
|
||||||
|
|
||||||
mock_response.json = return_val
|
mock_response.json = return_val
|
||||||
|
mock_response.headers = {"key": "value"}
|
||||||
mock_response.status_code = 200
|
mock_response.status_code = 200
|
||||||
|
|
||||||
expected_payload = {
|
expected_payload = {
|
||||||
|
@ -238,6 +239,9 @@ class TestLogger(CustomLogger):
|
||||||
|
|
||||||
@pytest.mark.asyncio()
|
@pytest.mark.asyncio()
|
||||||
async def test_rerank_custom_callbacks():
|
async def test_rerank_custom_callbacks():
|
||||||
|
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||||
|
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||||
|
|
||||||
custom_logger = TestLogger()
|
custom_logger = TestLogger()
|
||||||
litellm.callbacks = [custom_logger]
|
litellm.callbacks = [custom_logger]
|
||||||
response = await litellm.arerank(
|
response = await litellm.arerank(
|
||||||
|
|
|
@ -763,6 +763,45 @@ def test_supports_response_schema(model, expected_bool):
|
||||||
assert expected_bool == response
|
assert expected_bool == response
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model, expected_bool",
|
||||||
|
[
|
||||||
|
("gpt-3.5-turbo", True),
|
||||||
|
("gpt-4", True),
|
||||||
|
("command-nightly", False),
|
||||||
|
("gemini-pro", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_supports_function_calling_v2(model, expected_bool):
|
||||||
|
"""
|
||||||
|
Unit test for 'supports_function_calling' helper function.
|
||||||
|
"""
|
||||||
|
from litellm.utils import supports_function_calling
|
||||||
|
|
||||||
|
response = supports_function_calling(model=model, custom_llm_provider=None)
|
||||||
|
assert expected_bool == response
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"model, expected_bool",
|
||||||
|
[
|
||||||
|
("gpt-4-vision-preview", True),
|
||||||
|
("gpt-3.5-turbo", False),
|
||||||
|
("claude-3-opus-20240229", True),
|
||||||
|
("gemini-pro-vision", True),
|
||||||
|
("command-nightly", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_supports_vision(model, expected_bool):
|
||||||
|
"""
|
||||||
|
Unit test for 'supports_vision' helper function.
|
||||||
|
"""
|
||||||
|
from litellm.utils import supports_vision
|
||||||
|
|
||||||
|
response = supports_vision(model=model, custom_llm_provider=None)
|
||||||
|
assert expected_bool == response
|
||||||
|
|
||||||
|
|
||||||
def test_usage_object_null_tokens():
|
def test_usage_object_null_tokens():
|
||||||
"""
|
"""
|
||||||
Unit test.
|
Unit test.
|
||||||
|
|
|
@ -59,6 +59,7 @@ class ModelInfo(TypedDict, total=False):
|
||||||
input_cost_per_character_above_128k_tokens: Optional[
|
input_cost_per_character_above_128k_tokens: Optional[
|
||||||
float
|
float
|
||||||
] # only for vertex ai models
|
] # only for vertex ai models
|
||||||
|
input_cost_per_query: Optional[float] # only for rerank models
|
||||||
input_cost_per_image: Optional[float] # only for vertex ai models
|
input_cost_per_image: Optional[float] # only for vertex ai models
|
||||||
input_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
input_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
||||||
input_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
input_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
||||||
|
|
|
@ -367,7 +367,7 @@ def function_setup(
|
||||||
callback = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class( # type: ignore
|
callback = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class( # type: ignore
|
||||||
callback, internal_usage_cache=None, llm_router=None
|
callback, internal_usage_cache=None, llm_router=None
|
||||||
)
|
)
|
||||||
if any(
|
if callback is None or any(
|
||||||
isinstance(cb, type(callback))
|
isinstance(cb, type(callback))
|
||||||
for cb in litellm._async_success_callback
|
for cb in litellm._async_success_callback
|
||||||
): # don't double add a callback
|
): # don't double add a callback
|
||||||
|
@ -431,7 +431,7 @@ def function_setup(
|
||||||
)
|
)
|
||||||
|
|
||||||
# don't double add a callback
|
# don't double add a callback
|
||||||
if not any(
|
if callback_class is not None and not any(
|
||||||
isinstance(cb, type(callback_class)) for cb in litellm.callbacks
|
isinstance(cb, type(callback_class)) for cb in litellm.callbacks
|
||||||
):
|
):
|
||||||
litellm.callbacks.append(callback_class) # type: ignore
|
litellm.callbacks.append(callback_class) # type: ignore
|
||||||
|
@ -2148,50 +2148,67 @@ def supports_response_schema(model: str, custom_llm_provider: Optional[str]) ->
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def supports_function_calling(model: str) -> bool:
|
def supports_function_calling(
|
||||||
|
model: str, custom_llm_provider: Optional[str] = None
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the given model supports function calling and return a boolean value.
|
Check if the given model supports function calling and return a boolean value.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
model (str): The model name to be checked.
|
model (str): The model name to be checked.
|
||||||
|
custom_llm_provider (Optional[str]): The provider to be checked.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the model supports function calling, False otherwise.
|
bool: True if the model supports function calling, False otherwise.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If the given model is not found in model_prices_and_context_window.json.
|
Exception: If the given model is not found or there's an error in retrieval.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
model, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
|
|
||||||
|
model_info = litellm.get_model_info(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
|
|
||||||
if model in litellm.model_cost:
|
|
||||||
model_info = litellm.model_cost[model]
|
|
||||||
if model_info.get("supports_function_calling", False) is True:
|
if model_info.get("supports_function_calling", False) is True:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
else:
|
except Exception as e:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Model not supports function calling. You passed model={model}."
|
f"Model not found or error in checking function calling support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def supports_vision(model: str):
|
def supports_vision(model: str, custom_llm_provider: Optional[str] = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the given model supports vision and return a boolean value.
|
Check if the given model supports vision and return a boolean value.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
model (str): The model name to be checked.
|
model (str): The model name to be checked.
|
||||||
|
custom_llm_provider (Optional[str]): The provider to be checked.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the model supports vision, False otherwise.
|
bool: True if the model supports vision, False otherwise.
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the given model is not found in model_prices_and_context_window.json.
|
|
||||||
"""
|
"""
|
||||||
if model in litellm.model_cost:
|
try:
|
||||||
model_info = litellm.model_cost[model]
|
model, custom_llm_provider, _, _ = litellm.get_llm_provider(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
|
|
||||||
|
model_info = litellm.get_model_info(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
|
|
||||||
if model_info.get("supports_vision", False) is True:
|
if model_info.get("supports_vision", False) is True:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
else:
|
except Exception as e:
|
||||||
|
verbose_logger.error(
|
||||||
|
f"Model not found or error in checking vision support. You passed model={model}, custom_llm_provider={custom_llm_provider}. Error: {str(e)}"
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -4755,6 +4772,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
||||||
input_cost_per_character_above_128k_tokens: Optional[
|
input_cost_per_character_above_128k_tokens: Optional[
|
||||||
float
|
float
|
||||||
] # only for vertex ai models
|
] # only for vertex ai models
|
||||||
|
input_cost_per_query: Optional[float] # only for rerank models
|
||||||
input_cost_per_image: Optional[float] # only for vertex ai models
|
input_cost_per_image: Optional[float] # only for vertex ai models
|
||||||
input_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
input_cost_per_audio_per_second: Optional[float] # only for vertex ai models
|
||||||
input_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
input_cost_per_video_per_second: Optional[float] # only for vertex ai models
|
||||||
|
@ -5000,6 +5018,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
||||||
input_cost_per_token_above_128k_tokens=_model_info.get(
|
input_cost_per_token_above_128k_tokens=_model_info.get(
|
||||||
"input_cost_per_token_above_128k_tokens", None
|
"input_cost_per_token_above_128k_tokens", None
|
||||||
),
|
),
|
||||||
|
input_cost_per_query=_model_info.get("input_cost_per_query", None),
|
||||||
output_cost_per_token=_output_cost_per_token,
|
output_cost_per_token=_output_cost_per_token,
|
||||||
output_cost_per_character=_model_info.get(
|
output_cost_per_character=_model_info.get(
|
||||||
"output_cost_per_character", None
|
"output_cost_per_character", None
|
||||||
|
|
|
@ -990,6 +990,28 @@
|
||||||
"mode": "chat",
|
"mode": "chat",
|
||||||
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
|
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
|
||||||
},
|
},
|
||||||
|
"azure_ai/cohere-rerank-v3-multilingual": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "azure_ai",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"azure_ai/cohere-rerank-v3-english": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "azure_ai",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
"azure_ai/Cohere-embed-v3-english": {
|
"azure_ai/Cohere-embed-v3-english": {
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"max_input_tokens": 512,
|
"max_input_tokens": 512,
|
||||||
|
@ -3114,6 +3136,50 @@
|
||||||
"litellm_provider": "cohere",
|
"litellm_provider": "cohere",
|
||||||
"mode": "completion"
|
"mode": "completion"
|
||||||
},
|
},
|
||||||
|
"rerank-english-v3.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-multilingual-v3.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-english-v2.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
|
"rerank-multilingual-v2.0": {
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"max_output_tokens": 4096,
|
||||||
|
"max_query_tokens": 2048,
|
||||||
|
"input_cost_per_token": 0.0,
|
||||||
|
"input_cost_per_query": 0.002,
|
||||||
|
"output_cost_per_token": 0.0,
|
||||||
|
"litellm_provider": "cohere",
|
||||||
|
"mode": "rerank"
|
||||||
|
},
|
||||||
"embed-english-v3.0": {
|
"embed-english-v3.0": {
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"max_input_tokens": 512,
|
"max_input_tokens": 512,
|
||||||
|
|
|
@ -125,7 +125,6 @@ async def test_regenerate_api_key(prisma_client):
|
||||||
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
|
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
|
||||||
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
|
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
|
||||||
await litellm.proxy.proxy_server.prisma_client.connect()
|
await litellm.proxy.proxy_server.prisma_client.connect()
|
||||||
import uuid
|
|
||||||
|
|
||||||
# generate new key
|
# generate new key
|
||||||
key_alias = f"test_alias_regenerate_key-{uuid.uuid4()}"
|
key_alias = f"test_alias_regenerate_key-{uuid.uuid4()}"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue