forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_tts_pricing
This commit is contained in:
commit
f7ebb84488
22 changed files with 586 additions and 117 deletions
|
@ -0,0 +1,6 @@
|
|||
## This folder contains the `json` for creating the following Grafana Dashboard
|
||||
|
||||
### Pre-Requisites
|
||||
- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus
|
||||
|
||||

|
|
@ -1,6 +1,6 @@
|
|||
## This folder contains the `json` for creating the following Grafana Dashboard
|
||||
## Contains example Grafana Dashboard made for LiteLLM Proxy Server
|
||||
|
||||
This folder contains the `json` for creating Grafana Dashboards
|
||||
|
||||
### Pre-Requisites
|
||||
- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus
|
||||
|
||||

|
||||
|
|
|
@ -39,7 +39,7 @@ Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vis
|
|||
|
||||
```python
|
||||
assert litellm.supports_vision(model="gpt-4-vision-preview") == True
|
||||
assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True
|
||||
assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True
|
||||
assert litellm.supports_vision(model="gpt-3.5-turbo") == False
|
||||
```
|
||||
|
||||
|
|
|
@ -29,6 +29,9 @@ litellm_settings:
|
|||
- prompt_injection: # your custom name for guardrail
|
||||
callbacks: [lakera_prompt_injection] # litellm callbacks to use
|
||||
default_on: true # will run on all llm requests when true
|
||||
- pii_masking: # your custom name for guardrail
|
||||
callbacks: [presidio] # use the litellm presidio callback
|
||||
default_on: false # by default this is off for all requests
|
||||
- hide_secrets_guard:
|
||||
callbacks: [hide_secrets]
|
||||
default_on: false
|
||||
|
@ -37,6 +40,12 @@ litellm_settings:
|
|||
default_on: false
|
||||
```
|
||||
|
||||
:::info
|
||||
|
||||
Since `pii_masking` is default Off for all requests, [you can switch it on per API Key](#switch-guardrails-onoff-per-api-key)
|
||||
|
||||
:::
|
||||
|
||||
### 2. Test it
|
||||
|
||||
Run litellm proxy
|
||||
|
@ -185,6 +194,85 @@ print(response)
|
|||
|
||||
</Tabs>
|
||||
|
||||
## Switch Guardrails On/Off Per API Key
|
||||
|
||||
❓ Use this when you need to switch guardrails on/off per API Key
|
||||
|
||||
**Step 1** Create Key with `pii_masking` On
|
||||
|
||||
**NOTE:** We defined `pii_masking` [on step 1](#1-setup-guardrails-on-litellm-proxy-configyaml)
|
||||
|
||||
👉 Set `"permissions": {"pii_masking": true}` with either `/key/generate` or `/key/update`
|
||||
|
||||
This means the `pii_masking` guardrail is on for all requests from this API Key
|
||||
|
||||
:::info
|
||||
|
||||
If you need to switch `pii_masking` off for an API Key set `"permissions": {"pii_masking": false}` with either `/key/generate` or `/key/update`
|
||||
|
||||
:::
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="/key/generate" label="/key/generate">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"permissions": {"pii_masking": true}
|
||||
}'
|
||||
```
|
||||
|
||||
```shell
|
||||
# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="/key/update" label="/key/update">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/key/update' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"key": "sk-jNm1Zar7XfNdZXp49Z1kSQ",
|
||||
"permissions": {"pii_masking": true}
|
||||
}'
|
||||
```
|
||||
|
||||
```shell
|
||||
# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
**Step 2** Test it with new key
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Authorization: Bearer sk-jNm1Zar7XfNdZXp49Z1kSQ' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "llama3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "does my phone number look correct - +1 412-612-9992"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Expect to NOT see `+1 412-612-9992` in your server logs on your callback.
|
||||
|
||||
:::info
|
||||
The `pii_masking` guardrail ran on this request because api key=sk-jNm1Zar7XfNdZXp49Z1kSQ has `"permissions": {"pii_masking": true}`
|
||||
:::
|
||||
|
||||
|
||||
|
||||
|
||||
## Spec for `guardrails` on litellm config
|
||||
|
@ -208,9 +296,9 @@ litellm_settings:
|
|||
|
||||
#### Guardrail: `prompt_injection`: Configuration for detecting and preventing prompt injection attacks.
|
||||
|
||||
- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
|
||||
- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
|
||||
- `default_on`: Boolean flag determining if this guardrail runs on all LLM requests by default.
|
||||
#### Guardrail: `your-custom-guardrail`: Configuration for a user-defined custom guardrail.
|
||||
|
||||
- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
|
||||
- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
|
||||
- `default_on`: Boolean flag determining if this custom guardrail runs by default, set to false.
|
||||
|
|
|
@ -7,10 +7,13 @@ import TabItem from '@theme/TabItem';
|
|||
|
||||
Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
|
||||
- [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format)
|
||||
- [Async Custom Callbacks](#custom-callback-class-async)
|
||||
- [Async Custom Callback APIs](#custom-callback-apis-async)
|
||||
- [Logging to Galileo](#logging-llm-io-to-galileo)
|
||||
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
|
||||
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
|
||||
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
|
||||
|
@ -1056,6 +1059,68 @@ litellm_settings:
|
|||
|
||||
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
|
||||
|
||||
|
||||
## Logging LLM IO to Galileo
|
||||
[BETA]
|
||||
|
||||
Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
|
||||
|
||||
:::info
|
||||
|
||||
Beta Integration
|
||||
|
||||
:::
|
||||
|
||||
**Required Env Variables**
|
||||
|
||||
```bash
|
||||
export GALILEO_BASE_URL="" # For most users, this is the same as their console URL except with the word 'console' replaced by 'api' (e.g. http://www.console.galileo.myenterprise.com -> http://www.api.galileo.myenterprise.com)
|
||||
export GALILEO_PROJECT_ID=""
|
||||
export GALILEO_USERNAME=""
|
||||
export GALILEO_PASSWORD=""
|
||||
```
|
||||
|
||||
### Quick Start
|
||||
|
||||
1. Add to Config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- litellm_params:
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
api_key: my-fake-key
|
||||
model: openai/my-fake-model
|
||||
model_name: fake-openai-endpoint
|
||||
|
||||
litellm_settings:
|
||||
success_callback: ["galileo"] # 👈 KEY CHANGE
|
||||
```
|
||||
|
||||
2. Start Proxy
|
||||
|
||||
```
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
3. Test it!
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "fake-openai-endpoint",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
],
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
|
||||
🎉 That's it - Expect to see your Logs on your Galileo Dashboard
|
||||
|
||||
## Logging Proxy Cost + Usage - OpenMeter
|
||||
|
||||
Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
|
||||
|
|
|
@ -132,3 +132,9 @@ litellm_settings:
|
|||
| `litellm_redis_latency` | histogram latency for redis calls |
|
||||
| `litellm_redis_fails` | Number of failed redis calls |
|
||||
| `litellm_self_latency` | Histogram latency for successful litellm api call |
|
||||
|
||||
## 🔥 Community Maintained Grafana Dashboards
|
||||
|
||||
Link to Grafana Dashboards made by LiteLLM community
|
||||
|
||||
https://github.com/BerriAI/litellm/tree/main/cookbook/litellm_proxy_server/grafana_dashboard
|
|
@ -248,8 +248,14 @@ class RedisCache(BaseCache):
|
|||
# asyncio.get_running_loop().create_task(self.ping())
|
||||
result = asyncio.get_running_loop().create_task(self.ping())
|
||||
except Exception as e:
|
||||
if "no running event loop" in str(e):
|
||||
verbose_logger.debug(
|
||||
"Ignoring async redis ping. No running event loop."
|
||||
)
|
||||
else:
|
||||
verbose_logger.error(
|
||||
"Error connecting to Async Redis client", extra={"error": str(e)}
|
||||
"Error connecting to Async Redis client - {}".format(str(e)),
|
||||
extra={"error": str(e)},
|
||||
)
|
||||
|
||||
### SYNC HEALTH PING ###
|
||||
|
|
159
litellm/integrations/galileo.py
Normal file
159
litellm/integrations/galileo.py
Normal file
|
@ -0,0 +1,159 @@
|
|||
import os
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
|
||||
|
||||
# from here: https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#structuring-your-records
|
||||
class LLMResponse(BaseModel):
|
||||
latency_ms: int
|
||||
status_code: int
|
||||
input_text: str
|
||||
output_text: str
|
||||
node_type: str
|
||||
model: str
|
||||
num_input_tokens: int
|
||||
num_output_tokens: int
|
||||
output_logprobs: Optional[Dict[str, Any]] = Field(
|
||||
default=None,
|
||||
description="Optional. When available, logprobs are used to compute Uncertainty.",
|
||||
)
|
||||
created_at: str = Field(
|
||||
..., description='timestamp constructed in "%Y-%m-%dT%H:%M:%S" format'
|
||||
)
|
||||
tags: Optional[List[str]] = None
|
||||
user_metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class GalileoObserve(CustomLogger):
|
||||
def __init__(self) -> None:
|
||||
self.in_memory_records: List[dict] = []
|
||||
self.batch_size = 1
|
||||
self.base_url = os.getenv("GALILEO_BASE_URL", None)
|
||||
self.project_id = os.getenv("GALILEO_PROJECT_ID", None)
|
||||
self.headers = None
|
||||
self.async_httpx_handler = AsyncHTTPHandler(
|
||||
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
|
||||
)
|
||||
pass
|
||||
|
||||
def set_galileo_headers(self):
|
||||
# following https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#logging-your-records
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
}
|
||||
galileo_login_response = self.async_httpx_handler.post(
|
||||
url=f"{self.base_url}/login",
|
||||
headers=headers,
|
||||
data={
|
||||
"username": os.getenv("GALILEO_USERNAME"),
|
||||
"password": os.getenv("GALILEO_PASSWORD"),
|
||||
},
|
||||
)
|
||||
|
||||
access_token = galileo_login_response.json()["access_token"]
|
||||
|
||||
self.headers = {
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {access_token}",
|
||||
}
|
||||
|
||||
def get_output_str_from_response(self, response_obj, kwargs):
|
||||
output = None
|
||||
if response_obj is not None and (
|
||||
kwargs.get("call_type", None) == "embedding"
|
||||
or isinstance(response_obj, litellm.EmbeddingResponse)
|
||||
):
|
||||
output = None
|
||||
elif response_obj is not None and isinstance(
|
||||
response_obj, litellm.ModelResponse
|
||||
):
|
||||
output = response_obj["choices"][0]["message"].json()
|
||||
elif response_obj is not None and isinstance(
|
||||
response_obj, litellm.TextCompletionResponse
|
||||
):
|
||||
output = response_obj.choices[0].text
|
||||
elif response_obj is not None and isinstance(
|
||||
response_obj, litellm.ImageResponse
|
||||
):
|
||||
output = response_obj["data"]
|
||||
|
||||
return output
|
||||
|
||||
async def async_log_success_event(
|
||||
self,
|
||||
kwargs,
|
||||
start_time,
|
||||
end_time,
|
||||
response_obj,
|
||||
):
|
||||
verbose_logger.debug(f"On Async Success")
|
||||
|
||||
_latency_ms = int((end_time - start_time).total_seconds() * 1000)
|
||||
_call_type = kwargs.get("call_type", "litellm")
|
||||
input_text = litellm.utils.get_formatted_prompt(
|
||||
data=kwargs, call_type=_call_type
|
||||
)
|
||||
|
||||
_usage = response_obj.get("usage", {}) or {}
|
||||
num_input_tokens = _usage.get("prompt_tokens", 0)
|
||||
num_output_tokens = _usage.get("completion_tokens", 0)
|
||||
|
||||
output_text = self.get_output_str_from_response(
|
||||
response_obj=response_obj, kwargs=kwargs
|
||||
)
|
||||
|
||||
request_record = LLMResponse(
|
||||
latency_ms=_latency_ms,
|
||||
status_code=200,
|
||||
input_text=input_text,
|
||||
output_text=output_text,
|
||||
node_type=_call_type,
|
||||
model=kwargs.get("model", "-"),
|
||||
num_input_tokens=num_input_tokens,
|
||||
num_output_tokens=num_output_tokens,
|
||||
created_at=start_time.strftime(
|
||||
"%Y-%m-%dT%H:%M:%S"
|
||||
), # timestamp str constructed in "%Y-%m-%dT%H:%M:%S" format
|
||||
)
|
||||
|
||||
# dump to dict
|
||||
request_dict = request_record.model_dump()
|
||||
self.in_memory_records.append(request_dict)
|
||||
|
||||
if len(self.in_memory_records) >= self.batch_size:
|
||||
await self.flush_in_memory_records()
|
||||
|
||||
async def flush_in_memory_records(self):
|
||||
verbose_logger.debug("flushing in memory records")
|
||||
response = await self.async_httpx_handler.post(
|
||||
url=f"{self.base_url}/projects/{self.project_id}/observe/ingest",
|
||||
headers=self.headers,
|
||||
json={"records": self.in_memory_records},
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
verbose_logger.debug(
|
||||
"Galileo Logger:successfully flushed in memory records"
|
||||
)
|
||||
self.in_memory_records = []
|
||||
else:
|
||||
verbose_logger.debug("Galileo Logger: failed to flush in memory records")
|
||||
verbose_logger.debug(
|
||||
"Galileo Logger error=%s, status code=%s",
|
||||
response.text,
|
||||
response.status_code,
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
verbose_logger.debug(f"On Async Failure")
|
|
@ -56,6 +56,7 @@ from ..integrations.clickhouse import ClickhouseLogger
|
|||
from ..integrations.custom_logger import CustomLogger
|
||||
from ..integrations.datadog import DataDogLogger
|
||||
from ..integrations.dynamodb import DyanmoDBLogger
|
||||
from ..integrations.galileo import GalileoObserve
|
||||
from ..integrations.greenscale import GreenscaleLogger
|
||||
from ..integrations.helicone import HeliconeLogger
|
||||
from ..integrations.lago import LagoLogger
|
||||
|
@ -153,11 +154,6 @@ class Logging:
|
|||
langfuse_secret=None,
|
||||
langfuse_host=None,
|
||||
):
|
||||
if call_type not in [item.value for item in CallTypes]:
|
||||
allowed_values = ", ".join([item.value for item in CallTypes])
|
||||
raise ValueError(
|
||||
f"Invalid call_type {call_type}. Allowed values: {allowed_values}"
|
||||
)
|
||||
if messages is not None:
|
||||
if isinstance(messages, str):
|
||||
messages = [
|
||||
|
@ -604,8 +600,7 @@ class Logging:
|
|||
verbose_logger.error(
|
||||
"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while building complete streaming response in success logging {}\n{}".format(
|
||||
str(e), traceback.format_exc()
|
||||
),
|
||||
log_level="ERROR",
|
||||
)
|
||||
)
|
||||
complete_streaming_response = None
|
||||
else:
|
||||
|
@ -1612,6 +1607,7 @@ class Logging:
|
|||
)
|
||||
== False
|
||||
): # custom logger class
|
||||
|
||||
callback.log_failure_event(
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
|
@ -1929,6 +1925,15 @@ def _init_custom_logger_compatible_class(
|
|||
_openmeter_logger = OpenMeterLogger()
|
||||
_in_memory_loggers.append(_openmeter_logger)
|
||||
return _openmeter_logger # type: ignore
|
||||
|
||||
elif logging_integration == "galileo":
|
||||
for callback in _in_memory_loggers:
|
||||
if isinstance(callback, GalileoObserve):
|
||||
return callback # type: ignore
|
||||
|
||||
galileo_logger = GalileoObserve()
|
||||
_in_memory_loggers.append(galileo_logger)
|
||||
return galileo_logger # type: ignore
|
||||
elif logging_integration == "logfire":
|
||||
if "LOGFIRE_TOKEN" not in os.environ:
|
||||
raise ValueError("LOGFIRE_TOKEN not found in environment variables")
|
||||
|
@ -1985,6 +1990,10 @@ def get_custom_logger_compatible_class(
|
|||
for callback in _in_memory_loggers:
|
||||
if isinstance(callback, OpenMeterLogger):
|
||||
return callback
|
||||
elif logging_integration == "galileo":
|
||||
for callback in _in_memory_loggers:
|
||||
if isinstance(callback, GalileoObserve):
|
||||
return callback
|
||||
elif logging_integration == "logfire":
|
||||
if "LOGFIRE_TOKEN" not in os.environ:
|
||||
raise ValueError("LOGFIRE_TOKEN not found in environment variables")
|
||||
|
|
|
@ -49,7 +49,7 @@ class AnthropicConstants(Enum):
|
|||
class AnthropicError(Exception):
|
||||
def __init__(self, status_code, message):
|
||||
self.status_code = status_code
|
||||
self.message = message
|
||||
self.message: str = message
|
||||
self.request = httpx.Request(
|
||||
method="POST", url="https://api.anthropic.com/v1/messages"
|
||||
)
|
||||
|
@ -830,6 +830,16 @@ class ModelResponseIterator:
|
|||
.get("usage", {})
|
||||
.get("output_tokens", 0),
|
||||
)
|
||||
elif type_chunk == "error":
|
||||
"""
|
||||
{"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"} }
|
||||
"""
|
||||
_error_dict = chunk.get("error", {}) or {}
|
||||
message = _error_dict.get("message", None) or str(chunk)
|
||||
raise AnthropicError(
|
||||
message=message,
|
||||
status_code=500, # it looks like Anthropic API does not return a status code in the chunk error - default to 500
|
||||
)
|
||||
returned_chunk = GenericStreamingChunk(
|
||||
text=text,
|
||||
tool_use=tool_use,
|
||||
|
|
|
@ -58,7 +58,33 @@ class NvidiaNimConfig:
|
|||
and v is not None
|
||||
}
|
||||
|
||||
def get_supported_openai_params(self):
|
||||
def get_supported_openai_params(self, model: str) -> list:
|
||||
"""
|
||||
Get the supported OpenAI params for the given model
|
||||
|
||||
|
||||
Updated on July 5th, 2024 - based on https://docs.api.nvidia.com/nim/reference
|
||||
"""
|
||||
if model in [
|
||||
"google/recurrentgemma-2b",
|
||||
"google/gemma-2-27b-it",
|
||||
"google/gemma-2-9b-it",
|
||||
"gemma-2-9b-it",
|
||||
]:
|
||||
return ["stream", "temperature", "top_p", "max_tokens", "stop", "seed"]
|
||||
elif model == "nvidia/nemotron-4-340b-instruct":
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
]
|
||||
elif model == "nvidia/nemotron-4-340b-reward":
|
||||
return [
|
||||
"stream",
|
||||
]
|
||||
elif model in ["google/codegemma-1.1-7b"]:
|
||||
# most params - but no 'seed' :(
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
|
@ -68,11 +94,44 @@ class NvidiaNimConfig:
|
|||
"max_tokens",
|
||||
"stop",
|
||||
]
|
||||
else:
|
||||
# DEFAULT Case - The vast majority of Nvidia NIM Models lie here
|
||||
# "upstage/solar-10.7b-instruct",
|
||||
# "snowflake/arctic",
|
||||
# "seallms/seallm-7b-v2.5",
|
||||
# "nvidia/llama3-chatqa-1.5-8b",
|
||||
# "nvidia/llama3-chatqa-1.5-70b",
|
||||
# "mistralai/mistral-large",
|
||||
# "mistralai/mixtral-8x22b-instruct-v0.1",
|
||||
# "mistralai/mixtral-8x7b-instruct-v0.1",
|
||||
# "mistralai/mistral-7b-instruct-v0.3",
|
||||
# "mistralai/mistral-7b-instruct-v0.2",
|
||||
# "mistralai/codestral-22b-instruct-v0.1",
|
||||
# "microsoft/phi-3-small-8k-instruct",
|
||||
# "microsoft/phi-3-small-128k-instruct",
|
||||
# "microsoft/phi-3-mini-4k-instruct",
|
||||
# "microsoft/phi-3-mini-128k-instruct",
|
||||
# "microsoft/phi-3-medium-4k-instruct",
|
||||
# "microsoft/phi-3-medium-128k-instruct",
|
||||
# "meta/llama3-70b-instruct",
|
||||
# "meta/llama3-8b-instruct",
|
||||
# "meta/llama2-70b",
|
||||
# "meta/codellama-70b",
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
"max_tokens",
|
||||
"stop",
|
||||
"seed",
|
||||
]
|
||||
|
||||
def map_openai_params(
|
||||
self, non_default_params: dict, optional_params: dict
|
||||
self, model: str, non_default_params: dict, optional_params: dict
|
||||
) -> dict:
|
||||
supported_openai_params = self.get_supported_openai_params()
|
||||
supported_openai_params = self.get_supported_openai_params(model=model)
|
||||
for param, value in non_default_params.items():
|
||||
if param in supported_openai_params:
|
||||
optional_params[param] = value
|
||||
|
|
|
@ -2022,10 +2022,10 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 2097152,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"input_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000007,
|
||||
"output_cost_per_token": 0.0000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
|
@ -2033,16 +2033,16 @@
|
|||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_response_schema": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
"source": "https://ai.google.dev/pricing"
|
||||
},
|
||||
"gemini/gemini-1.5-pro-latest": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1048576,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"input_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
|
@ -2050,7 +2050,7 @@
|
|||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_response_schema": true,
|
||||
"source": "https://ai.google.dev/models/gemini"
|
||||
"source": "https://ai.google.dev/pricing"
|
||||
},
|
||||
"gemini/gemini-pro-vision": {
|
||||
"max_tokens": 2048,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from litellm._logging import verbose_proxy_logger
|
||||
from litellm.proxy.guardrails.init_guardrails import guardrail_name_config_map
|
||||
from litellm.proxy.proxy_server import UserAPIKeyAuth
|
||||
from litellm.types.guardrails import *
|
||||
|
||||
|
||||
|
@ -47,3 +48,44 @@ async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> b
|
|||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def should_proceed_based_on_api_key(
|
||||
user_api_key_dict: UserAPIKeyAuth, guardrail_name: str
|
||||
) -> bool:
|
||||
"""
|
||||
checks if this guardrail should be applied to this call
|
||||
"""
|
||||
if user_api_key_dict.permissions is not None:
|
||||
# { prompt_injection: true, rail_2: false }
|
||||
verbose_proxy_logger.debug(
|
||||
"Guardrails valid for API Key= %s - checking which to apply",
|
||||
user_api_key_dict.permissions,
|
||||
)
|
||||
|
||||
if not isinstance(user_api_key_dict.permissions, dict):
|
||||
verbose_proxy_logger.error(
|
||||
"API Key permissions must be a dict - %s running guardrail %s",
|
||||
user_api_key_dict,
|
||||
guardrail_name,
|
||||
)
|
||||
return True
|
||||
|
||||
for _guardrail_name, should_run in user_api_key_dict.permissions.items():
|
||||
if should_run is False:
|
||||
verbose_proxy_logger.debug(
|
||||
"Guardrail %s skipped because request set to False",
|
||||
_guardrail_name,
|
||||
)
|
||||
continue
|
||||
|
||||
# lookup the guardrail in guardrail_name_config_map
|
||||
guardrail_item: GuardrailItem = guardrail_name_config_map[_guardrail_name]
|
||||
|
||||
guardrail_callbacks = guardrail_item.callbacks
|
||||
if guardrail_name in guardrail_callbacks:
|
||||
return True
|
||||
|
||||
# Do not proceeed if - "metadata": { "guardrails": { "lakera_prompt_injection": false } }
|
||||
return False
|
||||
return True
|
||||
|
|
|
@ -7,6 +7,7 @@ import os
|
|||
import re
|
||||
import smtplib
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -49,6 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter
|
|||
from litellm.proxy.hooks.parallel_request_limiter import (
|
||||
_PROXY_MaxParallelRequestsHandler,
|
||||
)
|
||||
from litellm.types.utils import CallTypes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from opentelemetry.trace import Span as _Span
|
||||
|
@ -354,35 +356,6 @@ class ProxyLogging:
|
|||
print_verbose(f"final data being sent to {call_type} call: {data}")
|
||||
return data
|
||||
except Exception as e:
|
||||
if "litellm_logging_obj" in data:
|
||||
logging_obj: litellm.litellm_core_utils.litellm_logging.Logging = data[
|
||||
"litellm_logging_obj"
|
||||
]
|
||||
|
||||
## ASYNC FAILURE HANDLER ##
|
||||
error_message = ""
|
||||
if isinstance(e, HTTPException):
|
||||
if isinstance(e.detail, str):
|
||||
error_message = e.detail
|
||||
elif isinstance(e.detail, dict):
|
||||
error_message = json.dumps(e.detail)
|
||||
else:
|
||||
error_message = str(e)
|
||||
else:
|
||||
error_message = str(e)
|
||||
error_raised = Exception(f"{error_message}")
|
||||
await logging_obj.async_failure_handler(
|
||||
exception=error_raised,
|
||||
traceback_exception=traceback.format_exc(),
|
||||
)
|
||||
|
||||
## SYNC FAILURE HANDLER ##
|
||||
try:
|
||||
logging_obj.failure_handler(
|
||||
error_raised, traceback.format_exc()
|
||||
) # DO NOT MAKE THREADED - router retry fallback relies on this!
|
||||
except Exception as error_val:
|
||||
pass
|
||||
raise e
|
||||
|
||||
async def during_call_hook(
|
||||
|
@ -597,18 +570,22 @@ class ProxyLogging:
|
|||
)
|
||||
|
||||
### LOGGING ###
|
||||
if isinstance(original_exception, HTTPException):
|
||||
litellm_logging_obj: Optional[Logging] = request_data.get(
|
||||
"litellm_logging_obj", None
|
||||
)
|
||||
|
||||
if isinstance(original_exception, HTTPException):
|
||||
if litellm_logging_obj is None:
|
||||
import uuid
|
||||
|
||||
request_data["litellm_call_id"] = str(uuid.uuid4())
|
||||
litellm_logging_obj, data = litellm.utils.function_setup(
|
||||
original_function="IGNORE_THIS",
|
||||
rules_obj=litellm.utils.Rules(),
|
||||
start_time=datetime.now(),
|
||||
**request_data,
|
||||
)
|
||||
|
||||
if litellm_logging_obj is not None:
|
||||
# log the custom exception
|
||||
await litellm_logging_obj.async_failure_handler(
|
||||
exception=original_exception,
|
||||
|
@ -617,6 +594,16 @@ class ProxyLogging:
|
|||
end_time=time.time(),
|
||||
)
|
||||
|
||||
threading.Thread(
|
||||
target=litellm_logging_obj.failure_handler,
|
||||
args=(
|
||||
original_exception,
|
||||
traceback.format_exc(),
|
||||
time.time(),
|
||||
time.time(),
|
||||
),
|
||||
).start()
|
||||
|
||||
for callback in litellm.callbacks:
|
||||
try:
|
||||
_callback: Optional[CustomLogger] = None
|
||||
|
|
|
@ -1607,7 +1607,17 @@ def test_caching_redis_simple(caplog):
|
|||
print(m)
|
||||
print(time.time() - s2)
|
||||
|
||||
redis_async_caching_error = False
|
||||
redis_service_logging_error = False
|
||||
captured_logs = [rec.message for rec in caplog.records]
|
||||
|
||||
assert "LiteLLM Redis Caching: async set" not in captured_logs
|
||||
assert "ServiceLogging.async_service_success_hook" not in captured_logs
|
||||
print(f"captured_logs: {captured_logs}")
|
||||
for item in captured_logs:
|
||||
if "Error connecting to Async Redis client" in item:
|
||||
redis_async_caching_error = True
|
||||
|
||||
if "ServiceLogging.async_service_success_hook" in item:
|
||||
redis_service_logging_error = True
|
||||
|
||||
assert redis_async_caching_error is False
|
||||
assert redis_service_logging_error is False
|
||||
|
|
|
@ -3602,6 +3602,8 @@ def test_completion_nvidia_nim():
|
|||
"content": "What's the weather like in Boston today in Fahrenheit?",
|
||||
}
|
||||
],
|
||||
presence_penalty=0.5,
|
||||
frequency_penalty=0.1,
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
|
|
|
@ -23,6 +23,8 @@ import os
|
|||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
from typing import Literal
|
||||
|
||||
import pytest
|
||||
from fastapi import Request, Response
|
||||
from starlette.datastructures import URL
|
||||
|
@ -49,18 +51,32 @@ from litellm.router import Router
|
|||
class testLogger(CustomLogger):
|
||||
|
||||
def __init__(self):
|
||||
self.reaches_failure_event = False
|
||||
self.reaches_sync_failure_event = False
|
||||
self.reaches_async_failure_event = False
|
||||
|
||||
async def async_pre_call_check(self, deployment: dict):
|
||||
async def async_pre_call_hook(
|
||||
self,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
cache: DualCache,
|
||||
data: dict,
|
||||
call_type: Literal[
|
||||
"completion",
|
||||
"text_completion",
|
||||
"embeddings",
|
||||
"image_generation",
|
||||
"moderation",
|
||||
"audio_transcription",
|
||||
],
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=429, detail={"error": "Max parallel request limit reached"}
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
self.reaches_failure_event = True
|
||||
return await super().async_log_failure_event(
|
||||
kwargs, response_obj, start_time, end_time
|
||||
)
|
||||
self.reaches_async_failure_event = True
|
||||
|
||||
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
self.reaches_sync_failure_event = True
|
||||
|
||||
|
||||
router = Router(
|
||||
|
@ -92,15 +108,15 @@ router = Router(
|
|||
],
|
||||
},
|
||||
),
|
||||
# ("/v1/completions", {"model": "fake-model", "prompt": "ping"}),
|
||||
# (
|
||||
# "/v1/embeddings",
|
||||
# {
|
||||
# "input": "The food was delicious and the waiter...",
|
||||
# "model": "text-embedding-ada-002",
|
||||
# "encoding_format": "float",
|
||||
# },
|
||||
# ),
|
||||
("/v1/completions", {"model": "fake-model", "prompt": "ping"}),
|
||||
(
|
||||
"/v1/embeddings",
|
||||
{
|
||||
"input": "The food was delicious and the waiter...",
|
||||
"model": "text-embedding-ada-002",
|
||||
"encoding_format": "float",
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
|
@ -169,4 +185,6 @@ async def test_chat_completion_request_with_redaction(route, body):
|
|||
pass
|
||||
await asyncio.sleep(3)
|
||||
|
||||
assert _test_logger.reaches_failure_event is True
|
||||
assert _test_logger.reaches_async_failure_event is True
|
||||
|
||||
assert _test_logger.reaches_sync_failure_event is True
|
||||
|
|
|
@ -531,6 +531,8 @@ def function_setup(
|
|||
call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
|
||||
):
|
||||
messages = kwargs.get("input", "speech")
|
||||
else:
|
||||
messages = "default-message-value"
|
||||
stream = True if "stream" in kwargs and kwargs["stream"] == True else False
|
||||
logging_obj = litellm.litellm_core_utils.litellm_logging.Logging(
|
||||
model=model,
|
||||
|
@ -561,10 +563,8 @@ def function_setup(
|
|||
)
|
||||
return logging_obj, kwargs
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.debug(
|
||||
f"[Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}"
|
||||
verbose_logger.error(
|
||||
f"litellm.utils.py::function_setup() - [Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}"
|
||||
)
|
||||
raise e
|
||||
|
||||
|
@ -3184,7 +3184,9 @@ def get_optional_params(
|
|||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
optional_params = litellm.NvidiaNimConfig().map_openai_params(
|
||||
non_default_params=non_default_params, optional_params=optional_params
|
||||
model=model,
|
||||
non_default_params=non_default_params,
|
||||
optional_params=optional_params,
|
||||
)
|
||||
elif custom_llm_provider == "fireworks_ai":
|
||||
supported_params = get_supported_openai_params(
|
||||
|
@ -3776,7 +3778,7 @@ def get_supported_openai_params(
|
|||
elif custom_llm_provider == "fireworks_ai":
|
||||
return litellm.FireworksAIConfig().get_supported_openai_params()
|
||||
elif custom_llm_provider == "nvidia_nim":
|
||||
return litellm.NvidiaNimConfig().get_supported_openai_params()
|
||||
return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
|
||||
elif custom_llm_provider == "volcengine":
|
||||
return litellm.VolcEngineConfig().get_supported_openai_params(model=model)
|
||||
elif custom_llm_provider == "groq":
|
||||
|
|
|
@ -2022,10 +2022,10 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 2097152,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"input_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000007,
|
||||
"output_cost_per_token": 0.0000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
|
@ -2033,16 +2033,16 @@
|
|||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_response_schema": true,
|
||||
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
|
||||
"source": "https://ai.google.dev/pricing"
|
||||
},
|
||||
"gemini/gemini-1.5-pro-latest": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1048576,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.0000007,
|
||||
"input_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token_above_128k_tokens": 0.000007,
|
||||
"output_cost_per_token": 0.00000105,
|
||||
"output_cost_per_token_above_128k_tokens": 0.0000021,
|
||||
"output_cost_per_token_above_128k_tokens": 0.000021,
|
||||
"litellm_provider": "gemini",
|
||||
"mode": "chat",
|
||||
"supports_system_messages": true,
|
||||
|
@ -2050,7 +2050,7 @@
|
|||
"supports_vision": true,
|
||||
"supports_tool_choice": true,
|
||||
"supports_response_schema": true,
|
||||
"source": "https://ai.google.dev/models/gemini"
|
||||
"source": "https://ai.google.dev/pricing"
|
||||
},
|
||||
"gemini/gemini-pro-vision": {
|
||||
"max_tokens": 2048,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.41.7"
|
||||
version = "1.41.8"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.41.7"
|
||||
version = "1.41.8"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
|
@ -41,7 +41,7 @@ importlib-metadata==6.8.0 # for random utils
|
|||
tokenizers==0.14.0 # for calculating usage
|
||||
click==8.1.7 # for proxy cli
|
||||
jinja2==3.1.4 # for prompt templates
|
||||
certifi==2023.7.22 # [TODO] clean up
|
||||
certifi==2024.7.4 # [TODO] clean up
|
||||
aiohttp==3.9.0 # for network calls
|
||||
aioboto3==12.3.0 # for async sagemaker calls
|
||||
tenacity==8.2.3 # for retrying requests, when litellm.num_retries set
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue