Merge branch 'main' into fix/health-check-interval

This commit is contained in:
Florian Greinacher 2024-10-28 21:27:03 +01:00 committed by GitHub
commit 322c7cd353
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
136 changed files with 5845 additions and 3096 deletions

View file

@ -119,7 +119,7 @@ jobs:
paths:
- local_testing_coverage.xml
- local_testing_coverage
ui_endpoint_testing:
auth_ui_unit_tests:
docker:
- image: cimg/python:3.11
auth:
@ -161,8 +161,8 @@ jobs:
- run:
name: Rename the coverage files
command: |
mv coverage.xml ui_endpoint_testing_coverage.xml
mv .coverage ui_endpoint_testing_coverage
mv coverage.xml auth_ui_unit_tests_coverage.xml
mv .coverage auth_ui_unit_tests_coverage
# Store test results
- store_test_results:
@ -171,8 +171,8 @@ jobs:
- persist_to_workspace:
root: .
paths:
- ui_endpoint_testing_coverage.xml
- ui_endpoint_testing_coverage
- auth_ui_unit_tests_coverage.xml
- auth_ui_unit_tests_coverage
litellm_router_testing: # Runs all tests with the "router" keyword
docker:
- image: cimg/python:3.11
@ -416,15 +416,17 @@ jobs:
command: |
python -m pip install --upgrade pip
pip install ruff
pip install pylint
pip install pylint
pip install pyright
pip install .
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
- run: ruff check ./litellm
- run: python ./tests/documentation_tests/test_general_setting_keys.py
- run: python ./tests/code_coverage_tests/router_code_coverage.py
- run: python ./tests/documentation_tests/test_env_keys.py
- run: helm lint ./deploy/charts/litellm-helm
db_migration_disable_update_check:
machine:
image: ubuntu-2204:2023.10.1
@ -811,7 +813,7 @@ jobs:
python -m venv venv
. venv/bin/activate
pip install coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage ui_endpoint_testing_coverage
coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage
coverage xml
- codecov/upload:
file: ./coverage.xml
@ -1011,7 +1013,7 @@ workflows:
only:
- main
- /litellm_.*/
- ui_endpoint_testing:
- auth_ui_unit_tests:
filters:
branches:
only:
@ -1060,7 +1062,7 @@ workflows:
- litellm_router_testing
- local_testing
- litellm_assistants_api_testing
- ui_endpoint_testing
- auth_ui_unit_tests
- db_migration_disable_update_check:
filters:
branches:
@ -1088,7 +1090,7 @@ workflows:
- logging_testing
- litellm_router_testing
- litellm_assistants_api_testing
- ui_endpoint_testing
- auth_ui_unit_tests
- db_migration_disable_update_check
- e2e_ui_testing
- installing_litellm_on_python
@ -1099,4 +1101,4 @@ workflows:
branches:
only:
- main

View file

@ -50,6 +50,9 @@ jobs:
current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
version-fragment: 'bug'
- name: Lint helm chart
run: helm lint deploy/charts/litellm-helm
- uses: ./.github/actions/helm-oci-chart-releaser
with:
name: litellm-helm
@ -61,4 +64,4 @@ jobs:
registry_username: ${{ github.actor }}
registry_password: ${{ secrets.GITHUB_TOKEN }}
update_dependencies: true

View file

@ -18,4 +18,15 @@ component_management:
paths:
- "*/proxy/auth/**"
comment:
layout: "header, diff, flags, components" # show component info in the PR comment
layout: "header, diff, flags, components" # show component info in the PR comment
coverage:
status:
project:
default:
target: auto
threshold: 1% # at maximum allow project coverage to drop by 1%
patch:
default:
target: auto
threshold: 0% # patch coverage should be 100%

View file

@ -1,72 +0,0 @@
import clickhouse_connect
import datetime as datetime
import os
client = clickhouse_connect.get_client(
host=os.getenv("CLICKHOUSE_HOST"),
port=int(os.getenv("CLICKHOUSE_PORT")),
username=os.getenv("CLICKHOUSE_USERNAME"),
password=os.getenv("CLICKHOUSE_PASSWORD"),
)
import clickhouse_connect
row1 = [
"ishaan", # request_id
"GET", # call_type
"api_key_123", # api_key
50.00, # spend
1000, # total_tokens
800, # prompt_tokens
200, # completion_tokens
datetime.datetime.now(), # startTime (replace with the actual timestamp)
datetime.datetime.now(), # endTime (replace with the actual timestamp)
"gpt-3.5", # model
"user123", # user
'{"key": "value"}', # metadata (replace with valid JSON)
"True", # cache_hit
"cache_key_123", # cache_key
"tag1,tag2", # request_tags
]
row2 = [
"jaffer", # request_id
"POST", # call_type
"api_key_456", # api_key
30.50, # spend
800, # total_tokens
600, # prompt_tokens
200, # completion_tokens
datetime.datetime.now(), # startTime (replace with the actual timestamp)
datetime.datetime.now(), # endTime (replace with the actual timestamp)
"gpt-4.0", # model
"user456", # user
'{"key": "value"}', # metadata (replace with valid JSON)
"False", # cache_hit
"cache_key_789", # cache_key
"tag3,tag4", # request_tags
]
data = [row1, row2]
resp = client.insert(
"spend_logs",
data,
column_names=[
"request_id",
"call_type",
"api_key",
"spend",
"total_tokens",
"prompt_tokens",
"completion_tokens",
"startTime",
"endTime",
"model",
"user",
"metadata",
"cache_hit",
"cache_key",
"request_tags",
],
)
print(resp)

View file

@ -1,39 +0,0 @@
# insert data into clickhouse
# response = client.command(
# """
# CREATE TEMPORARY TABLE temp_spend_logs AS (
# SELECT
# generateUUIDv4() AS request_id,
# arrayElement(['TypeA', 'TypeB', 'TypeC'], rand() % 3 + 1) AS call_type,
# 'ishaan' as api_key,
# rand() * 1000 AS spend,
# rand() * 100 AS total_tokens,
# rand() * 50 AS prompt_tokens,
# rand() * 50 AS completion_tokens,
# toDate('2024-02-01') + toIntervalDay(rand()%27) AS startTime,
# now() AS endTime,
# arrayElement(['azure/gpt-4', 'gpt-3.5', 'vertexai/gemini-pro', 'mistral/mistral-small', 'ollama/llama2'], rand() % 3 + 1) AS model,
# 'ishaan-insert-rand' as user,
# 'data' as metadata,
# 'true'AS cache_hit,
# 'ishaan' as cache_key,
# '{"tag1": "value1", "tag2": "value2"}' AS request_tags
# FROM numbers(1, 1000000)
# );
# """
# )
# client.command(
# """
# -- Insert data into spend_logs table
# INSERT INTO spend_logs
# SELECT * FROM temp_spend_logs;
# """
# )
# client.command(
# """
# DROP TABLE IF EXISTS temp_spend_logs;
# """
# )

View file

@ -24,7 +24,7 @@ version: 0.3.0
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: v1.46.6
appVersion: v1.50.2
dependencies:
- name: "postgresql"

View file

@ -28,14 +28,13 @@ If `db.useStackgresOperator` is used (not yet implemented):
| `image.repository` | LiteLLM Proxy image repository | `ghcr.io/berriai/litellm` |
| `image.pullPolicy` | LiteLLM Proxy image pull policy | `IfNotPresent` |
| `image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` |
| `image.dbReadyImage` | On Pod startup, an initContainer is used to make sure the Postgres database is available before attempting to start LiteLLM. This field specifies the image to use as that initContainer. | `docker.io/bitnami/postgresql` |
| `image.dbReadyTag` | Tag for the above image. If not specified, "latest" is used. | `""` |
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` |
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
| `extraContainers[]` | An array of additional containers to be deployed as sidecars alongside the LiteLLM Proxy. | `[]` |
#### Example `environmentSecrets` Secret
@ -127,4 +126,4 @@ kubectl -n litellm get secret <RELEASE>-litellm-masterkey -o jsonpath="{.data.ma
At the time of writing, the Admin UI is unable to add models. This is because
it would need to update the `config.yaml` file which is a exposed ConfigMap, and
therefore, read-only. This is a limitation of this helm chart, not the Admin UI
itself.
itself.

View file

@ -31,71 +31,6 @@ spec:
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: db-ready
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.dbReadyImage }}:{{ .Values.image.dbReadyTag | default("16.1.0-debian-11-r20") }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
{{- if .Values.db.deployStandalone }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ include "litellm.fullname" . }}-dbcredentials
key: username
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: {{ include "litellm.fullname" . }}-dbcredentials
key: password
- name: DATABASE_HOST
value: {{ .Release.Name }}-postgresql
- name: DATABASE_NAME
value: litellm
{{- else if .Values.db.useExisting }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.usernameKey }}
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.passwordKey }}
- name: DATABASE_HOST
value: {{ .Values.db.endpoint }}
- name: DATABASE_NAME
value: {{ .Values.db.database }}
{{- end }}
command:
- sh
- -c
- |
# Maximum wait time will be (limit * 2) seconds.
limit=60
current=0
ret=1
while [ $current -lt $limit ] && [ $ret -ne 0 ]; do
echo "Waiting for database to be ready $current"
psql -U $(DATABASE_USERNAME) -h $(DATABASE_HOST) -l
ret=$?
current=$(( $current + 1 ))
sleep 2
done
if [ $ret -eq 0 ]; then
echo "Database is ready"
else
echo "Database failed to become ready before we gave up waiting."
fi
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{ if .Values.securityContext.readOnlyRootFilesystem }}
volumeMounts:
- name: tmp
mountPath: /tmp
{{ end }}
containers:
- name: {{ include "litellm.name" . }}
securityContext:
@ -203,6 +138,9 @@ spec:
{{- with .Values.volumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.extraContainers }}
{{- toYaml . | nindent 8 }}
{{- end }}
volumes:
{{ if .Values.securityContext.readOnlyRootFilesystem }}
- name: tmp
@ -235,4 +173,4 @@ spec:
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

View file

@ -7,16 +7,11 @@ replicaCount: 1
image:
# Use "ghcr.io/berriai/litellm-database" for optimized image with database
repository: ghcr.io/berriai/litellm-database
pullPolicy: IfNotPresent
pullPolicy: Always
# Overrides the image tag whose default is the chart appVersion.
# tag: "main-latest"
tag: ""
# Image and tag used for the init container to check and wait for the
# readiness of the postgres database.
dbReadyImage: docker.io/bitnami/postgresql
dbReadyTag: ""
imagePullSecrets: []
nameOverride: "litellm"
fullnameOverride: ""

View file

@ -84,6 +84,60 @@ print(query_result[:5])
</TabItem>
</Tabs>
## Image Embeddings
For models that support image embeddings, you can pass in a base64 encoded image string to the `input` param.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import embedding
import os
# set your api key
os.environ["COHERE_API_KEY"] = ""
response = embedding(model="cohere/embed-english-v3.0", input=["<base64 encoded image>"])
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: cohere-embed
litellm_params:
model: cohere/embed-english-v3.0
api_key: os.environ/COHERE_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/v1/embeddings' \
-H 'Authorization: Bearer sk-54d77cd67b9febbb' \
-H 'Content-Type: application/json' \
-d '{
"model": "cohere/embed-english-v3.0",
"input": ["<base64 encoded image>"]
}'
```
</TabItem>
</Tabs>
## Input Params for `litellm.embedding()`

View file

@ -62,7 +62,8 @@ litellm_settings:
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize api endpoint
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
```
## Support & Talk to Founders

View file

@ -9,12 +9,11 @@ LiteLLM requires `boto3` to be installed on your system for Bedrock requests
pip install boto3>=1.28.57
```
## Required Environment Variables
```python
os.environ["AWS_ACCESS_KEY_ID"] = "" # Access key
os.environ["AWS_SECRET_ACCESS_KEY"] = "" # Secret access key
os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
```
:::info
LiteLLM uses boto3 to handle authentication. All these options are supported - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#credentials.
:::
## Usage
@ -22,6 +21,7 @@ os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
```python
import os
from litellm import completion
@ -38,7 +38,7 @@ response = completion(
## LiteLLM Proxy Usage
Here's how to call Anthropic with the LiteLLM Proxy Server
Here's how to call Bedrock with the LiteLLM Proxy Server
### 1. Setup config.yaml

View file

@ -135,7 +135,7 @@ Cli arguments, --host, --port, --num_workers
```
## --request_timeout
- **Default:** `600`
- **Default:** `6000`
- **Type:** `int`
- Set the timeout in seconds for completion calls.
- **Usage:**

View file

@ -625,6 +625,7 @@ litellm_settings:
redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
json_logs: boolean # if true, logs will be in json format
@ -721,6 +722,7 @@ general_settings:
| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
| cache | boolean | If true, enables caching. [Further docs](./caching) |
@ -812,6 +814,7 @@ general_settings:
| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |
### router_settings - Reference
@ -898,10 +901,6 @@ router_settings:
| BRAINTRUST_API_KEY | API key for Braintrust integration
| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
| CLICKHOUSE_HOST | Host for ClickHouse database
| CLICKHOUSE_PASSWORD | Password for ClickHouse authentication
| CLICKHOUSE_PORT | Port for ClickHouse database connection
| CLICKHOUSE_USERNAME | Username for ClickHouse authentication
| CONFIG_FILE_PATH | File path for configuration file
| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
| DATABASE_HOST | Hostname for the database server
@ -919,6 +918,7 @@ router_settings:
| DD_API_KEY | API key for Datadog integration
| DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
| DD_SOURCE | Source identifier for Datadog logs
| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
| DEBUG_OTEL | Enable debug mode for OpenTelemetry
| DIRECT_URL | Direct URL for service endpoint
| DISABLE_ADMIN_UI | Toggle to disable the admin UI

View file

@ -57,4 +57,34 @@ model_list:
api_version: os.envrion/AZURE_API_VERSION
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
```
```
### Debugging
If you're custom pricing is not being used or you're seeing errors, please check the following:
1. Run the proxy with `LITELLM_LOG="DEBUG"` or the `--detailed_debug` cli flag
```bash
litellm --config /path/to/config.yaml --detailed_debug
```
2. Check logs for this line:
```
LiteLLM:DEBUG: utils.py:263 - litellm.acompletion
```
3. Check if 'input_cost_per_token' and 'output_cost_per_token' are top-level keys in the acompletion function.
```bash
acompletion(
...,
input_cost_per_token: my-custom-price,
output_cost_per_token: my-custom-price,
)
```
If these keys are not present, LiteLLM will not use your custom pricing.
If the problem persists, please file an issue on [GitHub](https://github.com/BerriAI/litellm/issues).

View file

@ -1279,7 +1279,8 @@ litellm_settings:
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize api endpoint
ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
```
2. Start Proxy
@ -1467,6 +1468,13 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
## Logging Proxy Input/Output - DataDog
LiteLLM Supports logging to the following Datdog Integrations:
- `datadog` [Datadog Logs](https://docs.datadoghq.com/logs/)
- `datadog_llm_observability` [Datadog LLM Observability](https://www.datadoghq.com/product/llm-observability/)
<Tabs>
<TabItem value="datadog" label="Datadog Logs">
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
@ -1481,6 +1489,21 @@ litellm_settings:
service_callback: ["datadog"] # logs redis, postgres failures on datadog
```
</TabItem>
<TabItem value="datadog_llm_observability" label="Datadog LLM Observability">
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
callbacks: ["datadog_llm_observability"] # logs llm success logs on datadog
```
</TabItem>
</Tabs>
**Step 2**: Set Required env variables for datadog
```shell

View file

@ -21,6 +21,7 @@ general_settings:
database_connection_pool_limit: 10 # limit the number of database connections to = MAX Number of DB Connections/Number of instances of litellm proxy (Around 10-20 is good number)
litellm_settings:
request_timeout: 600 # raise Timeout error if call takes longer than 600 seconds. Default value is 6000seconds if not set
set_verbose: False # Switch off Debug Logging, ensure your logs do not have any debugging on
json_logs: true # Get debug logs in json format
```

View file

@ -83,4 +83,21 @@ ws.on("message", function incoming(message) {
ws.on("error", function handleError(error) {
console.error("Error: ", error);
});
```
```
## Logging
To prevent requests from being dropped, by default LiteLLM just logs these event types:
- `session.created`
- `response.create`
- `response.done`
You can override this by setting the `logged_real_time_event_types` parameter in the config. For example:
```yaml
litellm_settings:
logged_real_time_event_types: "*" # Log all events
## OR ##
logged_real_time_event_types: ["session.created", "response.create", "response.done"] # Log only these event types
```

View file

@ -1312,7 +1312,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
```
#### --request_timeout
- **Default:** `600`
- **Default:** `6000`
- **Type:** `int`
- Set the timeout in seconds for completion calls.
- **Usage:**

View file

@ -12447,9 +12447,9 @@
}
},
"node_modules/http-proxy-middleware": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.6.tgz",
"integrity": "sha512-ya/UeJ6HVBYxrgYotAZo1KvPWlgB48kUJLDePFeneHsVujFaW5WNj2NgWCAE//B1Dl02BIfYlpNgBy8Kf8Rjmw==",
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz",
"integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==",
"dependencies": {
"@types/http-proxy": "^1.17.8",
"http-proxy": "^1.18.1",

View file

@ -8,6 +8,7 @@ import os
from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
from litellm._logging import (
set_verbose,
_turn_on_debug,
@ -48,6 +49,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
"langsmith",
"prometheus",
"datadog",
"datadog_llm_observability",
"galileo",
"braintrust",
"arize",
@ -56,6 +58,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
"opik",
"argilla",
]
logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
_known_custom_logger_compatible_callbacks: List = list(
get_args(_custom_logger_compatible_callbacks_literal)
)
@ -79,6 +82,9 @@ turn_off_message_logging: Optional[bool] = False
log_raw_request_response: bool = False
redact_messages_in_exceptions: Optional[bool] = False
redact_user_api_key_info: Optional[bool] = False
add_user_information_to_llm_headers: Optional[bool] = (
None # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
)
store_audit_logs = False # Enterprise feature, allow users to see audit logs
## end of callbacks #############
@ -132,7 +138,7 @@ enable_azure_ad_token_refresh: Optional[bool] = False
### DEFAULT AZURE API VERSION ###
AZURE_DEFAULT_API_VERSION = "2024-08-01-preview" # this is updated to the latest
### COHERE EMBEDDINGS DEFAULT TYPE ###
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None
openai_moderations_model_name: Optional[str] = None
@ -159,9 +165,6 @@ enable_caching_on_provider_specific_optional_params: bool = (
caching: bool = (
False # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
)
always_read_redis: bool = (
True # always use redis for rate limiting logic on litellm proxy
)
caching_with_models: bool = (
False # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
)

View file

@ -69,6 +69,8 @@ def _get_redis_cluster_kwargs(client=None):
available_args = [x for x in arg_spec.args if x not in exclude_args]
available_args.append("password")
available_args.append("username")
available_args.append("ssl")
return available_args

View file

@ -233,7 +233,7 @@ class Cache:
if self.namespace is not None and isinstance(self.cache, RedisCache):
self.cache.namespace = self.namespace
def get_cache_key(self, *args, **kwargs) -> str: # noqa: PLR0915
def get_cache_key(self, *args, **kwargs) -> str:
"""
Get the cache key for the given arguments.

View file

@ -32,7 +32,6 @@ class DualCache(BaseCache):
redis_cache: Optional[RedisCache] = None,
default_in_memory_ttl: Optional[float] = None,
default_redis_ttl: Optional[float] = None,
always_read_redis: Optional[bool] = True,
) -> None:
super().__init__()
# If in_memory_cache is not provided, use the default InMemoryCache
@ -44,7 +43,6 @@ class DualCache(BaseCache):
default_in_memory_ttl or litellm.default_in_memory_ttl
)
self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
self.always_read_redis = always_read_redis
def update_cache_ttl(
self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
@ -102,12 +100,8 @@ class DualCache(BaseCache):
if in_memory_result is not None:
result = in_memory_result
if (
(self.always_read_redis is True)
and self.redis_cache is not None
and local_only is False
):
# If not found in in-memory cache or always_read_redis is True, try fetching from Redis
if result is None and self.redis_cache is not None and local_only is False:
# If not found in in-memory cache, try fetching from Redis
redis_result = self.redis_cache.get_cache(key, **kwargs)
if redis_result is not None:

View file

@ -1,167 +0,0 @@
#### What this does ####
# On success + failure, log events to aispend.io
import datetime
import os
import traceback
import dotenv
model_cost = {
"gpt-3.5-turbo": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-35-turbo": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
}, # azure model name
"gpt-3.5-turbo-0613": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
}
class AISpendLogger:
# Class variables or attributes
def __init__(self):
# Instance variables
self.account_id = os.getenv("AISPEND_ACCOUNT_ID")
self.api_key = os.getenv("AISPEND_API_KEY")
def price_calculator(self, model, response_obj, start_time, end_time):
# try and find if the model is in the model_cost map
# else default to the average of the costs
prompt_tokens_cost_usd_dollar = 0
completion_tokens_cost_usd_dollar = 0
if model in model_cost:
prompt_tokens_cost_usd_dollar = (
model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
elif "replicate" in model:
# replicate models are charged based on time
# llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
model_run_time = end_time - start_time # assuming time in seconds
cost_usd_dollar = model_run_time * 0.0032
prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
else:
# calculate average input cost
input_cost_sum = 0
output_cost_sum = 0
for model in model_cost:
input_cost_sum += model_cost[model]["input_cost_per_token"]
output_cost_sum += model_cost[model]["output_cost_per_token"]
prompt_tokens_cost_usd_dollar = (
model_cost[model]["input_cost_per_token"]
* response_obj["usage"]["prompt_tokens"]
)
completion_tokens_cost_usd_dollar = (
model_cost[model]["output_cost_per_token"]
* response_obj["usage"]["completion_tokens"]
)
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
def log_event(self, model, response_obj, start_time, end_time, print_verbose):
# Method definition
try:
print_verbose(
f"AISpend Logging - Enters logging function for model {model}"
)
response_timestamp = datetime.datetime.fromtimestamp(
int(response_obj["created"])
).strftime("%Y-%m-%d")
(
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) = self.price_calculator(model, response_obj, start_time, end_time)
prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
data = [
{
"requests": 1,
"requests_context": 1,
"context_tokens": response_obj["usage"]["prompt_tokens"],
"requests_generated": 1,
"generated_tokens": response_obj["usage"]["completion_tokens"],
"recorded_date": response_timestamp,
"model_id": response_obj["model"],
"generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
"context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
}
]
print_verbose(f"AISpend Logging - final data object: {data}")
except Exception:
print_verbose(f"AISpend Logging Error - {traceback.format_exc()}")
pass

View file

@ -7,135 +7,208 @@ this file has Arize ai specific helper functions
import json
from typing import TYPE_CHECKING, Any, Optional, Union
from litellm._logging import verbose_proxy_logger
from litellm._logging import verbose_logger
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
from .opentelemetry import OpenTelemetryConfig as _OpenTelemetryConfig
Span = _Span
OpenTelemetryConfig = _OpenTelemetryConfig
else:
Span = Any
OpenTelemetryConfig = Any
import os
from litellm.types.integrations.arize import *
def make_json_serializable(payload: dict) -> dict:
for key, value in payload.items():
class ArizeLogger:
@staticmethod
def set_arize_ai_attributes(span: Span, kwargs, response_obj):
from litellm.integrations._types.open_inference import (
MessageAttributes,
MessageContentAttributes,
OpenInferenceSpanKindValues,
SpanAttributes,
)
try:
if isinstance(value, dict):
# recursively sanitize dicts
payload[key] = make_json_serializable(value.copy())
elif not isinstance(value, (str, int, float, bool, type(None))):
# everything else becomes a string
payload[key] = str(value)
except Exception:
# non blocking if it can't cast to a str
optional_params = kwargs.get("optional_params", {})
# litellm_params = kwargs.get("litellm_params", {}) or {}
#############################################
############ LLM CALL METADATA ##############
#############################################
# commented out for now - looks like Arize AI could not log this
# metadata = litellm_params.get("metadata", {}) or {}
# span.set_attribute(SpanAttributes.METADATA, str(metadata))
#############################################
########## LLM Request Attributes ###########
#############################################
# The name of the LLM a request is being made to
if kwargs.get("model"):
span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
span.set_attribute(
SpanAttributes.OPENINFERENCE_SPAN_KIND,
OpenInferenceSpanKindValues.LLM.value,
)
messages = kwargs.get("messages")
# for /chat/completions
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
if messages:
span.set_attribute(
SpanAttributes.INPUT_VALUE,
messages[-1].get("content", ""), # get the last message for input
)
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
for idx, msg in enumerate(messages):
# Set the role per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
msg["role"],
)
# Set the content per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
msg.get("content", ""),
)
# The Generative AI Provider: Azure, OpenAI, etc.
_optional_params = ArizeLogger.make_json_serializable(optional_params)
_json_optional_params = json.dumps(_optional_params)
span.set_attribute(
SpanAttributes.LLM_INVOCATION_PARAMETERS, _json_optional_params
)
if optional_params.get("user"):
span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
#############################################
########## LLM Response Attributes ##########
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
#############################################
for choice in response_obj.get("choices"):
response_message = choice.get("message", {})
span.set_attribute(
SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
)
# This shows up under `output_messages` tab on the span page
# This code assumes a single response
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
response_message["role"],
)
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
response_message.get("content", ""),
)
usage = response_obj.get("usage")
if usage:
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
usage.get("total_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM prompt.
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
usage.get("prompt_tokens"),
)
pass
return payload
except Exception as e:
verbose_logger.error(f"Error setting arize attributes: {e}")
###################### Helper functions ######################
def set_arize_ai_attributes(span: Span, kwargs, response_obj):
from litellm.integrations._types.open_inference import (
MessageAttributes,
MessageContentAttributes,
OpenInferenceSpanKindValues,
SpanAttributes,
)
@staticmethod
def _get_arize_config() -> ArizeConfig:
"""
Helper function to get Arize configuration.
try:
Returns:
ArizeConfig: A Pydantic model containing Arize configuration.
optional_params = kwargs.get("optional_params", {})
# litellm_params = kwargs.get("litellm_params", {}) or {}
Raises:
ValueError: If required environment variables are not set.
"""
space_key = os.environ.get("ARIZE_SPACE_KEY")
api_key = os.environ.get("ARIZE_API_KEY")
#############################################
############ LLM CALL METADATA ##############
#############################################
# commented out for now - looks like Arize AI could not log this
# metadata = litellm_params.get("metadata", {}) or {}
# span.set_attribute(SpanAttributes.METADATA, str(metadata))
if not space_key:
raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
if not api_key:
raise ValueError("ARIZE_API_KEY not found in environment variables")
#############################################
########## LLM Request Attributes ###########
#############################################
# The name of the LLM a request is being made to
if kwargs.get("model"):
span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
span.set_attribute(
SpanAttributes.OPENINFERENCE_SPAN_KIND,
OpenInferenceSpanKindValues.LLM.value,
)
messages = kwargs.get("messages")
# for /chat/completions
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
if messages:
span.set_attribute(
SpanAttributes.INPUT_VALUE,
messages[-1].get("content", ""), # get the last message for input
grpc_endpoint = os.environ.get("ARIZE_ENDPOINT")
http_endpoint = os.environ.get("ARIZE_HTTP_ENDPOINT")
if grpc_endpoint is None and http_endpoint is None:
# use default arize grpc endpoint
verbose_logger.debug(
"No ARIZE_ENDPOINT or ARIZE_HTTP_ENDPOINT found, using default endpoint: https://otlp.arize.com/v1"
)
grpc_endpoint = "https://otlp.arize.com/v1"
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
for idx, msg in enumerate(messages):
# Set the role per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
msg["role"],
)
# Set the content per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
msg.get("content", ""),
)
# The Generative AI Provider: Azure, OpenAI, etc.
_optional_params = make_json_serializable(optional_params)
_json_optional_params = json.dumps(_optional_params)
span.set_attribute(
SpanAttributes.LLM_INVOCATION_PARAMETERS, _json_optional_params
return ArizeConfig(
space_key=space_key,
api_key=api_key,
grpc_endpoint=grpc_endpoint,
http_endpoint=http_endpoint,
)
if optional_params.get("user"):
span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
@staticmethod
def get_arize_opentelemetry_config() -> Optional[OpenTelemetryConfig]:
"""
Helper function to get OpenTelemetry configuration for Arize.
#############################################
########## LLM Response Attributes ##########
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
#############################################
for choice in response_obj.get("choices"):
response_message = choice.get("message", {})
span.set_attribute(
SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
Args:
arize_config (ArizeConfig): Arize configuration object.
Returns:
OpenTelemetryConfig: Configuration for OpenTelemetry.
"""
from .opentelemetry import OpenTelemetryConfig
arize_config = ArizeLogger._get_arize_config()
if arize_config.http_endpoint:
return OpenTelemetryConfig(
exporter="otlp_http",
endpoint=arize_config.http_endpoint,
)
# This shows up under `output_messages` tab on the span page
# This code assumes a single response
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
response_message["role"],
)
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
response_message.get("content", ""),
)
# use default arize grpc endpoint
return OpenTelemetryConfig(
exporter="otlp_grpc",
endpoint=arize_config.grpc_endpoint,
)
usage = response_obj.get("usage")
if usage:
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
usage.get("total_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM prompt.
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
usage.get("prompt_tokens"),
)
pass
except Exception as e:
verbose_proxy_logger.error(f"Error setting arize attributes: {e}")
@staticmethod
def make_json_serializable(payload: dict) -> dict:
for key, value in payload.items():
try:
if isinstance(value, dict):
# recursively sanitize dicts
payload[key] = ArizeLogger.make_json_serializable(value.copy())
elif not isinstance(value, (str, int, float, bool, type(None))):
# everything else becomes a string
payload[key] = str(value)
except Exception:
# non blocking if it can't cast to a str
pass
return payload

View file

@ -1,104 +0,0 @@
#### What this does ####
# On success + failure, log events to aispend.io
import datetime
import os
import traceback
import dotenv
import requests # type: ignore
model_cost = {
"gpt-3.5-turbo": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-35-turbo": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
}, # azure model name
"gpt-3.5-turbo-0613": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-0301": {
"max_tokens": 4000,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
},
"gpt-3.5-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-35-turbo-16k": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
}, # azure model name
"gpt-3.5-turbo-16k-0613": {
"max_tokens": 16000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000004,
},
"gpt-4": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-0613": {
"max_tokens": 8000,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.00006,
},
"gpt-4-32k": {
"max_tokens": 8000,
"input_cost_per_token": 0.00006,
"output_cost_per_token": 0.00012,
},
"claude-instant-1": {
"max_tokens": 100000,
"input_cost_per_token": 0.00000163,
"output_cost_per_token": 0.00000551,
},
"claude-2": {
"max_tokens": 100000,
"input_cost_per_token": 0.00001102,
"output_cost_per_token": 0.00003268,
},
"text-bison-001": {
"max_tokens": 8192,
"input_cost_per_token": 0.000004,
"output_cost_per_token": 0.000004,
},
"chat-bison-001": {
"max_tokens": 4096,
"input_cost_per_token": 0.000002,
"output_cost_per_token": 0.000002,
},
"command-nightly": {
"max_tokens": 4096,
"input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000015,
},
}
class BerriSpendLogger:
# Class variables or attributes
def __init__(self):
# Instance variables
self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID")
def price_calculator(self, model, response_obj, start_time, end_time):
return
def log_event(
self, model, messages, response_obj, start_time, end_time, print_verbose
):
"""
This integration is not implemented yet.
"""
return

View file

@ -1,334 +0,0 @@
# callback to make a request to an API endpoint
#### What this does ####
# On success, logs events to Promptlayer
import datetime
import json
import os
import traceback
from typing import Literal, Optional, Union
import dotenv
import requests
import litellm
from litellm._logging import verbose_logger
from litellm.caching.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth
from litellm.types.utils import StandardLoggingPayload
#### What this does ####
# On success + failure, log events to Supabase
def create_client():
try:
import clickhouse_connect
port = os.getenv("CLICKHOUSE_PORT")
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
if clickhouse_host is not None:
verbose_logger.debug("setting up clickhouse")
port = os.getenv("CLICKHOUSE_PORT")
if port is not None and isinstance(port, str):
port = int(port)
host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
if host is None:
raise ValueError("CLICKHOUSE_HOST is not set")
username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
if username is None:
raise ValueError("CLICKHOUSE_USERNAME is not set")
password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
if password is None:
raise ValueError("CLICKHOUSE_PASSWORD is not set")
if port is None:
raise ValueError("CLICKHOUSE_PORT is not set")
client = clickhouse_connect.get_client(
host=host,
port=port,
username=username,
password=password,
)
return client
else:
raise Exception("Clickhouse: Clickhouse host not set")
except Exception as e:
raise ValueError(f"Clickhouse: {e}")
def build_daily_metrics():
click_house_client = create_client()
# get daily spend
daily_spend = click_house_client.query_df(
"""
SELECT sumMerge(DailySpend) as daily_spend, day FROM daily_aggregated_spend GROUP BY day
"""
)
# get daily spend per model
daily_spend_per_model = click_house_client.query_df(
"""
SELECT sumMerge(DailySpend) as daily_spend, day, model FROM daily_aggregated_spend_per_model GROUP BY day, model
"""
)
new_df = daily_spend_per_model.to_dict(orient="records")
import pandas as pd
df = pd.DataFrame(new_df)
# Group by 'day' and create a dictionary for each group
result_dict = {}
for day, group in df.groupby("day"):
models = group["model"].tolist()
spend = group["daily_spend"].tolist()
spend_per_model = {model: spend for model, spend in zip(models, spend)}
result_dict[day] = spend_per_model
# Display the resulting dictionary
# get daily spend per API key
daily_spend_per_api_key = click_house_client.query_df(
"""
SELECT
daily_spend,
day,
api_key
FROM (
SELECT
sumMerge(DailySpend) as daily_spend,
day,
api_key,
RANK() OVER (PARTITION BY day ORDER BY sumMerge(DailySpend) DESC) as spend_rank
FROM
daily_aggregated_spend_per_api_key
GROUP BY
day,
api_key
) AS ranked_api_keys
WHERE
spend_rank <= 5
AND day IS NOT NULL
ORDER BY
day,
daily_spend DESC
"""
)
new_df = daily_spend_per_api_key.to_dict(orient="records")
import pandas as pd
df = pd.DataFrame(new_df)
# Group by 'day' and create a dictionary for each group
api_key_result_dict = {}
for day, group in df.groupby("day"):
api_keys = group["api_key"].tolist()
spend = group["daily_spend"].tolist()
spend_per_api_key = {api_key: spend for api_key, spend in zip(api_keys, spend)}
api_key_result_dict[day] = spend_per_api_key
# Display the resulting dictionary
# Calculate total spend across all days
total_spend = daily_spend["daily_spend"].sum()
# Identify top models and top API keys with the highest spend across all days
top_models = {}
top_api_keys = {}
for day, spend_per_model in result_dict.items():
for model, model_spend in spend_per_model.items():
if model not in top_models or model_spend > top_models[model]:
top_models[model] = model_spend
for day, spend_per_api_key in api_key_result_dict.items():
for api_key, api_key_spend in spend_per_api_key.items():
if api_key not in top_api_keys or api_key_spend > top_api_keys[api_key]:
top_api_keys[api_key] = api_key_spend
# for each day in daily spend, look up the day in result_dict and api_key_result_dict
# Assuming daily_spend DataFrame has 'day' column
result = []
for index, row in daily_spend.iterrows():
day = row["day"]
data_day = row.to_dict()
# Look up in result_dict
if day in result_dict:
spend_per_model = result_dict[day]
# Assuming there is a column named 'model' in daily_spend
data_day["spend_per_model"] = spend_per_model # Assign 0 if model not found
# Look up in api_key_result_dict
if day in api_key_result_dict:
spend_per_api_key = api_key_result_dict[day]
# Assuming there is a column named 'api_key' in daily_spend
data_day["spend_per_api_key"] = spend_per_api_key
result.append(data_day)
data_to_return = {}
data_to_return["daily_spend"] = result
data_to_return["total_spend"] = total_spend
data_to_return["top_models"] = top_models
data_to_return["top_api_keys"] = top_api_keys
return data_to_return
# build_daily_metrics()
def _start_clickhouse():
import clickhouse_connect
port = os.getenv("CLICKHOUSE_PORT")
clickhouse_host = os.getenv("CLICKHOUSE_HOST")
if clickhouse_host is not None:
verbose_logger.debug("setting up clickhouse")
if port is not None and isinstance(port, str):
port = int(port)
port = os.getenv("CLICKHOUSE_PORT")
if port is not None and isinstance(port, str):
port = int(port)
host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
if host is None:
raise ValueError("CLICKHOUSE_HOST is not set")
username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
if username is None:
raise ValueError("CLICKHOUSE_USERNAME is not set")
password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
if password is None:
raise ValueError("CLICKHOUSE_PASSWORD is not set")
if port is None:
raise ValueError("CLICKHOUSE_PORT is not set")
client = clickhouse_connect.get_client(
host=host,
port=port,
username=username,
password=password,
)
# view all tables in DB
response = client.query("SHOW TABLES")
verbose_logger.debug(
f"checking if litellm spend logs exists, all tables={response.result_rows}"
)
# all tables is returned like this: all tables = [('new_table',), ('spend_logs',)]
# check if spend_logs in all tables
table_names = [all_tables[0] for all_tables in response.result_rows]
if "spend_logs" not in table_names:
verbose_logger.debug(
"Clickhouse: spend logs table does not exist... creating it"
)
response = client.command(
"""
CREATE TABLE default.spend_logs
(
`request_id` String,
`call_type` String,
`api_key` String,
`spend` Float64,
`total_tokens` Int256,
`prompt_tokens` Int256,
`completion_tokens` Int256,
`startTime` DateTime,
`endTime` DateTime,
`model` String,
`user` String,
`metadata` String,
`cache_hit` String,
`cache_key` String,
`request_tags` String
)
ENGINE = MergeTree
ORDER BY tuple();
"""
)
else:
# check if spend logs exist, if it does then return the schema
response = client.query("DESCRIBE default.spend_logs")
verbose_logger.debug(f"spend logs schema ={response.result_rows}")
class ClickhouseLogger:
# Class variables or attributes
def __init__(self, endpoint=None, headers=None):
import clickhouse_connect
_start_clickhouse()
verbose_logger.debug(
f"ClickhouseLogger init, host {os.getenv('CLICKHOUSE_HOST')}, port {os.getenv('CLICKHOUSE_PORT')}, username {os.getenv('CLICKHOUSE_USERNAME')}"
)
port = os.getenv("CLICKHOUSE_PORT")
if port is not None and isinstance(port, str):
port = int(port)
host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
if host is None:
raise ValueError("CLICKHOUSE_HOST is not set")
username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
if username is None:
raise ValueError("CLICKHOUSE_USERNAME is not set")
password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
if password is None:
raise ValueError("CLICKHOUSE_PASSWORD is not set")
if port is None:
raise ValueError("CLICKHOUSE_PORT is not set")
client = clickhouse_connect.get_client(
host=host,
port=port,
username=username,
password=password,
)
self.client = client
# This is sync, because we run this in a separate thread. Running in a sepearate thread ensures it will never block an LLM API call
# Experience with s3, Langfuse shows that async logging events are complicated and can block LLM calls
def log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
):
try:
verbose_logger.debug(
f"ClickhouseLogger Logging - Enters logging function for model {kwargs}"
)
# follows the same params as langfuse.py
payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object"
)
if payload is None:
return
# Build the initial payload
verbose_logger.debug(f"\nClickhouse Logger - Logging payload = {payload}")
# just get the payload items in one array and payload keys in 2nd array
values = []
keys = []
for key, value in payload.items():
keys.append(key)
values.append(value)
data = [values]
response = self.client.insert("default.spend_logs", data, column_names=keys)
# make request to endpoint with payload
verbose_logger.debug(f"Clickhouse Logger - final response = {response}")
except Exception as e:
verbose_logger.debug(f"Clickhouse - {str(e)}\n{traceback.format_exc()}")
pass

View file

@ -0,0 +1,169 @@
"""
Implements logging integration with Datadog's LLM Observability Service
API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
"""
import asyncio
import os
import traceback
import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from httpx import Response
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.llms.custom_httpx.http_handler import (
get_async_httpx_client,
httpxSpecialProvider,
)
from litellm.types.integrations.datadog_llm_obs import *
from litellm.types.utils import StandardLoggingPayload
class DataDogLLMObsLogger(CustomBatchLogger):
def __init__(self, **kwargs):
try:
verbose_logger.debug("DataDogLLMObs: Initializing logger")
if os.getenv("DD_API_KEY", None) is None:
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
if os.getenv("DD_SITE", None) is None:
raise Exception(
"DD_SITE is not set, set 'DD_SITE=<>', example sit = `us5.datadoghq.com`"
)
self.async_client = get_async_httpx_client(
llm_provider=httpxSpecialProvider.LoggingCallback
)
self.DD_API_KEY = os.getenv("DD_API_KEY")
self.DD_SITE = os.getenv("DD_SITE")
self.intake_url = (
f"https://api.{self.DD_SITE}/api/intake/llm-obs/v1/trace/spans"
)
# testing base url
dd_base_url = os.getenv("DD_BASE_URL")
if dd_base_url:
self.intake_url = f"{dd_base_url}/api/intake/llm-obs/v1/trace/spans"
asyncio.create_task(self.periodic_flush())
self.flush_lock = asyncio.Lock()
self.log_queue: List[LLMObsPayload] = []
super().__init__(**kwargs, flush_lock=self.flush_lock)
except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error initializing - {str(e)}")
raise e
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
verbose_logger.debug(
f"DataDogLLMObs: Logging success event for model {kwargs.get('model', 'unknown')}"
)
payload = self.create_llm_obs_payload(
kwargs, response_obj, start_time, end_time
)
verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
self.log_queue.append(payload)
if len(self.log_queue) >= self.batch_size:
await self.async_send_batch()
except Exception as e:
verbose_logger.exception(
f"DataDogLLMObs: Error logging success event - {str(e)}"
)
async def async_send_batch(self):
try:
if not self.log_queue:
return
verbose_logger.debug(
f"DataDogLLMObs: Flushing {len(self.log_queue)} events"
)
# Prepare the payload
payload = {
"data": DDIntakePayload(
type="span",
attributes=DDSpanAttributes(
ml_app="litellm",
tags=[
"service:litellm",
f"env:{os.getenv('DD_ENV', 'production')}",
],
spans=self.log_queue,
),
),
}
response = await self.async_client.post(
url=self.intake_url,
json=payload,
headers={
"DD-API-KEY": self.DD_API_KEY,
"Content-Type": "application/json",
},
)
response.raise_for_status()
if response.status_code != 202:
raise Exception(
f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
)
verbose_logger.debug(
f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
)
self.log_queue.clear()
except Exception as e:
verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
def create_llm_obs_payload(
self, kwargs: Dict, response_obj: Any, start_time: datetime, end_time: datetime
) -> LLMObsPayload:
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object"
)
if standard_logging_payload is None:
raise Exception("DataDogLLMObs: standard_logging_object is not set")
messages = standard_logging_payload["messages"]
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
input_meta = InputMeta(messages=messages) # type: ignore
output_meta = OutputMeta(messages=self._get_response_messages(response_obj))
meta = Meta(kind="llm", input=input_meta, output=output_meta)
# Calculate metrics (you may need to adjust these based on available data)
metrics = LLMMetrics(
input_tokens=float(standard_logging_payload.get("prompt_tokens", 0)),
output_tokens=float(standard_logging_payload.get("completion_tokens", 0)),
total_tokens=float(standard_logging_payload.get("total_tokens", 0)),
)
return LLMObsPayload(
parent_id=metadata.get("parent_id", "undefined"),
trace_id=metadata.get("trace_id", str(uuid.uuid4())),
span_id=metadata.get("span_id", str(uuid.uuid4())),
name=metadata.get("name", "litellm_llm_call"),
meta=meta,
start_ns=int(start_time.timestamp() * 1e9),
duration=int((end_time - start_time).total_seconds() * 1e9),
metrics=metrics,
)
def _get_response_messages(self, response_obj: Any) -> List[Any]:
"""
Get the messages from the response object
for now this handles logging /chat/completions responses
"""
if isinstance(response_obj, litellm.ModelResponse):
return [response_obj["choices"][0]["message"].json()]
return []

View file

@ -4,7 +4,7 @@ import copy
import inspect
import os
import traceback
from typing import Optional
from typing import TYPE_CHECKING, Any, Dict, Optional
from packaging.version import Version
from pydantic import BaseModel
@ -13,7 +13,13 @@ import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
from litellm.secret_managers.main import str_to_bool
from litellm.types.utils import StandardLoggingPayload
from litellm.types.integrations.langfuse import *
from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
else:
DynamicLoggingCache = Any
class LangFuseLogger:

View file

@ -0,0 +1,168 @@
"""
This file contains the LangFuseHandler class
Used to get the LangFuseLogger for a given request
Handles Key/Team Based Langfuse Logging
"""
from typing import TYPE_CHECKING, Any, Dict, Optional
from litellm.litellm_core_utils.litellm_logging import StandardCallbackDynamicParams
from .langfuse import LangFuseLogger, LangfuseLoggingConfig
if TYPE_CHECKING:
from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
else:
DynamicLoggingCache = Any
class LangFuseHandler:
@staticmethod
def get_langfuse_logger_for_request(
standard_callback_dynamic_params: StandardCallbackDynamicParams,
in_memory_dynamic_logger_cache: DynamicLoggingCache,
globalLangfuseLogger: Optional[LangFuseLogger] = None,
) -> LangFuseLogger:
"""
This function is used to get the LangFuseLogger for a given request
1. If dynamic credentials are passed
- check if a LangFuseLogger is cached for the dynamic credentials
- if cached LangFuseLogger is not found, create a new LangFuseLogger and cache it
2. If dynamic credentials are not passed return the globalLangfuseLogger
"""
temp_langfuse_logger: Optional[LangFuseLogger] = globalLangfuseLogger
if (
LangFuseHandler._dynamic_langfuse_credentials_are_passed(
standard_callback_dynamic_params
)
is False
):
return LangFuseHandler._return_global_langfuse_logger(
globalLangfuseLogger=globalLangfuseLogger,
in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
)
# get langfuse logging config to use for this request, based on standard_callback_dynamic_params
_credentials = LangFuseHandler.get_dynamic_langfuse_logging_config(
globalLangfuseLogger=globalLangfuseLogger,
standard_callback_dynamic_params=standard_callback_dynamic_params,
)
credentials_dict = dict(_credentials)
# check if langfuse logger is already cached
temp_langfuse_logger = in_memory_dynamic_logger_cache.get_cache(
credentials=credentials_dict, service_name="langfuse"
)
# if not cached, create a new langfuse logger and cache it
if temp_langfuse_logger is None:
temp_langfuse_logger = (
LangFuseHandler._create_langfuse_logger_from_credentials(
credentials=credentials_dict,
in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
)
)
return temp_langfuse_logger
@staticmethod
def _return_global_langfuse_logger(
globalLangfuseLogger: Optional[LangFuseLogger],
in_memory_dynamic_logger_cache: DynamicLoggingCache,
) -> LangFuseLogger:
"""
Returns the Global LangfuseLogger set on litellm
(this is the default langfuse logger - used when no dynamic credentials are passed)
If no Global LangfuseLogger is set, it will check in_memory_dynamic_logger_cache for a cached LangFuseLogger
This function is used to return the globalLangfuseLogger if it exists, otherwise it will check in_memory_dynamic_logger_cache for a cached LangFuseLogger
"""
if globalLangfuseLogger is not None:
return globalLangfuseLogger
credentials_dict: Dict[str, Any] = (
{}
) # the global langfuse logger uses Environment Variables, there are no dynamic credentials
globalLangfuseLogger = in_memory_dynamic_logger_cache.get_cache(
credentials=credentials_dict,
service_name="langfuse",
)
if globalLangfuseLogger is None:
globalLangfuseLogger = (
LangFuseHandler._create_langfuse_logger_from_credentials(
credentials=credentials_dict,
in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
)
)
return globalLangfuseLogger
@staticmethod
def _create_langfuse_logger_from_credentials(
credentials: Dict,
in_memory_dynamic_logger_cache: DynamicLoggingCache,
) -> LangFuseLogger:
"""
This function is used to
1. create a LangFuseLogger from the credentials
2. cache the LangFuseLogger to prevent re-creating it for the same credentials
"""
langfuse_logger = LangFuseLogger(
langfuse_public_key=credentials.get("langfuse_public_key"),
langfuse_secret=credentials.get("langfuse_secret"),
langfuse_host=credentials.get("langfuse_host"),
)
in_memory_dynamic_logger_cache.set_cache(
credentials=credentials,
service_name="langfuse",
logging_obj=langfuse_logger,
)
return langfuse_logger
@staticmethod
def get_dynamic_langfuse_logging_config(
standard_callback_dynamic_params: StandardCallbackDynamicParams,
globalLangfuseLogger: Optional[LangFuseLogger] = None,
) -> LangfuseLoggingConfig:
"""
This function is used to get the Langfuse logging config to use for a given request.
It checks if the dynamic parameters are provided in the standard_callback_dynamic_params and uses them to get the Langfuse logging config.
If no dynamic parameters are provided, it uses the `globalLangfuseLogger` values
"""
# only use dynamic params if langfuse credentials are passed dynamically
return LangfuseLoggingConfig(
langfuse_secret=standard_callback_dynamic_params.get("langfuse_secret")
or standard_callback_dynamic_params.get("langfuse_secret_key"),
langfuse_public_key=standard_callback_dynamic_params.get(
"langfuse_public_key"
),
langfuse_host=standard_callback_dynamic_params.get("langfuse_host"),
)
@staticmethod
def _dynamic_langfuse_credentials_are_passed(
standard_callback_dynamic_params: StandardCallbackDynamicParams,
) -> bool:
"""
This function is used to check if the dynamic langfuse credentials are passed in standard_callback_dynamic_params
Returns:
bool: True if the dynamic langfuse credentials are passed, False otherwise
"""
if (
standard_callback_dynamic_params.get("langfuse_host") is not None
or standard_callback_dynamic_params.get("langfuse_public_key") is not None
or standard_callback_dynamic_params.get("langfuse_secret") is not None
or standard_callback_dynamic_params.get("langfuse_secret_key") is not None
):
return True
return False

View file

@ -1,77 +0,0 @@
import json
import os
import traceback
import types
import requests
class LiteDebugger:
user_email = None
dashboard_url = None
def __init__(self, email=None):
self.api_url = "https://api.litellm.ai/debugger"
self.validate_environment(email)
pass
def validate_environment(self, email):
try:
self.user_email = (
email or os.getenv("LITELLM_TOKEN") or os.getenv("LITELLM_EMAIL")
)
if (
self.user_email is None
): # if users are trying to use_client=True but token not set
raise ValueError(
"litellm.use_client = True but no token or email passed. Please set it in litellm.token"
)
self.dashboard_url = "https://admin.litellm.ai/" + self.user_email
if self.user_email is None:
raise ValueError(
"[Non-Blocking Error] LiteLLMDebugger: Missing LITELLM_TOKEN. Set it in your environment. Eg.: os.environ['LITELLM_TOKEN']= <your_email>"
)
except Exception:
raise ValueError(
"[Non-Blocking Error] LiteLLMDebugger: Missing LITELLM_TOKEN. Set it in your environment. Eg.: os.environ['LITELLM_TOKEN']= <your_email>"
)
def input_log_event(
self,
model,
messages,
end_user,
litellm_call_id,
call_type,
print_verbose,
litellm_params,
optional_params,
):
"""
This integration is not implemented yet.
"""
return
def post_call_log_event(
self, original_response, litellm_call_id, print_verbose, call_type, stream
):
"""
This integration is not implemented yet.
"""
return
def log_event(
self,
end_user,
response_obj,
start_time,
end_time,
litellm_call_id,
print_verbose,
call_type,
stream=False,
):
"""
This integration is not implemented yet.
"""
return

View file

@ -171,7 +171,7 @@ class OpenTelemetry(CustomLogger):
try:
value = str(value)
except Exception:
value = "litllm logging error - could_not_json_serialize"
value = "litellm logging error - could_not_json_serialize"
self.safe_set_attribute(
span=service_logging_span,
key=key,
@ -396,9 +396,9 @@ class OpenTelemetry(CustomLogger):
def set_attributes(self, span: Span, kwargs, response_obj): # noqa: PLR0915
try:
if self.callback_name == "arize":
from litellm.integrations.arize_ai import set_arize_ai_attributes
from litellm.integrations.arize_ai import ArizeLogger
set_arize_ai_attributes(span, kwargs, response_obj)
ArizeLogger.set_arize_ai_attributes(span, kwargs, response_obj)
return
elif self.callback_name == "langtrace":
from litellm.integrations.langtrace import LangtraceAttributes

View file

@ -6,7 +6,7 @@ import subprocess
import sys
import traceback
import uuid
from datetime import datetime, timedelta
from datetime import date, datetime, timedelta
from typing import Optional, TypedDict, Union
import dotenv
@ -334,13 +334,8 @@ class PrometheusLogger(CustomLogger):
print_verbose(f"Got exception on init prometheus client {str(e)}")
raise e
async def async_log_success_event( # noqa: PLR0915
self, kwargs, response_obj, start_time, end_time
):
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
# Define prometheus client
from litellm.proxy.common_utils.callback_utils import (
get_model_group_from_litellm_kwargs,
)
from litellm.types.utils import StandardLoggingPayload
verbose_logger.debug(
@ -351,14 +346,19 @@ class PrometheusLogger(CustomLogger):
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
"standard_logging_object"
)
if standard_logging_payload is None:
raise ValueError("standard_logging_object is required")
if standard_logging_payload is None or not isinstance(
standard_logging_payload, dict
):
raise ValueError(
f"standard_logging_object is required, got={standard_logging_payload}"
)
model = kwargs.get("model", "")
litellm_params = kwargs.get("litellm_params", {}) or {}
_metadata = litellm_params.get("metadata", {})
proxy_server_request = litellm_params.get("proxy_server_request") or {}
end_user_id = proxy_server_request.get("body", {}).get("user", None)
model_parameters: dict = standard_logging_payload["model_parameters"]
user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@ -369,25 +369,6 @@ class PrometheusLogger(CustomLogger):
output_tokens = standard_logging_payload["completion_tokens"]
tokens_used = standard_logging_payload["total_tokens"]
response_cost = standard_logging_payload["response_cost"]
_team_spend = litellm_params.get("metadata", {}).get(
"user_api_key_team_spend", None
)
_team_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_team_max_budget", None
)
_remaining_team_budget = safe_get_remaining_budget(
max_budget=_team_max_budget, spend=_team_spend
)
_api_key_spend = litellm_params.get("metadata", {}).get(
"user_api_key_spend", None
)
_api_key_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_max_budget", None
)
_remaining_api_key_budget = safe_get_remaining_budget(
max_budget=_api_key_max_budget, spend=_api_key_spend
)
print_verbose(
f"inside track_prometheus_metrics, model {model}, response_cost {response_cost}, tokens_used {tokens_used}, end_user_id {end_user_id}, user_api_key {user_api_key}"
@ -402,24 +383,82 @@ class PrometheusLogger(CustomLogger):
user_api_key = hash_token(user_api_key)
self.litellm_requests_metric.labels(
end_user_id,
user_api_key,
user_api_key_alias,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc()
self.litellm_spend_metric.labels(
end_user_id,
user_api_key,
user_api_key_alias,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc(response_cost)
# increment total LLM requests and spend metric
self._increment_top_level_request_and_spend_metrics(
end_user_id=end_user_id,
user_api_key=user_api_key,
user_api_key_alias=user_api_key_alias,
model=model,
user_api_team=user_api_team,
user_api_team_alias=user_api_team_alias,
user_id=user_id,
response_cost=response_cost,
)
# input, output, total token metrics
self._increment_token_metrics(
# why type ignore below?
# 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains.
# 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal
standard_logging_payload=standard_logging_payload, # type: ignore
end_user_id=end_user_id,
user_api_key=user_api_key,
user_api_key_alias=user_api_key_alias,
model=model,
user_api_team=user_api_team,
user_api_team_alias=user_api_team_alias,
user_id=user_id,
)
# remaining budget metrics
self._increment_remaining_budget_metrics(
user_api_team=user_api_team,
user_api_team_alias=user_api_team_alias,
user_api_key=user_api_key,
user_api_key_alias=user_api_key_alias,
litellm_params=litellm_params,
)
# set proxy virtual key rpm/tpm metrics
self._set_virtual_key_rate_limit_metrics(
user_api_key=user_api_key,
user_api_key_alias=user_api_key_alias,
kwargs=kwargs,
metadata=_metadata,
)
# set latency metrics
self._set_latency_metrics(
kwargs=kwargs,
model=model,
user_api_key=user_api_key,
user_api_key_alias=user_api_key_alias,
user_api_team=user_api_team,
user_api_team_alias=user_api_team_alias,
# why type ignore below?
# 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains.
# 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal
standard_logging_payload=standard_logging_payload, # type: ignore
)
# set x-ratelimit headers
self.set_llm_deployment_success_metrics(
kwargs, start_time, end_time, output_tokens
)
pass
def _increment_token_metrics(
self,
standard_logging_payload: StandardLoggingPayload,
end_user_id: Optional[str],
user_api_key: Optional[str],
user_api_key_alias: Optional[str],
model: Optional[str],
user_api_team: Optional[str],
user_api_team_alias: Optional[str],
user_id: Optional[str],
):
# token metrics
self.litellm_tokens_metric.labels(
end_user_id,
user_api_key,
@ -450,6 +489,34 @@ class PrometheusLogger(CustomLogger):
user_id,
).inc(standard_logging_payload["completion_tokens"])
def _increment_remaining_budget_metrics(
self,
user_api_team: Optional[str],
user_api_team_alias: Optional[str],
user_api_key: Optional[str],
user_api_key_alias: Optional[str],
litellm_params: dict,
):
_team_spend = litellm_params.get("metadata", {}).get(
"user_api_key_team_spend", None
)
_team_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_team_max_budget", None
)
_remaining_team_budget = self._safe_get_remaining_budget(
max_budget=_team_max_budget, spend=_team_spend
)
_api_key_spend = litellm_params.get("metadata", {}).get(
"user_api_key_spend", None
)
_api_key_max_budget = litellm_params.get("metadata", {}).get(
"user_api_key_max_budget", None
)
_remaining_api_key_budget = self._safe_get_remaining_budget(
max_budget=_api_key_max_budget, spend=_api_key_spend
)
# Remaining Budget Metrics
self.litellm_remaining_team_budget_metric.labels(
user_api_team, user_api_team_alias
).set(_remaining_team_budget)
@ -458,6 +525,47 @@ class PrometheusLogger(CustomLogger):
user_api_key, user_api_key_alias
).set(_remaining_api_key_budget)
def _increment_top_level_request_and_spend_metrics(
self,
end_user_id: Optional[str],
user_api_key: Optional[str],
user_api_key_alias: Optional[str],
model: Optional[str],
user_api_team: Optional[str],
user_api_team_alias: Optional[str],
user_id: Optional[str],
response_cost: float,
):
self.litellm_requests_metric.labels(
end_user_id,
user_api_key,
user_api_key_alias,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc()
self.litellm_spend_metric.labels(
end_user_id,
user_api_key,
user_api_key_alias,
model,
user_api_team,
user_api_team_alias,
user_id,
).inc(response_cost)
def _set_virtual_key_rate_limit_metrics(
self,
user_api_key: Optional[str],
user_api_key_alias: Optional[str],
kwargs: dict,
metadata: dict,
):
from litellm.proxy.common_utils.callback_utils import (
get_model_group_from_litellm_kwargs,
)
# Set remaining rpm/tpm for API Key + model
# see parallel_request_limiter.py - variables are set there
model_group = get_model_group_from_litellm_kwargs(kwargs)
@ -466,10 +574,8 @@ class PrometheusLogger(CustomLogger):
)
remaining_tokens_variable_name = f"litellm-key-remaining-tokens-{model_group}"
remaining_requests = _metadata.get(
remaining_requests_variable_name, sys.maxsize
)
remaining_tokens = _metadata.get(remaining_tokens_variable_name, sys.maxsize)
remaining_requests = metadata.get(remaining_requests_variable_name, sys.maxsize)
remaining_tokens = metadata.get(remaining_tokens_variable_name, sys.maxsize)
self.litellm_remaining_api_key_requests_for_model.labels(
user_api_key, user_api_key_alias, model_group
@ -479,9 +585,20 @@ class PrometheusLogger(CustomLogger):
user_api_key, user_api_key_alias, model_group
).set(remaining_tokens)
def _set_latency_metrics(
self,
kwargs: dict,
model: Optional[str],
user_api_key: Optional[str],
user_api_key_alias: Optional[str],
user_api_team: Optional[str],
user_api_team_alias: Optional[str],
standard_logging_payload: StandardLoggingPayload,
):
# latency metrics
total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
total_time_seconds = total_time.total_seconds()
model_parameters: dict = standard_logging_payload["model_parameters"]
end_time: datetime = kwargs.get("end_time") or datetime.now()
start_time: Optional[datetime] = kwargs.get("start_time")
api_call_start_time = kwargs.get("api_call_start_time", None)
completion_start_time = kwargs.get("completion_start_time", None)
@ -509,9 +626,7 @@ class PrometheusLogger(CustomLogger):
if api_call_start_time is not None and isinstance(
api_call_start_time, datetime
):
api_call_total_time: timedelta = (
kwargs.get("end_time") - api_call_start_time
)
api_call_total_time: timedelta = end_time - api_call_start_time
api_call_total_time_seconds = api_call_total_time.total_seconds()
self.litellm_llm_api_latency_metric.labels(
model,
@ -521,20 +636,17 @@ class PrometheusLogger(CustomLogger):
user_api_team_alias,
).observe(api_call_total_time_seconds)
# log metrics
self.litellm_request_total_latency_metric.labels(
model,
user_api_key,
user_api_key_alias,
user_api_team,
user_api_team_alias,
).observe(total_time_seconds)
# set x-ratelimit headers
self.set_llm_deployment_success_metrics(
kwargs, start_time, end_time, output_tokens
)
pass
# total request latency
if start_time is not None and isinstance(start_time, datetime):
total_time: timedelta = end_time - start_time
total_time_seconds = total_time.total_seconds()
self.litellm_request_total_latency_metric.labels(
model,
user_api_key,
user_api_key_alias,
user_api_team,
user_api_team_alias,
).observe(total_time_seconds)
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
from litellm.types.utils import StandardLoggingPayload
@ -651,24 +763,31 @@ class PrometheusLogger(CustomLogger):
pass
def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
"""
Sets Failure metrics when an LLM API call fails
- mark the deployment as partial outage
- increment deployment failure responses metric
- increment deployment total requests metric
Args:
request_kwargs: dict
"""
try:
verbose_logger.debug("setting remaining tokens requests metric")
standard_logging_payload: StandardLoggingPayload = request_kwargs.get(
"standard_logging_object", {}
)
_response_headers = request_kwargs.get("response_headers")
_litellm_params = request_kwargs.get("litellm_params", {}) or {}
_metadata = _litellm_params.get("metadata", {})
litellm_model_name = request_kwargs.get("model", None)
api_base = _metadata.get("api_base", None)
model_group = _metadata.get("model_group", None)
if api_base is None:
api_base = _litellm_params.get("api_base", None)
llm_provider = _litellm_params.get("custom_llm_provider", None)
_model_info = _metadata.get("model_info") or {}
model_id = _model_info.get("id", None)
model_group = standard_logging_payload.get("model_group", None)
api_base = standard_logging_payload.get("api_base", None)
model_id = standard_logging_payload.get("model_id", None)
exception: Exception = request_kwargs.get("exception", None)
llm_provider = _litellm_params.get("custom_llm_provider", None)
"""
log these labels
["litellm_model_name", "model_id", "api_base", "api_provider"]
@ -891,7 +1010,7 @@ class PrometheusLogger(CustomLogger):
"""
from litellm.litellm_core_utils.litellm_logging import (
StandardLoggingMetadata,
get_standard_logging_metadata,
StandardLoggingPayloadSetup,
)
verbose_logger.debug(
@ -900,8 +1019,10 @@ class PrometheusLogger(CustomLogger):
kwargs,
)
_metadata = kwargs.get("metadata", {})
standard_metadata: StandardLoggingMetadata = get_standard_logging_metadata(
metadata=_metadata
standard_metadata: StandardLoggingMetadata = (
StandardLoggingPayloadSetup.get_standard_logging_metadata(
metadata=_metadata
)
)
_new_model = kwargs.get("model")
self.litellm_deployment_successful_fallbacks.labels(
@ -923,7 +1044,7 @@ class PrometheusLogger(CustomLogger):
"""
from litellm.litellm_core_utils.litellm_logging import (
StandardLoggingMetadata,
get_standard_logging_metadata,
StandardLoggingPayloadSetup,
)
verbose_logger.debug(
@ -933,8 +1054,10 @@ class PrometheusLogger(CustomLogger):
)
_new_model = kwargs.get("model")
_metadata = kwargs.get("metadata", {})
standard_metadata: StandardLoggingMetadata = get_standard_logging_metadata(
metadata=_metadata
standard_metadata: StandardLoggingMetadata = (
StandardLoggingPayloadSetup.get_standard_logging_metadata(
metadata=_metadata
)
)
self.litellm_deployment_failed_fallbacks.labels(
requested_model=original_model_group,
@ -951,8 +1074,8 @@ class PrometheusLogger(CustomLogger):
self,
state: int,
litellm_model_name: str,
model_id: str,
api_base: str,
model_id: Optional[str],
api_base: Optional[str],
api_provider: str,
):
self.litellm_deployment_state.labels(
@ -973,8 +1096,8 @@ class PrometheusLogger(CustomLogger):
def set_deployment_partial_outage(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
model_id: Optional[str],
api_base: Optional[str],
api_provider: str,
):
self.set_litellm_deployment_state(
@ -984,8 +1107,8 @@ class PrometheusLogger(CustomLogger):
def set_deployment_complete_outage(
self,
litellm_model_name: str,
model_id: str,
api_base: str,
model_id: Optional[str],
api_base: Optional[str],
api_provider: str,
):
self.set_litellm_deployment_state(
@ -1007,14 +1130,13 @@ class PrometheusLogger(CustomLogger):
litellm_model_name, model_id, api_base, api_provider, exception_status
).inc()
def _safe_get_remaining_budget(
self, max_budget: Optional[float], spend: Optional[float]
) -> float:
if max_budget is None:
return float("inf")
def safe_get_remaining_budget(
max_budget: Optional[float], spend: Optional[float]
) -> float:
if max_budget is None:
return float("inf")
if spend is None:
return max_budget
if spend is None:
return max_budget
return max_budget - spend
return max_budget - spend

View file

@ -333,6 +333,14 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915
api_key: Optional[str],
dynamic_api_key: Optional[str],
) -> Tuple[str, str, Optional[str], Optional[str]]:
"""
Returns:
Tuple[str, str, Optional[str], Optional[str]]:
model: str
custom_llm_provider: str
dynamic_api_key: Optional[str]
api_base: Optional[str]
"""
custom_llm_provider = model.split("/", 1)[0]
model = model.split("/", 1)[1]

View file

@ -12,7 +12,7 @@ import time
import traceback
import uuid
from datetime import datetime as dt_object
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel
@ -51,6 +51,7 @@ from litellm.types.utils import (
StandardPassThroughResponseObject,
TextCompletionResponse,
TranscriptionResponse,
Usage,
)
from litellm.utils import (
_get_base_model_from_metadata,
@ -58,22 +59,21 @@ from litellm.utils import (
prompt_token_calculator,
)
from ..integrations.aispend import AISpendLogger
from ..integrations.argilla import ArgillaLogger
from ..integrations.arize_ai import ArizeLogger
from ..integrations.athina import AthinaLogger
from ..integrations.berrispend import BerriSpendLogger
from ..integrations.braintrust_logging import BraintrustLogger
from ..integrations.clickhouse import ClickhouseLogger
from ..integrations.datadog.datadog import DataDogLogger
from ..integrations.datadog.datadog_llm_obs import DataDogLLMObsLogger
from ..integrations.dynamodb import DyanmoDBLogger
from ..integrations.galileo import GalileoObserve
from ..integrations.gcs_bucket.gcs_bucket import GCSBucketLogger
from ..integrations.greenscale import GreenscaleLogger
from ..integrations.helicone import HeliconeLogger
from ..integrations.lago import LagoLogger
from ..integrations.langfuse import LangFuseLogger
from ..integrations.langfuse.langfuse import LangFuseLogger
from ..integrations.langfuse.langfuse_handler import LangFuseHandler
from ..integrations.langsmith import LangsmithLogger
from ..integrations.litedebugger import LiteDebugger
from ..integrations.literal_ai import LiteralAILogger
from ..integrations.logfire_logger import LogfireLevel, LogfireLogger
from ..integrations.lunary import LunaryLogger
@ -122,13 +122,9 @@ prometheusLogger = None
dynamoLogger = None
s3Logger = None
genericAPILogger = None
clickHouseLogger = None
greenscaleLogger = None
lunaryLogger = None
aispendLogger = None
berrispendLogger = None
supabaseClient = None
liteDebuggerClient = None
callback_list: Optional[List[str]] = []
user_logger_fn = None
additional_details: Optional[Dict[str, str]] = {}
@ -191,7 +187,7 @@ in_memory_dynamic_logger_cache = DynamicLoggingCache()
class Logging:
global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
global supabaseClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
custom_pricing: bool = False
stream_options = None
@ -970,22 +966,6 @@ class Logging:
):
print_verbose("no-log request, skipping logging")
continue
if callback == "lite_debugger" and liteDebuggerClient is not None:
print_verbose("reaches lite_debugger for logging!")
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
print_verbose(
f"liteDebuggerClient details function {self.call_type} and stream set to {self.stream}"
)
liteDebuggerClient.log_event(
end_user=kwargs.get("user", "default"),
response_obj=result,
start_time=start_time,
end_time=end_time,
litellm_call_id=self.litellm_call_id,
print_verbose=print_verbose,
call_type=self.call_type,
stream=self.stream,
)
if callback == "promptlayer" and promptLayerLogger is not None:
print_verbose("reaches promptlayer for logging!")
promptLayerLogger.log_event(
@ -1136,74 +1116,13 @@ class Logging:
print_verbose("reaches langfuse for streaming logging!")
result = kwargs["complete_streaming_response"]
temp_langfuse_logger = langFuseLogger
if langFuseLogger is None or (
(
self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
!= langFuseLogger.public_key
)
or (
self.standard_callback_dynamic_params.get(
"langfuse_secret"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_secret"
)
!= langFuseLogger.secret_key
)
or (
self.standard_callback_dynamic_params.get(
"langfuse_host"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_host"
)
!= langFuseLogger.langfuse_host
)
):
credentials = {
"langfuse_public_key": self.standard_callback_dynamic_params.get(
"langfuse_public_key"
),
"langfuse_secret": self.standard_callback_dynamic_params.get(
"langfuse_secret"
),
"langfuse_host": self.standard_callback_dynamic_params.get(
"langfuse_host"
),
}
temp_langfuse_logger = (
in_memory_dynamic_logger_cache.get_cache(
credentials=credentials, service_name="langfuse"
)
)
if temp_langfuse_logger is None:
temp_langfuse_logger = LangFuseLogger(
langfuse_public_key=self.standard_callback_dynamic_params.get(
"langfuse_public_key"
),
langfuse_secret=self.standard_callback_dynamic_params.get(
"langfuse_secret"
),
langfuse_host=self.standard_callback_dynamic_params.get(
"langfuse_host"
),
)
in_memory_dynamic_logger_cache.set_cache(
credentials=credentials,
service_name="langfuse",
logging_obj=temp_langfuse_logger,
)
if temp_langfuse_logger is not None:
_response = temp_langfuse_logger.log_event(
langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request(
globalLangfuseLogger=langFuseLogger,
standard_callback_dynamic_params=self.standard_callback_dynamic_params,
in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
)
if langfuse_logger_to_use is not None:
_response = langfuse_logger_to_use.log_event(
kwargs=kwargs,
response_obj=result,
start_time=start_time,
@ -1248,37 +1167,6 @@ class Logging:
user_id=kwargs.get("user", None),
print_verbose=print_verbose,
)
if callback == "clickhouse":
global clickHouseLogger
verbose_logger.debug("reaches clickhouse for success logging!")
kwargs = {}
for k, v in self.model_call_details.items():
if (
k != "original_response"
): # copy.deepcopy raises errors as this could be a coroutine
kwargs[k] = v
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
if self.stream:
verbose_logger.debug(
f"is complete_streaming_response in kwargs: {kwargs.get('complete_streaming_response', None)}"
)
if complete_streaming_response is None:
continue
else:
print_verbose(
"reaches clickhouse for streaming logging!"
)
result = kwargs["complete_streaming_response"]
if clickHouseLogger is None:
clickHouseLogger = ClickhouseLogger()
clickHouseLogger.log_event(
kwargs=kwargs,
response_obj=result,
start_time=start_time,
end_time=end_time,
user_id=kwargs.get("user", None),
print_verbose=print_verbose,
)
if callback == "greenscale" and greenscaleLogger is not None:
kwargs = {}
for k, v in self.model_call_details.items():
@ -1874,9 +1762,7 @@ class Logging:
)
for callback in callbacks:
try:
if callback == "lite_debugger" and liteDebuggerClient is not None:
pass
elif callback == "lunary" and lunaryLogger is not None:
if callback == "lunary" and lunaryLogger is not None:
print_verbose("reaches lunary for logging error!")
model = self.model
@ -1962,50 +1848,12 @@ class Logging:
): # copy.deepcopy raises errors as this could be a coroutine
kwargs[k] = v
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
if langFuseLogger is None or (
(
self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
!= langFuseLogger.public_key
)
or (
self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_public_key"
)
!= langFuseLogger.public_key
)
or (
self.standard_callback_dynamic_params.get(
"langfuse_host"
)
is not None
and self.standard_callback_dynamic_params.get(
"langfuse_host"
)
!= langFuseLogger.langfuse_host
)
):
langFuseLogger = LangFuseLogger(
langfuse_public_key=self.standard_callback_dynamic_params.get(
"langfuse_public_key"
),
langfuse_secret=self.standard_callback_dynamic_params.get(
"langfuse_secret"
),
langfuse_host=self.standard_callback_dynamic_params.get(
"langfuse_host"
),
)
_response = langFuseLogger.log_event(
langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request(
globalLangfuseLogger=langFuseLogger,
standard_callback_dynamic_params=self.standard_callback_dynamic_params,
in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
)
_response = langfuse_logger_to_use.log_event(
start_time=start_time,
end_time=end_time,
response_obj=None,
@ -2195,7 +2043,7 @@ def set_callbacks(callback_list, function_id=None): # noqa: PLR0915
"""
Globally sets the callback client
"""
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, supabaseClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
try:
for callback in callback_list:
@ -2275,26 +2123,12 @@ def set_callbacks(callback_list, function_id=None): # noqa: PLR0915
weightsBiasesLogger = WeightsBiasesLogger()
elif callback == "logfire":
logfireLogger = LogfireLogger()
elif callback == "aispend":
aispendLogger = AISpendLogger()
elif callback == "berrispend":
berrispendLogger = BerriSpendLogger()
elif callback == "supabase":
print_verbose("instantiating supabase")
supabaseClient = Supabase()
elif callback == "greenscale":
greenscaleLogger = GreenscaleLogger()
print_verbose("Initialized Greenscale Logger")
elif callback == "lite_debugger":
print_verbose("instantiating lite_debugger")
if function_id:
liteDebuggerClient = LiteDebugger(email=function_id)
elif litellm.token:
liteDebuggerClient = LiteDebugger(email=litellm.token)
elif litellm.email:
liteDebuggerClient = LiteDebugger(email=litellm.email)
else:
liteDebuggerClient = LiteDebugger(email=str(uuid.uuid4()))
elif callable(callback):
customLogger = CustomLogger()
except Exception as e:
@ -2372,6 +2206,10 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
_datadog_logger = DataDogLogger()
_in_memory_loggers.append(_datadog_logger)
return _datadog_logger # type: ignore
elif logging_integration == "datadog_llm_observability":
_datadog_llm_obs_logger = DataDogLLMObsLogger()
_in_memory_loggers.append(_datadog_llm_obs_logger)
return _datadog_llm_obs_logger # type: ignore
elif logging_integration == "gcs_bucket":
for callback in _in_memory_loggers:
if isinstance(callback, GCSBucketLogger):
@ -2389,22 +2227,16 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
_in_memory_loggers.append(_opik_logger)
return _opik_logger # type: ignore
elif logging_integration == "arize":
if "ARIZE_SPACE_KEY" not in os.environ:
raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
if "ARIZE_API_KEY" not in os.environ:
raise ValueError("ARIZE_API_KEY not found in environment variables")
from litellm.integrations.opentelemetry import (
OpenTelemetry,
OpenTelemetryConfig,
)
arize_endpoint = (
os.environ.get("ARIZE_ENDPOINT", None) or "https://otlp.arize.com/v1"
)
otel_config = OpenTelemetryConfig(
exporter="otlp_grpc",
endpoint=arize_endpoint,
)
otel_config = ArizeLogger.get_arize_opentelemetry_config()
if otel_config is None:
raise ValueError(
"No valid endpoint found for Arize, please set 'ARIZE_ENDPOINT' to your GRPC endpoint or 'ARIZE_HTTP_ENDPOINT' to your HTTP endpoint"
)
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
f"space_key={os.getenv('ARIZE_SPACE_KEY')},api_key={os.getenv('ARIZE_API_KEY')}"
)
@ -2417,7 +2249,6 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915
_otel_logger = OpenTelemetry(config=otel_config, callback_name="arize")
_in_memory_loggers.append(_otel_logger)
return _otel_logger # type: ignore
elif logging_integration == "otel":
from litellm.integrations.opentelemetry import OpenTelemetry
@ -2546,6 +2377,10 @@ def get_custom_logger_compatible_class(
for callback in _in_memory_loggers:
if isinstance(callback, DataDogLogger):
return callback
elif logging_integration == "datadog_llm_observability":
for callback in _in_memory_loggers:
if isinstance(callback, DataDogLLMObsLogger):
return callback
elif logging_integration == "gcs_bucket":
for callback in _in_memory_loggers:
if isinstance(callback, GCSBucketLogger):
@ -2629,7 +2464,184 @@ def is_valid_sha256_hash(value: str) -> bool:
return bool(re.fullmatch(r"[a-fA-F0-9]{64}", value))
def get_standard_logging_object_payload( # noqa: PLR0915
class StandardLoggingPayloadSetup:
@staticmethod
def cleanup_timestamps(
start_time: Union[dt_object, float],
end_time: Union[dt_object, float],
completion_start_time: Union[dt_object, float],
) -> Tuple[float, float, float]:
"""
Convert datetime objects to floats
"""
if isinstance(start_time, datetime.datetime):
start_time_float = start_time.timestamp()
elif isinstance(start_time, float):
start_time_float = start_time
else:
raise ValueError(
f"start_time is required, got={start_time} of type {type(start_time)}"
)
if isinstance(end_time, datetime.datetime):
end_time_float = end_time.timestamp()
elif isinstance(end_time, float):
end_time_float = end_time
else:
raise ValueError(
f"end_time is required, got={end_time} of type {type(end_time)}"
)
if isinstance(completion_start_time, datetime.datetime):
completion_start_time_float = completion_start_time.timestamp()
elif isinstance(completion_start_time, float):
completion_start_time_float = completion_start_time
else:
completion_start_time_float = end_time_float
return start_time_float, end_time_float, completion_start_time_float
@staticmethod
def get_standard_logging_metadata(
metadata: Optional[Dict[str, Any]]
) -> StandardLoggingMetadata:
"""
Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
Args:
metadata (Optional[Dict[str, Any]]): The original metadata dictionary.
Returns:
StandardLoggingMetadata: A StandardLoggingMetadata object containing the cleaned metadata.
Note:
- If the input metadata is None or not a dictionary, an empty StandardLoggingMetadata object is returned.
- If 'user_api_key' is present in metadata and is a valid SHA256 hash, it's stored as 'user_api_key_hash'.
"""
# Initialize with default values
clean_metadata = StandardLoggingMetadata(
user_api_key_hash=None,
user_api_key_alias=None,
user_api_key_team_id=None,
user_api_key_org_id=None,
user_api_key_user_id=None,
user_api_key_team_alias=None,
spend_logs_metadata=None,
requester_ip_address=None,
requester_metadata=None,
)
if isinstance(metadata, dict):
# Filter the metadata dictionary to include only the specified keys
clean_metadata = StandardLoggingMetadata(
**{ # type: ignore
key: metadata[key]
for key in StandardLoggingMetadata.__annotations__.keys()
if key in metadata
}
)
if metadata.get("user_api_key") is not None:
if is_valid_sha256_hash(str(metadata.get("user_api_key"))):
clean_metadata["user_api_key_hash"] = metadata.get(
"user_api_key"
) # this is the hash
return clean_metadata
@staticmethod
def get_usage_from_response_obj(response_obj: Optional[dict]) -> Usage:
## BASE CASE ##
if response_obj is None:
return Usage(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
usage = response_obj.get("usage", None) or {}
if usage is None or (
not isinstance(usage, dict) and not isinstance(usage, Usage)
):
return Usage(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
elif isinstance(usage, Usage):
return usage
elif isinstance(usage, dict):
return Usage(**usage)
raise ValueError(f"usage is required, got={usage} of type {type(usage)}")
@staticmethod
def get_model_cost_information(
base_model: Optional[str],
custom_pricing: Optional[bool],
custom_llm_provider: Optional[str],
init_response_obj: Union[Any, BaseModel, dict],
) -> StandardLoggingModelInformation:
model_cost_name = _select_model_name_for_cost_calc(
model=None,
completion_response=init_response_obj, # type: ignore
base_model=base_model,
custom_pricing=custom_pricing,
)
if model_cost_name is None:
model_cost_information = StandardLoggingModelInformation(
model_map_key="", model_map_value=None
)
else:
try:
_model_cost_information = litellm.get_model_info(
model=model_cost_name, custom_llm_provider=custom_llm_provider
)
model_cost_information = StandardLoggingModelInformation(
model_map_key=model_cost_name,
model_map_value=_model_cost_information,
)
except Exception:
verbose_logger.debug( # keep in debug otherwise it will trigger on every call
"Model={} is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload".format(
model_cost_name
)
)
model_cost_information = StandardLoggingModelInformation(
model_map_key=model_cost_name, model_map_value=None
)
return model_cost_information
@staticmethod
def get_final_response_obj(
response_obj: dict, init_response_obj: Union[Any, BaseModel, dict], kwargs: dict
) -> Optional[Union[dict, str, list]]:
"""
Get final response object after redacting the message input/output from logging
"""
if response_obj is not None:
final_response_obj: Optional[Union[dict, str, list]] = response_obj
elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str):
final_response_obj = init_response_obj
else:
final_response_obj = None
modified_final_response_obj = redact_message_input_output_from_logging(
model_call_details=kwargs,
result=final_response_obj,
)
if modified_final_response_obj is not None and isinstance(
modified_final_response_obj, BaseModel
):
final_response_obj = modified_final_response_obj.model_dump()
else:
final_response_obj = modified_final_response_obj
return final_response_obj
def get_standard_logging_object_payload(
kwargs: Optional[dict],
init_response_obj: Union[Any, BaseModel, dict],
start_time: dt_object,
@ -2677,9 +2689,9 @@ def get_standard_logging_object_payload( # noqa: PLR0915
completion_start_time = kwargs.get("completion_start_time", end_time)
call_type = kwargs.get("call_type")
cache_hit = kwargs.get("cache_hit", False)
usage = response_obj.get("usage", None) or {}
if type(usage) is litellm.Usage:
usage = dict(usage)
usage = StandardLoggingPayloadSetup.get_usage_from_response_obj(
response_obj=response_obj
)
id = response_obj.get("id", kwargs.get("litellm_call_id"))
_model_id = metadata.get("model_info", {}).get("id", "")
@ -2692,20 +2704,13 @@ def get_standard_logging_object_payload( # noqa: PLR0915
)
# cleanup timestamps
if isinstance(start_time, datetime.datetime):
start_time_float = start_time.timestamp()
elif isinstance(start_time, float):
start_time_float = start_time
if isinstance(end_time, datetime.datetime):
end_time_float = end_time.timestamp()
elif isinstance(end_time, float):
end_time_float = end_time
if isinstance(completion_start_time, datetime.datetime):
completion_start_time_float = completion_start_time.timestamp()
elif isinstance(completion_start_time, float):
completion_start_time_float = completion_start_time
else:
completion_start_time_float = end_time_float
start_time_float, end_time_float, completion_start_time_float = (
StandardLoggingPayloadSetup.cleanup_timestamps(
start_time=start_time,
end_time=end_time,
completion_start_time=completion_start_time,
)
)
# clean up litellm hidden params
clean_hidden_params = StandardLoggingHiddenParams(
model_id=None,
@ -2723,7 +2728,9 @@ def get_standard_logging_object_payload( # noqa: PLR0915
}
)
# clean up litellm metadata
clean_metadata = get_standard_logging_metadata(metadata=metadata)
clean_metadata = StandardLoggingPayloadSetup.get_standard_logging_metadata(
metadata=metadata
)
if litellm.cache is not None:
cache_key = litellm.cache.get_cache_key(**kwargs)
@ -2745,58 +2752,21 @@ def get_standard_logging_object_payload( # noqa: PLR0915
## Get model cost information ##
base_model = _get_base_model_from_metadata(model_call_details=kwargs)
custom_pricing = use_custom_pricing_for_model(litellm_params=litellm_params)
model_cost_name = _select_model_name_for_cost_calc(
model=None,
completion_response=init_response_obj, # type: ignore
model_cost_information = StandardLoggingPayloadSetup.get_model_cost_information(
base_model=base_model,
custom_pricing=custom_pricing,
custom_llm_provider=kwargs.get("custom_llm_provider"),
init_response_obj=init_response_obj,
)
if model_cost_name is None:
model_cost_information = StandardLoggingModelInformation(
model_map_key="", model_map_value=None
)
else:
custom_llm_provider = kwargs.get("custom_llm_provider", None)
try:
_model_cost_information = litellm.get_model_info(
model=model_cost_name, custom_llm_provider=custom_llm_provider
)
model_cost_information = StandardLoggingModelInformation(
model_map_key=model_cost_name,
model_map_value=_model_cost_information,
)
except Exception:
verbose_logger.debug( # keep in debug otherwise it will trigger on every call
"Model={} is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload".format(
model_cost_name
)
)
model_cost_information = StandardLoggingModelInformation(
model_map_key=model_cost_name, model_map_value=None
)
response_cost: float = kwargs.get("response_cost", 0) or 0.0
if response_obj is not None:
final_response_obj: Optional[Union[dict, str, list]] = response_obj
elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str):
final_response_obj = init_response_obj
else:
final_response_obj = None
modified_final_response_obj = redact_message_input_output_from_logging(
model_call_details=kwargs,
result=final_response_obj,
## get final response object ##
final_response_obj = StandardLoggingPayloadSetup.get_final_response_obj(
response_obj=response_obj,
init_response_obj=init_response_obj,
kwargs=kwargs,
)
if modified_final_response_obj is not None and isinstance(
modified_final_response_obj, BaseModel
):
final_response_obj = modified_final_response_obj.model_dump()
else:
final_response_obj = modified_final_response_obj
payload: StandardLoggingPayload = StandardLoggingPayload(
id=str(id),
call_type=call_type or "",
@ -2810,9 +2780,9 @@ def get_standard_logging_object_payload( # noqa: PLR0915
metadata=clean_metadata,
cache_key=cache_key,
response_cost=response_cost,
total_tokens=usage.get("total_tokens", 0),
prompt_tokens=usage.get("prompt_tokens", 0),
completion_tokens=usage.get("completion_tokens", 0),
total_tokens=usage.total_tokens,
prompt_tokens=usage.prompt_tokens,
completion_tokens=usage.completion_tokens,
request_tags=request_tags,
end_user=end_user_id or "",
api_base=litellm_params.get("api_base", ""),
@ -2859,6 +2829,7 @@ def get_standard_logging_metadata(
user_api_key_hash=None,
user_api_key_alias=None,
user_api_key_team_id=None,
user_api_key_org_id=None,
user_api_key_user_id=None,
user_api_key_team_alias=None,
spend_logs_metadata=None,

View file

@ -0,0 +1,508 @@
import asyncio
import json
import time
import traceback
import uuid
from typing import Dict, Iterable, List, Literal, Optional, Union
import litellm
from litellm._logging import verbose_logger
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
ChatCompletionMessageToolCall,
Choices,
Delta,
EmbeddingResponse,
Function,
ImageResponse,
Message,
ModelResponse,
RerankResponse,
StreamingChoices,
TranscriptionResponse,
Usage,
)
from .get_headers import get_response_headers
async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
"""
Asynchronously converts a response object to a streaming response.
Args:
response_object (Optional[dict]): The response object to be converted. Defaults to None.
Raises:
Exception: If the response object is None.
Yields:
ModelResponse: The converted streaming response object.
Returns:
None
"""
if response_object is None:
raise Exception("Error in response object format")
model_response_object = ModelResponse(stream=True)
if model_response_object is None:
raise Exception("Error in response creating model response object")
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
if (
choice["message"].get("tool_calls", None) is not None
and isinstance(choice["message"]["tool_calls"], list)
and len(choice["message"]["tool_calls"]) > 0
and isinstance(choice["message"]["tool_calls"][0], dict)
):
pydantic_tool_calls = []
for index, t in enumerate(choice["message"]["tool_calls"]):
if "index" not in t:
t["index"] = index
pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
choice["message"]["tool_calls"] = pydantic_tool_calls
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
finish_reason = choice.get("finish_details")
logprobs = choice.get("logprobs", None)
choice = StreamingChoices(
finish_reason=finish_reason, index=idx, delta=delta, logprobs=logprobs
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
setattr(
model_response_object,
"usage",
Usage(
completion_tokens=response_object["usage"].get("completion_tokens", 0),
prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
total_tokens=response_object["usage"].get("total_tokens", 0),
),
)
if "id" in response_object:
model_response_object.id = response_object["id"]
if "created" in response_object:
model_response_object.created = response_object["created"]
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object["system_fingerprint"]
if "model" in response_object:
model_response_object.model = response_object["model"]
yield model_response_object
await asyncio.sleep(0)
def convert_to_streaming_response(response_object: Optional[dict] = None):
# used for yielding Cache hits when stream == True
if response_object is None:
raise Exception("Error in response object format")
model_response_object = ModelResponse(stream=True)
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
finish_reason = choice.get("finish_details")
logprobs = choice.get("logprobs", None)
enhancements = choice.get("enhancements", None)
choice = StreamingChoices(
finish_reason=finish_reason,
index=idx,
delta=delta,
logprobs=logprobs,
enhancements=enhancements,
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
setattr(model_response_object, "usage", Usage())
model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0) # type: ignore
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if "id" in response_object:
model_response_object.id = response_object["id"]
if "created" in response_object:
model_response_object.created = response_object["created"]
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object["system_fingerprint"]
if "model" in response_object:
model_response_object.model = response_object["model"]
yield model_response_object
from collections import defaultdict
def _handle_invalid_parallel_tool_calls(
tool_calls: List[ChatCompletionMessageToolCall],
):
"""
Handle hallucinated parallel tool call from openai - https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
Code modified from: https://github.com/phdowling/openai_multi_tool_use_parallel_patch/blob/main/openai_multi_tool_use_parallel_patch.py
"""
if tool_calls is None:
return
try:
replacements: Dict[int, List[ChatCompletionMessageToolCall]] = defaultdict(list)
for i, tool_call in enumerate(tool_calls):
current_function = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
if current_function == "multi_tool_use.parallel":
verbose_logger.debug(
"OpenAI did a weird pseudo-multi-tool-use call, fixing call structure.."
)
for _fake_i, _fake_tool_use in enumerate(function_args["tool_uses"]):
_function_args = _fake_tool_use["parameters"]
_current_function = _fake_tool_use["recipient_name"]
if _current_function.startswith("functions."):
_current_function = _current_function[len("functions.") :]
fixed_tc = ChatCompletionMessageToolCall(
id=f"{tool_call.id}_{_fake_i}",
type="function",
function=Function(
name=_current_function, arguments=json.dumps(_function_args)
),
)
replacements[i].append(fixed_tc)
shift = 0
for i, replacement in replacements.items():
tool_calls[:] = (
tool_calls[: i + shift] + replacement + tool_calls[i + shift + 1 :]
)
shift += len(replacement)
return tool_calls
except json.JSONDecodeError:
# if there is a JSONDecodeError, return the original tool_calls
return tool_calls
class LiteLLMResponseObjectHandler:
@staticmethod
def convert_to_image_response(
response_object: dict,
model_response_object: Optional[ImageResponse] = None,
hidden_params: Optional[dict] = None,
) -> ImageResponse:
response_object.update({"hidden_params": hidden_params})
if model_response_object is None:
model_response_object = ImageResponse(**response_object)
return model_response_object
else:
model_response_dict = model_response_object.model_dump()
model_response_dict.update(response_object)
model_response_object = ImageResponse(**model_response_dict)
return model_response_object
def convert_to_model_response_object( # noqa: PLR0915
response_object: Optional[dict] = None,
model_response_object: Optional[
Union[
ModelResponse,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
RerankResponse,
]
] = None,
response_type: Literal[
"completion", "embedding", "image_generation", "audio_transcription", "rerank"
] = "completion",
stream=False,
start_time=None,
end_time=None,
hidden_params: Optional[dict] = None,
_response_headers: Optional[dict] = None,
convert_tool_call_to_json_mode: Optional[
bool
] = None, # used for supporting 'json_schema' on older models
):
received_args = locals()
additional_headers = get_response_headers(_response_headers)
if hidden_params is None:
hidden_params = {}
hidden_params["additional_headers"] = additional_headers
### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
if (
response_object is not None
and "error" in response_object
and response_object["error"] is not None
):
error_args = {"status_code": 422, "message": "Error in response object"}
if isinstance(response_object["error"], dict):
if "code" in response_object["error"]:
error_args["status_code"] = response_object["error"]["code"]
if "message" in response_object["error"]:
if isinstance(response_object["error"]["message"], dict):
message_str = json.dumps(response_object["error"]["message"])
else:
message_str = str(response_object["error"]["message"])
error_args["message"] = message_str
raised_exception = Exception()
setattr(raised_exception, "status_code", error_args["status_code"])
setattr(raised_exception, "message", error_args["message"])
raise raised_exception
try:
if response_type == "completion" and (
model_response_object is None
or isinstance(model_response_object, ModelResponse)
):
if response_object is None or model_response_object is None:
raise Exception("Error in response object format")
if stream is True:
# for returning cached responses, we need to yield a generator
return convert_to_streaming_response(response_object=response_object)
choice_list = []
assert response_object["choices"] is not None and isinstance(
response_object["choices"], Iterable
)
for idx, choice in enumerate(response_object["choices"]):
## HANDLE JSON MODE - anthropic returns single function call]
tool_calls = choice["message"].get("tool_calls", None)
if tool_calls is not None:
_openai_tool_calls = []
for _tc in tool_calls:
_openai_tc = ChatCompletionMessageToolCall(**_tc)
_openai_tool_calls.append(_openai_tc)
fixed_tool_calls = _handle_invalid_parallel_tool_calls(
_openai_tool_calls
)
if fixed_tool_calls is not None:
tool_calls = fixed_tool_calls
message: Optional[Message] = None
finish_reason: Optional[str] = None
if (
convert_tool_call_to_json_mode
and tool_calls is not None
and len(tool_calls) == 1
):
# to support 'json_schema' logic on older models
json_mode_content_str: Optional[str] = tool_calls[0][
"function"
].get("arguments")
if json_mode_content_str is not None:
message = litellm.Message(content=json_mode_content_str)
finish_reason = "stop"
if message is None:
message = Message(
content=choice["message"].get("content", None),
role=choice["message"]["role"] or "assistant",
function_call=choice["message"].get("function_call", None),
tool_calls=tool_calls,
audio=choice["message"].get("audio", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
finish_reason = choice.get("finish_details") or "stop"
logprobs = choice.get("logprobs", None)
enhancements = choice.get("enhancements", None)
choice = Choices(
finish_reason=finish_reason,
index=idx,
message=message,
logprobs=logprobs,
enhancements=enhancements,
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
usage_object = litellm.Usage(**response_object["usage"])
setattr(model_response_object, "usage", usage_object)
if "created" in response_object:
model_response_object.created = response_object["created"] or int(
time.time()
)
if "id" in response_object:
model_response_object.id = response_object["id"] or str(uuid.uuid4())
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object[
"system_fingerprint"
]
if "model" in response_object:
if model_response_object.model is None:
model_response_object.model = response_object["model"]
elif (
"/" in model_response_object.model
and response_object["model"] is not None
):
openai_compatible_provider = model_response_object.model.split("/")[
0
]
model_response_object.model = (
openai_compatible_provider + "/" + response_object["model"]
)
if start_time is not None and end_time is not None:
if isinstance(start_time, type(end_time)):
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000
if hidden_params is not None:
if model_response_object._hidden_params is None:
model_response_object._hidden_params = {}
model_response_object._hidden_params.update(hidden_params)
if _response_headers is not None:
model_response_object._response_headers = _response_headers
special_keys = list(litellm.ModelResponse.model_fields.keys())
special_keys.append("usage")
for k, v in response_object.items():
if k not in special_keys:
setattr(model_response_object, k, v)
return model_response_object
elif response_type == "embedding" and (
model_response_object is None
or isinstance(model_response_object, EmbeddingResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = EmbeddingResponse()
if "model" in response_object:
model_response_object.model = response_object["model"]
if "object" in response_object:
model_response_object.object = response_object["object"]
model_response_object.data = response_object["data"]
if "usage" in response_object and response_object["usage"] is not None:
model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0) # type: ignore
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if start_time is not None and end_time is not None:
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai
if hidden_params is not None:
model_response_object._hidden_params = hidden_params
if _response_headers is not None:
model_response_object._response_headers = _response_headers
return model_response_object
elif response_type == "image_generation" and (
model_response_object is None
or isinstance(model_response_object, ImageResponse)
):
if response_object is None:
raise Exception("Error in response object format")
return LiteLLMResponseObjectHandler.convert_to_image_response(
response_object=response_object,
model_response_object=model_response_object,
hidden_params=hidden_params,
)
elif response_type == "audio_transcription" and (
model_response_object is None
or isinstance(model_response_object, TranscriptionResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = TranscriptionResponse()
if "text" in response_object:
model_response_object.text = response_object["text"]
optional_keys = ["language", "task", "duration", "words", "segments"]
for key in optional_keys: # not guaranteed to be in response
if key in response_object:
setattr(model_response_object, key, response_object[key])
if hidden_params is not None:
model_response_object._hidden_params = hidden_params
if _response_headers is not None:
model_response_object._response_headers = _response_headers
return model_response_object
elif response_type == "rerank" and (
model_response_object is None
or isinstance(model_response_object, RerankResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = RerankResponse(**response_object)
return model_response_object
if "id" in response_object:
model_response_object.id = response_object["id"]
if "meta" in response_object:
model_response_object.meta = response_object["meta"]
if "results" in response_object:
model_response_object.results = response_object["results"]
return model_response_object
except Exception:
raise Exception(
f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
)

View file

@ -15,27 +15,28 @@ def get_response_headers(_response_headers: Optional[dict] = None) -> dict:
dict: _response_headers with OpenAI headers and llm_provider-{header}
"""
if _response_headers is not None:
openai_headers = {}
if "x-ratelimit-limit-requests" in _response_headers:
openai_headers["x-ratelimit-limit-requests"] = _response_headers[
"x-ratelimit-limit-requests"
]
if "x-ratelimit-remaining-requests" in _response_headers:
openai_headers["x-ratelimit-remaining-requests"] = _response_headers[
"x-ratelimit-remaining-requests"
]
if "x-ratelimit-limit-tokens" in _response_headers:
openai_headers["x-ratelimit-limit-tokens"] = _response_headers[
"x-ratelimit-limit-tokens"
]
if "x-ratelimit-remaining-tokens" in _response_headers:
openai_headers["x-ratelimit-remaining-tokens"] = _response_headers[
"x-ratelimit-remaining-tokens"
]
llm_provider_headers = _get_llm_provider_headers(_response_headers)
return {**llm_provider_headers, **openai_headers}
return {}
if _response_headers is None:
return {}
openai_headers = {}
if "x-ratelimit-limit-requests" in _response_headers:
openai_headers["x-ratelimit-limit-requests"] = _response_headers[
"x-ratelimit-limit-requests"
]
if "x-ratelimit-remaining-requests" in _response_headers:
openai_headers["x-ratelimit-remaining-requests"] = _response_headers[
"x-ratelimit-remaining-requests"
]
if "x-ratelimit-limit-tokens" in _response_headers:
openai_headers["x-ratelimit-limit-tokens"] = _response_headers[
"x-ratelimit-limit-tokens"
]
if "x-ratelimit-remaining-tokens" in _response_headers:
openai_headers["x-ratelimit-remaining-tokens"] = _response_headers[
"x-ratelimit-remaining-tokens"
]
llm_provider_headers = _get_llm_provider_headers(_response_headers)
return {**llm_provider_headers, **openai_headers}
def _get_llm_provider_headers(response_headers: dict) -> dict:

View file

@ -26,15 +26,24 @@ async with websockets.connect( # type: ignore
import asyncio
import concurrent.futures
import json
import traceback
from asyncio import Task
from typing import Any, Dict, List, Optional, Union
import litellm
from .litellm_logging import Logging as LiteLLMLogging
# Create a thread pool with a maximum of 10 threads
executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
DefaultLoggedRealTimeEventTypes = [
"session.created",
"response.create",
"response.done",
]
class RealTimeStreaming:
def __init__(
@ -49,9 +58,27 @@ class RealTimeStreaming:
self.messages: List = []
self.input_message: Dict = {}
_logged_real_time_event_types = litellm.logged_real_time_event_types
if _logged_real_time_event_types is None:
_logged_real_time_event_types = DefaultLoggedRealTimeEventTypes
self.logged_real_time_event_types = _logged_real_time_event_types
def _should_store_message(self, message: Union[str, bytes]) -> bool:
if isinstance(message, bytes):
message = message.decode("utf-8")
message_obj = json.loads(message)
_msg_type = message_obj["type"]
if self.logged_real_time_event_types == "*":
return True
if _msg_type in self.logged_real_time_event_types:
return True
return False
def store_message(self, message: Union[str, bytes]):
"""Store message in list"""
self.messages.append(message)
if self._should_store_message(message):
self.messages.append(message)
def store_input(self, message: dict):
"""Store input message"""

View file

@ -198,9 +198,6 @@ class AzureOpenAIConfig:
optional_params["json_mode"] = True
else:
optional_params["response_format"] = value
elif param == "max_completion_tokens":
# TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
optional_params["max_tokens"] = value
elif param in supported_openai_params:
optional_params[param] = value

View file

@ -72,5 +72,5 @@ class AzureOpenAIRealtime(AzureChatCompletion):
except websockets.exceptions.InvalidStatusCode as e: # type: ignore
await websocket.close(code=e.status_code, reason=str(e))
except Exception as e:
await websocket.close(code=1011, reason=f"Internal server error: {str(e)}")
except Exception:
pass

View file

@ -1349,7 +1349,7 @@ class OpenAIChatCompletion(BaseLLM):
if aimg_generation is True:
return self.aimage_generation(data=data, prompt=prompt, logging_obj=logging_obj, model_response=model_response, api_base=api_base, api_key=api_key, timeout=timeout, client=client, max_retries=max_retries) # type: ignore
openai_client = self._get_openai_client(
openai_client: OpenAI = self._get_openai_client( # type: ignore
is_async=False,
api_key=api_key,
api_base=api_base,
@ -1371,8 +1371,9 @@ class OpenAIChatCompletion(BaseLLM):
)
## COMPLETION CALL
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
response = response.model_dump() # type: ignore
_response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
response = _response.model_dump()
## LOGGING
logging_obj.post_call(
input=prompt,
@ -1380,7 +1381,6 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
original_response=response,
)
# return response
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e:

View file

@ -398,6 +398,8 @@ class AnthropicChatCompletion(BaseLLM):
error_response = getattr(e, "response", None)
if error_headers is None and error_response:
error_headers = getattr(error_response, "headers", None)
if error_response and hasattr(error_response, "text"):
error_text = getattr(error_response, "text", error_text)
raise AnthropicError(
message=error_text,
status_code=status_code,

View file

@ -9,7 +9,7 @@ import httpx
from openai import OpenAI
import litellm
from litellm.llms.cohere.embed import embedding as cohere_embedding
from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,

View file

@ -19,6 +19,7 @@ from ..common_utils import BedrockError
from .invoke_handler import AWSEventStreamDecoder, MockResponseIterator, make_call
BEDROCK_CONVERSE_MODELS = [
"anthropic.claude-3-5-sonnet-20241022-v2:0",
"anthropic.claude-3-5-sonnet-20240620-v1:0",
"anthropic.claude-3-opus-20240229-v1:0",
"anthropic.claude-3-sonnet-20240229-v1:0",

View file

@ -7,6 +7,7 @@ Why separate file? Make it easy to see how transformation works
from typing import List
import litellm
from litellm.llms.cohere.embed.transformation import CohereEmbeddingConfig
from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingResponse
from litellm.types.utils import Embedding, EmbeddingResponse
@ -26,15 +27,21 @@ class BedrockCohereEmbeddingConfig:
optional_params["embedding_types"] = v
return optional_params
def _is_v3_model(self, model: str) -> bool:
return "3" in model
def _transform_request(
self, input: List[str], inference_params: dict
self, model: str, input: List[str], inference_params: dict
) -> CohereEmbeddingRequest:
transformed_request = CohereEmbeddingRequest(
texts=input,
input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE, # type: ignore
transformed_request = CohereEmbeddingConfig()._transform_request(
model, input, inference_params
)
for k, v in inference_params.items():
transformed_request[k] = v # type: ignore
new_transformed_request = CohereEmbeddingRequest(
input_type=transformed_request["input_type"],
)
for k in CohereEmbeddingRequest.__annotations__.keys():
if k in transformed_request:
new_transformed_request[k] = transformed_request[k] # type: ignore
return transformed_request
return new_transformed_request

View file

@ -11,7 +11,7 @@ from typing import Any, Callable, List, Literal, Optional, Tuple, Union
import httpx
import litellm
from litellm.llms.cohere.embed import embedding as cohere_embedding
from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
@ -369,7 +369,7 @@ class BedrockEmbedding(BaseAWSLLM):
batch_data: Optional[List] = None
if provider == "cohere":
data = BedrockCohereEmbeddingConfig()._transform_request(
input=input, inference_params=inference_params
model=model, input=input, inference_params=inference_params
)
elif provider == "amazon" and model in [
"amazon.titan-embed-image-v1",

View file

@ -12,8 +12,11 @@ import requests # type: ignore
import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.llms.bedrock import CohereEmbeddingRequest
from litellm.utils import Choices, Message, ModelResponse, Usage
from .transformation import CohereEmbeddingConfig
def validate_environment(api_key, headers: dict):
headers.update(
@ -41,39 +44,9 @@ class CohereError(Exception):
) # Call the base class constructor with the parameters it needs
def _process_embedding_response(
embeddings: list,
model_response: litellm.EmbeddingResponse,
model: str,
encoding: Any,
input: list,
) -> litellm.EmbeddingResponse:
output_data = []
for idx, embedding in enumerate(embeddings):
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
setattr(
model_response,
"usage",
Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
),
)
return model_response
async def async_embedding(
model: str,
data: dict,
data: Union[dict, CohereEmbeddingRequest],
input: list,
model_response: litellm.utils.EmbeddingResponse,
timeout: Optional[Union[float, httpx.Timeout]],
@ -121,19 +94,12 @@ async def async_embedding(
)
raise e
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response.text,
)
embeddings = response.json()["embeddings"]
## PROCESS RESPONSE ##
return _process_embedding_response(
embeddings=embeddings,
return CohereEmbeddingConfig()._transform_response(
response=response,
api_key=api_key,
logging_obj=logging_obj,
data=data,
model_response=model_response,
model=model,
encoding=encoding,
@ -149,7 +115,7 @@ def embedding(
optional_params: dict,
headers: dict,
encoding: Any,
data: Optional[dict] = None,
data: Optional[Union[dict, CohereEmbeddingRequest]] = None,
complete_api_base: Optional[str] = None,
api_key: Optional[str] = None,
aembedding: Optional[bool] = None,
@ -159,11 +125,10 @@ def embedding(
headers = validate_environment(api_key, headers=headers)
embed_url = complete_api_base or "https://api.cohere.ai/v1/embed"
model = model
data = data or {"model": model, "texts": input, **optional_params}
if "3" in model and "input_type" not in data:
# cohere v3 embedding models require input_type, if no input_type is provided, default to "search_document"
data["input_type"] = "search_document"
data = data or CohereEmbeddingConfig()._transform_request(
model=model, input=input, inference_params=optional_params
)
## ROUTING
if aembedding is True:
@ -193,30 +158,12 @@ def embedding(
client = HTTPHandler(concurrent_limit=1)
response = client.post(embed_url, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
"""
response
{
'object': "list",
'data': [
]
'model',
'usage'
}
"""
if response.status_code != 200:
raise CohereError(message=response.text, status_code=response.status_code)
embeddings = response.json()["embeddings"]
return _process_embedding_response(
embeddings=embeddings,
return CohereEmbeddingConfig()._transform_response(
response=response,
api_key=api_key,
logging_obj=logging_obj,
data=data,
model_response=model_response,
model=model,
encoding=encoding,

View file

@ -0,0 +1,160 @@
"""
Transformation logic from OpenAI /v1/embeddings format to Cohere's /v1/embed format.
Why separate file? Make it easy to see how transformation works
Convers
- v3 embedding models
- v2 embedding models
Docs - https://docs.cohere.com/v2/reference/embed
"""
import types
from typing import Any, List, Optional, Union
import httpx
from litellm import COHERE_DEFAULT_EMBEDDING_INPUT_TYPE
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.llms.bedrock import (
COHERE_EMBEDDING_INPUT_TYPES,
CohereEmbeddingRequest,
CohereEmbeddingRequestWithModel,
)
from litellm.types.utils import (
Embedding,
EmbeddingResponse,
PromptTokensDetailsWrapper,
Usage,
)
from litellm.utils import is_base64_encoded
class CohereEmbeddingConfig:
"""
Reference: https://docs.cohere.com/v2/reference/embed
"""
def __init__(self) -> None:
pass
def get_supported_openai_params(self) -> List[str]:
return ["encoding_format"]
def map_openai_params(
self, non_default_params: dict, optional_params: dict
) -> dict:
for k, v in non_default_params.items():
if k == "encoding_format":
optional_params["embedding_types"] = v
return optional_params
def _is_v3_model(self, model: str) -> bool:
return "3" in model
def _transform_request(
self, model: str, input: List[str], inference_params: dict
) -> CohereEmbeddingRequestWithModel:
is_encoded = False
for input_str in input:
is_encoded = is_base64_encoded(input_str)
if is_encoded: # check if string is b64 encoded image or not
transformed_request = CohereEmbeddingRequestWithModel(
model=model,
images=input,
input_type="image",
)
else:
transformed_request = CohereEmbeddingRequestWithModel(
model=model,
texts=input,
input_type=COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,
)
for k, v in inference_params.items():
transformed_request[k] = v # type: ignore
return transformed_request
def _calculate_usage(self, input: List[str], encoding: Any, meta: dict) -> Usage:
input_tokens = 0
text_tokens: Optional[int] = meta.get("billed_units", {}).get("input_tokens")
image_tokens: Optional[int] = meta.get("billed_units", {}).get("images")
prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
if image_tokens is None and text_tokens is None:
for text in input:
input_tokens += len(encoding.encode(text))
else:
prompt_tokens_details = PromptTokensDetailsWrapper(
image_tokens=image_tokens,
text_tokens=text_tokens,
)
if image_tokens:
input_tokens += image_tokens
if text_tokens:
input_tokens += text_tokens
return Usage(
prompt_tokens=input_tokens,
completion_tokens=0,
total_tokens=input_tokens,
prompt_tokens_details=prompt_tokens_details,
)
def _transform_response(
self,
response: httpx.Response,
api_key: Optional[str],
logging_obj: LiteLLMLoggingObj,
data: Union[dict, CohereEmbeddingRequest],
model_response: EmbeddingResponse,
model: str,
encoding: Any,
input: list,
) -> EmbeddingResponse:
response_json = response.json()
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response_json,
)
"""
response
{
'object': "list",
'data': [
]
'model',
'usage'
}
"""
embeddings = response_json["embeddings"]
output_data = []
for idx, embedding in enumerate(embeddings):
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
setattr(
model_response,
"usage",
self._calculate_usage(input, encoding, response_json.get("meta", {})),
)
return model_response

View file

@ -1,7 +1,7 @@
import asyncio
import os
import traceback
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, List, Mapping, Optional, Union
import httpx
from httpx import USE_CLIENT_DEFAULT
@ -32,15 +32,20 @@ class AsyncHTTPHandler:
def __init__(
self,
timeout: Optional[Union[float, httpx.Timeout]] = None,
event_hooks: Optional[Mapping[str, List[Callable[..., Any]]]] = None,
concurrent_limit=1000,
):
self.timeout = timeout
self.event_hooks = event_hooks
self.client = self.create_client(
timeout=timeout, concurrent_limit=concurrent_limit
timeout=timeout, concurrent_limit=concurrent_limit, event_hooks=event_hooks
)
def create_client(
self, timeout: Optional[Union[float, httpx.Timeout]], concurrent_limit: int
self,
timeout: Optional[Union[float, httpx.Timeout]],
concurrent_limit: int,
event_hooks: Optional[Mapping[str, List[Callable[..., Any]]]],
) -> httpx.AsyncClient:
# SSL certificates (a.k.a CA bundle) used to verify the identity of requested hosts.
@ -55,6 +60,7 @@ class AsyncHTTPHandler:
# Create a client with a connection pool
return httpx.AsyncClient(
event_hooks=event_hooks,
timeout=timeout,
limits=httpx.Limits(
max_connections=concurrent_limit,
@ -114,7 +120,9 @@ class AsyncHTTPHandler:
return response
except (httpx.RemoteProtocolError, httpx.ConnectError):
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=timeout, concurrent_limit=1)
new_client = self.create_client(
timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
)
try:
return await self.single_connection_post_request(
url=url,
@ -144,8 +152,10 @@ class AsyncHTTPHandler:
setattr(e, "status_code", e.response.status_code)
if stream is True:
setattr(e, "message", await e.response.aread())
setattr(e, "text", await e.response.aread())
else:
setattr(e, "message", e.response.text)
setattr(e, "text", e.response.text)
raise e
except Exception as e:
raise e
@ -172,7 +182,9 @@ class AsyncHTTPHandler:
return response
except (httpx.RemoteProtocolError, httpx.ConnectError):
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=timeout, concurrent_limit=1)
new_client = self.create_client(
timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
)
try:
return await self.single_connection_post_request(
url=url,
@ -229,7 +241,9 @@ class AsyncHTTPHandler:
return response
except (httpx.RemoteProtocolError, httpx.ConnectError):
# Retry the request with a new session if there is a connection error
new_client = self.create_client(timeout=timeout, concurrent_limit=1)
new_client = self.create_client(
timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
)
try:
return await self.single_connection_post_request(
url=url,

View file

@ -398,6 +398,7 @@ def ollama_completion_stream(url, data, logging_obj):
isinstance(content_chunk, StreamingChoices)
and hasattr(content_chunk, "delta")
and hasattr(content_chunk.delta, "content")
and content_chunk.delta.content is not None
):
content_chunks.append(content_chunk.delta.content)
response_content = "".join(content_chunks)

View file

@ -2429,6 +2429,15 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915
contents: List[BedrockMessageBlock] = []
msg_i = 0
## BASE CASE ##
if len(messages) == 0:
raise litellm.BadRequestError(
message=BAD_MESSAGE_ERROR_STR
+ "bedrock requires at least one non-system message",
model=model,
llm_provider=llm_provider,
)
# if initial message is assistant message
if messages[0].get("role") is not None and messages[0]["role"] == "assistant":
if user_continue_message is not None:

View file

@ -177,3 +177,16 @@ class VertexAIAnthropicConfig:
optional_params["json_mode"] = True
return optional_params
@classmethod
def is_supported_model(
cls, model: str, custom_llm_provider: Optional[str] = None
) -> bool:
"""
Check if the model is supported by the VertexAI Anthropic API.
"""
if custom_llm_provider == "vertex_ai" and "claude" in model.lower():
return True
elif model in litellm.vertex_anthropic_models:
return True
return False

View file

@ -113,7 +113,7 @@ from .llms.bedrock.chat import BedrockConverseLLM, BedrockLLM
from .llms.bedrock.embed.embedding import BedrockEmbedding
from .llms.cohere import chat as cohere_chat
from .llms.cohere import completion as cohere_completion # type: ignore
from .llms.cohere import embed as cohere_embed
from .llms.cohere.embed import handler as cohere_embed
from .llms.custom_llm import CustomLLM, custom_chat_llm_router
from .llms.databricks.chat import DatabricksChatCompletion
from .llms.groq.chat.handler import GroqChatCompletion
@ -4986,7 +4986,6 @@ def speech(
litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", {})
model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore
kwargs.pop("tags", [])

View file

@ -1104,7 +1104,7 @@
"litellm_provider": "azure_ai",
"mode": "chat"
},
"azure_ai/Meta-Llama-31-8B-Instruct": {
"azure_ai/Meta-Llama-3.1-8B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-70B-Instruct": {
"azure_ai/Meta-Llama-3.1-70B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-405B-Instruct": {
"azure_ai/Meta-Llama-3.1-405B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1751,6 +1751,22 @@
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"claude-3-5-sonnet-20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"tool_use_system_prompt_tokens": 159,
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"text-bison": {
"max_tokens": 2048,
"max_input_tokens": 8192,
@ -2578,6 +2594,18 @@
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-5-sonnet-v2@20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-haiku@20240307": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -3336,54 +3364,56 @@
"litellm_provider": "cohere",
"mode": "rerank"
},
"embed-english-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-light-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-multilingual-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-v2.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 4096,
"max_input_tokens": 4096,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-light-v2.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-multilingual-v2.0": {
"max_tokens": 256,
"max_input_tokens": 256,
"max_tokens": 768,
"max_input_tokens": 768,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-v3.0": {
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"input_cost_per_image": 0.0001,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding",
"supports_image_input": true
},
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
@ -3572,6 +3602,22 @@
"supports_vision": true,
"tool_use_system_prompt_tokens": 264
},
"anthropic/claude-3-5-sonnet-20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"tool_use_system_prompt_tokens": 159,
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"openrouter/anthropic/claude-3.5-sonnet": {
"max_tokens": 8192,
"max_input_tokens": 200000,
@ -4093,6 +4139,18 @@
"litellm_provider": "bedrock",
"mode": "embedding"
},
"amazon.titan-embed-image-v1": {
"max_tokens": 128,
"max_input_tokens": 128,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000008,
"input_cost_per_image": 0.00006,
"output_cost_per_token": 0.0,
"litellm_provider": "bedrock",
"supports_image_input": true,
"mode": "embedding",
"source": "https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=amazon.titan-image-generator-v1"
},
"mistral.mistral-7b-instruct-v0:2": {
"max_tokens": 8191,
"max_input_tokens": 32000,
@ -4246,6 +4304,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -4290,6 +4359,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"us.anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -4334,6 +4414,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"eu.anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -6369,6 +6460,14 @@
"litellm_provider": "voyage",
"mode": "embedding"
},
"voyage/voyage-finance-2": {
"max_tokens": 4000,
"max_input_tokens": 4000,
"input_cost_per_token": 0.00000012,
"output_cost_per_token": 0.000000,
"litellm_provider": "voyage",
"mode": "embedding"
},
"databricks/databricks-meta-llama-3-1-405b-instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00256a1984d35914.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[68031,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-58bf23027703b2e8.js\",\"131\",\"static/chunks/131-3d2257b0ff5aadb2.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-fc3969bfc35ead00.js\",\"777\",\"static/chunks/777-a81b45dec53652df.js\",\"931\",\"static/chunks/app/page-7c218fb97a2a9817.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00256a1984d35914.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Q5YcBgN0qLD3pcZcx1fRm\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_86ef86\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00256a1984d35914.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[68031,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-58bf23027703b2e8.js\",\"131\",\"static/chunks/131-3d2257b0ff5aadb2.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-fc3969bfc35ead00.js\",\"777\",\"static/chunks/777-a81b45dec53652df.js\",\"931\",\"static/chunks/app/page-7b75dc53f1c6e449.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00256a1984d35914.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"ffXp7j1jzMKpweBFKW_w2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_86ef86\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>

View file

@ -1,7 +1,7 @@
2:I[77831,[],""]
3:I[68031,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-fc3969bfc35ead00.js","777","static/chunks/777-a81b45dec53652df.js","931","static/chunks/app/page-7c218fb97a2a9817.js"],""]
3:I[68031,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-fc3969bfc35ead00.js","777","static/chunks/777-a81b45dec53652df.js","931","static/chunks/app/page-7b75dc53f1c6e449.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

File diff suppressed because one or more lines are too long

View file

@ -2,6 +2,6 @@
3:I[87494,["902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","777","static/chunks/777-a81b45dec53652df.js","418","static/chunks/app/model_hub/page-8ed460f3f33c0bf2.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

File diff suppressed because one or more lines are too long

View file

@ -2,6 +2,6 @@
3:I[667,["665","static/chunks/3014691f-b24e8254c7593934.js","902","static/chunks/902-58bf23027703b2e8.js","684","static/chunks/684-16b194c83a169f6d.js","777","static/chunks/777-a81b45dec53652df.js","461","static/chunks/app/onboarding/page-cba59362096ed469.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,8 +1,50 @@
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
- model_name: gpt-4o
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["prometheus"]
callbacks: ["prometheus", "otel"]
general_settings:
user_api_key_cache_ttl: 3600
router_settings:
routing_strategy: latency-based-routing
routing_strategy_args:
# only assign 40% of traffic to the fastest deployment to avoid overloading it
lowest_latency_buffer: 0.4
# consider last five minutes of calls for latency calculation
ttl: 300
# model_group_alias:
# gpt-4o: gpt-4o-128k-2024-05-13
# gpt-4o-mini: gpt-4o-mini-128k-2024-07-18
enable_tag_filtering: True
# retry call 3 times on each model_name (we don't use fallbacks, so this would be 3 times total)
num_retries: 3
# -- cooldown settings --
# see https://github.com/BerriAI/litellm/blob/main/litellm/router_utils/cooldown_handlers.py#L265
# cooldown model if it fails > n calls in a minute.
allowed_fails: 2
# (in seconds) how long to cooldown model if fails/min > allowed_fails
cooldown_time: 60
allowed_fails_policy:
InternalServerErrorAllowedFails: 1
RateLimitErrorAllowedFails: 2
TimeoutErrorAllowedFails: 3
# -- end cooldown settings --
# see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
redis_host: os.environ/REDIS_HOST
redis_port: os.environ/REDIS_PORT
redis_password: os.environ/REDIS_PASSWORD

View file

@ -104,7 +104,7 @@ class LitellmUserRoles(str, enum.Enum):
return ui_labels.get(self.value, "")
class LitellmTableNames(enum.Enum):
class LitellmTableNames(str, enum.Enum):
"""
Enum for Table Names used by LiteLLM
"""
@ -340,6 +340,7 @@ class LiteLLMRoutes(enum.Enum):
"/sso/get/ui_settings",
"/login",
"/key/generate",
"/key/{token_id}/regenerate",
"/key/update",
"/key/info",
"/key/delete",
@ -1371,6 +1372,8 @@ class LiteLLM_VerificationToken(LiteLLMBase):
blocked: Optional[bool] = None
litellm_budget_table: Optional[dict] = None
org_id: Optional[str] = None # org id for a given key
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
model_config = ConfigDict(protected_namespaces=())

View file

@ -28,7 +28,7 @@ from litellm.proxy._types import (
LitellmUserRoles,
UserAPIKeyAuth,
)
from litellm.proxy.auth.route_checks import is_llm_api_route
from litellm.proxy.auth.route_checks import RouteChecks
from litellm.proxy.utils import PrismaClient, ProxyLogging, log_to_opentelemetry
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
@ -138,7 +138,7 @@ def common_checks( # noqa: PLR0915
general_settings.get("enforce_user_param", None) is not None
and general_settings["enforce_user_param"] is True
):
if is_llm_api_route(route=route) and "user" not in request_body:
if RouteChecks.is_llm_api_route(route=route) and "user" not in request_body:
raise Exception(
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
)
@ -154,7 +154,7 @@ def common_checks( # noqa: PLR0915
+ CommonProxyErrors.not_premium_user.value
)
if is_llm_api_route(route=route):
if RouteChecks.is_llm_api_route(route=route):
# loop through each enforced param
# example enforced_params ['user', 'metadata', 'metadata.generation_name']
for enforced_param in general_settings["enforced_params"]:
@ -182,7 +182,7 @@ def common_checks( # noqa: PLR0915
and global_proxy_spend is not None
# only run global budget checks for OpenAI routes
# Reason - the Admin UI should continue working if the proxy crosses it's global budget
and is_llm_api_route(route=route)
and RouteChecks.is_llm_api_route(route=route)
and route != "/v1/models"
and route != "/models"
):

View file

@ -17,175 +17,199 @@ from .auth_checks_organization import _user_is_org_admin
from .auth_utils import _has_user_setup_sso
def non_proxy_admin_allowed_routes_check(
user_obj: Optional[LiteLLM_UserTable],
_user_role: Optional[LitellmUserRoles],
route: str,
request: Request,
valid_token: UserAPIKeyAuth,
api_key: str,
request_data: dict,
):
"""
Checks if Non Proxy Admin User is allowed to access the route
"""
class RouteChecks:
# Check user has defined custom admin routes
custom_admin_only_route_check(
route=route,
)
if is_llm_api_route(route=route):
pass
elif (
route in LiteLLMRoutes.info_routes.value
): # check if user allowed to call an info route
if route == "/key/info":
# check if user can access this route
query_params = request.query_params
key = query_params.get("key")
if key is not None and hash_token(token=key) != api_key:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="user not allowed to access this key's info",
)
elif route == "/user/info":
# check if user can access this route
query_params = request.query_params
user_id = query_params.get("user_id")
verbose_proxy_logger.debug(
f"user_id: {user_id} & valid_token.user_id: {valid_token.user_id}"
)
if user_id and user_id != valid_token.user_id:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="key not allowed to access this user's info. user_id={}, key's user_id={}".format(
user_id, valid_token.user_id
),
)
elif route == "/model/info":
# /model/info just shows models user has access to
pass
elif route == "/team/info":
pass # handled by function itself
elif _has_user_setup_sso() and route in LiteLLMRoutes.sso_only_routes.value:
pass
elif (
route in LiteLLMRoutes.global_spend_tracking_routes.value
and getattr(valid_token, "permissions", None) is not None
and "get_spend_routes" in getattr(valid_token, "permissions", [])
@staticmethod
def non_proxy_admin_allowed_routes_check(
user_obj: Optional[LiteLLM_UserTable],
_user_role: Optional[LitellmUserRoles],
route: str,
request: Request,
valid_token: UserAPIKeyAuth,
api_key: str,
request_data: dict,
):
"""
Checks if Non Proxy Admin User is allowed to access the route
"""
pass
elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
if is_llm_api_route(route=route):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
)
if route in LiteLLMRoutes.management_routes.value:
# the Admin Viewer is only allowed to call /user/update for their own user_id and can only update
if route == "/user/update":
# Check the Request params are valid for PROXY_ADMIN_VIEW_ONLY
if request_data is not None and isinstance(request_data, dict):
_params_updated = request_data.keys()
for param in _params_updated:
if param not in ["user_email", "password"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route} and updating invalid param: {param}. only user_email and password can be updated",
)
else:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route}",
)
elif (
_user_role == LitellmUserRoles.INTERNAL_USER.value
and route in LiteLLMRoutes.internal_user_routes.value
):
pass
elif (
_user_is_org_admin(request_data=request_data, user_object=user_obj)
and route in LiteLLMRoutes.org_admin_allowed_routes.value
):
pass
elif (
_user_role == LitellmUserRoles.INTERNAL_USER_VIEW_ONLY.value
and route in LiteLLMRoutes.internal_user_view_only_routes.value
):
pass
elif (
route in LiteLLMRoutes.self_managed_routes.value
): # routes that manage their own allowed/disallowed logic
pass
else:
user_role = "unknown"
user_id = "unknown"
if user_obj is not None:
user_role = user_obj.user_role or "unknown"
user_id = user_obj.user_id or "unknown"
raise Exception(
f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
# Check user has defined custom admin routes
RouteChecks.custom_admin_only_route_check(
route=route,
)
if RouteChecks.is_llm_api_route(route=route):
pass
elif (
route in LiteLLMRoutes.info_routes.value
): # check if user allowed to call an info route
if route == "/key/info":
# check if user can access this route
query_params = request.query_params
key = query_params.get("key")
if key is not None and hash_token(token=key) != api_key:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="user not allowed to access this key's info",
)
elif route == "/user/info":
# check if user can access this route
query_params = request.query_params
user_id = query_params.get("user_id")
verbose_proxy_logger.debug(
f"user_id: {user_id} & valid_token.user_id: {valid_token.user_id}"
)
if user_id and user_id != valid_token.user_id:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="key not allowed to access this user's info. user_id={}, key's user_id={}".format(
user_id, valid_token.user_id
),
)
elif route == "/model/info":
# /model/info just shows models user has access to
pass
elif route == "/team/info":
pass # handled by function itself
elif _has_user_setup_sso() and route in LiteLLMRoutes.sso_only_routes.value:
pass
elif (
route in LiteLLMRoutes.global_spend_tracking_routes.value
and getattr(valid_token, "permissions", None) is not None
and "get_spend_routes" in getattr(valid_token, "permissions", [])
):
def custom_admin_only_route_check(route: str):
from litellm.proxy.proxy_server import general_settings, premium_user
pass
elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
if RouteChecks.is_llm_api_route(route=route):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
)
if route in LiteLLMRoutes.management_routes.value:
# the Admin Viewer is only allowed to call /user/update for their own user_id and can only update
if route == "/user/update":
if "admin_only_routes" in general_settings:
if premium_user is not True:
verbose_proxy_logger.error(
f"Trying to use 'admin_only_routes' this is an Enterprise only feature. {CommonProxyErrors.not_premium_user.value}"
# Check the Request params are valid for PROXY_ADMIN_VIEW_ONLY
if request_data is not None and isinstance(request_data, dict):
_params_updated = request_data.keys()
for param in _params_updated:
if param not in ["user_email", "password"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route} and updating invalid param: {param}. only user_email and password can be updated",
)
else:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route}",
)
elif (
_user_role == LitellmUserRoles.INTERNAL_USER.value
and route in LiteLLMRoutes.internal_user_routes.value
):
pass
elif (
_user_is_org_admin(request_data=request_data, user_object=user_obj)
and route in LiteLLMRoutes.org_admin_allowed_routes.value
):
pass
elif (
_user_role == LitellmUserRoles.INTERNAL_USER_VIEW_ONLY.value
and route in LiteLLMRoutes.internal_user_view_only_routes.value
):
pass
elif (
route in LiteLLMRoutes.self_managed_routes.value
): # routes that manage their own allowed/disallowed logic
pass
else:
user_role = "unknown"
user_id = "unknown"
if user_obj is not None:
user_role = user_obj.user_role or "unknown"
user_id = user_obj.user_id or "unknown"
raise Exception(
f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
)
return
if route in general_settings["admin_only_routes"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route. Route={route} is an admin only route",
)
pass
@staticmethod
def custom_admin_only_route_check(route: str):
from litellm.proxy.proxy_server import general_settings, premium_user
if "admin_only_routes" in general_settings:
if premium_user is not True:
verbose_proxy_logger.error(
f"Trying to use 'admin_only_routes' this is an Enterprise only feature. {CommonProxyErrors.not_premium_user.value}"
)
return
if route in general_settings["admin_only_routes"]:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this route. Route={route} is an admin only route",
)
pass
@staticmethod
def is_llm_api_route(route: str) -> bool:
"""
Helper to checks if provided route is an OpenAI route
def is_llm_api_route(route: str) -> bool:
"""
Helper to checks if provided route is an OpenAI route
Returns:
- True: if route is an OpenAI route
- False: if route is not an OpenAI route
"""
if route in LiteLLMRoutes.openai_routes.value:
return True
if route in LiteLLMRoutes.anthropic_routes.value:
return True
# fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
# Check for routes with placeholders
for openai_route in LiteLLMRoutes.openai_routes.value:
# Replace placeholders with regex pattern
# placeholders are written as "/threads/{thread_id}"
if "{" in openai_route:
if RouteChecks._route_matches_pattern(
route=route, pattern=openai_route
):
return True
# Pass through Bedrock, VertexAI, and Cohere Routes
if "/bedrock/" in route:
return True
if "/vertex-ai/" in route:
return True
if "/gemini/" in route:
return True
if "/cohere/" in route:
return True
if "/langfuse/" in route:
return True
return False
@staticmethod
def _route_matches_pattern(route: str, pattern: str) -> bool:
"""
Check if route matches the pattern placed in proxy/_types.py
Example:
- pattern: "/threads/{thread_id}"
- route: "/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
- returns: True
Returns:
- True: if route is an OpenAI route
- False: if route is not an OpenAI route
"""
if route in LiteLLMRoutes.openai_routes.value:
return True
if route in LiteLLMRoutes.anthropic_routes.value:
return True
# fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
# Check for routes with placeholders
for openai_route in LiteLLMRoutes.openai_routes.value:
# Replace placeholders with regex pattern
# placeholders are written as "/threads/{thread_id}"
if "{" in openai_route:
pattern = re.sub(r"\{[^}]+\}", r"[^/]+", openai_route)
# Anchor the pattern to match the entire string
pattern = f"^{pattern}$"
if re.match(pattern, route):
return True
# Pass through Bedrock, VertexAI, and Cohere Routes
if "/bedrock/" in route:
return True
if "/vertex-ai/" in route:
return True
if "/gemini/" in route:
return True
if "/cohere/" in route:
return True
if "/langfuse/" in route:
return True
return False
- pattern: "/key/{token_id}/regenerate"
- route: "/key/regenerate/82akk800000000jjsk"
- returns: False, pattern is "/key/{token_id}/regenerate"
"""
pattern = re.sub(r"\{[^}]+\}", r"[^/]+", pattern)
# Anchor the pattern to match the entire string
pattern = f"^{pattern}$"
if re.match(pattern, route):
return True
return False

View file

@ -69,7 +69,7 @@ from litellm.proxy.auth.auth_utils import (
)
from litellm.proxy.auth.oauth2_check import check_oauth2_token
from litellm.proxy.auth.oauth2_proxy_hook import handle_oauth2_proxy_request
from litellm.proxy.auth.route_checks import non_proxy_admin_allowed_routes_check
from litellm.proxy.auth.route_checks import RouteChecks
from litellm.proxy.auth.service_account_checks import service_account_checks
from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
from litellm.proxy.utils import _to_ns
@ -122,6 +122,11 @@ def _is_ui_route_allowed(
):
# Do something if the current route starts with any of the allowed routes
return True
elif any(
RouteChecks._route_matches_pattern(route=route, pattern=allowed_route)
for allowed_route in allowed_routes
):
return True
else:
if user_obj is not None and _is_user_proxy_admin(user_obj=user_obj):
return True
@ -150,7 +155,7 @@ def _is_api_route_allowed(
raise Exception("Invalid proxy server token passed")
if not _is_user_proxy_admin(user_obj=user_obj): # if non-admin
non_proxy_admin_allowed_routes_check(
RouteChecks.non_proxy_admin_allowed_routes_check(
user_obj=user_obj,
_user_role=_user_role,
route=route,

View file

@ -120,7 +120,7 @@ async def health_services_endpoint( # noqa: PLR0915
}
if service == "langfuse":
from litellm.integrations.langfuse import LangFuseLogger
from litellm.integrations.langfuse.langfuse import LangFuseLogger
langfuse_logger = LangFuseLogger()
langfuse_logger.Langfuse.auth_check()
@ -372,6 +372,11 @@ async def _db_health_readiness_check():
return db_health_cache
@router.get(
"/settings",
tags=["health"],
dependencies=[Depends(user_api_key_auth)],
)
@router.get(
"/active/callbacks",
tags=["health"],
@ -379,8 +384,29 @@ async def _db_health_readiness_check():
)
async def active_callbacks():
"""
Returns a list of active callbacks on litellm.callbacks, litellm.input_callback, litellm.failure_callback, litellm.success_callback
Returns a list of litellm level settings
This is useful for debugging and ensuring the proxy server is configured correctly.
Response schema:
```
{
"alerting": _alerting,
"litellm.callbacks": litellm_callbacks,
"litellm.input_callback": litellm_input_callbacks,
"litellm.failure_callback": litellm_failure_callbacks,
"litellm.success_callback": litellm_success_callbacks,
"litellm._async_success_callback": litellm_async_success_callbacks,
"litellm._async_failure_callback": litellm_async_failure_callbacks,
"litellm._async_input_callback": litellm_async_input_callbacks,
"all_litellm_callbacks": all_litellm_callbacks,
"num_callbacks": len(all_litellm_callbacks),
"num_alerting": _num_alerting,
"litellm.request_timeout": litellm.request_timeout,
}
```
"""
from litellm.proxy.proxy_server import general_settings, proxy_logging_obj
_alerting = str(general_settings.get("alerting"))
@ -421,6 +447,7 @@ async def active_callbacks():
"all_litellm_callbacks": all_litellm_callbacks,
"num_callbacks": len(all_litellm_callbacks),
"num_alerting": _num_alerting,
"litellm.request_timeout": litellm.request_timeout,
}

View file

@ -16,7 +16,10 @@ from litellm.proxy._types import (
UserAPIKeyAuth,
)
from litellm.proxy.auth.auth_utils import get_request_route
from litellm.types.utils import SupportedCacheControls
from litellm.types.utils import (
StandardLoggingUserAPIKeyMetadata,
SupportedCacheControls,
)
if TYPE_CHECKING:
from litellm.proxy.proxy_server import ProxyConfig as _ProxyConfig
@ -159,56 +162,111 @@ def clean_headers(
return clean_headers
def get_forwardable_headers(
headers: Union[Headers, dict],
):
"""
Get the headers that should be forwarded to the LLM Provider.
Looks for any `x-` headers and sends them to the LLM Provider.
"""
forwarded_headers = {}
for header, value in headers.items():
if header.lower().startswith("x-") and not header.lower().startswith(
"x-stainless"
): # causes openai sdk to fail
forwarded_headers[header] = value
return forwarded_headers
def get_openai_org_id_from_headers(
headers: dict, general_settings: Optional[Dict] = None
) -> Optional[str]:
"""
Get the OpenAI Org ID from the headers.
"""
if (
general_settings is not None
and general_settings.get("forward_openai_org_id") is not True
class LiteLLMProxyRequestSetup:
@staticmethod
def _get_forwardable_headers(
headers: Union[Headers, dict],
):
"""
Get the headers that should be forwarded to the LLM Provider.
Looks for any `x-` headers and sends them to the LLM Provider.
"""
forwarded_headers = {}
for header, value in headers.items():
if header.lower().startswith("x-") and not header.lower().startswith(
"x-stainless"
): # causes openai sdk to fail
forwarded_headers[header] = value
return forwarded_headers
@staticmethod
def get_openai_org_id_from_headers(
headers: dict, general_settings: Optional[Dict] = None
) -> Optional[str]:
"""
Get the OpenAI Org ID from the headers.
"""
if (
general_settings is not None
and general_settings.get("forward_openai_org_id") is not True
):
return None
for header, value in headers.items():
if header.lower() == "openai-organization":
return value
return None
for header, value in headers.items():
if header.lower() == "openai-organization":
return value
return None
@staticmethod
def add_headers_to_llm_call(
headers: dict, user_api_key_dict: UserAPIKeyAuth
) -> dict:
"""
Add headers to the LLM call
def add_litellm_data_for_backend_llm_call(
headers: dict, general_settings: Optional[Dict[str, Any]] = None
) -> LitellmDataForBackendLLMCall:
"""
- Adds forwardable headers
- Adds org id
"""
data = LitellmDataForBackendLLMCall()
_headers = get_forwardable_headers(headers)
if _headers != {}:
data["headers"] = _headers
_organization = get_openai_org_id_from_headers(headers, general_settings)
if _organization is not None:
data["organization"] = _organization
return data
- Checks request headers for forwardable headers
- Checks if user information should be added to the headers
"""
from litellm.litellm_core_utils.litellm_logging import (
get_standard_logging_metadata,
)
returned_headers = LiteLLMProxyRequestSetup._get_forwardable_headers(headers)
if litellm.add_user_information_to_llm_headers is True:
litellm_logging_metadata_headers = (
LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
user_api_key_dict=user_api_key_dict
)
)
for k, v in litellm_logging_metadata_headers.items():
if v is not None:
returned_headers["x-litellm-{}".format(k)] = v
return returned_headers
@staticmethod
def add_litellm_data_for_backend_llm_call(
*,
headers: dict,
user_api_key_dict: UserAPIKeyAuth,
general_settings: Optional[Dict[str, Any]] = None,
) -> LitellmDataForBackendLLMCall:
"""
- Adds forwardable headers
- Adds org id
"""
data = LitellmDataForBackendLLMCall()
if (
general_settings
and general_settings.get("forward_client_headers_to_llm_api") is True
):
_headers = LiteLLMProxyRequestSetup.add_headers_to_llm_call(
headers, user_api_key_dict
)
if _headers != {}:
data["headers"] = _headers
_organization = LiteLLMProxyRequestSetup.get_openai_org_id_from_headers(
headers, general_settings
)
if _organization is not None:
data["organization"] = _organization
return data
@staticmethod
def get_sanitized_user_information_from_key(
user_api_key_dict: UserAPIKeyAuth,
) -> StandardLoggingUserAPIKeyMetadata:
user_api_key_logged_metadata = StandardLoggingUserAPIKeyMetadata(
user_api_key_hash=user_api_key_dict.api_key, # just the hashed token
user_api_key_alias=user_api_key_dict.key_alias,
user_api_key_team_id=user_api_key_dict.team_id,
user_api_key_user_id=user_api_key_dict.user_id,
user_api_key_org_id=user_api_key_dict.org_id,
user_api_key_team_alias=user_api_key_dict.team_alias,
)
return user_api_key_logged_metadata
async def add_litellm_data_to_request( # noqa: PLR0915
@ -246,7 +304,13 @@ async def add_litellm_data_to_request( # noqa: PLR0915
),
)
data.update(add_litellm_data_for_backend_llm_call(_headers, general_settings))
data.update(
LiteLLMProxyRequestSetup.add_litellm_data_for_backend_llm_call(
headers=_headers,
user_api_key_dict=user_api_key_dict,
general_settings=general_settings,
)
)
# Include original request and headers in the data
data["proxy_server_request"] = {
@ -294,13 +358,22 @@ async def add_litellm_data_to_request( # noqa: PLR0915
data["metadata"]
)
data[_metadata_variable_name]["user_api_key"] = user_api_key_dict.api_key
data[_metadata_variable_name]["user_api_key_alias"] = getattr(
user_api_key_dict, "key_alias", None
user_api_key_logged_metadata = (
LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
user_api_key_dict=user_api_key_dict
)
)
data[_metadata_variable_name].update(user_api_key_logged_metadata)
data[_metadata_variable_name][
"user_api_key"
] = (
user_api_key_dict.api_key
) # this is just the hashed token. [TODO]: replace variable name in repo.
data[_metadata_variable_name]["user_api_end_user_max_budget"] = getattr(
user_api_key_dict, "end_user_max_budget", None
)
data[_metadata_variable_name]["litellm_api_version"] = version
if general_settings is not None:
@ -308,15 +381,6 @@ async def add_litellm_data_to_request( # noqa: PLR0915
general_settings.get("global_max_parallel_requests", None)
)
data[_metadata_variable_name]["user_api_key_user_id"] = user_api_key_dict.user_id
data[_metadata_variable_name]["user_api_key_org_id"] = user_api_key_dict.org_id
data[_metadata_variable_name]["user_api_key_team_id"] = getattr(
user_api_key_dict, "team_id", None
)
data[_metadata_variable_name]["user_api_key_team_alias"] = getattr(
user_api_key_dict, "team_alias", None
)
### KEY-LEVEL Controls
key_metadata = user_api_key_dict.metadata
if "cache" in key_metadata:

View file

@ -0,0 +1,43 @@
"""
Functions to create audit logs for LiteLLM Proxy
"""
import json
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.proxy._types import LiteLLM_AuditLogs
async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
from litellm.proxy.proxy_server import premium_user, prisma_client
if premium_user is not True:
return
if litellm.store_audit_logs is not True:
return
if prisma_client is None:
raise Exception("prisma_client is None, no DB connected")
verbose_proxy_logger.debug("creating audit log for %s", request_data)
if isinstance(request_data.updated_values, dict):
request_data.updated_values = json.dumps(request_data.updated_values)
if isinstance(request_data.before_value, dict):
request_data.before_value = json.dumps(request_data.before_value)
_request_data = request_data.model_dump(exclude_none=True)
try:
await prisma_client.db.litellm_auditlog.create(
data={
**_request_data, # type: ignore
}
)
except Exception as e:
# [Non-Blocking Exception. Do not allow blocking LLM API call]
verbose_proxy_logger.error(f"Failed Creating audit log {e}")
return

View file

@ -125,7 +125,7 @@ def is_port_in_use(port):
)
@click.option(
"--request_timeout",
default=600,
default=6000,
type=int,
help="Set timeout in seconds for completion calls",
)

View file

@ -1,48 +1,20 @@
model_list:
################################################################################
# Azure
- model_name: gpt-4o-mini
litellm_params:
model: azure/gpt-4o-mini
api_base: https://amazin-prod.openai.azure.com
api_key: "os.environ/AZURE_GPT_4O"
deployment_id: gpt-4o-mini
- model_name: gpt-4o
litellm_params:
model: azure/gpt-4o
api_base: https://very-cool-prod.openai.azure.com
api_key: "os.environ/AZURE_GPT_4O"
deployment_id: gpt-4o
model: gpt-4o
api_key: os.environ/OPENAI_API_KEY
tpm: 1000000
rpm: 10000
################################################################################
# Fireworks
- model_name: fireworks-llama-v3p1-405b-instruct
litellm_params:
model: fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct
api_key: "os.environ/FIREWORKS"
- model_name: fireworks-llama-v3p1-70b-instruct
litellm_params:
model: fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct
api_key: "os.environ/FIREWORKS"
general_settings:
alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
success_callback: ["prometheus"]
service_callback: ["prometheus_system"]
drop_params: False # Raise an exception if the openai param being passed in isn't supported.
cache: false
default_internal_user_params:
user_role: os.environ/DEFAULT_USER_ROLE
# master key is set via env var
# master_key: #######
proxy_batch_write_at: 60 # Batch write spend updates every 60s
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
litellm_settings:
store_audit_logs: true
router_settings:
routing_strategy: simple-shuffle # "simple-shuffle" shown to result in highest throughput. https://docs.litellm.ai/docs/proxy/configs#load-balancing
# https://docs.litellm.ai/docs/proxy/reliability#default-fallbacks
default_fallbacks: ["gpt-4o-2024-08-06", "claude-3-5-sonnet-20240620"]
fallbacks: [{"gpt-4o-2024-08-06": ["claude-3-5-sonnet-20240620"]}, {"gpt-4o-2024-05-13": ["claude-3-5-sonnet-20240620"]}]

View file

@ -194,6 +194,7 @@ from litellm.proxy.management_endpoints.team_callback_endpoints import (
)
from litellm.proxy.management_endpoints.team_endpoints import router as team_router
from litellm.proxy.management_endpoints.ui_sso import router as ui_sso_router
from litellm.proxy.management_helpers.audit_logs import create_audit_log_for_update
from litellm.proxy.openai_files_endpoints.files_endpoints import is_known_model
from litellm.proxy.openai_files_endpoints.files_endpoints import (
router as openai_files_router,
@ -638,18 +639,6 @@ def _resolve_pydantic_type(typ) -> List:
return typs
def prisma_setup(database_url: Optional[str]):
global prisma_client, proxy_logging_obj, user_api_key_cache
if database_url is not None:
try:
prisma_client = PrismaClient(
database_url=database_url, proxy_logging_obj=proxy_logging_obj
)
except Exception as e:
raise e
def load_from_azure_key_vault(use_azure_key_vault: bool = False):
if use_azure_key_vault is False:
return
@ -1548,7 +1537,7 @@ class ProxyConfig:
## INIT PROXY REDIS USAGE CLIENT ##
redis_usage_cache = litellm.cache.cache
async def get_config(self, config_file_path: Optional[str] = None) -> dict:
"""
Load config file
@ -2801,137 +2790,55 @@ def giveup(e):
return result
@router.on_event("startup")
async def startup_event(): # noqa: PLR0915
global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time, litellm_proxy_admin_name, db_writer_client, store_model_in_db, premium_user, _license_check
import json
class ProxyStartupEvent:
@classmethod
def _initialize_startup_logging(
cls,
llm_router: Optional[litellm.Router],
proxy_logging_obj: ProxyLogging,
redis_usage_cache: Optional[RedisCache],
):
"""Initialize logging and alerting on startup"""
## COST TRACKING ##
cost_tracking()
init_verbose_loggers()
## Error Tracking ##
error_tracking()
### LOAD MASTER KEY ###
# check if master key set in environment - load from there
master_key = get_secret("LITELLM_MASTER_KEY", None) # type: ignore
# check if DATABASE_URL in environment - load from there
if prisma_client is None:
_db_url: Optional[str] = get_secret("DATABASE_URL", None) # type: ignore
prisma_setup(database_url=_db_url)
proxy_logging_obj.startup_event(
llm_router=llm_router, redis_usage_cache=redis_usage_cache
)
### LOAD CONFIG ###
worker_config: Optional[Union[str, dict]] = get_secret("WORKER_CONFIG") # type: ignore
env_config_yaml: Optional[str] = get_secret_str("CONFIG_FILE_PATH")
verbose_proxy_logger.debug("worker_config: %s", worker_config)
# check if it's a valid file path
if env_config_yaml is not None:
if os.path.isfile(env_config_yaml) and proxy_config.is_yaml(
config_file_path=env_config_yaml
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=env_config_yaml
)
elif worker_config is not None:
if (
isinstance(worker_config, str)
and os.path.isfile(worker_config)
and proxy_config.is_yaml(config_file_path=worker_config)
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=worker_config
)
elif os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None and isinstance(
worker_config, str
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=worker_config
)
elif isinstance(worker_config, dict):
await initialize(**worker_config)
@classmethod
def _initialize_jwt_auth(
cls,
general_settings: dict,
prisma_client: Optional[PrismaClient],
user_api_key_cache: DualCache,
):
"""Initialize JWT auth on startup"""
if general_settings.get("litellm_jwtauth", None) is not None:
for k, v in general_settings["litellm_jwtauth"].items():
if isinstance(v, str) and v.startswith("os.environ/"):
general_settings["litellm_jwtauth"][k] = get_secret(v)
litellm_jwtauth = LiteLLM_JWTAuth(**general_settings["litellm_jwtauth"])
else:
# if not, assume it's a json string
worker_config = json.loads(worker_config)
if isinstance(worker_config, dict):
await initialize(**worker_config)
## CHECK PREMIUM USER
verbose_proxy_logger.debug(
"litellm.proxy.proxy_server.py::startup() - CHECKING PREMIUM USER - {}".format(
premium_user
litellm_jwtauth = LiteLLM_JWTAuth()
jwt_handler.update_environment(
prisma_client=prisma_client,
user_api_key_cache=user_api_key_cache,
litellm_jwtauth=litellm_jwtauth,
)
)
if premium_user is False:
premium_user = _license_check.is_premium()
verbose_proxy_logger.debug(
"litellm.proxy.proxy_server.py::startup() - PREMIUM USER value - {}".format(
premium_user
)
)
## COST TRACKING ##
cost_tracking()
## Error Tracking ##
error_tracking()
## UPDATE SLACK ALERTING ##
proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router)
db_writer_client = HTTPHandler()
## UPDATE INTERNAL USAGE CACHE ##
proxy_logging_obj.update_values(
redis_cache=redis_usage_cache
) # used by parallel request limiter for rate limiting keys across instances
proxy_logging_obj._init_litellm_callbacks(
llm_router=llm_router
) # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
if "daily_reports" in proxy_logging_obj.slack_alerting_instance.alert_types:
asyncio.create_task(
proxy_logging_obj.slack_alerting_instance._run_scheduled_daily_report(
llm_router=llm_router
)
) # RUN DAILY REPORT (if scheduled)
## JWT AUTH ##
if general_settings.get("litellm_jwtauth", None) is not None:
for k, v in general_settings["litellm_jwtauth"].items():
if isinstance(v, str) and v.startswith("os.environ/"):
general_settings["litellm_jwtauth"][k] = get_secret(v)
litellm_jwtauth = LiteLLM_JWTAuth(**general_settings["litellm_jwtauth"])
else:
litellm_jwtauth = LiteLLM_JWTAuth()
jwt_handler.update_environment(
prisma_client=prisma_client,
user_api_key_cache=user_api_key_cache,
litellm_jwtauth=litellm_jwtauth,
)
if use_background_health_checks:
asyncio.create_task(
_run_background_health_check()
) # start the background health check coroutine.
if prompt_injection_detection_obj is not None:
prompt_injection_detection_obj.update_environment(router=llm_router)
verbose_proxy_logger.debug("prisma_client: %s", prisma_client)
if prisma_client is not None:
await prisma_client.connect()
if prisma_client is not None and master_key is not None:
@classmethod
def _add_master_key_hash_to_db(
cls,
master_key: str,
prisma_client: PrismaClient,
litellm_proxy_admin_name: str,
general_settings: dict,
):
"""Adds master key hash to db for cost tracking"""
if os.getenv("PROXY_ADMIN_ID", None) is not None:
litellm_proxy_admin_name = os.getenv(
"PROXY_ADMIN_ID", litellm_proxy_admin_name
@ -2956,7 +2863,9 @@ async def startup_event(): # noqa: PLR0915
)
asyncio.create_task(task_1)
if prisma_client is not None and litellm.max_budget > 0:
@classmethod
def _add_proxy_budget_to_db(cls, litellm_proxy_budget_name: str):
"""Adds a global proxy budget to db"""
if litellm.budget_duration is None:
raise Exception(
"budget_duration not set on Proxy. budget_duration is required to use max_budget."
@ -2982,8 +2891,18 @@ async def startup_event(): # noqa: PLR0915
)
)
### START BATCH WRITING DB + CHECKING NEW MODELS###
if prisma_client is not None:
@classmethod
async def initialize_scheduled_background_jobs(
cls,
general_settings: dict,
prisma_client: PrismaClient,
proxy_budget_rescheduler_min_time: int,
proxy_budget_rescheduler_max_time: int,
proxy_batch_write_at: int,
proxy_logging_obj: ProxyLogging,
store_model_in_db: bool,
):
"""Initializes scheduled background jobs"""
scheduler = AsyncIOScheduler()
interval = random.randint(
proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time
@ -3072,6 +2991,165 @@ async def startup_event(): # noqa: PLR0915
scheduler.start()
@classmethod
def _setup_prisma_client(
cls,
database_url: Optional[str],
proxy_logging_obj: ProxyLogging,
user_api_key_cache: DualCache,
) -> Optional[PrismaClient]:
"""
- Sets up prisma client
- Adds necessary views to proxy
"""
prisma_client: Optional[PrismaClient] = None
if database_url is not None:
try:
prisma_client = PrismaClient(
database_url=database_url, proxy_logging_obj=proxy_logging_obj
)
except Exception as e:
raise e
## Add necessary views to proxy ##
asyncio.create_task(
prisma_client.check_view_exists()
) # check if all necessary views exist. Don't block execution
return prisma_client
@router.on_event("startup")
async def startup_event():
global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time, litellm_proxy_admin_name, db_writer_client, store_model_in_db, premium_user, _license_check
import json
init_verbose_loggers()
### LOAD MASTER KEY ###
# check if master key set in environment - load from there
master_key = get_secret("LITELLM_MASTER_KEY", None) # type: ignore
# check if DATABASE_URL in environment - load from there
if prisma_client is None:
_db_url: Optional[str] = get_secret("DATABASE_URL", None) # type: ignore
prisma_client = ProxyStartupEvent._setup_prisma_client(
database_url=_db_url,
proxy_logging_obj=proxy_logging_obj,
user_api_key_cache=user_api_key_cache,
)
### LOAD CONFIG ###
worker_config: Optional[Union[str, dict]] = get_secret("WORKER_CONFIG") # type: ignore
env_config_yaml: Optional[str] = get_secret_str("CONFIG_FILE_PATH")
verbose_proxy_logger.debug("worker_config: %s", worker_config)
# check if it's a valid file path
if env_config_yaml is not None:
if os.path.isfile(env_config_yaml) and proxy_config.is_yaml(
config_file_path=env_config_yaml
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=env_config_yaml
)
elif worker_config is not None:
if (
isinstance(worker_config, str)
and os.path.isfile(worker_config)
and proxy_config.is_yaml(config_file_path=worker_config)
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=worker_config
)
elif os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None and isinstance(
worker_config, str
):
(
llm_router,
llm_model_list,
general_settings,
) = await proxy_config.load_config(
router=llm_router, config_file_path=worker_config
)
elif isinstance(worker_config, dict):
await initialize(**worker_config)
else:
# if not, assume it's a json string
worker_config = json.loads(worker_config)
if isinstance(worker_config, dict):
await initialize(**worker_config)
## CHECK PREMIUM USER
verbose_proxy_logger.debug(
"litellm.proxy.proxy_server.py::startup() - CHECKING PREMIUM USER - {}".format(
premium_user
)
)
if premium_user is False:
premium_user = _license_check.is_premium()
verbose_proxy_logger.debug(
"litellm.proxy.proxy_server.py::startup() - PREMIUM USER value - {}".format(
premium_user
)
)
ProxyStartupEvent._initialize_startup_logging(
llm_router=llm_router,
proxy_logging_obj=proxy_logging_obj,
redis_usage_cache=redis_usage_cache,
)
## JWT AUTH ##
ProxyStartupEvent._initialize_jwt_auth(
general_settings=general_settings,
prisma_client=prisma_client,
user_api_key_cache=user_api_key_cache,
)
if use_background_health_checks:
asyncio.create_task(
_run_background_health_check()
) # start the background health check coroutine.
if prompt_injection_detection_obj is not None: # [TODO] - REFACTOR THIS
prompt_injection_detection_obj.update_environment(router=llm_router)
verbose_proxy_logger.debug("prisma_client: %s", prisma_client)
if prisma_client is not None:
await prisma_client.connect()
if prisma_client is not None and master_key is not None:
ProxyStartupEvent._add_master_key_hash_to_db(
master_key=master_key,
prisma_client=prisma_client,
litellm_proxy_admin_name=litellm_proxy_admin_name,
general_settings=general_settings,
)
if prisma_client is not None and litellm.max_budget > 0:
ProxyStartupEvent._add_proxy_budget_to_db(
litellm_proxy_budget_name=litellm_proxy_admin_name
)
### START BATCH WRITING DB + CHECKING NEW MODELS###
if prisma_client is not None:
await ProxyStartupEvent.initialize_scheduled_background_jobs(
general_settings=general_settings,
prisma_client=prisma_client,
proxy_budget_rescheduler_min_time=proxy_budget_rescheduler_min_time,
proxy_budget_rescheduler_max_time=proxy_budget_rescheduler_max_time,
proxy_batch_write_at=proxy_batch_write_at,
proxy_logging_obj=proxy_logging_obj,
store_model_in_db=store_model_in_db,
)
#### API ENDPOINTS ####
@router.get(
@ -6327,11 +6405,7 @@ async def list_end_user(
--header 'Authorization: Bearer sk-1234'
```
"""
from litellm.proxy.proxy_server import (
create_audit_log_for_update,
litellm_proxy_admin_name,
prisma_client,
)
from litellm.proxy.proxy_server import litellm_proxy_admin_name, prisma_client
if (
user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
@ -6362,38 +6436,6 @@ async def list_end_user(
return returned_response
async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
if premium_user is not True:
return
if litellm.store_audit_logs is not True:
return
if prisma_client is None:
raise Exception("prisma_client is None, no DB connected")
verbose_proxy_logger.debug("creating audit log for %s", request_data)
if isinstance(request_data.updated_values, dict):
request_data.updated_values = json.dumps(request_data.updated_values)
if isinstance(request_data.before_value, dict):
request_data.before_value = json.dumps(request_data.before_value)
_request_data = request_data.dict(exclude_none=True)
try:
await prisma_client.db.litellm_auditlog.create(
data={
**_request_data, # type: ignore
}
)
except Exception as e:
# [Non-Blocking Exception. Do not allow blocking LLM API call]
verbose_proxy_logger.error(f"Failed Creating audit log {e}")
return
#### BUDGET TABLE MANAGEMENT ####

View file

@ -154,6 +154,8 @@ model LiteLLM_VerificationToken {
model_spend Json @default("{}")
model_max_budget Json @default("{}")
budget_id String?
created_at DateTime? @default(now()) @map("created_at")
updated_at DateTime? @default(now()) @updatedAt @map("updated_at")
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
}

View file

@ -349,6 +349,31 @@ class ProxyLogging:
)
self.premium_user = premium_user
def startup_event(
self,
llm_router: Optional[litellm.Router],
redis_usage_cache: Optional[RedisCache],
):
"""Initialize logging and alerting on proxy startup"""
## UPDATE SLACK ALERTING ##
self.slack_alerting_instance.update_values(llm_router=llm_router)
## UPDATE INTERNAL USAGE CACHE ##
self.update_values(
redis_cache=redis_usage_cache
) # used by parallel request limiter for rate limiting keys across instances
self._init_litellm_callbacks(
llm_router=llm_router
) # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
if "daily_reports" in self.slack_alerting_instance.alert_types:
asyncio.create_task(
self.slack_alerting_instance._run_scheduled_daily_report(
llm_router=llm_router
)
) # RUN DAILY REPORT (if scheduled)
def update_values(
self,
alerting: Optional[List] = None,

View file

@ -63,10 +63,7 @@ from litellm.router_utils.batch_utils import (
_get_router_metadata_variable_name,
replace_model_in_jsonl,
)
from litellm.router_utils.client_initalization_utils import (
set_client,
should_initialize_sync_client,
)
from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient
from litellm.router_utils.cooldown_cache import CooldownCache
from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
from litellm.router_utils.cooldown_handlers import (
@ -3951,7 +3948,7 @@ class Router:
raise Exception(f"Unsupported provider - {custom_llm_provider}")
# init OpenAI, Azure clients
set_client(
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment.to_json(exclude_none=True)
)
@ -4661,7 +4658,9 @@ class Router:
"""
Re-initialize the client
"""
set_client(litellm_router_instance=self, model=deployment)
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key, local_only=True)
return client
else:
@ -4671,7 +4670,9 @@ class Router:
"""
Re-initialize the client
"""
set_client(litellm_router_instance=self, model=deployment)
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key, local_only=True)
return client
else:
@ -4682,7 +4683,9 @@ class Router:
"""
Re-initialize the client
"""
set_client(litellm_router_instance=self, model=deployment)
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key)
return client
else:
@ -4692,7 +4695,9 @@ class Router:
"""
Re-initialize the client
"""
set_client(litellm_router_instance=self, model=deployment)
InitalizeOpenAISDKClient.set_client(
litellm_router_instance=self, model=deployment
)
client = self.cache.get_cache(key=cache_key)
return client

View file

@ -23,236 +23,227 @@ else:
LitellmRouter = Any
def should_initialize_sync_client(
litellm_router_instance: LitellmRouter,
) -> bool:
"""
Returns if Sync OpenAI, Azure Clients should be initialized.
class InitalizeOpenAISDKClient:
@staticmethod
def should_initialize_sync_client(
litellm_router_instance: LitellmRouter,
) -> bool:
"""
Returns if Sync OpenAI, Azure Clients should be initialized.
Do not init sync clients when router.router_general_settings.async_only_mode is True
Do not init sync clients when router.router_general_settings.async_only_mode is True
"""
if litellm_router_instance is None:
return False
if litellm_router_instance.router_general_settings is not None:
if (
hasattr(litellm_router_instance, "router_general_settings")
and hasattr(
litellm_router_instance.router_general_settings, "async_only_mode"
)
and litellm_router_instance.router_general_settings.async_only_mode is True
):
"""
if litellm_router_instance is None:
return False
return True
if litellm_router_instance.router_general_settings is not None:
if (
hasattr(litellm_router_instance, "router_general_settings")
and hasattr(
litellm_router_instance.router_general_settings, "async_only_mode"
)
and litellm_router_instance.router_general_settings.async_only_mode
is True
):
return False
return True
def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PLR0915
"""
- Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
- Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
"""
client_ttl = litellm_router_instance.client_ttl
litellm_params = model.get("litellm_params", {})
model_name = litellm_params.get("model")
model_id = model["model_info"]["id"]
# ### IF RPM SET - initialize a semaphore ###
rpm = litellm_params.get("rpm", None)
tpm = litellm_params.get("tpm", None)
max_parallel_requests = litellm_params.get("max_parallel_requests", None)
calculated_max_parallel_requests = calculate_max_parallel_requests(
rpm=rpm,
max_parallel_requests=max_parallel_requests,
tpm=tpm,
default_max_parallel_requests=litellm_router_instance.default_max_parallel_requests,
)
if calculated_max_parallel_requests:
semaphore = asyncio.Semaphore(calculated_max_parallel_requests)
cache_key = f"{model_id}_max_parallel_requests_client"
litellm_router_instance.cache.set_cache(
key=cache_key,
value=semaphore,
local_only=True,
)
#### for OpenAI / Azure we need to initalize the Client for High Traffic ########
custom_llm_provider = litellm_params.get("custom_llm_provider")
custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
default_api_base = None
default_api_key = None
if custom_llm_provider in litellm.openai_compatible_providers:
_, custom_llm_provider, api_key, api_base = litellm.get_llm_provider(
model=model_name
)
default_api_base = api_base
default_api_key = api_key
if (
model_name in litellm.open_ai_chat_completion_models
or custom_llm_provider in litellm.openai_compatible_providers
or custom_llm_provider == "azure"
or custom_llm_provider == "azure_text"
or custom_llm_provider == "custom_openai"
or custom_llm_provider == "openai"
or custom_llm_provider == "text-completion-openai"
or "ft:gpt-3.5-turbo" in model_name
or model_name in litellm.open_ai_embedding_models
@staticmethod
def set_client( # noqa: PLR0915
litellm_router_instance: LitellmRouter, model: dict
):
is_azure_ai_studio_model: bool = False
if custom_llm_provider == "azure":
if litellm.utils._is_non_openai_azure_model(model_name):
is_azure_ai_studio_model = True
custom_llm_provider = "openai"
# remove azure prefx from model_name
model_name = model_name.replace("azure/", "")
# glorified / complicated reading of configs
# user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
# we do this here because we init clients for Azure, OpenAI and we need to set the right key
api_key = litellm_params.get("api_key") or default_api_key
if api_key and isinstance(api_key, str) and api_key.startswith("os.environ/"):
api_key_env_name = api_key.replace("os.environ/", "")
api_key = get_secret_str(api_key_env_name)
litellm_params["api_key"] = api_key
api_base = litellm_params.get("api_base")
base_url: Optional[str] = litellm_params.get("base_url")
api_base = (
api_base or base_url or default_api_base
) # allow users to pass in `api_base` or `base_url` for azure
if api_base and api_base.startswith("os.environ/"):
api_base_env_name = api_base.replace("os.environ/", "")
api_base = get_secret_str(api_base_env_name)
litellm_params["api_base"] = api_base
## AZURE AI STUDIO MISTRAL CHECK ##
"""
Make sure api base ends in /v1/
if not, add it - https://github.com/BerriAI/litellm/issues/2279
- Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
- Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
"""
if (
is_azure_ai_studio_model is True
and api_base is not None
and isinstance(api_base, str)
and not api_base.endswith("/v1/")
):
# check if it ends with a trailing slash
if api_base.endswith("/"):
api_base += "v1/"
elif api_base.endswith("/v1"):
api_base += "/"
else:
api_base += "/v1/"
api_version = litellm_params.get("api_version")
if api_version and api_version.startswith("os.environ/"):
api_version_env_name = api_version.replace("os.environ/", "")
api_version = get_secret_str(api_version_env_name)
litellm_params["api_version"] = api_version
timeout: Optional[float] = (
litellm_params.pop("timeout", None) or litellm.request_timeout
client_ttl = litellm_router_instance.client_ttl
litellm_params = model.get("litellm_params", {})
model_name = litellm_params.get("model")
model_id = model["model_info"]["id"]
# ### IF RPM SET - initialize a semaphore ###
rpm = litellm_params.get("rpm", None)
tpm = litellm_params.get("tpm", None)
max_parallel_requests = litellm_params.get("max_parallel_requests", None)
calculated_max_parallel_requests = calculate_max_parallel_requests(
rpm=rpm,
max_parallel_requests=max_parallel_requests,
tpm=tpm,
default_max_parallel_requests=litellm_router_instance.default_max_parallel_requests,
)
if isinstance(timeout, str) and timeout.startswith("os.environ/"):
timeout_env_name = timeout.replace("os.environ/", "")
timeout = get_secret(timeout_env_name) # type: ignore
litellm_params["timeout"] = timeout
stream_timeout: Optional[float] = litellm_params.pop(
"stream_timeout", timeout
) # if no stream_timeout is set, default to timeout
if isinstance(stream_timeout, str) and stream_timeout.startswith("os.environ/"):
stream_timeout_env_name = stream_timeout.replace("os.environ/", "")
stream_timeout = get_secret(stream_timeout_env_name) # type: ignore
litellm_params["stream_timeout"] = stream_timeout
max_retries: Optional[int] = litellm_params.pop(
"max_retries", 0
) # router handles retry logic
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
max_retries_env_name = max_retries.replace("os.environ/", "")
max_retries = get_secret(max_retries_env_name) # type: ignore
litellm_params["max_retries"] = max_retries
organization = litellm_params.get("organization", None)
if isinstance(organization, str) and organization.startswith("os.environ/"):
organization_env_name = organization.replace("os.environ/", "")
organization = get_secret_str(organization_env_name)
litellm_params["organization"] = organization
azure_ad_token_provider: Optional[Callable[[], str]] = None
if litellm_params.get("tenant_id"):
verbose_router_logger.debug("Using Azure AD Token Provider for Azure Auth")
azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
tenant_id=litellm_params.get("tenant_id"),
client_id=litellm_params.get("client_id"),
client_secret=litellm_params.get("client_secret"),
if calculated_max_parallel_requests:
semaphore = asyncio.Semaphore(calculated_max_parallel_requests)
cache_key = f"{model_id}_max_parallel_requests_client"
litellm_router_instance.cache.set_cache(
key=cache_key,
value=semaphore,
local_only=True,
)
if custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
if api_base is None or not isinstance(api_base, str):
filtered_litellm_params = {
k: v for k, v in model["litellm_params"].items() if k != "api_key"
}
_filtered_model = {
"model_name": model["model_name"],
"litellm_params": filtered_litellm_params,
}
raise ValueError(
f"api_base is required for Azure OpenAI. Set it on your config. Model - {_filtered_model}"
)
azure_ad_token = litellm_params.get("azure_ad_token")
if azure_ad_token is not None:
if azure_ad_token.startswith("oidc/"):
azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
elif (
azure_ad_token_provider is None
and litellm.enable_azure_ad_token_refresh is True
#### for OpenAI / Azure we need to initalize the Client for High Traffic ########
custom_llm_provider = litellm_params.get("custom_llm_provider")
custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
default_api_base = None
default_api_key = None
if custom_llm_provider in litellm.openai_compatible_providers:
_, custom_llm_provider, api_key, api_base = litellm.get_llm_provider(
model=model_name
)
default_api_base = api_base
default_api_key = api_key
if (
model_name in litellm.open_ai_chat_completion_models
or custom_llm_provider in litellm.openai_compatible_providers
or custom_llm_provider == "azure"
or custom_llm_provider == "azure_text"
or custom_llm_provider == "custom_openai"
or custom_llm_provider == "openai"
or custom_llm_provider == "text-completion-openai"
or "ft:gpt-3.5-turbo" in model_name
or model_name in litellm.open_ai_embedding_models
):
is_azure_ai_studio_model: bool = False
if custom_llm_provider == "azure":
if litellm.utils._is_non_openai_azure_model(model_name):
is_azure_ai_studio_model = True
custom_llm_provider = "openai"
# remove azure prefx from model_name
model_name = model_name.replace("azure/", "")
# glorified / complicated reading of configs
# user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
# we do this here because we init clients for Azure, OpenAI and we need to set the right key
api_key = litellm_params.get("api_key") or default_api_key
if (
api_key
and isinstance(api_key, str)
and api_key.startswith("os.environ/")
):
try:
azure_ad_token_provider = get_azure_ad_token_provider()
except ValueError:
verbose_router_logger.debug(
"Azure AD Token Provider could not be used."
)
if api_version is None:
api_version = os.getenv(
"AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
)
api_key_env_name = api_key.replace("os.environ/", "")
api_key = get_secret_str(api_key_env_name)
litellm_params["api_key"] = api_key
if "gateway.ai.cloudflare.com" in api_base:
if not api_base.endswith("/"):
api_base = litellm_params.get("api_base")
base_url: Optional[str] = litellm_params.get("base_url")
api_base = (
api_base or base_url or default_api_base
) # allow users to pass in `api_base` or `base_url` for azure
if api_base and api_base.startswith("os.environ/"):
api_base_env_name = api_base.replace("os.environ/", "")
api_base = get_secret_str(api_base_env_name)
litellm_params["api_base"] = api_base
## AZURE AI STUDIO MISTRAL CHECK ##
"""
Make sure api base ends in /v1/
if not, add it - https://github.com/BerriAI/litellm/issues/2279
"""
if (
is_azure_ai_studio_model is True
and api_base is not None
and isinstance(api_base, str)
and not api_base.endswith("/v1/")
):
# check if it ends with a trailing slash
if api_base.endswith("/"):
api_base += "v1/"
elif api_base.endswith("/v1"):
api_base += "/"
azure_model = model_name.replace("azure/", "")
api_base += f"{azure_model}"
cache_key = f"{model_id}_async_client"
_client = openai.AsyncAzureOpenAI(
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
base_url=api_base,
api_version=api_version,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
else:
api_base += "/v1/"
if should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
api_version = litellm_params.get("api_version")
if api_version and api_version.startswith("os.environ/"):
api_version_env_name = api_version.replace("os.environ/", "")
api_version = get_secret_str(api_version_env_name)
litellm_params["api_version"] = api_version
timeout: Optional[float] = (
litellm_params.pop("timeout", None) or litellm.request_timeout
)
if isinstance(timeout, str) and timeout.startswith("os.environ/"):
timeout_env_name = timeout.replace("os.environ/", "")
timeout = get_secret(timeout_env_name) # type: ignore
litellm_params["timeout"] = timeout
stream_timeout: Optional[float] = litellm_params.pop(
"stream_timeout", timeout
) # if no stream_timeout is set, default to timeout
if isinstance(stream_timeout, str) and stream_timeout.startswith(
"os.environ/"
):
stream_timeout_env_name = stream_timeout.replace("os.environ/", "")
stream_timeout = get_secret(stream_timeout_env_name) # type: ignore
litellm_params["stream_timeout"] = stream_timeout
max_retries: Optional[int] = litellm_params.pop(
"max_retries", 0
) # router handles retry logic
if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
max_retries_env_name = max_retries.replace("os.environ/", "")
max_retries = get_secret(max_retries_env_name) # type: ignore
litellm_params["max_retries"] = max_retries
organization = litellm_params.get("organization", None)
if isinstance(organization, str) and organization.startswith("os.environ/"):
organization_env_name = organization.replace("os.environ/", "")
organization = get_secret_str(organization_env_name)
litellm_params["organization"] = organization
azure_ad_token_provider: Optional[Callable[[], str]] = None
if litellm_params.get("tenant_id"):
verbose_router_logger.debug(
"Using Azure AD Token Provider for Azure Auth"
)
azure_ad_token_provider = (
InitalizeOpenAISDKClient.get_azure_ad_token_from_entrata_id(
tenant_id=litellm_params.get("tenant_id"),
client_id=litellm_params.get("client_id"),
client_secret=litellm_params.get("client_secret"),
)
)
if custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
if api_base is None or not isinstance(api_base, str):
filtered_litellm_params = {
k: v
for k, v in model["litellm_params"].items()
if k != "api_key"
}
_filtered_model = {
"model_name": model["model_name"],
"litellm_params": filtered_litellm_params,
}
raise ValueError(
f"api_base is required for Azure OpenAI. Set it on your config. Model - {_filtered_model}"
)
azure_ad_token = litellm_params.get("azure_ad_token")
if azure_ad_token is not None:
if azure_ad_token.startswith("oidc/"):
azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
elif (
azure_ad_token_provider is None
and litellm.enable_azure_ad_token_refresh is True
):
cache_key = f"{model_id}_client"
_client = openai.AzureOpenAI( # type: ignore
try:
azure_ad_token_provider = get_azure_ad_token_provider()
except ValueError:
verbose_router_logger.debug(
"Azure AD Token Provider could not be used."
)
if api_version is None:
api_version = os.getenv(
"AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
)
if "gateway.ai.cloudflare.com" in api_base:
if not api_base.endswith("/"):
api_base += "/"
azure_model = model_name.replace("azure/", "")
api_base += f"{azure_model}"
cache_key = f"{model_id}_async_client"
_client = openai.AsyncAzureOpenAI(
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
@ -260,7 +251,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
api_version=api_version,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
@ -273,35 +264,35 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
# streaming clients can have diff timeouts
cache_key = f"{model_id}_stream_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
base_url=api_base,
api_version=api_version,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_stream_client"
_client = openai.AzureOpenAI( # type: ignore
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_client"
_client = openai.AzureOpenAI( # type: ignore
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
base_url=api_base,
api_version=api_version,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
# streaming clients can have diff timeouts
cache_key = f"{model_id}_stream_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
@ -309,7 +300,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
api_version=api_version,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
@ -322,41 +313,159 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_stream_client"
_client = openai.AzureOpenAI( # type: ignore
api_key=api_key,
azure_ad_token=azure_ad_token,
azure_ad_token_provider=azure_ad_token_provider,
base_url=api_base,
api_version=api_version,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
else:
_api_key = api_key
if _api_key is not None and isinstance(_api_key, str):
# only show first 5 chars of api_key
_api_key = _api_key[:8] + "*" * 15
verbose_router_logger.debug(
f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
)
azure_client_params = {
"api_key": api_key,
"azure_endpoint": api_base,
"api_version": api_version,
"azure_ad_token": azure_ad_token,
"azure_ad_token_provider": azure_ad_token_provider,
}
if azure_ad_token_provider is not None:
azure_client_params["azure_ad_token_provider"] = (
azure_ad_token_provider
)
from litellm.llms.AzureOpenAI.azure import (
select_azure_base_url_or_endpoint,
)
# this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
# required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params
)
cache_key = f"{model_id}_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
**azure_client_params,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_client"
_client = openai.AzureOpenAI( # type: ignore
**azure_client_params,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
# streaming clients should have diff timeouts
cache_key = f"{model_id}_stream_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
**azure_client_params,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_stream_client"
_client = openai.AzureOpenAI( # type: ignore
**azure_client_params,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
else:
_api_key = api_key
_api_key = api_key # type: ignore
if _api_key is not None and isinstance(_api_key, str):
# only show first 5 chars of api_key
_api_key = _api_key[:8] + "*" * 15
verbose_router_logger.debug(
f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
)
azure_client_params = {
"api_key": api_key,
"azure_endpoint": api_base,
"api_version": api_version,
"azure_ad_token": azure_ad_token,
"azure_ad_token_provider": azure_ad_token_provider,
}
if azure_ad_token_provider is not None:
azure_client_params["azure_ad_token_provider"] = (
azure_ad_token_provider
)
from litellm.llms.AzureOpenAI.azure import (
select_azure_base_url_or_endpoint,
)
# this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
# required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params
)
cache_key = f"{model_id}_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
**azure_client_params,
_client = openai.AsyncOpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
@ -370,14 +479,17 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if should_initialize_sync_client(
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_client"
_client = openai.AzureOpenAI( # type: ignore
**azure_client_params,
_client = openai.OpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
@ -394,16 +506,18 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
# streaming clients should have diff timeouts
cache_key = f"{model_id}_stream_async_client"
_client = openai.AsyncAzureOpenAI( # type: ignore
**azure_client_params,
_client = openai.AsyncOpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
@ -412,20 +526,23 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
local_only=True,
) # cache for 1 hr
if should_initialize_sync_client(
if InitalizeOpenAISDKClient.should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
# streaming clients should have diff timeouts
cache_key = f"{model_id}_stream_client"
_client = openai.AzureOpenAI( # type: ignore
**azure_client_params,
_client = openai.OpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
),
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
@ -434,149 +551,49 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict): # noqa: PL
local_only=True,
) # cache for 1 hr
@staticmethod
def get_azure_ad_token_from_entrata_id(
tenant_id: str, client_id: str, client_secret: str
) -> Callable[[], str]:
from azure.identity import (
ClientSecretCredential,
DefaultAzureCredential,
get_bearer_token_provider,
)
verbose_router_logger.debug("Getting Azure AD Token from Entrata ID")
if tenant_id.startswith("os.environ/"):
_tenant_id = get_secret_str(tenant_id)
else:
_api_key = api_key # type: ignore
if _api_key is not None and isinstance(_api_key, str):
# only show first 5 chars of api_key
_api_key = _api_key[:8] + "*" * 15
verbose_router_logger.debug(
f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
)
cache_key = f"{model_id}_async_client"
_client = openai.AsyncOpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
_tenant_id = tenant_id
if should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
cache_key = f"{model_id}_client"
_client = openai.OpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if client_id.startswith("os.environ/"):
_client_id = get_secret_str(client_id)
else:
_client_id = client_id
# streaming clients should have diff timeouts
cache_key = f"{model_id}_stream_async_client"
_client = openai.AsyncOpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.AsyncClient(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
if client_secret.startswith("os.environ/"):
_client_secret = get_secret_str(client_secret)
else:
_client_secret = client_secret
if should_initialize_sync_client(
litellm_router_instance=litellm_router_instance
):
# streaming clients should have diff timeouts
cache_key = f"{model_id}_stream_client"
_client = openai.OpenAI( # type: ignore
api_key=api_key,
base_url=api_base,
timeout=stream_timeout, # type: ignore
max_retries=max_retries, # type: ignore
organization=organization,
http_client=httpx.Client(
limits=httpx.Limits(
max_connections=1000, max_keepalive_connections=100
),
verify=litellm.ssl_verify,
), # type: ignore
)
litellm_router_instance.cache.set_cache(
key=cache_key,
value=_client,
ttl=client_ttl,
local_only=True,
) # cache for 1 hr
verbose_router_logger.debug(
"tenant_id %s, client_id %s, client_secret %s",
_tenant_id,
_client_id,
_client_secret,
)
if _tenant_id is None or _client_id is None or _client_secret is None:
raise ValueError("tenant_id, client_id, and client_secret must be provided")
credential = ClientSecretCredential(_tenant_id, _client_id, _client_secret)
verbose_router_logger.debug("credential %s", credential)
def get_azure_ad_token_from_entrata_id(
tenant_id: str, client_id: str, client_secret: str
) -> Callable[[], str]:
from azure.identity import (
ClientSecretCredential,
DefaultAzureCredential,
get_bearer_token_provider,
)
token_provider = get_bearer_token_provider(
credential, "https://cognitiveservices.azure.com/.default"
)
verbose_router_logger.debug("Getting Azure AD Token from Entrata ID")
verbose_router_logger.debug("token_provider %s", token_provider)
if tenant_id.startswith("os.environ/"):
_tenant_id = get_secret_str(tenant_id)
else:
_tenant_id = tenant_id
if client_id.startswith("os.environ/"):
_client_id = get_secret_str(client_id)
else:
_client_id = client_id
if client_secret.startswith("os.environ/"):
_client_secret = get_secret_str(client_secret)
else:
_client_secret = client_secret
verbose_router_logger.debug(
"tenant_id %s, client_id %s, client_secret %s",
_tenant_id,
_client_id,
_client_secret,
)
if _tenant_id is None or _client_id is None or _client_secret is None:
raise ValueError("tenant_id, client_id, and client_secret must be provided")
credential = ClientSecretCredential(_tenant_id, _client_id, _client_secret)
verbose_router_logger.debug("credential %s", credential)
token_provider = get_bearer_token_provider(
credential, "https://cognitiveservices.azure.com/.default"
)
verbose_router_logger.debug("token_provider %s", token_provider)
return token_provider
return token_provider

View file

@ -4,6 +4,7 @@ Class to handle llm wildcard routing and regex pattern matching
import copy
import re
from re import Match
from typing import Dict, List, Optional
from litellm import get_llm_provider
@ -53,11 +54,12 @@ class PatternMatchRouter:
Returns:
str: regex pattern
"""
# Replace '*' with '.*' for regex matching
regex = pattern.replace("*", ".*")
# Escape other special characters
regex = re.escape(regex).replace(r"\.\*", ".*")
return f"^{regex}$"
# # Replace '*' with '.*' for regex matching
# regex = pattern.replace("*", ".*")
# # Escape other special characters
# regex = re.escape(regex).replace(r"\.\*", ".*")
# return f"^{regex}$"
return re.escape(pattern).replace(r"\*", "(.*)")
def route(self, request: Optional[str]) -> Optional[List[Dict]]:
"""
@ -84,6 +86,44 @@ class PatternMatchRouter:
return None # No matching pattern found
@staticmethod
def set_deployment_model_name(
matched_pattern: Match,
litellm_deployment_litellm_model: str,
) -> str:
"""
Set the model name for the matched pattern llm deployment
E.g.:
model_name: llmengine/* (can be any regex pattern or wildcard pattern)
litellm_params:
model: openai/*
if model_name = "llmengine/foo" -> model = "openai/foo"
"""
## BASE CASE: if the deployment model name does not contain a wildcard, return the deployment model name
if "*" not in litellm_deployment_litellm_model:
return litellm_deployment_litellm_model
wildcard_count = litellm_deployment_litellm_model.count("*")
# Extract all dynamic segments from the request
dynamic_segments = matched_pattern.groups()
if len(dynamic_segments) > wildcard_count:
raise ValueError(
f"More wildcards in the deployment model name than the pattern. Wildcard count: {wildcard_count}, dynamic segments count: {len(dynamic_segments)}"
)
# Replace the corresponding wildcards in the litellm model pattern with extracted segments
for segment in dynamic_segments:
litellm_deployment_litellm_model = litellm_deployment_litellm_model.replace(
"*", segment, 1
)
return litellm_deployment_litellm_model
def get_pattern(
self, model: str, custom_llm_provider: Optional[str] = None
) -> Optional[List[Dict]]:

View file

@ -0,0 +1,10 @@
from typing import Optional
from pydantic import BaseModel
class ArizeConfig(BaseModel):
space_key: str
api_key: str
grpc_endpoint: Optional[str] = None
http_endpoint: Optional[str] = None

View file

@ -0,0 +1,52 @@
"""
Payloads for Datadog LLM Observability Service (LLMObs)
API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
"""
from typing import Any, List, Literal, Optional, TypedDict
class InputMeta(TypedDict):
messages: List[Any]
class OutputMeta(TypedDict):
messages: List[Any]
class Meta(TypedDict):
# The span kind: "agent", "workflow", "llm", "tool", "task", "embedding", or "retrieval".
kind: Literal["llm", "tool", "task", "embedding", "retrieval"]
input: InputMeta # The spans input information.
output: OutputMeta # The spans output information.
class LLMMetrics(TypedDict, total=False):
input_tokens: float
output_tokens: float
total_tokens: float
time_to_first_token: float
time_per_output_token: float
class LLMObsPayload(TypedDict):
parent_id: str
trace_id: str
span_id: str
name: str
meta: Meta
start_ns: int
duration: int
metrics: LLMMetrics
class DDSpanAttributes(TypedDict):
ml_app: str
tags: List[str]
spans: List[LLMObsPayload]
class DDIntakePayload(TypedDict):
type: str
attributes: DDSpanAttributes

View file

@ -0,0 +1,7 @@
from typing import Optional, TypedDict
class LangfuseLoggingConfig(TypedDict):
langfuse_secret: Optional[str]
langfuse_public_key: Optional[str]
langfuse_host: Optional[str]

View file

@ -210,15 +210,23 @@ class ServerSentEvent:
return f"ServerSentEvent(event={self.event}, data={self.data}, id={self.id}, retry={self.retry})"
COHERE_EMBEDDING_INPUT_TYPES = Literal[
"search_document", "search_query", "classification", "clustering", "image"
]
class CohereEmbeddingRequest(TypedDict, total=False):
texts: Required[List[str]]
input_type: Required[
Literal["search_document", "search_query", "classification", "clustering"]
]
texts: List[str]
images: List[str]
input_type: Required[COHERE_EMBEDDING_INPUT_TYPES]
truncate: Literal["NONE", "START", "END"]
embedding_types: Literal["float", "int8", "uint8", "binary", "ubinary"]
class CohereEmbeddingRequestWithModel(CohereEmbeddingRequest):
model: Required[str]
class CohereEmbeddingResponse(TypedDict):
embeddings: List[List[float]]
id: str

View file

@ -970,9 +970,9 @@ class EmbeddingResponse(OpenAIObject):
class Logprobs(OpenAIObject):
text_offset: List[int]
token_logprobs: List[float]
token_logprobs: List[Union[float, None]]
tokens: List[str]
top_logprobs: List[Dict[str, float]]
top_logprobs: List[Union[Dict[str, float], None]]
class TextChoices(OpenAIObject):
@ -1177,12 +1177,15 @@ from openai.types.images_response import ImagesResponse as OpenAIImageResponse
class ImageResponse(OpenAIImageResponse):
_hidden_params: dict = {}
usage: Usage
def __init__(
self,
created: Optional[int] = None,
data: Optional[List[ImageObject]] = None,
response_ms=None,
usage: Optional[Usage] = None,
hidden_params: Optional[dict] = None,
):
if response_ms:
_response_ms = response_ms
@ -1204,8 +1207,13 @@ class ImageResponse(OpenAIImageResponse):
_data.append(ImageObject(**d))
elif isinstance(d, BaseModel):
_data.append(ImageObject(**d.model_dump()))
super().__init__(created=created, data=_data)
self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
_usage = usage or Usage(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
super().__init__(created=created, data=_data, usage=_usage) # type: ignore
self._hidden_params = hidden_params or {}
def __contains__(self, key):
# Define custom behavior for the 'in' operator
@ -1404,16 +1412,20 @@ class AdapterCompletionStreamWrapper:
raise StopAsyncIteration
class StandardLoggingMetadata(TypedDict):
class StandardLoggingUserAPIKeyMetadata(TypedDict):
user_api_key_hash: Optional[str] # hash of the litellm virtual key used
user_api_key_alias: Optional[str]
user_api_key_org_id: Optional[str]
user_api_key_team_id: Optional[str]
user_api_key_user_id: Optional[str]
user_api_key_team_alias: Optional[str]
class StandardLoggingMetadata(StandardLoggingUserAPIKeyMetadata):
"""
Specific metadata k,v pairs logged to integration for easier cost tracking
"""
user_api_key_hash: Optional[str] # hash of the litellm virtual key used
user_api_key_alias: Optional[str]
user_api_key_team_id: Optional[str]
user_api_key_user_id: Optional[str]
user_api_key_team_alias: Optional[str]
spend_logs_metadata: Optional[
dict
] # special param to log k,v pairs to spendlogs for a call

View file

@ -70,6 +70,12 @@ from litellm.litellm_core_utils.get_llm_provider_logic import (
get_llm_provider,
)
from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
_handle_invalid_parallel_tool_calls,
convert_to_model_response_object,
convert_to_streaming_response,
convert_to_streaming_response_async,
)
from litellm.litellm_core_utils.llm_response_utils.get_headers import (
get_response_headers,
)
@ -126,6 +132,7 @@ except (ImportError, AttributeError):
os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
"CUSTOM_TIKTOKEN_CACHE_DIR", filename
) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
from tiktoken import Encoding
encoding = tiktoken.get_encoding("cl100k_base")
from importlib import resources
@ -213,13 +220,10 @@ prometheusLogger = None
dynamoLogger = None
s3Logger = None
genericAPILogger = None
clickHouseLogger = None
greenscaleLogger = None
lunaryLogger = None
aispendLogger = None
berrispendLogger = None
supabaseClient = None
liteDebuggerClient = None
callback_list: Optional[List[str]] = []
user_logger_fn = None
additional_details: Optional[Dict[str, str]] = {}
@ -609,7 +613,6 @@ def function_setup( # noqa: PLR0915
def client(original_function): # noqa: PLR0915
global liteDebuggerClient
rules_obj = Rules()
def check_coroutine(value) -> bool:
@ -1282,7 +1285,10 @@ def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
enc: The encoded text.
"""
tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
enc = tokenizer_json["tokenizer"].encode(text)
if isinstance(tokenizer_json["tokenizer"], Encoding):
enc = tokenizer_json["tokenizer"].encode(text, disallowed_special=())
else:
enc = tokenizer_json["tokenizer"].encode(text)
return enc
@ -3049,8 +3055,8 @@ def get_optional_params( # noqa: PLR0915
)
if litellm.vertex_ai_safety_settings is not None:
optional_params["safety_settings"] = litellm.vertex_ai_safety_settings
elif (
custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models
elif litellm.VertexAIAnthropicConfig.is_supported_model(
model=model, custom_llm_provider=custom_llm_provider
):
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
@ -5191,7 +5197,9 @@ def create_proxy_transport_and_mounts():
def validate_environment( # noqa: PLR0915
model: Optional[str] = None, api_key: Optional[str] = None
model: Optional[str] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
"""
Checks if the environment variables are valid for the given model.
@ -5218,11 +5226,6 @@ def validate_environment( # noqa: PLR0915
_, custom_llm_provider, _, _ = get_llm_provider(model=model)
except Exception:
custom_llm_provider = None
# # check if llm provider part of model name
# if model.split("/",1)[0] in litellm.provider_list:
# custom_llm_provider = model.split("/", 1)[0]
# model = model.split("/", 1)[1]
# custom_llm_provider_passed_in = True
if custom_llm_provider:
if custom_llm_provider == "openai":
@ -5491,476 +5494,20 @@ def validate_environment( # noqa: PLR0915
if "api_key" not in key.lower():
new_missing_keys.append(key)
missing_keys = new_missing_keys
if api_base is not None:
new_missing_keys = []
for key in missing_keys:
if "api_base" not in key.lower():
new_missing_keys.append(key)
missing_keys = new_missing_keys
if len(missing_keys) == 0: # no missing keys
keys_in_environment = True
return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}
async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
"""
Asynchronously converts a response object to a streaming response.
Args:
response_object (Optional[dict]): The response object to be converted. Defaults to None.
Raises:
Exception: If the response object is None.
Yields:
ModelResponse: The converted streaming response object.
Returns:
None
"""
if response_object is None:
raise Exception("Error in response object format")
model_response_object = ModelResponse(stream=True)
if model_response_object is None:
raise Exception("Error in response creating model response object")
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
if (
choice["message"].get("tool_calls", None) is not None
and isinstance(choice["message"]["tool_calls"], list)
and len(choice["message"]["tool_calls"]) > 0
and isinstance(choice["message"]["tool_calls"][0], dict)
):
pydantic_tool_calls = []
for index, t in enumerate(choice["message"]["tool_calls"]):
if "index" not in t:
t["index"] = index
pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
choice["message"]["tool_calls"] = pydantic_tool_calls
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
finish_reason = choice.get("finish_details")
logprobs = choice.get("logprobs", None)
choice = StreamingChoices(
finish_reason=finish_reason, index=idx, delta=delta, logprobs=logprobs
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
setattr(
model_response_object,
"usage",
Usage(
completion_tokens=response_object["usage"].get("completion_tokens", 0),
prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
total_tokens=response_object["usage"].get("total_tokens", 0),
),
)
if "id" in response_object:
model_response_object.id = response_object["id"]
if "created" in response_object:
model_response_object.created = response_object["created"]
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object["system_fingerprint"]
if "model" in response_object:
model_response_object.model = response_object["model"]
yield model_response_object
await asyncio.sleep(0)
def convert_to_streaming_response(response_object: Optional[dict] = None):
# used for yielding Cache hits when stream == True
if response_object is None:
raise Exception("Error in response object format")
model_response_object = ModelResponse(stream=True)
choice_list = []
for idx, choice in enumerate(response_object["choices"]):
delta = Delta(
content=choice["message"].get("content", None),
role=choice["message"]["role"],
function_call=choice["message"].get("function_call", None),
tool_calls=choice["message"].get("tool_calls", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
finish_reason = choice.get("finish_details")
logprobs = choice.get("logprobs", None)
enhancements = choice.get("enhancements", None)
choice = StreamingChoices(
finish_reason=finish_reason,
index=idx,
delta=delta,
logprobs=logprobs,
enhancements=enhancements,
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
setattr(model_response_object, "usage", Usage())
model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0) # type: ignore
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if "id" in response_object:
model_response_object.id = response_object["id"]
if "created" in response_object:
model_response_object.created = response_object["created"]
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object["system_fingerprint"]
if "model" in response_object:
model_response_object.model = response_object["model"]
yield model_response_object
from collections import defaultdict
def _handle_invalid_parallel_tool_calls(
tool_calls: List[ChatCompletionMessageToolCall],
):
"""
Handle hallucinated parallel tool call from openai - https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
Code modified from: https://github.com/phdowling/openai_multi_tool_use_parallel_patch/blob/main/openai_multi_tool_use_parallel_patch.py
"""
if tool_calls is None:
return
try:
replacements: Dict[int, List[ChatCompletionMessageToolCall]] = defaultdict(list)
for i, tool_call in enumerate(tool_calls):
current_function = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
if current_function == "multi_tool_use.parallel":
verbose_logger.debug(
"OpenAI did a weird pseudo-multi-tool-use call, fixing call structure.."
)
for _fake_i, _fake_tool_use in enumerate(function_args["tool_uses"]):
_function_args = _fake_tool_use["parameters"]
_current_function = _fake_tool_use["recipient_name"]
if _current_function.startswith("functions."):
_current_function = _current_function[len("functions.") :]
fixed_tc = ChatCompletionMessageToolCall(
id=f"{tool_call.id}_{_fake_i}",
type="function",
function=Function(
name=_current_function, arguments=json.dumps(_function_args)
),
)
replacements[i].append(fixed_tc)
shift = 0
for i, replacement in replacements.items():
tool_calls[:] = (
tool_calls[: i + shift] + replacement + tool_calls[i + shift + 1 :]
)
shift += len(replacement)
return tool_calls
except json.JSONDecodeError:
# if there is a JSONDecodeError, return the original tool_calls
return tool_calls
def convert_to_model_response_object( # noqa: PLR0915
response_object: Optional[dict] = None,
model_response_object: Optional[
Union[
ModelResponse,
EmbeddingResponse,
ImageResponse,
TranscriptionResponse,
RerankResponse,
]
] = None,
response_type: Literal[
"completion", "embedding", "image_generation", "audio_transcription", "rerank"
] = "completion",
stream=False,
start_time=None,
end_time=None,
hidden_params: Optional[dict] = None,
_response_headers: Optional[dict] = None,
convert_tool_call_to_json_mode: Optional[
bool
] = None, # used for supporting 'json_schema' on older models
):
received_args = locals()
additional_headers = get_response_headers(_response_headers)
if hidden_params is None:
hidden_params = {}
hidden_params["additional_headers"] = additional_headers
### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
if (
response_object is not None
and "error" in response_object
and response_object["error"] is not None
):
error_args = {"status_code": 422, "message": "Error in response object"}
if isinstance(response_object["error"], dict):
if "code" in response_object["error"]:
error_args["status_code"] = response_object["error"]["code"]
if "message" in response_object["error"]:
if isinstance(response_object["error"]["message"], dict):
message_str = json.dumps(response_object["error"]["message"])
else:
message_str = str(response_object["error"]["message"])
error_args["message"] = message_str
raised_exception = Exception()
setattr(raised_exception, "status_code", error_args["status_code"])
setattr(raised_exception, "message", error_args["message"])
raise raised_exception
try:
if response_type == "completion" and (
model_response_object is None
or isinstance(model_response_object, ModelResponse)
):
if response_object is None or model_response_object is None:
raise Exception("Error in response object format")
if stream is True:
# for returning cached responses, we need to yield a generator
return convert_to_streaming_response(response_object=response_object)
choice_list = []
assert response_object["choices"] is not None and isinstance(
response_object["choices"], Iterable
)
for idx, choice in enumerate(response_object["choices"]):
## HANDLE JSON MODE - anthropic returns single function call]
tool_calls = choice["message"].get("tool_calls", None)
if tool_calls is not None:
_openai_tool_calls = []
for _tc in tool_calls:
_openai_tc = ChatCompletionMessageToolCall(**_tc)
_openai_tool_calls.append(_openai_tc)
fixed_tool_calls = _handle_invalid_parallel_tool_calls(
_openai_tool_calls
)
if fixed_tool_calls is not None:
tool_calls = fixed_tool_calls
message: Optional[Message] = None
finish_reason: Optional[str] = None
if (
convert_tool_call_to_json_mode
and tool_calls is not None
and len(tool_calls) == 1
):
# to support 'json_schema' logic on older models
json_mode_content_str: Optional[str] = tool_calls[0][
"function"
].get("arguments")
if json_mode_content_str is not None:
message = litellm.Message(content=json_mode_content_str)
finish_reason = "stop"
if message is None:
message = Message(
content=choice["message"].get("content", None),
role=choice["message"]["role"] or "assistant",
function_call=choice["message"].get("function_call", None),
tool_calls=tool_calls,
audio=choice["message"].get("audio", None),
)
finish_reason = choice.get("finish_reason", None)
if finish_reason is None:
# gpt-4 vision can return 'finish_reason' or 'finish_details'
finish_reason = choice.get("finish_details") or "stop"
logprobs = choice.get("logprobs", None)
enhancements = choice.get("enhancements", None)
choice = Choices(
finish_reason=finish_reason,
index=idx,
message=message,
logprobs=logprobs,
enhancements=enhancements,
)
choice_list.append(choice)
model_response_object.choices = choice_list
if "usage" in response_object and response_object["usage"] is not None:
usage_object = litellm.Usage(**response_object["usage"])
setattr(model_response_object, "usage", usage_object)
if "created" in response_object:
model_response_object.created = response_object["created"] or int(
time.time()
)
if "id" in response_object:
model_response_object.id = response_object["id"] or str(uuid.uuid4())
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object[
"system_fingerprint"
]
if "model" in response_object:
if model_response_object.model is None:
model_response_object.model = response_object["model"]
elif (
"/" in model_response_object.model
and response_object["model"] is not None
):
openai_compatible_provider = model_response_object.model.split("/")[
0
]
model_response_object.model = (
openai_compatible_provider + "/" + response_object["model"]
)
if start_time is not None and end_time is not None:
if isinstance(start_time, type(end_time)):
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000
if hidden_params is not None:
if model_response_object._hidden_params is None:
model_response_object._hidden_params = {}
model_response_object._hidden_params.update(hidden_params)
if _response_headers is not None:
model_response_object._response_headers = _response_headers
special_keys = list(litellm.ModelResponse.model_fields.keys())
special_keys.append("usage")
for k, v in response_object.items():
if k not in special_keys:
setattr(model_response_object, k, v)
return model_response_object
elif response_type == "embedding" and (
model_response_object is None
or isinstance(model_response_object, EmbeddingResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = EmbeddingResponse()
if "model" in response_object:
model_response_object.model = response_object["model"]
if "object" in response_object:
model_response_object.object = response_object["object"]
model_response_object.data = response_object["data"]
if "usage" in response_object and response_object["usage"] is not None:
model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0) # type: ignore
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if start_time is not None and end_time is not None:
model_response_object._response_ms = ( # type: ignore
end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai
if hidden_params is not None:
model_response_object._hidden_params = hidden_params
if _response_headers is not None:
model_response_object._response_headers = _response_headers
return model_response_object
elif response_type == "image_generation" and (
model_response_object is None
or isinstance(model_response_object, ImageResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = ImageResponse()
if "created" in response_object:
model_response_object.created = response_object["created"]
if "data" in response_object:
model_response_object.data = response_object["data"]
if hidden_params is not None:
model_response_object._hidden_params = hidden_params
return model_response_object
elif response_type == "audio_transcription" and (
model_response_object is None
or isinstance(model_response_object, TranscriptionResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = TranscriptionResponse()
if "text" in response_object:
model_response_object.text = response_object["text"]
optional_keys = ["language", "task", "duration", "words", "segments"]
for key in optional_keys: # not guaranteed to be in response
if key in response_object:
setattr(model_response_object, key, response_object[key])
if hidden_params is not None:
model_response_object._hidden_params = hidden_params
if _response_headers is not None:
model_response_object._response_headers = _response_headers
return model_response_object
elif response_type == "rerank" and (
model_response_object is None
or isinstance(model_response_object, RerankResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = RerankResponse(**response_object)
return model_response_object
if "id" in response_object:
model_response_object.id = response_object["id"]
if "meta" in response_object:
model_response_object.meta = response_object["meta"]
if "results" in response_object:
model_response_object.results = response_object["results"]
return model_response_object
except Exception:
raise Exception(
f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
)
def acreate(*args, **kwargs): ## Thin client to handle the acreate langchain call
return litellm.acompletion(*args, **kwargs)

View file

@ -1104,7 +1104,7 @@
"litellm_provider": "azure_ai",
"mode": "chat"
},
"azure_ai/Meta-Llama-31-8B-Instruct": {
"azure_ai/Meta-Llama-3.1-8B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-70B-Instruct": {
"azure_ai/Meta-Llama-3.1-70B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-405B-Instruct": {
"azure_ai/Meta-Llama-3.1-405B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
@ -1751,6 +1751,22 @@
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"claude-3-5-sonnet-20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"tool_use_system_prompt_tokens": 159,
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"text-bison": {
"max_tokens": 2048,
"max_input_tokens": 8192,
@ -2578,6 +2594,18 @@
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-5-sonnet-v2@20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "vertex_ai-anthropic_models",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"supports_assistant_prefill": true
},
"vertex_ai/claude-3-haiku@20240307": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -3336,54 +3364,56 @@
"litellm_provider": "cohere",
"mode": "rerank"
},
"embed-english-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-light-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-multilingual-v3.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-v2.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 4096,
"max_input_tokens": 4096,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-light-v2.0": {
"max_tokens": 512,
"max_input_tokens": 512,
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-multilingual-v2.0": {
"max_tokens": 256,
"max_input_tokens": 256,
"max_tokens": 768,
"max_input_tokens": 768,
"input_cost_per_token": 0.00000010,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding"
},
"embed-english-v3.0": {
"max_tokens": 1024,
"max_input_tokens": 1024,
"input_cost_per_token": 0.00000010,
"input_cost_per_image": 0.0001,
"output_cost_per_token": 0.00000,
"litellm_provider": "cohere",
"mode": "embedding",
"supports_image_input": true
},
"replicate/meta/llama-2-13b": {
"max_tokens": 4096,
"max_input_tokens": 4096,
@ -3572,6 +3602,22 @@
"supports_vision": true,
"tool_use_system_prompt_tokens": 264
},
"anthropic/claude-3-5-sonnet-20241022": {
"max_tokens": 8192,
"max_input_tokens": 200000,
"max_output_tokens": 8192,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true,
"tool_use_system_prompt_tokens": 159,
"supports_assistant_prefill": true,
"supports_prompt_caching": true
},
"openrouter/anthropic/claude-3.5-sonnet": {
"max_tokens": 8192,
"max_input_tokens": 200000,
@ -4093,6 +4139,18 @@
"litellm_provider": "bedrock",
"mode": "embedding"
},
"amazon.titan-embed-image-v1": {
"max_tokens": 128,
"max_input_tokens": 128,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000008,
"input_cost_per_image": 0.00006,
"output_cost_per_token": 0.0,
"litellm_provider": "bedrock",
"supports_image_input": true,
"mode": "embedding",
"source": "https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=amazon.titan-image-generator-v1"
},
"mistral.mistral-7b-instruct-v0:2": {
"max_tokens": 8191,
"max_input_tokens": 32000,
@ -4246,6 +4304,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -4290,6 +4359,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"us.anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -4334,6 +4414,17 @@
"supports_function_calling": true,
"supports_vision": true
},
"eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015,
"litellm_provider": "bedrock",
"mode": "chat",
"supports_function_calling": true,
"supports_vision": true
},
"eu.anthropic.claude-3-haiku-20240307-v1:0": {
"max_tokens": 4096,
"max_input_tokens": 200000,
@ -6369,6 +6460,14 @@
"litellm_provider": "voyage",
"mode": "embedding"
},
"voyage/voyage-finance-2": {
"max_tokens": 4000,
"max_input_tokens": 4000,
"input_cost_per_token": 0.00000012,
"output_cost_per_token": 0.000000,
"litellm_provider": "voyage",
"mode": "embedding"
},
"databricks/databricks-meta-llama-3-1-405b-instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.50.2"
version = "1.51.0"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.50.2"
version = "1.51.0"
version_files = [
"pyproject.toml:^version"
]

View file

@ -154,6 +154,8 @@ model LiteLLM_VerificationToken {
model_spend Json @default("{}")
model_max_budget Json @default("{}")
budget_id String?
created_at DateTime? @default(now()) @map("created_at")
updated_at DateTime? @default(now()) @updatedAt @map("updated_at")
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
}

View file

@ -695,3 +695,41 @@ def test_convert_to_model_response_object_error():
_response_headers=None,
convert_tool_call_to_json_mode=False,
)
def test_image_generation_openai_with_pydantic_warning(caplog):
try:
import logging
from litellm.types.utils import ImageResponse, ImageObject
convert_response_args = {
"response_object": {
"created": 1729709945,
"data": [
{
"b64_json": None,
"revised_prompt": "Generate an image of a baby sea otter. It should look incredibly cute, with big, soulful eyes and a fluffy, wet fur coat. The sea otter should be on its back, as sea otters often do, with its tiny hands holding onto a shell as if it is its precious toy. The background should be a tranquil sea under a clear sky, with soft sunlight reflecting off the waters. The color palette should be soothing with blues, browns, and white.",
"url": "https://oaidalleapiprodscus.blob.core.windows.net/private/org-ikDc4ex8NB5ZzfTf8m5WYVB7/user-JpwZsbIXubBZvan3Y3GchiiB/img-LL0uoOv4CFJIvNYxoNCKB8oc.png?st=2024-10-23T17%3A59%3A05Z&se=2024-10-23T19%3A59%3A05Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=d505667d-d6c1-4a0a-bac7-5c84a87759f8&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-10-22T19%3A26%3A22Z&ske=2024-10-23T19%3A26%3A22Z&sks=b&skv=2024-08-04&sig=Hl4wczJ3H2vZNdLRt/7JvNi6NvQGDnbNkDy15%2Bl3k5s%3D",
}
],
},
"model_response_object": ImageResponse(
created=1729709929,
data=[],
),
"response_type": "image_generation",
"stream": False,
"start_time": None,
"end_time": None,
"hidden_params": None,
"_response_headers": None,
"convert_tool_call_to_json_mode": None,
}
resp: ImageResponse = convert_to_model_response_object(**convert_response_args)
assert resp is not None
assert resp.data is not None
assert len(resp.data) == 1
assert isinstance(resp.data[0], ImageObject)
except Exception as e:
pytest.fail(f"Test failed with exception: {e}")

View file

@ -235,7 +235,7 @@ def test_all_model_configs():
optional_params={},
api_version="2022-12-01",
drop_params=False,
) == {"max_tokens": 10}
) == {"max_completion_tokens": 10}
from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig

View file

@ -775,3 +775,12 @@ def test_hosted_vllm_tool_param():
)
assert "tools" not in optional_params
assert "tool_choice" not in optional_params
def test_unmapped_vertex_anthropic_model():
optional_params = get_optional_params(
model="claude-3-5-sonnet-v250@20241022",
custom_llm_provider="vertex_ai",
max_retries=10,
)
assert "max_retries" not in optional_params

View file

@ -0,0 +1,64 @@
import json
import os
import sys
from datetime import datetime
from unittest.mock import AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from litellm.types.utils import TextCompletionResponse
def test_convert_dict_to_text_completion_response():
input_dict = {
"id": "cmpl-ALVLPJgRkqpTomotoOMi3j0cAaL4L",
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": {
"text_offset": [0, 5],
"token_logprobs": [None, -12.203847],
"tokens": ["hello", " crisp"],
"top_logprobs": [None, {",": -2.1568563}],
},
"text": "hello crisp",
}
],
"created": 1729688739,
"model": "davinci-002",
"object": "text_completion",
"system_fingerprint": None,
"usage": {
"completion_tokens": 1,
"prompt_tokens": 1,
"total_tokens": 2,
"completion_tokens_details": None,
"prompt_tokens_details": None,
},
}
response = TextCompletionResponse(**input_dict)
assert response.id == "cmpl-ALVLPJgRkqpTomotoOMi3j0cAaL4L"
assert len(response.choices) == 1
assert response.choices[0].finish_reason == "length"
assert response.choices[0].index == 0
assert response.choices[0].text == "hello crisp"
assert response.created == 1729688739
assert response.model == "davinci-002"
assert response.object == "text_completion"
assert response.system_fingerprint is None
assert response.usage.completion_tokens == 1
assert response.usage.prompt_tokens == 1
assert response.usage.total_tokens == 2
assert response.usage.completion_tokens_details is None
assert response.usage.prompt_tokens_details is None
# Test logprobs
assert response.choices[0].logprobs.text_offset == [0, 5]
assert response.choices[0].logprobs.token_logprobs == [None, -12.203847]
assert response.choices[0].logprobs.tokens == ["hello", " crisp"]
assert response.choices[0].logprobs.top_logprobs == [None, {",": -2.1568563}]

View file

@ -428,11 +428,16 @@ async def test_aaalangfuse_logging_metadata(langfuse_client):
await asyncio.sleep(2)
langfuse_client.flush()
# await asyncio.sleep(10)
await asyncio.sleep(4)
# Tests the metadata filtering and the override of the output to be the last generation
for trace_id, generation_ids in trace_identifiers.items():
trace = langfuse_client.get_trace(id=trace_id)
try:
trace = langfuse_client.get_trace(id=trace_id)
except Exception as e:
if "Trace not found within authorized project" in str(e):
print(f"Trace {trace_id} not found")
continue
assert trace.id == trace_id
assert trace.session_id == session_id
assert trace.metadata != trace_metadata
@ -620,7 +625,7 @@ def test_aaalangfuse_existing_trace_id():
import datetime
import litellm
from litellm.integrations.langfuse import LangFuseLogger
from litellm.integrations.langfuse.langfuse import LangFuseLogger
langfuse_Logger = LangFuseLogger(
langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
@ -1120,7 +1125,7 @@ generation_params = {
)
def test_langfuse_prompt_type(prompt):
from litellm.integrations.langfuse import _add_prompt_to_generation_params
from litellm.integrations.langfuse.langfuse import _add_prompt_to_generation_params
clean_metadata = {
"prompt": {
@ -1227,7 +1232,7 @@ def test_langfuse_prompt_type(prompt):
def test_langfuse_logging_metadata():
from litellm.integrations.langfuse import log_requester_metadata
from litellm.integrations.langfuse.langfuse import log_requester_metadata
metadata = {"key": "value", "requester_metadata": {"key": "value"}}

View file

@ -10,9 +10,9 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanE
import litellm
from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from litellm.integrations.arize_ai import ArizeConfig, ArizeLogger
load_dotenv()
import logging
@pytest.mark.asyncio()
@ -32,3 +32,57 @@ async def test_async_otel_callback():
)
await asyncio.sleep(2)
@pytest.fixture
def mock_env_vars(monkeypatch):
monkeypatch.setenv("ARIZE_SPACE_KEY", "test_space_key")
monkeypatch.setenv("ARIZE_API_KEY", "test_api_key")
def test_get_arize_config(mock_env_vars):
"""
Use Arize default endpoint when no endpoints are provided
"""
config = ArizeLogger._get_arize_config()
assert isinstance(config, ArizeConfig)
assert config.space_key == "test_space_key"
assert config.api_key == "test_api_key"
assert config.grpc_endpoint == "https://otlp.arize.com/v1"
assert config.http_endpoint is None
def test_get_arize_config_with_endpoints(mock_env_vars, monkeypatch):
"""
Use provided endpoints when they are set
"""
monkeypatch.setenv("ARIZE_ENDPOINT", "grpc://test.endpoint")
monkeypatch.setenv("ARIZE_HTTP_ENDPOINT", "http://test.endpoint")
config = ArizeLogger._get_arize_config()
assert config.grpc_endpoint == "grpc://test.endpoint"
assert config.http_endpoint == "http://test.endpoint"
def test_get_arize_opentelemetry_config_grpc(mock_env_vars, monkeypatch):
"""
Use provided GRPC endpoint when it is set
"""
monkeypatch.setenv("ARIZE_ENDPOINT", "grpc://test.endpoint")
config = ArizeLogger.get_arize_opentelemetry_config()
assert isinstance(config, OpenTelemetryConfig)
assert config.exporter == "otlp_grpc"
assert config.endpoint == "grpc://test.endpoint"
def test_get_arize_opentelemetry_config_http(mock_env_vars, monkeypatch):
"""
Use provided HTTP endpoint when it is set
"""
monkeypatch.setenv("ARIZE_HTTP_ENDPOINT", "http://test.endpoint")
config = ArizeLogger.get_arize_opentelemetry_config()
assert isinstance(config, OpenTelemetryConfig)
assert config.exporter == "otlp_http"
assert config.endpoint == "http://test.endpoint"

View file

@ -0,0 +1,152 @@
import os
import sys
import traceback
import uuid
from datetime import datetime
from dotenv import load_dotenv
from fastapi import Request
from fastapi.routing import APIRoute
import io
import os
import time
# this file is to test litellm/proxy
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
import logging
load_dotenv()
import pytest
import uuid
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.proxy.proxy_server import (
LitellmUserRoles,
audio_transcriptions,
chat_completion,
completion,
embeddings,
image_generation,
model_list,
moderations,
new_end_user,
user_api_key_auth,
)
from litellm.proxy.utils import PrismaClient, ProxyLogging, hash_token, update_spend
verbose_proxy_logger.setLevel(level=logging.DEBUG)
from starlette.datastructures import URL
from litellm.proxy.management_helpers.audit_logs import create_audit_log_for_update
from litellm.proxy._types import LiteLLM_AuditLogs, LitellmTableNames
from litellm.caching.caching import DualCache
from unittest.mock import patch, AsyncMock
proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
import json
@pytest.mark.asyncio
async def test_create_audit_log_for_update_premium_user():
"""
Basic unit test for create_audit_log_for_update
Test that the audit log is created when a premium user updates a team
"""
with patch("litellm.proxy.proxy_server.premium_user", True), patch(
"litellm.store_audit_logs", True
), patch("litellm.proxy.proxy_server.prisma_client") as mock_prisma:
mock_prisma.db.litellm_auditlog.create = AsyncMock()
request_data = LiteLLM_AuditLogs(
id="test_id",
updated_at=datetime.now(),
changed_by="test_changed_by",
action="updated",
table_name=LitellmTableNames.TEAM_TABLE_NAME,
object_id="test_object_id",
updated_values=json.dumps({"key": "value"}),
before_value=json.dumps({"old_key": "old_value"}),
)
await create_audit_log_for_update(request_data)
mock_prisma.db.litellm_auditlog.create.assert_called_once_with(
data={
"id": "test_id",
"updated_at": request_data.updated_at,
"changed_by": request_data.changed_by,
"action": request_data.action,
"table_name": request_data.table_name,
"object_id": request_data.object_id,
"updated_values": request_data.updated_values,
"before_value": request_data.before_value,
}
)
@pytest.fixture
def prisma_client():
from litellm.proxy.proxy_cli import append_query_params
### add connection pool + pool timeout args
params = {"connection_limit": 100, "pool_timeout": 60}
database_url = os.getenv("DATABASE_URL")
modified_url = append_query_params(database_url, params)
os.environ["DATABASE_URL"] = modified_url
# Assuming PrismaClient is a class that needs to be instantiated
prisma_client = PrismaClient(
database_url=os.environ["DATABASE_URL"], proxy_logging_obj=proxy_logging_obj
)
return prisma_client
@pytest.mark.asyncio()
async def test_create_audit_log_in_db(prisma_client):
print("prisma client=", prisma_client)
setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "premium_user", True)
setattr(litellm, "store_audit_logs", True)
await litellm.proxy.proxy_server.prisma_client.connect()
audit_log_id = f"audit_log_id_{uuid.uuid4()}"
# create a audit log for /key/generate
request_data = LiteLLM_AuditLogs(
id=audit_log_id,
updated_at=datetime.now(),
changed_by="test_changed_by",
action="updated",
table_name=LitellmTableNames.TEAM_TABLE_NAME,
object_id="test_object_id",
updated_values=json.dumps({"key": "value"}),
before_value=json.dumps({"old_key": "old_value"}),
)
await create_audit_log_for_update(request_data)
await asyncio.sleep(1)
# now read the last log from the db
last_log = await prisma_client.db.litellm_auditlog.find_first(
where={"id": audit_log_id}
)
assert last_log.id == audit_log_id
setattr(litellm, "store_audit_logs", False)

View file

@ -2066,46 +2066,6 @@ async def test_cache_default_off_acompletion():
assert response3.id == response4.id
@pytest.mark.asyncio()
@pytest.mark.skip(reason="dual caching should first prioritze local cache")
async def test_dual_cache_uses_redis():
"""
- Store diff values in redis and in memory cache
- call get cache
- Assert that value from redis is used
"""
litellm.set_verbose = True
from litellm.caching.caching import DualCache, RedisCache
current_usage = uuid.uuid4()
_cache_obj = DualCache(redis_cache=RedisCache(), always_read_redis=True)
# set cache
await _cache_obj.async_set_cache(key=f"current_usage: {current_usage}", value=10)
# modify value of in memory cache
_cache_obj.in_memory_cache.cache_dict[f"current_usage: {current_usage}"] = 1
# get cache
value = await _cache_obj.async_get_cache(key=f"current_usage: {current_usage}")
print("value from dual cache", value)
assert value == 10
@pytest.mark.asyncio()
async def test_proxy_logging_setup():
"""
Assert always_read_redis is True when used by internal usage cache
"""
from litellm.caching.caching import DualCache
from litellm.proxy.utils import ProxyLogging
pl_obj = ProxyLogging(user_api_key_cache=DualCache())
assert pl_obj.internal_usage_cache.dual_cache.always_read_redis is True
@pytest.mark.skip(reason="local test. Requires sentinel setup.")
@pytest.mark.asyncio
async def test_redis_sentinel_caching():

View file

@ -1,42 +0,0 @@
import sys
import os
import io, asyncio
# import logging
# logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, os.path.abspath("../.."))
print("Modified sys.path:", sys.path)
from litellm import completion
import litellm
from litellm._logging import verbose_logger
import logging
litellm.num_retries = 3
import time, random
import pytest
@pytest.mark.asyncio
@pytest.mark.skip(reason="beta test - this is a new feature")
async def test_custom_api_logging():
try:
litellm.success_callback = ["clickhouse"]
litellm.set_verbose = True
verbose_logger.setLevel(logging.DEBUG)
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": f"This is a test"}],
max_tokens=10,
temperature=0.7,
user="ishaan-2",
)
except Exception as e:
pytest.fail(f"An exception occurred - {e}")
finally:
# post, close log file and verify
# Reset stdout to the original value
print("Passed!")

Some files were not shown because too many files have changed in this diff Show more