Merge branch 'main' into main
|
@ -28,8 +28,9 @@ jobs:
|
|||
pip install "pytest==7.3.1"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install mypy
|
||||
pip install "google-generativeai>=0.3.2"
|
||||
pip install "google-cloud-aiplatform>=1.38.0"
|
||||
pip install "google-generativeai==0.3.2"
|
||||
pip install "google-cloud-aiplatform==1.43.0"
|
||||
pip install pyarrow
|
||||
pip install "boto3>=1.28.57"
|
||||
pip install "aioboto3>=12.3.0"
|
||||
pip install langchain
|
||||
|
@ -48,6 +49,7 @@ jobs:
|
|||
pip install argon2-cffi
|
||||
pip install "pytest-mock==3.12.0"
|
||||
pip install python-multipart
|
||||
pip install google-cloud-aiplatform
|
||||
- save_cache:
|
||||
paths:
|
||||
- ./venv
|
||||
|
@ -152,10 +154,11 @@ jobs:
|
|||
pip install "pytest-mock==3.12.0"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install mypy
|
||||
pip install "google-generativeai>=0.3.2"
|
||||
pip install "google-cloud-aiplatform>=1.38.0"
|
||||
pip install "boto3>=1.28.57"
|
||||
pip install "aioboto3>=12.3.0"
|
||||
pip install "google-generativeai==0.3.2"
|
||||
pip install "google-cloud-aiplatform==1.43.0"
|
||||
pip install pyarrow
|
||||
pip install "boto3==1.34.34"
|
||||
pip install "aioboto3==12.3.0"
|
||||
pip install langchain
|
||||
pip install "langfuse>=2.0.0"
|
||||
pip install numpydoc
|
||||
|
|
|
@ -7,8 +7,7 @@ baseten
|
|||
cohere
|
||||
redis
|
||||
anthropic
|
||||
boto3
|
||||
orjson
|
||||
pydantic
|
||||
google-cloud-aiplatform
|
||||
google-cloud-aiplatform==1.43.0
|
||||
redisvl==0.0.7 # semantic caching
|
45
.github/workflows/ghcr_deploy.yml
vendored
|
@ -43,6 +43,13 @@ jobs:
|
|||
push: true
|
||||
file: Dockerfile.database
|
||||
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
|
||||
-
|
||||
name: Build and push litellm-spend-logs image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
push: true
|
||||
file: ./litellm-js/spend-logs/Dockerfile
|
||||
tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
|
||||
|
||||
build-and-push-image:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -120,6 +127,44 @@ jobs:
|
|||
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
|
||||
labels: ${{ steps.meta-database.outputs.labels }}
|
||||
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
|
||||
|
||||
build-and-push-image-spend-logs:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for spend-logs Dockerfile
|
||||
id: meta-spend-logs
|
||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-spend_logs
|
||||
# Configure multi platform Docker builds
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
|
||||
|
||||
- name: Build and push Database Docker image
|
||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||
with:
|
||||
context: .
|
||||
file: ./litellm-js/spend-logs/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-latest
|
||||
labels: ${{ steps.meta-spend-logs.outputs.labels }}
|
||||
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
|
||||
|
||||
build-and-push-helm-chart:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# Base image for building
|
||||
ARG LITELLM_BUILD_IMAGE=python:3.9
|
||||
ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
|
||||
|
||||
# Runtime image
|
||||
ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
|
||||
ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
|
||||
# Builder stage
|
||||
FROM $LITELLM_BUILD_IMAGE as builder
|
||||
|
||||
|
@ -70,5 +70,5 @@ EXPOSE 4000/tcp
|
|||
ENTRYPOINT ["litellm"]
|
||||
|
||||
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
||||
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
|
||||
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# Base image for building
|
||||
ARG LITELLM_BUILD_IMAGE=python:3.9
|
||||
ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
|
||||
|
||||
# Runtime image
|
||||
ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
|
||||
ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
|
||||
# Builder stage
|
||||
FROM $LITELLM_BUILD_IMAGE as builder
|
||||
|
||||
|
@ -72,5 +72,5 @@ EXPOSE 4000/tcp
|
|||
ENTRYPOINT ["litellm"]
|
||||
|
||||
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
|
||||
# CMD ["--port", "4000","--run_gunicorn", "--detailed_debug"]
|
||||
CMD ["--port", "4000", "--run_gunicorn"]
|
||||
# CMD ["--port", "4000", "--detailed_debug"]
|
||||
CMD ["--port", "4000"]
|
||||
|
|
|
@ -31,11 +31,11 @@ LiteLLM manages:
|
|||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
**Stable Release**: v`1.30.2` 👈 Recommended stable version of proxy.
|
||||
|
||||
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
|
||||
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)
|
||||
|
||||
🚨 **Stable Release:** v1.34.1
|
||||
|
||||
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
|
||||
|
||||
# Usage ([**Docs**](https://docs.litellm.ai/docs/))
|
||||
|
|
55
deploy/kubernetes/kub.yaml
Normal file
|
@ -0,0 +1,55 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: litellm-deployment
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: litellm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: litellm
|
||||
spec:
|
||||
containers:
|
||||
- name: litellm-container
|
||||
image: ghcr.io/berriai/litellm:main-latest
|
||||
env:
|
||||
- name: AZURE_API_KEY
|
||||
value: "d6f****"
|
||||
- name: AZURE_API_BASE
|
||||
value: "https://openai
|
||||
- name: LITELLM_MASTER_KEY
|
||||
value: "sk-1234"
|
||||
- name: DATABASE_URL
|
||||
value: "postgresql://ishaan:*********""
|
||||
args:
|
||||
- "--config"
|
||||
- "/app/proxy_config.yaml" # Update the path to mount the config file
|
||||
volumeMounts: # Define volume mount for proxy_config.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/liveliness
|
||||
port: 4000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
timeoutSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/readiness
|
||||
port: 4000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
timeoutSeconds: 10
|
||||
volumes: # Define volume to mount proxy_config.yaml
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: litellm-config
|
12
deploy/kubernetes/service.yaml
Normal file
|
@ -0,0 +1,12 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: litellm-service
|
||||
spec:
|
||||
selector:
|
||||
app: litellm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 4000
|
||||
targetPort: 4000
|
||||
type: LoadBalancer
|
|
@ -76,7 +76,6 @@ Click on your personal dashboard link. Here's how you can find it 👇
|
|||
|
||||
Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider.
|
||||
|
||||
<Image img={require('../../img/dashboard_log_row.png')} alt="Dashboard Log Row" />
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -41,6 +41,35 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
## Additional information in metadata
|
||||
You can send some additional information to Athina by using the `metadata` field in completion. This can be useful for sending metadata about the request, such as the customer_id, prompt_slug, or any other information you want to track.
|
||||
|
||||
```python
|
||||
#openai call with additional metadata
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hi 👋 - i'm openai"}
|
||||
],
|
||||
metadata={
|
||||
"environment": "staging",
|
||||
"prompt_slug": "my_prompt_slug/v1"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
Following are the allowed fields in metadata, their types, and their descriptions:
|
||||
|
||||
* `environment: Optional[str]` - Environment your app is running in (ex: production, staging, etc). This is useful for segmenting inference calls by environment.
|
||||
* `prompt_slug: Optional[str]` - Identifier for the prompt used for inference. This is useful for segmenting inference calls by prompt.
|
||||
* `customer_id: Optional[str]` - This is your customer ID. This is useful for segmenting inference calls by customer.
|
||||
* `customer_user_id: Optional[str]` - This is the end user ID. This is useful for segmenting inference calls by the end user.
|
||||
* `session_id: Optional[str]` - is the session or conversation ID. This is used for grouping different inferences into a conversation or chain. [Read more].(https://docs.athina.ai/logging/grouping_inferences)
|
||||
* `external_reference_id: Optional[str]` - This is useful if you want to associate your own internal identifier with the inference logged to Athina.
|
||||
* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
|
||||
* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
|
||||
* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
|
||||
|
||||
## Support & Talk with Athina Team
|
||||
|
||||
- [Schedule Demo 👋](https://cal.com/shiv-athina/30min)
|
||||
|
|
|
@ -60,11 +60,30 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|||
|
||||
### 2. Start the proxy
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="cli" label="cli">
|
||||
|
||||
```bash
|
||||
$ litellm --model claude-3-opus-20240229
|
||||
|
||||
# Server running on http://0.0.0.0:4000
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="config" label="config.yaml">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: claude-3 ### RECEIVED MODEL NAME ###
|
||||
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
|
||||
model: claude-3-opus-20240229 ### MODEL NAME sent to `litellm.completion()` ###
|
||||
api_key: "os.environ/ANTHROPIC_API_KEY" # does os.getenv("AZURE_API_KEY_EU")
|
||||
```
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### 3. Test it
|
||||
|
||||
|
@ -76,7 +95,7 @@ $ litellm --model claude-3-opus-20240229
|
|||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"model": "claude-3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -97,7 +116,7 @@ client = openai.OpenAI(
|
|||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
response = client.chat.completions.create(model="claude-3", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
|
@ -121,7 +140,7 @@ from langchain.schema import HumanMessage, SystemMessage
|
|||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
|
||||
model = "gpt-3.5-turbo",
|
||||
model = "claude-3",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
|
@ -238,7 +257,7 @@ resp = litellm.completion(
|
|||
print(f"\nResponse: {resp}")
|
||||
```
|
||||
|
||||
### Usage - "Assistant Pre-fill"
|
||||
## Usage - "Assistant Pre-fill"
|
||||
|
||||
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
|
||||
|
||||
|
@ -271,8 +290,8 @@ Human: How do you say 'Hello' in German? Return your answer as a JSON object, li
|
|||
Assistant: {
|
||||
```
|
||||
|
||||
### Usage - "System" messages
|
||||
If you're using Anthropic's Claude 2.1 with Bedrock, `system` role messages are properly formatted for you.
|
||||
## Usage - "System" messages
|
||||
If you're using Anthropic's Claude 2.1, `system` role messages are properly formatted for you.
|
||||
|
||||
```python
|
||||
import os
|
||||
|
|
|
@ -20,7 +20,28 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
|||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = completion(
|
||||
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
|
||||
model="sagemaker/<your-endpoint-name>",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
temperature=0.2,
|
||||
max_tokens=80
|
||||
)
|
||||
```
|
||||
|
||||
### Passing Inference Component Name
|
||||
|
||||
If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`.
|
||||
|
||||
```python
|
||||
import os
|
||||
from litellm import completion
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = completion(
|
||||
model="sagemaker/<your-endpoint-name>",
|
||||
model_id="<your-model-name",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
temperature=0.2,
|
||||
max_tokens=80
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
## Pre-requisites
|
||||
* `pip install -q google-generativeai`
|
||||
* Get API Key - https://aistudio.google.com/
|
||||
|
||||
# Gemini-Pro
|
||||
## Sample Usage
|
||||
|
@ -97,6 +98,6 @@ print(content)
|
|||
| Model Name | Function Call | Required OS Variables |
|
||||
|------------------|--------------------------------------|-------------------------|
|
||||
| gemini-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
| gemini-1.5-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
| gemini-1.5-pro | `completion('gemini/gemini-1.5-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
| gemini-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
| gemini-1.5-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
| gemini-1.5-pro-vision | `completion('gemini/gemini-1.5-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# 🚨 Budget Alerting
|
||||
|
||||
**Alerts when a project will exceed it’s planned limit**
|
||||
|
||||
<Image img={require('../../img/budget_alerts.png')} />
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Setup Slack Alerting on your Proxy Config.yaml
|
||||
|
||||
**Add Slack Webhook to your env**
|
||||
Get a slack webhook url from https://api.slack.com/messaging/webhooks
|
||||
|
||||
|
||||
Set `SLACK_WEBHOOK_URL` in your proxy env
|
||||
|
||||
```shell
|
||||
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
|
||||
```
|
||||
|
||||
**Update proxy config.yaml with slack alerting**
|
||||
|
||||
Add `general_settings:alerting`
|
||||
```yaml
|
||||
model_list:
|
||||
model_name: "azure-model"
|
||||
litellm_params:
|
||||
model: "azure/gpt-35-turbo"
|
||||
|
||||
general_settings:
|
||||
alerting: ["slack"]
|
||||
```
|
||||
|
||||
|
||||
|
||||
Start proxy
|
||||
```bash
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
|
||||
### 2. Create API Key on Proxy Admin UI
|
||||
The Admin UI is found on `your-litellm-proxy-endpoint/ui`, example `http://localhost:4000/ui/`
|
||||
|
||||
- Set a key name
|
||||
- Set a Soft Budget on when to get alerted
|
||||
|
||||
<Image img={require('../../img/create_key.png')} />
|
||||
|
||||
|
||||
### 3. Test Slack Alerting on Admin UI
|
||||
After creating a key on the Admin UI, click on "Test Slack Alert" to send a test alert to your Slack channel
|
||||
<Image img={require('../../img/test_alert.png')} />
|
||||
|
||||
### 4. Check Slack
|
||||
|
||||
When the test alert works, you should expect to see this on your alerts slack channel
|
||||
|
||||
<Image img={require('../../img/budget_alerts.png')} />
|
|
@ -32,8 +32,9 @@ litellm_settings:
|
|||
cache: True # set cache responses to True, litellm defaults to using a redis cache
|
||||
```
|
||||
|
||||
#### [OPTIONAL] Step 1.5: Add redis namespaces
|
||||
#### [OPTIONAL] Step 1.5: Add redis namespaces, default ttl
|
||||
|
||||
## Namespace
|
||||
If you want to create some folder for your keys, you can set a namespace, like this:
|
||||
|
||||
```yaml
|
||||
|
@ -50,6 +51,16 @@ and keys will be stored like:
|
|||
litellm_caching:<hash>
|
||||
```
|
||||
|
||||
## TTL
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
cache: true
|
||||
cache_params: # set cache params for redis
|
||||
type: redis
|
||||
ttl: 600 # will be cached on redis for 600s
|
||||
```
|
||||
|
||||
#### Step 2: Add Redis Credentials to .env
|
||||
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
|
||||
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# Modify / Reject Incoming Requests
|
||||
|
||||
- Modify data before making llm api calls on proxy
|
||||
- Reject data before making llm api calls / before returning the response
|
||||
- Enforce 'user' param for all openai endpoint calls
|
||||
|
||||
See a complete example with our [parallel request rate limiter](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/parallel_request_limiter.py)
|
||||
|
||||
|
@ -95,7 +98,7 @@ We might need to update the function schema in the future, to support multiple e
|
|||
|
||||
:::
|
||||
|
||||
See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/hooks/llama_guard.py)
|
||||
See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/llm_guard.py)
|
||||
|
||||
```python
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
@ -172,4 +175,19 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
```
|
||||
|
||||
## Advanced - Enforce 'user' param
|
||||
|
||||
Set `enforce_user_param` to true, to require all calls to the openai endpoints to have the 'user' param.
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/4777921a31c4c70e4d87b927cb233b6a09cd8b51/litellm/proxy/auth/auth_checks.py#L72)
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
enforce_user_param: True
|
||||
```
|
||||
|
||||
**Result**
|
||||
|
||||
<Image img={require('../../img/end_user_enforcement.png')}/>
|
|
@ -62,7 +62,6 @@ model_list:
|
|||
|
||||
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
|
||||
drop_params: True
|
||||
set_verbose: True
|
||||
|
||||
general_settings:
|
||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||
|
@ -558,6 +557,16 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|||
}'
|
||||
```
|
||||
|
||||
## Disable Swagger UI
|
||||
|
||||
To disable the Swagger docs from the base url, set
|
||||
|
||||
```env
|
||||
NO_DOCS="True"
|
||||
```
|
||||
|
||||
in your environment, and restart the proxy.
|
||||
|
||||
|
||||
## Configure DB Pool Limits + Connection Timeouts
|
||||
|
||||
|
@ -592,7 +601,9 @@ general_settings:
|
|||
"completion_model": "string",
|
||||
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
|
||||
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
|
||||
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
|
||||
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
|
||||
"enforce_user_param": "boolean", # requires all openai endpoint requests to have a 'user' param
|
||||
"allowed_routes": "list", # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
|
||||
"key_management_system": "google_kms", # either google_kms or azure_kms
|
||||
"master_key": "string",
|
||||
"database_url": "string",
|
||||
|
|
|
@ -103,7 +103,10 @@ RUN chmod +x entrypoint.sh
|
|||
EXPOSE 4000/tcp
|
||||
|
||||
# Override the CMD instruction with your desired command and arguments
|
||||
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gunicorn"]
|
||||
# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
|
||||
# CMD ["--port", "4000", "--config", "config.yaml"]
|
||||
|
||||
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -232,7 +235,6 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
|
|||
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
|
||||
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
|
||||
|
||||
|
||||
## Deploy with Database
|
||||
### Docker, Kubernetes, Helm Chart
|
||||
|
||||
|
@ -474,25 +476,6 @@ docker run --name litellm-proxy \
|
|||
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
|
||||
```
|
||||
|
||||
## Best Practices for Deploying to Production
|
||||
### 1. Switch of debug logs in production
|
||||
don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
|
||||
|
||||
### 2. Use `run_gunicorn` and `num_workers`
|
||||
|
||||
Example setting `--run_gunicorn` and `--num_workers`
|
||||
```shell
|
||||
docker run ghcr.io/berriai/litellm-database:main-latest --run_gunicorn --num_workers 4
|
||||
```
|
||||
|
||||
Why `Gunicorn`?
|
||||
- Gunicorn takes care of running multiple instances of your web application
|
||||
- Gunicorn is ideal for running litellm proxy on cluster of machines with Kubernetes
|
||||
|
||||
Why `num_workers`?
|
||||
Setting `num_workers` to the number of CPUs available ensures optimal utilization of system resources by matching the number of worker processes to the available CPU cores.
|
||||
|
||||
|
||||
## Advanced Deployment Settings
|
||||
|
||||
### Customization of the server root path
|
||||
|
@ -525,6 +508,57 @@ Provide an ssl certificate when starting litellm proxy server
|
|||
## Platform-specific Guide
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="AWS EKS" label="AWS EKS - Kubernetes">
|
||||
|
||||
### Kubernetes - Deploy on EKS
|
||||
|
||||
Step1. Create an EKS Cluster with the following spec
|
||||
|
||||
```shell
|
||||
eksctl create cluster --name=litellm-cluster --region=us-west-2 --node-type=t2.small
|
||||
```
|
||||
|
||||
Step 2. Mount litellm proxy config on kub cluster
|
||||
|
||||
This will mount your local file called `proxy_config.yaml` on kubernetes cluster
|
||||
|
||||
```shell
|
||||
kubectl create configmap litellm-config --from-file=proxy_config.yaml
|
||||
```
|
||||
|
||||
Step 3. Apply `kub.yaml` and `service.yaml`
|
||||
Clone the following `kub.yaml` and `service.yaml` files and apply locally
|
||||
|
||||
- Use this `kub.yaml` file - [litellm kub.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/kub.yaml)
|
||||
|
||||
- Use this `service.yaml` file - [litellm service.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/service.yaml)
|
||||
|
||||
Apply `kub.yaml`
|
||||
```
|
||||
kubectl apply -f kub.yaml
|
||||
```
|
||||
|
||||
Apply `service.yaml` - creates an AWS load balancer to expose the proxy
|
||||
```
|
||||
kubectl apply -f service.yaml
|
||||
|
||||
# service/litellm-service created
|
||||
```
|
||||
|
||||
Step 4. Get Proxy Base URL
|
||||
|
||||
```shell
|
||||
kubectl get services
|
||||
|
||||
# litellm-service LoadBalancer 10.100.6.31 a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com 4000:30374/TCP 63m
|
||||
```
|
||||
|
||||
Proxy Base URL = `a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com:4000`
|
||||
|
||||
That's it, now you can start using LiteLLM Proxy
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="aws-stack" label="AWS Cloud Formation Stack">
|
||||
|
||||
|
|
|
@ -12,9 +12,9 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
|
|||
:::
|
||||
|
||||
Features:
|
||||
- ✅ Content Moderation with LLM Guard
|
||||
- ✅ Content Moderation with LlamaGuard
|
||||
- ✅ Content Moderation with Google Text Moderations
|
||||
- ✅ Content Moderation with LLM Guard
|
||||
- ✅ Reject calls from Blocked User list
|
||||
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
||||
- ✅ Don't log/store specific requests (eg confidential LLM requests)
|
||||
|
@ -23,6 +23,71 @@ Features:
|
|||
|
||||
|
||||
## Content Moderation
|
||||
### Content Moderation with LLM Guard
|
||||
|
||||
Set the LLM Guard API Base in your environment
|
||||
|
||||
```env
|
||||
LLM_GUARD_API_BASE = "http://0.0.0.0:8192" # deployed llm guard api
|
||||
```
|
||||
|
||||
Add `llmguard_moderations` as a callback
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
callbacks: ["llmguard_moderations"]
|
||||
```
|
||||
|
||||
Now you can easily test it
|
||||
|
||||
- Make a regular /chat/completion call
|
||||
|
||||
- Check your proxy logs for any statement with `LLM Guard:`
|
||||
|
||||
Expected results:
|
||||
|
||||
```
|
||||
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
||||
```
|
||||
#### Turn on/off per key
|
||||
|
||||
**1. Update config**
|
||||
```yaml
|
||||
litellm_settings:
|
||||
callbacks: ["llmguard_moderations"]
|
||||
llm_guard_mode: "key-specific"
|
||||
```
|
||||
|
||||
**2. Create new key**
|
||||
|
||||
```bash
|
||||
curl --location 'http://localhost:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"models": ["fake-openai-endpoint"],
|
||||
"permissions": {
|
||||
"enable_llm_guard_check": true # 👈 KEY CHANGE
|
||||
}
|
||||
}'
|
||||
|
||||
# Returns {..'key': 'my-new-key'}
|
||||
```
|
||||
|
||||
**2. Test it!**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer my-new-key' \ # 👈 TEST KEY
|
||||
--data '{"model": "fake-openai-endpoint", "messages": [
|
||||
{"role": "system", "content": "Be helpful"},
|
||||
{"role": "user", "content": "What do you know?"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
### Content Moderation with LlamaGuard
|
||||
|
||||
Currently works with Sagemaker's LlamaGuard endpoint.
|
||||
|
@ -55,32 +120,7 @@ callbacks: ["llamaguard_moderations"]
|
|||
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
||||
```
|
||||
|
||||
### Content Moderation with LLM Guard
|
||||
|
||||
Set the LLM Guard API Base in your environment
|
||||
|
||||
```env
|
||||
LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
|
||||
```
|
||||
|
||||
Add `llmguard_moderations` as a callback
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
callbacks: ["llmguard_moderations"]
|
||||
```
|
||||
|
||||
Now you can easily test it
|
||||
|
||||
- Make a regular /chat/completion call
|
||||
|
||||
- Check your proxy logs for any statement with `LLM Guard:`
|
||||
|
||||
Expected results:
|
||||
|
||||
```
|
||||
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
||||
```
|
||||
|
||||
### Content Moderation with Google Text Moderation
|
||||
|
||||
|
|
53
docs/my-website/docs/proxy/grafana_metrics.md
Normal file
|
@ -0,0 +1,53 @@
|
|||
# Grafana, Prometheus metrics [BETA]
|
||||
|
||||
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
|
||||
|
||||
## Quick Start
|
||||
|
||||
If you're using the LiteLLM CLI with `litellm --config proxy_config.yaml` then you need to `pip install prometheus_client==0.20.0`. **This is already pre-installed on the litellm Docker image**
|
||||
|
||||
Add this to your proxy config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
litellm_settings:
|
||||
success_callback: ["prometheus"]
|
||||
```
|
||||
|
||||
Start the proxy
|
||||
```shell
|
||||
litellm --config config.yaml --debug
|
||||
```
|
||||
|
||||
Test Request
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
View Metrics on `/metrics`, Visit `http://localhost:4000/metrics`
|
||||
```shell
|
||||
http://localhost:4000/metrics
|
||||
|
||||
# <proxy_base_url>/metrics
|
||||
```
|
||||
|
||||
## Metrics Tracked
|
||||
|
||||
|
||||
| Metric Name | Description |
|
||||
|----------------------|--------------------------------------|
|
||||
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model"` |
|
||||
| `litellm_spend_metric` | Total Spend, per `"user", "key", "model"` |
|
||||
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model"` |
|
249
docs/my-website/docs/proxy/prod.md
Normal file
|
@ -0,0 +1,249 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# ⚡ Best Practices for Production
|
||||
|
||||
Expected Performance in Production
|
||||
|
||||
1 LiteLLM Uvicorn Worker on Kubernetes
|
||||
|
||||
| Description | Value |
|
||||
|--------------|-------|
|
||||
| Avg latency | `50ms` |
|
||||
| Median latency | `51ms` |
|
||||
| `/chat/completions` Requests/second | `35` |
|
||||
| `/chat/completions` Requests/minute | `2100` |
|
||||
| `/chat/completions` Requests/hour | `126K` |
|
||||
|
||||
|
||||
## 1. Switch of Debug Logging
|
||||
|
||||
Remove `set_verbose: True` from your config.yaml
|
||||
```yaml
|
||||
litellm_settings:
|
||||
set_verbose: True
|
||||
```
|
||||
|
||||
You should only see the following level of details in logs on the proxy server
|
||||
```shell
|
||||
# INFO: 192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK
|
||||
# INFO: 192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
|
||||
# INFO: 192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
|
||||
```
|
||||
|
||||
## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
|
||||
|
||||
Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
|
||||
|
||||
(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD).
|
||||
```shell
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
|
||||
```
|
||||
|
||||
## 2. Batch write spend updates every 60s
|
||||
|
||||
The default proxy batch write is 10s. This is to make it easy to see spend when debugging locally.
|
||||
|
||||
In production, we recommend using a longer interval period of 60s. This reduces the number of connections used to make DB writes.
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
|
||||
```
|
||||
|
||||
|
||||
## 3. Move spend logs to separate server
|
||||
|
||||
Writing each spend log to the db can slow down your proxy. In testing we saw a 70% improvement in median response time, by moving writing spend logs to a separate server.
|
||||
|
||||
👉 [LiteLLM Spend Logs Server](https://github.com/BerriAI/litellm/tree/main/litellm-js/spend-logs)
|
||||
|
||||
|
||||
**Spend Logs**
|
||||
This is a log of the key, tokens, model, and latency for each call on the proxy.
|
||||
|
||||
[**Full Payload**](https://github.com/BerriAI/litellm/blob/8c9623a6bc4ad9da0a2dac64249a60ed8da719e8/litellm/proxy/utils.py#L1769)
|
||||
|
||||
|
||||
**1. Start the spend logs server**
|
||||
|
||||
```bash
|
||||
docker run -p 3000:3000 \
|
||||
-e DATABASE_URL="postgres://.." \
|
||||
ghcr.io/berriai/litellm-spend_logs:main-latest
|
||||
|
||||
# RUNNING on http://0.0.0.0:3000
|
||||
```
|
||||
|
||||
**2. Connect to proxy**
|
||||
|
||||
|
||||
Example litellm_config.yaml
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: fake-openai-endpoint
|
||||
litellm_params:
|
||||
model: openai/my-fake-model
|
||||
api_key: my-fake-key
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
|
||||
```
|
||||
|
||||
Add `SPEND_LOGS_URL` as an environment variable when starting the proxy
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
|
||||
-e DATABASE_URL="postgresql://.." \
|
||||
-e SPEND_LOGS_URL="http://host.docker.internal:3000" \ # 👈 KEY CHANGE
|
||||
-p 4000:4000 \
|
||||
ghcr.io/berriai/litellm:main-latest \
|
||||
--config /app/config.yaml --detailed_debug
|
||||
|
||||
# Running on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
**3. Test Proxy!**
|
||||
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--data '{
|
||||
"model": "fake-openai-endpoint",
|
||||
"messages": [
|
||||
{"role": "system", "content": "Be helpful"},
|
||||
{"role": "user", "content": "What do you know?"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
In your LiteLLM Spend Logs Server, you should see
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```
|
||||
Received and stored 1 logs. Total logs in memory: 1
|
||||
...
|
||||
Flushed 1 log to the DB.
|
||||
```
|
||||
|
||||
|
||||
### Machine Specification
|
||||
|
||||
A t2.micro should be sufficient to handle 1k logs / minute on this server.
|
||||
|
||||
This consumes at max 120MB, and <0.1 vCPU.
|
||||
|
||||
## 4. Switch off resetting budgets
|
||||
|
||||
Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
|
||||
```yaml
|
||||
general_settings:
|
||||
disable_spend_logs: true
|
||||
disable_reset_budget: true
|
||||
```
|
||||
|
||||
## 5. Switch of `litellm.telemetry`
|
||||
|
||||
Switch of all telemetry tracking done by litellm
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
telemetry: False
|
||||
```
|
||||
|
||||
## Machine Specifications to Deploy LiteLLM
|
||||
|
||||
| Service | Spec | CPUs | Memory | Architecture | Version|
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
|
||||
| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
|
||||
|
||||
|
||||
## Reference Kubernetes Deployment YAML
|
||||
|
||||
Reference Kubernetes `deployment.yaml` that was load tested by us
|
||||
|
||||
```yaml
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: litellm-deployment
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
app: litellm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: litellm
|
||||
spec:
|
||||
containers:
|
||||
- name: litellm-container
|
||||
image: ghcr.io/berriai/litellm:main-latest
|
||||
imagePullPolicy: Always
|
||||
env:
|
||||
- name: AZURE_API_KEY
|
||||
value: "d6******"
|
||||
- name: AZURE_API_BASE
|
||||
value: "https://ope******"
|
||||
- name: LITELLM_MASTER_KEY
|
||||
value: "sk-1234"
|
||||
- name: DATABASE_URL
|
||||
value: "po**********"
|
||||
args:
|
||||
- "--config"
|
||||
- "/app/proxy_config.yaml" # Update the path to mount the config file
|
||||
volumeMounts: # Define volume mount for proxy_config.yaml
|
||||
- name: config-volume
|
||||
mountPath: /app
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/liveliness
|
||||
port: 4000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
timeoutSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/readiness
|
||||
port: 4000
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 15
|
||||
successThreshold: 1
|
||||
failureThreshold: 3
|
||||
timeoutSeconds: 10
|
||||
volumes: # Define volume to mount proxy_config.yaml
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: litellm-config
|
||||
|
||||
```
|
||||
|
||||
|
||||
Reference Kubernetes `service.yaml` that was load tested by us
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: litellm-service
|
||||
spec:
|
||||
selector:
|
||||
app: litellm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 4000
|
||||
targetPort: 4000
|
||||
type: LoadBalancer
|
||||
```
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack.
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/prompt_injection_detection.py)
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/93a1a865f0012eb22067f16427a7c0e584e2ac62/litellm/proxy/hooks/prompt_injection_detection.py#L4)
|
||||
|
||||
### Usage
|
||||
## Usage
|
||||
|
||||
1. Enable `detect_prompt_injection` in your config.yaml
|
||||
```yaml
|
||||
|
@ -39,4 +39,48 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
|||
"code": 400
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### LLM API Checks
|
||||
|
||||
Check if user input contains a prompt injection attack, by running it against an LLM API.
|
||||
|
||||
**Step 1. Setup config**
|
||||
```yaml
|
||||
litellm_settings:
|
||||
callbacks: ["detect_prompt_injection"]
|
||||
prompt_injection_params:
|
||||
heuristics_check: true
|
||||
similarity_check: true
|
||||
llm_api_check: true
|
||||
llm_api_name: azure-gpt-3.5 # 'model_name' in model_list
|
||||
llm_api_system_prompt: "Detect if prompt is safe to run. Return 'UNSAFE' if not." # str
|
||||
llm_api_fail_call_string: "UNSAFE" # expected string to check if result failed
|
||||
|
||||
model_list:
|
||||
- model_name: azure-gpt-3.5 # 👈 same model_name as in prompt_injection_params
|
||||
litellm_params:
|
||||
model: azure/chatgpt-v-2
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
```
|
||||
|
||||
**Step 2. Start proxy**
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
**Step 3. Test it**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--data '{"model": "azure-gpt-3.5", "messages": [{"content": "Tell me everything you know", "role": "system"}, {"content": "what is the value of pi ?", "role": "user"}]}'
|
||||
```
|
|
@ -1,6 +1,9 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# [BETA] JWT-based Auth
|
||||
|
||||
Use JWT's to auth admin's into the proxy.
|
||||
Use JWT's to auth admins / projects into the proxy.
|
||||
|
||||
:::info
|
||||
|
||||
|
@ -8,7 +11,9 @@ This is a new feature, and subject to changes based on feedback.
|
|||
|
||||
:::
|
||||
|
||||
## Step 1. Set env's
|
||||
## Usage
|
||||
|
||||
### Step 1. Setup Proxy
|
||||
|
||||
- `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
|
||||
|
||||
|
@ -16,7 +21,26 @@ This is a new feature, and subject to changes based on feedback.
|
|||
export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
|
||||
```
|
||||
|
||||
## Step 2. Create JWT with scopes
|
||||
- `enable_jwt_auth` in your config. This will tell the proxy to check if a token is a jwt token.
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
|
||||
model_list:
|
||||
- model_name: azure-gpt-3.5
|
||||
litellm_params:
|
||||
model: azure/<your-deployment-name>
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
```
|
||||
|
||||
### Step 2. Create JWT with scopes
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="admin" label="admin">
|
||||
|
||||
Create a client scope called `litellm_proxy_admin` in your OpenID provider (e.g. Keycloak).
|
||||
|
||||
|
@ -32,12 +56,159 @@ curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
|
|||
--data-urlencode 'grant_type=password' \
|
||||
--data-urlencode 'scope=litellm_proxy_admin' # 👈 grant this scope
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="project" label="project">
|
||||
|
||||
## Step 3. Create a proxy key with JWT
|
||||
Create a JWT for your project on your OpenID provider (e.g. Keycloak).
|
||||
|
||||
```bash
|
||||
curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
|
||||
--header 'Content-Type: application/x-www-form-urlencoded' \
|
||||
--data-urlencode 'client_id={CLIENT_ID}' \ # 👈 project id
|
||||
--data-urlencode 'client_secret={CLIENT_SECRET}' \
|
||||
--data-urlencode 'grant_type=client_credential' \
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Step 3. Test your JWT
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="key" label="/key/generate">
|
||||
|
||||
```bash
|
||||
curl --location '{proxy_base_url}/key/generate' \
|
||||
--header 'Authorization: Bearer eyJhbGciOiJSUzI1NiI...' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{}'
|
||||
```
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="llm_call" label="/chat/completions">
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer eyJhbGciOiJSUzI1...' \
|
||||
--data '{"model": "azure-gpt-3.5", "messages": [ { "role": "user", "content": "What's the weather like in Boston today?" } ]}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Advanced - Set Accepted JWT Scope Names
|
||||
|
||||
Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
litellm_jwtauth:
|
||||
admin_jwt_scope: "litellm-proxy-admin"
|
||||
```
|
||||
### JWT Scopes
|
||||
|
||||
Here's what scopes on JWT-Auth tokens look like
|
||||
|
||||
**Can be a list**
|
||||
```
|
||||
scope: ["litellm-proxy-admin",...]
|
||||
```
|
||||
|
||||
**Can be a space-separated string**
|
||||
```
|
||||
scope: "litellm-proxy-admin ..."
|
||||
```
|
||||
|
||||
## Advanced - Allowed Routes
|
||||
|
||||
Configure which routes a JWT can access via the config.
|
||||
|
||||
By default:
|
||||
|
||||
- Admins: can access only management routes (`/team/*`, `/key/*`, `/user/*`)
|
||||
- Teams: can access only openai routes (`/chat/completions`, etc.)+ info routes (`/*/info`)
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
|
||||
|
||||
**Admin Routes**
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
litellm_jwtauth:
|
||||
admin_jwt_scope: "litellm-proxy-admin"
|
||||
admin_allowed_routes: ["/v1/embeddings"]
|
||||
```
|
||||
|
||||
**Team Routes**
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
litellm_jwtauth:
|
||||
...
|
||||
team_jwt_scope: "litellm-team" # 👈 Set JWT Scope string
|
||||
team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
|
||||
```
|
||||
|
||||
## Advanced - Caching Public Keys
|
||||
|
||||
Control how long public keys are cached for (in seconds).
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
litellm_jwtauth:
|
||||
admin_jwt_scope: "litellm-proxy-admin"
|
||||
admin_allowed_routes: ["/v1/embeddings"]
|
||||
public_key_ttl: 600 # 👈 KEY CHANGE
|
||||
```
|
||||
|
||||
## Advanced - Custom JWT Field
|
||||
|
||||
Set a custom field in which the team_id exists. By default, the 'client_id' field is checked.
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
enable_jwt_auth: True
|
||||
litellm_jwtauth:
|
||||
team_id_jwt_field: "client_id" # 👈 KEY CHANGE
|
||||
```
|
||||
|
||||
## All Params
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
|
||||
|
||||
|
||||
|
||||
|
||||
## Advanced - Block Teams
|
||||
|
||||
To block all requests for a certain team id, use `/team/block`
|
||||
|
||||
**Block Team**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/team/block' \
|
||||
--header 'Authorization: Bearer <admin-token>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"team_id": "litellm-test-client-id-new" # 👈 set team id
|
||||
}'
|
||||
```
|
||||
|
||||
**Unblock Team**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:4000/team/unblock' \
|
||||
--header 'Authorization: Bearer <admin-token>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"team_id": "litellm-test-client-id-new" # 👈 set team id
|
||||
}'
|
||||
```
|
||||
|
||||
|
|
|
@ -47,8 +47,9 @@ Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhos
|
|||
Set the following in your .env on the Proxy
|
||||
|
||||
```shell
|
||||
UI_USERNAME=ishaan-litellm
|
||||
UI_PASSWORD=langchain
|
||||
LITELLM_MASTER_KEY="sk-1234" # this is your master key for using the proxy server
|
||||
UI_USERNAME=ishaan-litellm # username to sign in on UI
|
||||
UI_PASSWORD=langchain # password to sign in on UI
|
||||
```
|
||||
|
||||
On accessing the LiteLLM UI, you will be prompted to enter your username, password
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
# 🔑 Virtual Keys, Users
|
||||
Track Spend, Set budgets and create virtual keys for the proxy
|
||||
|
||||
Grant other's temporary access to your proxy, with keys that expire after a set duration.
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# 🔑 Virtual Keys
|
||||
Track Spend, and control model access via virtual keys for the proxy
|
||||
|
||||
:::info
|
||||
|
||||
- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
|
||||
- [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
|
||||
- Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
|
||||
- [Dockerfile.database for LiteLLM Proxy + Key Management](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
|
||||
|
||||
|
||||
:::
|
||||
|
@ -30,7 +30,7 @@ export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>
|
|||
```
|
||||
|
||||
|
||||
You can then generate temporary keys by hitting the `/key/generate` endpoint.
|
||||
You can then generate keys by hitting the `/key/generate` endpoint.
|
||||
|
||||
[**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672)
|
||||
|
||||
|
@ -46,8 +46,8 @@ model_list:
|
|||
model: ollama/llama2
|
||||
|
||||
general_settings:
|
||||
master_key: sk-1234 # [OPTIONAL] if set all calls to proxy will require either this key or a valid generated token
|
||||
database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>"
|
||||
master_key: sk-1234
|
||||
database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # 👈 KEY CHANGE
|
||||
```
|
||||
|
||||
**Step 2: Start litellm**
|
||||
|
@ -56,62 +56,220 @@ general_settings:
|
|||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
**Step 3: Generate temporary keys**
|
||||
**Step 3: Generate keys**
|
||||
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}'
|
||||
```
|
||||
|
||||
## Advanced - Spend Tracking
|
||||
|
||||
## /key/generate
|
||||
Get spend per:
|
||||
- key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get)
|
||||
- user - via `/user/info` [Swagger](https://litellm-api.up.railway.app/#/user%20management/user_info_user_info_get)
|
||||
- team - via `/team/info` [Swagger](https://litellm-api.up.railway.app/#/team%20management/team_info_team_info_get)
|
||||
- ⏳ end-users - via `/end_user/info` - [Comment on this issue for end-user cost tracking](https://github.com/BerriAI/litellm/issues/2633)
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"duration": "20m",
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra",
|
||||
"max_budget": 10,
|
||||
"soft_budget": 5,
|
||||
}'
|
||||
**How is it calculated?**
|
||||
|
||||
The cost per model is stored [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) and calculated by the [`completion_cost`](https://github.com/BerriAI/litellm/blob/db7974f9f216ee50b53c53120d1e3fc064173b60/litellm/utils.py#L3771) function.
|
||||
|
||||
**How is it tracking?**
|
||||
|
||||
Spend is automatically tracked for the key in the "LiteLLM_VerificationTokenTable". If the key has an attached 'user_id' or 'team_id', the spend for that user is tracked in the "LiteLLM_UserTable", and team in the "LiteLLM_TeamTable".
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="key-info" label="Key Spend">
|
||||
|
||||
You can get spend for a key by using the `/key/info` endpoint.
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
|
||||
-X GET \
|
||||
-H 'Authorization: Bearer <your-master-key>'
|
||||
```
|
||||
|
||||
This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654).
|
||||
|
||||
Request Params:
|
||||
|
||||
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
- `key_alias`: *Optional[str]* - User defined key alias
|
||||
- `team_id`: *Optional[str]* - The team id of the user
|
||||
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
|
||||
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
|
||||
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
|
||||
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
|
||||
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||
|
||||
|
||||
### Response
|
||||
**Sample response**
|
||||
|
||||
```python
|
||||
{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
||||
"expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
|
||||
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
||||
...
|
||||
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
|
||||
"info": {
|
||||
"token": "sk-tXL0wt5-lOOVK9sfY2UacA",
|
||||
"spend": 0.0001065, # 👈 SPEND
|
||||
"expires": "2023-11-24T23:19:11.131000Z",
|
||||
"models": [
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-4",
|
||||
"claude-2"
|
||||
],
|
||||
"aliases": {
|
||||
"mistral-7b": "gpt-3.5-turbo"
|
||||
},
|
||||
"config": {}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Upgrade/Downgrade Models
|
||||
</TabItem>
|
||||
<TabItem value="user-info" label="User Spend">
|
||||
|
||||
**1. Create a user**
|
||||
|
||||
```bash
|
||||
curl --location 'http://localhost:4000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{user_email: "krrish@berri.ai"}'
|
||||
```
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```bash
|
||||
{
|
||||
...
|
||||
"expires": "2023-12-22T09:53:13.861000Z",
|
||||
"user_id": "my-unique-id", # 👈 unique id
|
||||
"max_budget": 0.0
|
||||
}
|
||||
```
|
||||
|
||||
**2. Create a key for that user**
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "user_id": "my-unique-id"}'
|
||||
```
|
||||
|
||||
Returns a key - `sk-...`.
|
||||
|
||||
**3. See spend for user**
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/user/info?user_id=my-unique-id' \
|
||||
-X GET \
|
||||
-H 'Authorization: Bearer <your-master-key>'
|
||||
```
|
||||
|
||||
Expected Response
|
||||
|
||||
```bash
|
||||
{
|
||||
...
|
||||
"spend": 0 # 👈 SPEND
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="team-info" label="Team Spend">
|
||||
|
||||
Use teams, if you want keys to be owned by multiple people (e.g. for a production app).
|
||||
|
||||
**1. Create a team**
|
||||
|
||||
```bash
|
||||
curl --location 'http://localhost:4000/team/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"team_alias": "my-awesome-team"}'
|
||||
```
|
||||
|
||||
**Expected Response**
|
||||
|
||||
```bash
|
||||
{
|
||||
...
|
||||
"expires": "2023-12-22T09:53:13.861000Z",
|
||||
"team_id": "my-unique-id", # 👈 unique id
|
||||
"max_budget": 0.0
|
||||
}
|
||||
```
|
||||
|
||||
**2. Create a key for that team**
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "team_id": "my-unique-id"}'
|
||||
```
|
||||
|
||||
Returns a key - `sk-...`.
|
||||
|
||||
**3. See spend for team**
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/team/info?team_id=my-unique-id' \
|
||||
-X GET \
|
||||
-H 'Authorization: Bearer <your-master-key>'
|
||||
```
|
||||
|
||||
Expected Response
|
||||
|
||||
```bash
|
||||
{
|
||||
...
|
||||
"spend": 0 # 👈 SPEND
|
||||
}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Advanced - Model Access
|
||||
|
||||
### Restrict models by `team_id`
|
||||
`litellm-dev` can only access `azure-gpt-3.5`
|
||||
|
||||
**1. Create a team via `/team/new`**
|
||||
```shell
|
||||
curl --location 'http://localhost:4000/team/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_alias": "litellm-dev",
|
||||
"models": ["azure-gpt-3.5"]
|
||||
}'
|
||||
|
||||
# returns {...,"team_id": "my-unique-id"}
|
||||
```
|
||||
|
||||
**2. Create a key for team**
|
||||
```shell
|
||||
curl --location 'http://localhost:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"team_id": "my-unique-id"}'
|
||||
```
|
||||
|
||||
**3. Test it**
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
|
||||
--data '{
|
||||
"model": "BEDROCK_GROUP",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
```shell
|
||||
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
|
||||
```
|
||||
|
||||
### Model Aliases
|
||||
|
||||
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
|
||||
|
||||
|
@ -189,421 +347,9 @@ curl --location 'http://localhost:4000/key/generate' \
|
|||
"max_budget": 0,}'
|
||||
```
|
||||
|
||||
## Advanced - Custom Auth
|
||||
|
||||
## /key/info
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
|
||||
-H "Authorization: Bearer sk-1234"
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- key: str - The key you want the info for
|
||||
|
||||
### Response
|
||||
|
||||
`token` is the hashed key (The DB stores the hashed key for security)
|
||||
```json
|
||||
{
|
||||
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
|
||||
"info": {
|
||||
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
|
||||
"spend": 0.0,
|
||||
"expires": "2024-01-18T23:52:09.125000+00:00",
|
||||
"models": ["azure-gpt-3.5", "azure-embedding-model"],
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"user_id": "ishaan2@berri.ai",
|
||||
"team_id": "None",
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
```
|
||||
|
||||
## /key/update
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/key/update' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra"
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- key: str - The key that needs to be updated.
|
||||
|
||||
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
|
||||
|
||||
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
|
||||
|
||||
- team_id: str or null (optional) - Specify the team_id for the associated key.
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
|
||||
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
|
||||
"metadata": {
|
||||
"user": "ishaan@berri.ai"
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
## /key/delete
|
||||
|
||||
### Request
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/key/delete' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- keys: List[str] - List of keys to delete
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}
|
||||
```
|
||||
|
||||
## /user/new
|
||||
|
||||
### Request
|
||||
|
||||
All [key/generate params supported](#keygenerate) for creating a user
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/user/new' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"user_id": "ishaan1",
|
||||
"user_email": "ishaan@litellm.ai",
|
||||
"user_role": "admin",
|
||||
"team_id": "cto-team",
|
||||
"max_budget": 20,
|
||||
"budget_duration": "1h"
|
||||
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
|
||||
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
|
||||
- user_email: str (optional - defaults to "") - The email address associated with the user.
|
||||
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
|
||||
|
||||
**Possible `user_role` values**
|
||||
```
|
||||
"admin" - Maintaining the proxy and owning the overall budget
|
||||
"app_owner" - employees maintaining the apps, each owner may own more than one app
|
||||
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
|
||||
```
|
||||
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
|
||||
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
|
||||
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
|
||||
|
||||
### Response
|
||||
A key will be generated for the new user created
|
||||
|
||||
```shell
|
||||
{
|
||||
"models": [],
|
||||
"spend": 0.0,
|
||||
"max_budget": null,
|
||||
"user_id": "ishaan1",
|
||||
"team_id": null,
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {},
|
||||
"tpm_limit": null,
|
||||
"rpm_limit": null,
|
||||
"budget_duration": null,
|
||||
"allowed_cache_controls": [],
|
||||
"key_alias": null,
|
||||
"duration": null,
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
|
||||
"key_name": null,
|
||||
"expires": null
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## /user/info
|
||||
|
||||
### Request
|
||||
|
||||
#### View all Users
|
||||
If you're trying to view all users, we recommend using pagination with the following args
|
||||
- `view_all=true`
|
||||
- `page=0` Optional(int) min = 0, default=0
|
||||
- `page_size=25` Optional(int) min = 1, default = 25
|
||||
```shell
|
||||
curl -X GET "http://0.0.0.0:4000/user/info?view_all=true&page=0&page_size=25" -H "Authorization: Bearer sk-1234"
|
||||
```
|
||||
|
||||
#### View specific user_id
|
||||
```shell
|
||||
curl -X GET "http://0.0.0.0:4000/user/info?user_id=228da235-eef0-4c30-bf53-5d6ac0d278c2" -H "Authorization: Bearer sk-1234"
|
||||
```
|
||||
|
||||
### Response
|
||||
View user spend, budget, models, keys and teams
|
||||
|
||||
```json
|
||||
{
|
||||
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
|
||||
"user_info": {
|
||||
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
|
||||
"team_id": null,
|
||||
"teams": [],
|
||||
"user_role": "app_user",
|
||||
"max_budget": null,
|
||||
"spend": 200000.0,
|
||||
"user_email": null,
|
||||
"models": [],
|
||||
"max_parallel_requests": null,
|
||||
"tpm_limit": null,
|
||||
"rpm_limit": null,
|
||||
"budget_duration": null,
|
||||
"budget_reset_at": null,
|
||||
"allowed_cache_controls": [],
|
||||
"model_spend": {
|
||||
"chatgpt-v-2": 200000
|
||||
},
|
||||
"model_max_budget": {}
|
||||
},
|
||||
"keys": [
|
||||
{
|
||||
"token": "16c337f9df00a0e6472627e39a2ed02e67bc9a8a760c983c4e9b8cad7954f3c0",
|
||||
"key_name": null,
|
||||
"key_alias": null,
|
||||
"spend": 200000.0,
|
||||
"expires": null,
|
||||
"models": [],
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
|
||||
"team_id": null,
|
||||
"permissions": {},
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {},
|
||||
"tpm_limit": null,
|
||||
"rpm_limit": null,
|
||||
"max_budget": null,
|
||||
"budget_duration": null,
|
||||
"budget_reset_at": null,
|
||||
"allowed_cache_controls": [],
|
||||
"model_spend": {
|
||||
"chatgpt-v-2": 200000
|
||||
},
|
||||
"model_max_budget": {}
|
||||
}
|
||||
],
|
||||
"teams": []
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Advanced
|
||||
### Upperbound /key/generate params
|
||||
Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
|
||||
|
||||
Set `litellm_settings:upperbound_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
upperbound_key_generate_params:
|
||||
max_budget: 100 # upperbound of $100, for all /key/generate requests
|
||||
duration: "30d" # upperbound of 30 days for all /key/generate requests
|
||||
```
|
||||
|
||||
** Expected Behavior **
|
||||
|
||||
- Send a `/key/generate` request with `max_budget=200`
|
||||
- Key will be created with `max_budget=100` since 100 is the upper bound
|
||||
|
||||
### Default /key/generate params
|
||||
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
|
||||
|
||||
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
|
||||
|
||||
Set `litellm_settings:default_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_key_generate_params:
|
||||
max_budget: 1.5000
|
||||
models: ["azure-gpt-3.5"]
|
||||
duration: # blank means `null`
|
||||
metadata: {"setting":"default"}
|
||||
team_id: "core-infra"
|
||||
```
|
||||
|
||||
### Restrict models by `team_id`
|
||||
`litellm-dev` can only access `azure-gpt-3.5`
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_team_settings:
|
||||
- team_id: litellm-dev
|
||||
models: ["azure-gpt-3.5"]
|
||||
```
|
||||
|
||||
#### Create key with team_id="litellm-dev"
|
||||
```shell
|
||||
curl --location 'http://localhost:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"team_id": "litellm-dev"}'
|
||||
```
|
||||
|
||||
#### Use Key to call invalid model - Fails
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
|
||||
--data '{
|
||||
"model": "BEDROCK_GROUP",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
```shell
|
||||
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
|
||||
```
|
||||
|
||||
### Set Budgets - Per Key
|
||||
|
||||
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"metadata": {"user": "ishaan@berri.ai"},
|
||||
"team_id": "core-infra",
|
||||
"max_budget": 10,
|
||||
}'
|
||||
```
|
||||
|
||||
#### Expected Behaviour
|
||||
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
|
||||
- After the key crosses it's `max_budget`, requests fail
|
||||
|
||||
Example Request to `/chat/completions` when key has crossed budget
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
|
||||
--data ' {
|
||||
"model": "azure-gpt-3.5",
|
||||
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "respond in 50 lines"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
Expected Response from `/chat/completions` when key has crossed budget
|
||||
```shell
|
||||
{
|
||||
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### Set Budgets - Per User
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
|
||||
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request.
|
||||
|
||||
```shell
|
||||
curl --location 'http://localhost:4000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||
|
||||
**Sample Response**
|
||||
|
||||
```shell
|
||||
{
|
||||
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
|
||||
"expires": "2023-12-22T09:53:13.861000Z",
|
||||
"user_id": "krrish3@berri.ai",
|
||||
"max_budget": 0.0
|
||||
}
|
||||
```
|
||||
|
||||
### Tracking Spend
|
||||
|
||||
You can get spend for a key by using the `/key/info` endpoint.
|
||||
|
||||
```bash
|
||||
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
|
||||
-X GET \
|
||||
-H 'Authorization: Bearer <your-master-key>'
|
||||
```
|
||||
|
||||
This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654).
|
||||
|
||||
**Sample response**
|
||||
|
||||
```python
|
||||
{
|
||||
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
|
||||
"info": {
|
||||
"token": "sk-tXL0wt5-lOOVK9sfY2UacA",
|
||||
"spend": 0.0001065,
|
||||
"expires": "2023-11-24T23:19:11.131000Z",
|
||||
"models": [
|
||||
"gpt-3.5-turbo",
|
||||
"gpt-4",
|
||||
"claude-2"
|
||||
],
|
||||
"aliases": {
|
||||
"mistral-7b": "gpt-3.5-turbo"
|
||||
},
|
||||
"config": {}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### Custom Auth
|
||||
|
||||
You can now override the default api key auth.
|
||||
You can now override the default api key auth.
|
||||
|
||||
Here's how:
|
||||
|
||||
|
@ -737,4 +483,56 @@ litellm_settings:
|
|||
|
||||
general_settings:
|
||||
custom_key_generate: custom_auth.custom_generate_key_fn
|
||||
```
|
||||
```
|
||||
|
||||
|
||||
## Upperbound /key/generate params
|
||||
Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key.
|
||||
|
||||
Set `litellm_settings:upperbound_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
upperbound_key_generate_params:
|
||||
max_budget: 100 # upperbound of $100, for all /key/generate requests
|
||||
duration: "30d" # upperbound of 30 days for all /key/generate requests
|
||||
```
|
||||
|
||||
** Expected Behavior **
|
||||
|
||||
- Send a `/key/generate` request with `max_budget=200`
|
||||
- Key will be created with `max_budget=100` since 100 is the upper bound
|
||||
|
||||
## Default /key/generate params
|
||||
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
|
||||
|
||||
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
|
||||
|
||||
Set `litellm_settings:default_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_key_generate_params:
|
||||
max_budget: 1.5000
|
||||
models: ["azure-gpt-3.5"]
|
||||
duration: # blank means `null`
|
||||
metadata: {"setting":"default"}
|
||||
team_id: "core-infra"
|
||||
```
|
||||
|
||||
## Endpoints
|
||||
|
||||
### Keys
|
||||
|
||||
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/key%20management/)
|
||||
|
||||
### Users
|
||||
|
||||
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/user%20management/)
|
||||
|
||||
|
||||
### Teams
|
||||
|
||||
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/team%20management)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -442,6 +442,8 @@ If a call fails after num_retries, fall back to another model group.
|
|||
|
||||
If the error is a context window exceeded error, fall back to a larger model group (if given).
|
||||
|
||||
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
|
||||
|
@ -551,6 +553,156 @@ router = Router(model_list: Optional[list] = None,
|
|||
cache_responses=True)
|
||||
```
|
||||
|
||||
## Pre-Call Checks (Context Window)
|
||||
|
||||
Enable pre-call checks to filter out deployments with context window limit < messages for a call.
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="sdk" label="SDK">
|
||||
|
||||
**1. Enable pre-call checks**
|
||||
```python
|
||||
from litellm import Router
|
||||
# ...
|
||||
router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
|
||||
```
|
||||
|
||||
**2. (Azure-only) Set base model**
|
||||
|
||||
For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
|
||||
|
||||
```python
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # model group name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
"model_info": {
|
||||
"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
|
||||
}
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # model group name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo-1106",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
**3. Test it!**
|
||||
|
||||
```python
|
||||
"""
|
||||
- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
|
||||
- Send a 5k prompt
|
||||
- Assert it works
|
||||
"""
|
||||
from litellm import Router
|
||||
import os
|
||||
|
||||
try:
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # model group name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
"model_info": {
|
||||
"base_model": "azure/gpt-35-turbo",
|
||||
}
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", # model group name
|
||||
"litellm_params": { # params for litellm completion/embedding call
|
||||
"model": "gpt-3.5-turbo-1106",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
router = Router(model_list=model_list, enable_pre_call_checks=True)
|
||||
|
||||
text = "What is the meaning of 42?" * 5000
|
||||
|
||||
response = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{"role": "system", "content": text},
|
||||
{"role": "user", "content": "Who was Alexander?"},
|
||||
],
|
||||
)
|
||||
|
||||
print(f"response: {response}")
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="proxy" label="Proxy">
|
||||
|
||||
**1. Setup config**
|
||||
|
||||
For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with azure/.
|
||||
|
||||
```yaml
|
||||
router_settings:
|
||||
enable_pre_call_checks: true # 1. Enable pre-call checks
|
||||
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/chatgpt-v-2
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
model_info:
|
||||
base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
|
||||
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo-1106
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
```
|
||||
|
||||
**2. Start proxy**
|
||||
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
# RUNNING on http://0.0.0.0:4000
|
||||
```
|
||||
|
||||
**3. Test it!**
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:4000"
|
||||
)
|
||||
|
||||
text = "What is the meaning of 42?" * 5000
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [
|
||||
{"role": "system", "content": text},
|
||||
{"role": "user", "content": "Who was Alexander?"},
|
||||
],
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Caching across model groups
|
||||
|
||||
If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups.
|
||||
|
|
|
@ -95,5 +95,4 @@ completion_with_split_tests(
|
|||
)
|
||||
```
|
||||
|
||||
### A/B Testing Dashboard after running code - https://admin.litellm.ai/
|
||||
<Image img={require('../../img/ab_test_logs.png')} />
|
||||
|
||||
|
|
95
docs/my-website/docs/tutorials/instructor.md
Normal file
|
@ -0,0 +1,95 @@
|
|||
# Instructor - Function Calling
|
||||
|
||||
Use LiteLLM Router with [jxnl's instructor library](https://github.com/jxnl/instructor) for function calling in prod.
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import Router
|
||||
import instructor
|
||||
from pydantic import BaseModel
|
||||
|
||||
litellm.set_verbose = True # 👈 print DEBUG LOGS
|
||||
|
||||
client = instructor.patch(
|
||||
Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo", openai model name
|
||||
"litellm_params": { # params for litellm completion/embedding call - e.g.: https://github.com/BerriAI/litellm/blob/62a591f90c99120e1a51a8445f5c3752586868ea/litellm/router.py#L111
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class UserDetail(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
user = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
response_model=UserDetail,
|
||||
messages=[
|
||||
{"role": "user", "content": "Extract Jason is 25 years old"},
|
||||
],
|
||||
)
|
||||
|
||||
assert isinstance(user, UserDetail)
|
||||
assert user.name == "Jason"
|
||||
assert user.age == 25
|
||||
|
||||
print(f"user: {user}")
|
||||
```
|
||||
|
||||
## Async Calls
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import Router
|
||||
import instructor, asyncio
|
||||
from pydantic import BaseModel
|
||||
|
||||
aclient = instructor.apatch(
|
||||
Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
},
|
||||
}
|
||||
],
|
||||
default_litellm_params={"acompletion": True}, # 👈 IMPORTANT - tells litellm to route to async completion function.
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class UserExtract(BaseModel):
|
||||
name: str
|
||||
age: int
|
||||
|
||||
|
||||
async def main():
|
||||
model = await aclient.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
response_model=UserExtract,
|
||||
messages=[
|
||||
{"role": "user", "content": "Extract jason is 25 years old"},
|
||||
],
|
||||
)
|
||||
print(f"model: {model}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
Before Width: | Height: | Size: 263 KiB |
Before Width: | Height: | Size: 449 KiB |
Before Width: | Height: | Size: 66 KiB |
Before Width: | Height: | Size: 73 KiB |
Before Width: | Height: | Size: 89 KiB |
Before Width: | Height: | Size: 140 KiB |
Before Width: | Height: | Size: 386 KiB |
Before Width: | Height: | Size: 20 KiB |
BIN
docs/my-website/img/end_user_enforcement.png
Normal file
After Width: | Height: | Size: 180 KiB |
Before Width: | Height: | Size: 429 KiB |
Before Width: | Height: | Size: 505 KiB |
Before Width: | Height: | Size: 468 KiB |
Before Width: | Height: | Size: 123 KiB |
Before Width: | Height: | Size: 203 KiB |
Before Width: | Height: | Size: 81 KiB |
Before Width: | Height: | Size: 82 KiB |
38
docs/my-website/package-lock.json
generated
|
@ -5561,12 +5561,12 @@
|
|||
}
|
||||
},
|
||||
"node_modules/body-parser": {
|
||||
"version": "1.20.1",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz",
|
||||
"integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==",
|
||||
"version": "1.20.2",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
|
||||
"integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
|
||||
"dependencies": {
|
||||
"bytes": "3.1.2",
|
||||
"content-type": "~1.0.4",
|
||||
"content-type": "~1.0.5",
|
||||
"debug": "2.6.9",
|
||||
"depd": "2.0.0",
|
||||
"destroy": "1.2.0",
|
||||
|
@ -5574,7 +5574,7 @@
|
|||
"iconv-lite": "0.4.24",
|
||||
"on-finished": "2.4.1",
|
||||
"qs": "6.11.0",
|
||||
"raw-body": "2.5.1",
|
||||
"raw-body": "2.5.2",
|
||||
"type-is": "~1.6.18",
|
||||
"unpipe": "1.0.0"
|
||||
},
|
||||
|
@ -6707,9 +6707,9 @@
|
|||
"integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A=="
|
||||
},
|
||||
"node_modules/cookie": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
|
||||
"integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
|
||||
"integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
|
@ -10411,16 +10411,16 @@
|
|||
}
|
||||
},
|
||||
"node_modules/express": {
|
||||
"version": "4.18.2",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
|
||||
"integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
|
||||
"version": "4.19.2",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
|
||||
"integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
|
||||
"dependencies": {
|
||||
"accepts": "~1.3.8",
|
||||
"array-flatten": "1.1.1",
|
||||
"body-parser": "1.20.1",
|
||||
"body-parser": "1.20.2",
|
||||
"content-disposition": "0.5.4",
|
||||
"content-type": "~1.0.4",
|
||||
"cookie": "0.5.0",
|
||||
"cookie": "0.6.0",
|
||||
"cookie-signature": "1.0.6",
|
||||
"debug": "2.6.9",
|
||||
"depd": "2.0.0",
|
||||
|
@ -17016,9 +17016,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/raw-body": {
|
||||
"version": "2.5.1",
|
||||
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
|
||||
"integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
|
||||
"version": "2.5.2",
|
||||
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
|
||||
"integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
|
||||
"dependencies": {
|
||||
"bytes": "3.1.2",
|
||||
"http-errors": "2.0.0",
|
||||
|
@ -21554,9 +21554,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/webpack-dev-middleware": {
|
||||
"version": "5.3.3",
|
||||
"resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz",
|
||||
"integrity": "sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==",
|
||||
"version": "5.3.4",
|
||||
"resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
|
||||
"integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
|
||||
"dependencies": {
|
||||
"colorette": "^2.0.10",
|
||||
"memfs": "^3.4.3",
|
||||
|
|
|
@ -28,8 +28,9 @@ const sidebars = {
|
|||
slug: "/simple_proxy",
|
||||
},
|
||||
items: [
|
||||
"proxy/quick_start",
|
||||
"proxy/deploy",
|
||||
"proxy/quick_start",
|
||||
"proxy/deploy",
|
||||
"proxy/prod",
|
||||
"proxy/configs",
|
||||
{
|
||||
type: "link",
|
||||
|
@ -42,7 +43,6 @@ const sidebars = {
|
|||
"proxy/users",
|
||||
"proxy/team_based_routing",
|
||||
"proxy/ui",
|
||||
"proxy/budget_alerts",
|
||||
"proxy/cost_tracking",
|
||||
"proxy/token_auth",
|
||||
{
|
||||
|
@ -61,6 +61,7 @@ const sidebars = {
|
|||
label: "Logging, Alerting",
|
||||
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
|
||||
},
|
||||
"proxy/grafana_metrics",
|
||||
"proxy/call_hooks",
|
||||
"proxy/rules",
|
||||
"proxy/cli",
|
||||
|
@ -180,8 +181,9 @@ const sidebars = {
|
|||
type: "category",
|
||||
label: "Tutorials",
|
||||
items: [
|
||||
"tutorials/azure_openai",
|
||||
"tutorials/oobabooga",
|
||||
'tutorials/azure_openai',
|
||||
'tutorials/instructor',
|
||||
'tutorials/oobabooga',
|
||||
"tutorials/gradio_integration",
|
||||
"tutorials/huggingface_codellama",
|
||||
"tutorials/huggingface_tutorial",
|
||||
|
|
|
@ -3138,13 +3138,13 @@ bluebird@~3.4.1:
|
|||
resolved "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz"
|
||||
integrity sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==
|
||||
|
||||
body-parser@1.20.1:
|
||||
version "1.20.1"
|
||||
resolved "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz"
|
||||
integrity sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==
|
||||
body-parser@1.20.2:
|
||||
version "1.20.2"
|
||||
resolved "https://registry.yarnpkg.com/body-parser/-/body-parser-1.20.2.tgz#6feb0e21c4724d06de7ff38da36dad4f57a747fd"
|
||||
integrity sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==
|
||||
dependencies:
|
||||
bytes "3.1.2"
|
||||
content-type "~1.0.4"
|
||||
content-type "~1.0.5"
|
||||
debug "2.6.9"
|
||||
depd "2.0.0"
|
||||
destroy "1.2.0"
|
||||
|
@ -3152,7 +3152,7 @@ body-parser@1.20.1:
|
|||
iconv-lite "0.4.24"
|
||||
on-finished "2.4.1"
|
||||
qs "6.11.0"
|
||||
raw-body "2.5.1"
|
||||
raw-body "2.5.2"
|
||||
type-is "~1.6.18"
|
||||
unpipe "1.0.0"
|
||||
|
||||
|
@ -3921,7 +3921,7 @@ content-disposition@0.5.4:
|
|||
dependencies:
|
||||
safe-buffer "5.2.1"
|
||||
|
||||
content-type@~1.0.4:
|
||||
content-type@~1.0.4, content-type@~1.0.5:
|
||||
version "1.0.5"
|
||||
resolved "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz"
|
||||
integrity sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==
|
||||
|
@ -3941,10 +3941,10 @@ cookie-signature@1.0.6:
|
|||
resolved "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz"
|
||||
integrity sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==
|
||||
|
||||
cookie@0.5.0:
|
||||
version "0.5.0"
|
||||
resolved "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz"
|
||||
integrity sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==
|
||||
cookie@0.6.0:
|
||||
version "0.6.0"
|
||||
resolved "https://registry.yarnpkg.com/cookie/-/cookie-0.6.0.tgz#2798b04b071b0ecbff0dbb62a505a8efa4e19051"
|
||||
integrity sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==
|
||||
|
||||
copy-descriptor@^0.1.0:
|
||||
version "0.1.1"
|
||||
|
@ -5325,16 +5325,16 @@ expand-template@^2.0.3:
|
|||
integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==
|
||||
|
||||
express@^4.17.1, express@^4.17.3:
|
||||
version "4.18.2"
|
||||
resolved "https://registry.npmjs.org/express/-/express-4.18.2.tgz"
|
||||
integrity sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==
|
||||
version "4.19.2"
|
||||
resolved "https://registry.yarnpkg.com/express/-/express-4.19.2.tgz#e25437827a3aa7f2a827bc8171bbbb664a356465"
|
||||
integrity sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==
|
||||
dependencies:
|
||||
accepts "~1.3.8"
|
||||
array-flatten "1.1.1"
|
||||
body-parser "1.20.1"
|
||||
body-parser "1.20.2"
|
||||
content-disposition "0.5.4"
|
||||
content-type "~1.0.4"
|
||||
cookie "0.5.0"
|
||||
cookie "0.6.0"
|
||||
cookie-signature "1.0.6"
|
||||
debug "2.6.9"
|
||||
depd "2.0.0"
|
||||
|
@ -9924,10 +9924,10 @@ range-parser@^1.2.1, range-parser@~1.2.1:
|
|||
resolved "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz"
|
||||
integrity sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==
|
||||
|
||||
raw-body@2.5.1:
|
||||
version "2.5.1"
|
||||
resolved "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz"
|
||||
integrity sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==
|
||||
raw-body@2.5.2:
|
||||
version "2.5.2"
|
||||
resolved "https://registry.yarnpkg.com/raw-body/-/raw-body-2.5.2.tgz#99febd83b90e08975087e8f1f9419a149366b68a"
|
||||
integrity sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==
|
||||
dependencies:
|
||||
bytes "3.1.2"
|
||||
http-errors "2.0.0"
|
||||
|
@ -12406,9 +12406,9 @@ webpack-bundle-analyzer@^4.5.0:
|
|||
ws "^7.3.1"
|
||||
|
||||
webpack-dev-middleware@^5.3.1:
|
||||
version "5.3.3"
|
||||
resolved "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz"
|
||||
integrity sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==
|
||||
version "5.3.4"
|
||||
resolved "https://registry.yarnpkg.com/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz#eb7b39281cbce10e104eb2b8bf2b63fce49a3517"
|
||||
integrity sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==
|
||||
dependencies:
|
||||
colorette "^2.0.10"
|
||||
memfs "^3.4.3"
|
||||
|
|
|
@ -96,6 +96,8 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
|
|||
async def async_moderation_hook(
|
||||
self,
|
||||
data: dict,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
call_type: Literal["completion", "embeddings", "image_generation"],
|
||||
):
|
||||
"""
|
||||
- Calls Google's Text Moderation API
|
||||
|
|
|
@ -99,6 +99,8 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
|
|||
async def async_moderation_hook(
|
||||
self,
|
||||
data: dict,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
call_type: Literal["completion", "embeddings", "image_generation"],
|
||||
):
|
||||
"""
|
||||
- Calls the Llama Guard Endpoint
|
||||
|
|
|
@ -22,6 +22,7 @@ from litellm.utils import (
|
|||
)
|
||||
from datetime import datetime
|
||||
import aiohttp, asyncio
|
||||
from litellm.utils import get_formatted_prompt
|
||||
|
||||
litellm.set_verbose = True
|
||||
|
||||
|
@ -29,9 +30,12 @@ litellm.set_verbose = True
|
|||
class _ENTERPRISE_LLMGuard(CustomLogger):
|
||||
# Class variables or attributes
|
||||
def __init__(
|
||||
self, mock_testing: bool = False, mock_redacted_text: Optional[dict] = None
|
||||
self,
|
||||
mock_testing: bool = False,
|
||||
mock_redacted_text: Optional[dict] = None,
|
||||
):
|
||||
self.mock_redacted_text = mock_redacted_text
|
||||
self.llm_guard_mode = litellm.llm_guard_mode
|
||||
if mock_testing == True: # for testing purposes only
|
||||
return
|
||||
self.llm_guard_api_base = litellm.get_secret("LLM_GUARD_API_BASE", None)
|
||||
|
@ -59,7 +63,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
|
|||
else:
|
||||
# Make the first request to /analyze
|
||||
analyze_url = f"{self.llm_guard_api_base}analyze/prompt"
|
||||
verbose_proxy_logger.debug(f"Making request to: {analyze_url}")
|
||||
verbose_proxy_logger.debug("Making request to: %s", analyze_url)
|
||||
analyze_payload = {"prompt": text}
|
||||
redacted_text = None
|
||||
async with session.post(
|
||||
|
@ -72,7 +76,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
|
|||
if redacted_text is not None:
|
||||
if (
|
||||
redacted_text.get("is_valid", None) is not None
|
||||
and redacted_text["is_valid"] == "True"
|
||||
and redacted_text["is_valid"] != True
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
@ -91,9 +95,26 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
|
|||
traceback.print_exc()
|
||||
raise e
|
||||
|
||||
def should_proceed(self, user_api_key_dict: UserAPIKeyAuth) -> bool:
|
||||
if self.llm_guard_mode == "key-specific":
|
||||
# check if llm guard enabled for specific keys only
|
||||
self.print_verbose(
|
||||
f"user_api_key_dict.permissions: {user_api_key_dict.permissions}"
|
||||
)
|
||||
if (
|
||||
user_api_key_dict.permissions.get("enable_llm_guard_check", False)
|
||||
== True
|
||||
):
|
||||
return True
|
||||
elif self.llm_guard_mode == "all":
|
||||
return True
|
||||
return False
|
||||
|
||||
async def async_moderation_hook(
|
||||
self,
|
||||
data: dict,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
call_type: Literal["completion", "embeddings", "image_generation"],
|
||||
):
|
||||
"""
|
||||
- Calls the LLM Guard Endpoint
|
||||
|
@ -101,7 +122,32 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
|
|||
- Use the sanitized prompt returned
|
||||
- LLM Guard can handle things like PII Masking, etc.
|
||||
"""
|
||||
return data
|
||||
self.print_verbose(
|
||||
f"Inside LLM Guard Pre-Call Hook - llm_guard_mode={self.llm_guard_mode}"
|
||||
)
|
||||
|
||||
_proceed = self.should_proceed(user_api_key_dict=user_api_key_dict)
|
||||
if _proceed == False:
|
||||
return
|
||||
|
||||
self.print_verbose("Makes LLM Guard Check")
|
||||
try:
|
||||
assert call_type in [
|
||||
"completion",
|
||||
"embeddings",
|
||||
"image_generation",
|
||||
"moderation",
|
||||
"audio_transcription",
|
||||
]
|
||||
except Exception as e:
|
||||
self.print_verbose(
|
||||
f"Call Type - {call_type}, not in accepted list - ['completion','embeddings','image_generation','moderation','audio_transcription']"
|
||||
)
|
||||
return data
|
||||
|
||||
formatted_prompt = get_formatted_prompt(data=data, call_type=call_type) # type: ignore
|
||||
self.print_verbose(f"LLM Guard, formatted_prompt: {formatted_prompt}")
|
||||
return await self.moderation_check(text=formatted_prompt)
|
||||
|
||||
async def async_post_call_streaming_hook(
|
||||
self, user_api_key_dict: UserAPIKeyAuth, response: str
|
||||
|
|
8
litellm-js/proxy/README.md
Normal file
|
@ -0,0 +1,8 @@
|
|||
```
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
```
|
||||
npm run deploy
|
||||
```
|
14
litellm-js/proxy/package.json
Normal file
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"scripts": {
|
||||
"dev": "wrangler dev src/index.ts",
|
||||
"deploy": "wrangler deploy --minify src/index.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"hono": "^4.1.4",
|
||||
"openai": "^4.29.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@cloudflare/workers-types": "^4.20240208.0",
|
||||
"wrangler": "^3.32.0"
|
||||
}
|
||||
}
|
59
litellm-js/proxy/src/index.ts
Normal file
|
@ -0,0 +1,59 @@
|
|||
import { Hono } from 'hono'
|
||||
import { Context } from 'hono';
|
||||
import { bearerAuth } from 'hono/bearer-auth'
|
||||
import OpenAI from "openai";
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: "sk-1234",
|
||||
baseURL: "https://openai-endpoint.ishaanjaffer0324.workers.dev"
|
||||
});
|
||||
|
||||
async function call_proxy() {
|
||||
const completion = await openai.chat.completions.create({
|
||||
messages: [{ role: "system", content: "You are a helpful assistant." }],
|
||||
model: "gpt-3.5-turbo",
|
||||
});
|
||||
|
||||
return completion
|
||||
}
|
||||
|
||||
const app = new Hono()
|
||||
|
||||
// Middleware for API Key Authentication
|
||||
const apiKeyAuth = async (c: Context, next: Function) => {
|
||||
const apiKey = c.req.header('Authorization');
|
||||
if (!apiKey || apiKey !== 'Bearer sk-1234') {
|
||||
return c.text('Unauthorized', 401);
|
||||
}
|
||||
await next();
|
||||
};
|
||||
|
||||
|
||||
app.use('/*', apiKeyAuth)
|
||||
|
||||
|
||||
app.get('/', (c) => {
|
||||
return c.text('Hello Hono!')
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
// Handler for chat completions
|
||||
const chatCompletionHandler = async (c: Context) => {
|
||||
// Assuming your logic for handling chat completion goes here
|
||||
// For demonstration, just returning a simple JSON response
|
||||
const response = await call_proxy()
|
||||
return c.json(response);
|
||||
};
|
||||
|
||||
// Register the above handler for different POST routes with the apiKeyAuth middleware
|
||||
app.post('/v1/chat/completions', chatCompletionHandler);
|
||||
app.post('/chat/completions', chatCompletionHandler);
|
||||
|
||||
// Example showing how you might handle dynamic segments within the URL
|
||||
// Here, using ':model*' to capture the rest of the path as a parameter 'model'
|
||||
app.post('/openai/deployments/:model*/chat/completions', chatCompletionHandler);
|
||||
|
||||
|
||||
export default app
|
16
litellm-js/proxy/tsconfig.json
Normal file
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"strict": true,
|
||||
"lib": [
|
||||
"ESNext"
|
||||
],
|
||||
"types": [
|
||||
"@cloudflare/workers-types"
|
||||
],
|
||||
"jsx": "react-jsx",
|
||||
"jsxImportSource": "hono/jsx"
|
||||
},
|
||||
}
|
18
litellm-js/proxy/wrangler.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
name = "my-app"
|
||||
compatibility_date = "2023-12-01"
|
||||
|
||||
# [vars]
|
||||
# MY_VAR = "my-variable"
|
||||
|
||||
# [[kv_namespaces]]
|
||||
# binding = "MY_KV_NAMESPACE"
|
||||
# id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
|
||||
# [[r2_buckets]]
|
||||
# binding = "MY_BUCKET"
|
||||
# bucket_name = "my-bucket"
|
||||
|
||||
# [[d1_databases]]
|
||||
# binding = "DB"
|
||||
# database_name = "my-database"
|
||||
# database_id = ""
|
26
litellm-js/spend-logs/Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
|||
# Use the specific Node.js v20.11.0 image
|
||||
FROM node:20.11.0
|
||||
|
||||
# Set the working directory inside the container
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package.json and package-lock.json to the working directory
|
||||
COPY ./litellm-js/spend-logs/package*.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN npm install
|
||||
|
||||
# Install Prisma globally
|
||||
RUN npm install -g prisma
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY ./litellm-js/spend-logs .
|
||||
|
||||
# Generate Prisma client
|
||||
RUN npx prisma generate
|
||||
|
||||
# Expose the port that the Node.js server will run on
|
||||
EXPOSE 3000
|
||||
|
||||
# Command to run the Node.js app with npm run dev
|
||||
CMD ["npm", "run", "dev"]
|
8
litellm-js/spend-logs/README.md
Normal file
|
@ -0,0 +1,8 @@
|
|||
```
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
```
|
||||
open http://localhost:3000
|
||||
```
|
508
litellm-js/spend-logs/package-lock.json
generated
Normal file
|
@ -0,0 +1,508 @@
|
|||
{
|
||||
"name": "spend-logs",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"@hono/node-server": "^1.9.0",
|
||||
"hono": "^4.1.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.11.17",
|
||||
"tsx": "^4.7.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/aix-ppc64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.19.12.tgz",
|
||||
"integrity": "sha512-bmoCYyWdEL3wDQIVbcyzRyeKLgk2WtWLTWz1ZIAZF/EGbNOwSA6ew3PftJ1PqMiOOGu0OyFMzG53L0zqIpPeNA==",
|
||||
"cpu": [
|
||||
"ppc64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"aix"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/android-arm": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.12.tgz",
|
||||
"integrity": "sha512-qg/Lj1mu3CdQlDEEiWrlC4eaPZ1KztwGJ9B6J+/6G+/4ewxJg7gqj8eVYWvao1bXrqGiW2rsBZFSX3q2lcW05w==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/android-arm64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.12.tgz",
|
||||
"integrity": "sha512-P0UVNGIienjZv3f5zq0DP3Nt2IE/3plFzuaS96vihvD0Hd6H/q4WXUGpCxD/E8YrSXfNyRPbpTq+T8ZQioSuPA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/android-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-3k7ZoUW6Q6YqhdhIaq/WZ7HwBpnFBlW905Fa4s4qWJyiNOgT1dOqDiVAQFwBH7gBRZr17gLrlFCRzF6jFh7Kew==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/darwin-arm64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.12.tgz",
|
||||
"integrity": "sha512-B6IeSgZgtEzGC42jsI+YYu9Z3HKRxp8ZT3cqhvliEHovq8HSX2YX8lNocDn79gCKJXOSaEot9MVYky7AKjCs8g==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/darwin-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-hKoVkKzFiToTgn+41qGhsUJXFlIjxI/jSYeZf3ugemDYZldIXIxhvwN6erJGlX4t5h417iFuheZ7l+YVn05N3A==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/freebsd-arm64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.12.tgz",
|
||||
"integrity": "sha512-4aRvFIXmwAcDBw9AueDQ2YnGmz5L6obe5kmPT8Vd+/+x/JMVKCgdcRwH6APrbpNXsPz+K653Qg8HB/oXvXVukA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"freebsd"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/freebsd-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-EYoXZ4d8xtBoVN7CEwWY2IN4ho76xjYXqSXMNccFSx2lgqOG/1TBPW0yPx1bJZk94qu3tX0fycJeeQsKovA8gg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"freebsd"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-arm": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.12.tgz",
|
||||
"integrity": "sha512-J5jPms//KhSNv+LO1S1TX1UWp1ucM6N6XuL6ITdKWElCu8wXP72l9MM0zDTzzeikVyqFE6U8YAV9/tFyj0ti+w==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-arm64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.12.tgz",
|
||||
"integrity": "sha512-EoTjyYyLuVPfdPLsGVVVC8a0p1BFFvtpQDB/YLEhaXyf/5bczaGeN15QkR+O4S5LeJ92Tqotve7i1jn35qwvdA==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-ia32": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.12.tgz",
|
||||
"integrity": "sha512-Thsa42rrP1+UIGaWz47uydHSBOgTUnwBwNq59khgIwktK6x60Hivfbux9iNR0eHCHzOLjLMLfUMLCypBkZXMHA==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-loong64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.12.tgz",
|
||||
"integrity": "sha512-LiXdXA0s3IqRRjm6rV6XaWATScKAXjI4R4LoDlvO7+yQqFdlr1Bax62sRwkVvRIrwXxvtYEHHI4dm50jAXkuAA==",
|
||||
"cpu": [
|
||||
"loong64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-mips64el": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.12.tgz",
|
||||
"integrity": "sha512-fEnAuj5VGTanfJ07ff0gOA6IPsvrVHLVb6Lyd1g2/ed67oU1eFzL0r9WL7ZzscD+/N6i3dWumGE1Un4f7Amf+w==",
|
||||
"cpu": [
|
||||
"mips64el"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-ppc64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.12.tgz",
|
||||
"integrity": "sha512-nYJA2/QPimDQOh1rKWedNOe3Gfc8PabU7HT3iXWtNUbRzXS9+vgB0Fjaqr//XNbd82mCxHzik2qotuI89cfixg==",
|
||||
"cpu": [
|
||||
"ppc64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-riscv64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.12.tgz",
|
||||
"integrity": "sha512-2MueBrlPQCw5dVJJpQdUYgeqIzDQgw3QtiAHUC4RBz9FXPrskyyU3VI1hw7C0BSKB9OduwSJ79FTCqtGMWqJHg==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-s390x": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.12.tgz",
|
||||
"integrity": "sha512-+Pil1Nv3Umes4m3AZKqA2anfhJiVmNCYkPchwFJNEJN5QxmTs1uzyy4TvmDrCRNT2ApwSari7ZIgrPeUx4UZDg==",
|
||||
"cpu": [
|
||||
"s390x"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/linux-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-B71g1QpxfwBvNrfyJdVDexenDIt1CiDN1TIXLbhOw0KhJzE78KIFGX6OJ9MrtC0oOqMWf+0xop4qEU8JrJTwCg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/netbsd-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-3ltjQ7n1owJgFbuC61Oj++XhtzmymoCihNFgT84UAmJnxJfm4sYCiSLTXZtE00VWYpPMYc+ZQmB6xbSdVh0JWA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"netbsd"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/openbsd-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-RbrfTB9SWsr0kWmb9srfF+L933uMDdu9BIzdA7os2t0TXhCRjrQyCeOt6wVxr79CKD4c+p+YhCj31HBkYcXebw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"openbsd"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/sunos-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-HKjJwRrW8uWtCQnQOz9qcU3mUZhTUQvi56Q8DPTLLB+DawoiQdjsYq+j+D3s9I8VFtDr+F9CjgXKKC4ss89IeA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"sunos"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/win32-arm64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.12.tgz",
|
||||
"integrity": "sha512-URgtR1dJnmGvX864pn1B2YUYNzjmXkuJOIqG2HdU62MVS4EHpU2946OZoTMnRUHklGtJdJZ33QfzdjGACXhn1A==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/win32-ia32": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.12.tgz",
|
||||
"integrity": "sha512-+ZOE6pUkMOJfmxmBZElNOx72NKpIa/HFOMGzu8fqzQJ5kgf6aTGrcJaFsNiVMH4JKpMipyK+7k0n2UXN7a8YKQ==",
|
||||
"cpu": [
|
||||
"ia32"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/win32-x64": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.12.tgz",
|
||||
"integrity": "sha512-T1QyPSDCyMXaO3pzBkF96E8xMkiRYbUEZADd29SyPGabqxMViNoii+NcK7eWJAEoU6RZyEm5lVSIjTmcdoB9HA==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"dev": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@hono/node-server": {
|
||||
"version": "1.9.0",
|
||||
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
|
||||
"integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
|
||||
"engines": {
|
||||
"node": ">=18.14.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "20.11.30",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.30.tgz",
|
||||
"integrity": "sha512-dHM6ZxwlmuZaRmUPfv1p+KrdD1Dci04FbdEm/9wEMouFqxYoFl5aMkt0VMAUtYRQDyYvD41WJLukhq/ha3YuTw==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~5.26.4"
|
||||
}
|
||||
},
|
||||
"node_modules/esbuild": {
|
||||
"version": "0.19.12",
|
||||
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.19.12.tgz",
|
||||
"integrity": "sha512-aARqgq8roFBj054KvQr5f1sFu0D65G+miZRCuJyJ0G13Zwx7vRar5Zhn2tkQNzIXcBrNVsv/8stehpj+GAjgbg==",
|
||||
"dev": true,
|
||||
"hasInstallScript": true,
|
||||
"bin": {
|
||||
"esbuild": "bin/esbuild"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@esbuild/aix-ppc64": "0.19.12",
|
||||
"@esbuild/android-arm": "0.19.12",
|
||||
"@esbuild/android-arm64": "0.19.12",
|
||||
"@esbuild/android-x64": "0.19.12",
|
||||
"@esbuild/darwin-arm64": "0.19.12",
|
||||
"@esbuild/darwin-x64": "0.19.12",
|
||||
"@esbuild/freebsd-arm64": "0.19.12",
|
||||
"@esbuild/freebsd-x64": "0.19.12",
|
||||
"@esbuild/linux-arm": "0.19.12",
|
||||
"@esbuild/linux-arm64": "0.19.12",
|
||||
"@esbuild/linux-ia32": "0.19.12",
|
||||
"@esbuild/linux-loong64": "0.19.12",
|
||||
"@esbuild/linux-mips64el": "0.19.12",
|
||||
"@esbuild/linux-ppc64": "0.19.12",
|
||||
"@esbuild/linux-riscv64": "0.19.12",
|
||||
"@esbuild/linux-s390x": "0.19.12",
|
||||
"@esbuild/linux-x64": "0.19.12",
|
||||
"@esbuild/netbsd-x64": "0.19.12",
|
||||
"@esbuild/openbsd-x64": "0.19.12",
|
||||
"@esbuild/sunos-x64": "0.19.12",
|
||||
"@esbuild/win32-arm64": "0.19.12",
|
||||
"@esbuild/win32-ia32": "0.19.12",
|
||||
"@esbuild/win32-x64": "0.19.12"
|
||||
}
|
||||
},
|
||||
"node_modules/fsevents": {
|
||||
"version": "2.3.3",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
|
||||
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
|
||||
"dev": true,
|
||||
"hasInstallScript": true,
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/get-tsconfig": {
|
||||
"version": "4.7.3",
|
||||
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.3.tgz",
|
||||
"integrity": "sha512-ZvkrzoUA0PQZM6fy6+/Hce561s+faD1rsNwhnO5FelNjyy7EMGJ3Rz1AQ8GYDWjhRs/7dBLOEJvhK8MiEJOAFg==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"resolve-pkg-maps": "^1.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/hono": {
|
||||
"version": "4.1.5",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
|
||||
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
|
||||
"engines": {
|
||||
"node": ">=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/resolve-pkg-maps": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
|
||||
"integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
|
||||
"dev": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/tsx": {
|
||||
"version": "4.7.1",
|
||||
"resolved": "https://registry.npmjs.org/tsx/-/tsx-4.7.1.tgz",
|
||||
"integrity": "sha512-8d6VuibXHtlN5E3zFkgY8u4DX7Y3Z27zvvPKVmLon/D4AjuKzarkUBTLDBgj9iTQ0hg5xM7c/mYiRVM+HETf0g==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"esbuild": "~0.19.10",
|
||||
"get-tsconfig": "^4.7.2"
|
||||
},
|
||||
"bin": {
|
||||
"tsx": "dist/cli.mjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "~2.3.3"
|
||||
}
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "5.26.5",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
|
||||
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
|
||||
"dev": true
|
||||
}
|
||||
}
|
||||
}
|
13
litellm-js/spend-logs/package.json
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"scripts": {
|
||||
"dev": "tsx watch src/index.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@hono/node-server": "^1.9.0",
|
||||
"hono": "^4.1.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.11.17",
|
||||
"tsx": "^4.7.1"
|
||||
}
|
||||
}
|
29
litellm-js/spend-logs/schema.prisma
Normal file
|
@ -0,0 +1,29 @@
|
|||
generator client {
|
||||
provider = "prisma-client-js"
|
||||
}
|
||||
|
||||
datasource client {
|
||||
provider = "postgresql"
|
||||
url = env("DATABASE_URL")
|
||||
}
|
||||
|
||||
model LiteLLM_SpendLogs {
|
||||
request_id String @id
|
||||
call_type String
|
||||
api_key String @default("")
|
||||
spend Float @default(0.0)
|
||||
total_tokens Int @default(0)
|
||||
prompt_tokens Int @default(0)
|
||||
completion_tokens Int @default(0)
|
||||
startTime DateTime
|
||||
endTime DateTime
|
||||
model String @default("")
|
||||
api_base String @default("")
|
||||
user String @default("")
|
||||
metadata Json @default("{}")
|
||||
cache_hit String @default("")
|
||||
cache_key String @default("")
|
||||
request_tags Json @default("[]")
|
||||
team_id String?
|
||||
end_user String?
|
||||
}
|
32
litellm-js/spend-logs/src/_types.ts
Normal file
|
@ -0,0 +1,32 @@
|
|||
export type LiteLLM_IncrementSpend = {
|
||||
key_transactions: Array<LiteLLM_IncrementObject>, // [{"key": spend},..]
|
||||
user_transactions: Array<LiteLLM_IncrementObject>,
|
||||
team_transactions: Array<LiteLLM_IncrementObject>,
|
||||
spend_logs_transactions: Array<LiteLLM_SpendLogs>
|
||||
}
|
||||
|
||||
export type LiteLLM_IncrementObject = {
|
||||
key: string,
|
||||
spend: number
|
||||
}
|
||||
|
||||
export type LiteLLM_SpendLogs = {
|
||||
request_id: string; // @id means it's a unique identifier
|
||||
call_type: string;
|
||||
api_key: string; // @default("") means it defaults to an empty string if not provided
|
||||
spend: number; // Float in Prisma corresponds to number in TypeScript
|
||||
total_tokens: number; // Int in Prisma corresponds to number in TypeScript
|
||||
prompt_tokens: number;
|
||||
completion_tokens: number;
|
||||
startTime: Date; // DateTime in Prisma corresponds to Date in TypeScript
|
||||
endTime: Date;
|
||||
model: string; // @default("") means it defaults to an empty string if not provided
|
||||
api_base: string;
|
||||
user: string;
|
||||
metadata: any; // Json type in Prisma is represented by any in TypeScript; could also use a more specific type if the structure of JSON is known
|
||||
cache_hit: string;
|
||||
cache_key: string;
|
||||
request_tags: any; // Similarly, this could be an array or a more specific type depending on the expected structure
|
||||
team_id?: string | null; // ? indicates it's optional and can be undefined, but could also be null if not provided
|
||||
end_user?: string | null;
|
||||
};
|
84
litellm-js/spend-logs/src/index.ts
Normal file
|
@ -0,0 +1,84 @@
|
|||
import { serve } from '@hono/node-server'
|
||||
import { Hono } from 'hono'
|
||||
import { PrismaClient } from '@prisma/client'
|
||||
import {LiteLLM_SpendLogs, LiteLLM_IncrementSpend, LiteLLM_IncrementObject} from './_types'
|
||||
|
||||
const app = new Hono()
|
||||
const prisma = new PrismaClient()
|
||||
// In-memory storage for logs
|
||||
let spend_logs: LiteLLM_SpendLogs[] = [];
|
||||
const key_logs: LiteLLM_IncrementObject[] = [];
|
||||
const user_logs: LiteLLM_IncrementObject[] = [];
|
||||
const transaction_logs: LiteLLM_IncrementObject[] = [];
|
||||
|
||||
|
||||
app.get('/', (c) => {
|
||||
return c.text('Hello Hono!')
|
||||
})
|
||||
|
||||
const MIN_LOGS = 1; // Minimum number of logs needed to initiate a flush
|
||||
const FLUSH_INTERVAL = 5000; // Time in ms to wait before trying to flush again
|
||||
const BATCH_SIZE = 100; // Preferred size of each batch to write to the database
|
||||
const MAX_LOGS_PER_INTERVAL = 1000; // Maximum number of logs to flush in a single interval
|
||||
|
||||
const flushLogsToDb = async () => {
|
||||
if (spend_logs.length >= MIN_LOGS) {
|
||||
// Limit the logs to process in this interval to MAX_LOGS_PER_INTERVAL or less
|
||||
const logsToProcess = spend_logs.slice(0, MAX_LOGS_PER_INTERVAL);
|
||||
|
||||
for (let i = 0; i < logsToProcess.length; i += BATCH_SIZE) {
|
||||
// Create subarray for current batch, ensuring it doesn't exceed the BATCH_SIZE
|
||||
const batch = logsToProcess.slice(i, i + BATCH_SIZE);
|
||||
|
||||
// Convert datetime strings to Date objects
|
||||
const batchWithDates = batch.map(entry => ({
|
||||
...entry,
|
||||
startTime: new Date(entry.startTime),
|
||||
endTime: new Date(entry.endTime),
|
||||
// Repeat for any other DateTime fields you may have
|
||||
}));
|
||||
|
||||
await prisma.liteLLM_SpendLogs.createMany({
|
||||
data: batchWithDates,
|
||||
});
|
||||
|
||||
console.log(`Flushed ${batch.length} logs to the DB.`);
|
||||
}
|
||||
|
||||
// Remove the processed logs from spend_logs
|
||||
spend_logs = spend_logs.slice(logsToProcess.length);
|
||||
|
||||
console.log(`${logsToProcess.length} logs processed. Remaining in queue: ${spend_logs.length}`);
|
||||
} else {
|
||||
// This will ensure it doesn't falsely claim "No logs to flush." when it's merely below the MIN_LOGS threshold.
|
||||
if(spend_logs.length > 0) {
|
||||
console.log(`Accumulating logs. Currently at ${spend_logs.length}, waiting for at least ${MIN_LOGS}.`);
|
||||
} else {
|
||||
console.log("No logs to flush.");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Setup interval for attempting to flush the logs
|
||||
setInterval(flushLogsToDb, FLUSH_INTERVAL);
|
||||
|
||||
// Route to receive log messages
|
||||
app.post('/spend/update', async (c) => {
|
||||
const incomingLogs = await c.req.json<LiteLLM_SpendLogs[]>();
|
||||
|
||||
spend_logs.push(...incomingLogs);
|
||||
|
||||
console.log(`Received and stored ${incomingLogs.length} logs. Total logs in memory: ${spend_logs.length}`);
|
||||
|
||||
return c.json({ message: `Successfully stored ${incomingLogs.length} logs` });
|
||||
});
|
||||
|
||||
|
||||
|
||||
const port = 3000
|
||||
console.log(`Server is running on port ${port}`)
|
||||
|
||||
serve({
|
||||
fetch: app.fetch,
|
||||
port
|
||||
})
|
13
litellm-js/spend-logs/tsconfig.json
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"strict": true,
|
||||
"types": [
|
||||
"node"
|
||||
],
|
||||
"jsx": "react-jsx",
|
||||
"jsxImportSource": "hono/jsx",
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
### INIT VARIABLES ###
|
||||
import threading, requests, os
|
||||
from typing import Callable, List, Optional, Dict, Union, Any
|
||||
from typing import Callable, List, Optional, Dict, Union, Any, Literal
|
||||
from litellm.caching import Cache
|
||||
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
|
||||
from litellm.proxy._types import KeyManagementSystem, KeyManagementSettings
|
||||
|
@ -56,6 +56,7 @@ baseten_key: Optional[str] = None
|
|||
aleph_alpha_key: Optional[str] = None
|
||||
nlp_cloud_key: Optional[str] = None
|
||||
use_client: bool = False
|
||||
disable_streaming_logging: bool = False
|
||||
### GUARDRAILS ###
|
||||
llamaguard_model_name: Optional[str] = None
|
||||
presidio_ad_hoc_recognizers: Optional[str] = None
|
||||
|
@ -63,6 +64,7 @@ google_moderation_confidence_threshold: Optional[float] = None
|
|||
llamaguard_unsafe_content_categories: Optional[str] = None
|
||||
blocked_user_list: Optional[Union[str, List]] = None
|
||||
banned_keywords_list: Optional[Union[str, List]] = None
|
||||
llm_guard_mode: Literal["all", "key-specific"] = "all"
|
||||
##################
|
||||
logging: bool = True
|
||||
caching: bool = (
|
||||
|
@ -172,6 +174,7 @@ upperbound_key_generate_params: Optional[Dict] = None
|
|||
default_user_params: Optional[Dict] = None
|
||||
default_team_settings: Optional[List] = None
|
||||
max_user_budget: Optional[float] = None
|
||||
max_end_user_budget: Optional[float] = None
|
||||
#### RELIABILITY ####
|
||||
request_timeout: Optional[float] = 6000
|
||||
num_retries: Optional[int] = None # per model endpoint
|
||||
|
|
|
@ -38,6 +38,9 @@ class BaseCache:
|
|||
async def async_get_cache(self, key, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
async def batch_cache_write(self, result, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
async def disconnect(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -96,7 +99,9 @@ class InMemoryCache(BaseCache):
|
|||
class RedisCache(BaseCache):
|
||||
# if users don't provider one, use the default litellm cache
|
||||
|
||||
def __init__(self, host=None, port=None, password=None, **kwargs):
|
||||
def __init__(
|
||||
self, host=None, port=None, password=None, redis_flush_size=100, **kwargs
|
||||
):
|
||||
from ._redis import get_redis_client, get_redis_connection_pool
|
||||
|
||||
redis_kwargs = {}
|
||||
|
@ -111,6 +116,10 @@ class RedisCache(BaseCache):
|
|||
self.redis_client = get_redis_client(**redis_kwargs)
|
||||
self.redis_kwargs = redis_kwargs
|
||||
self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)
|
||||
|
||||
# for high traffic, we store the redis results in memory and then batch write to redis
|
||||
self.redis_batch_writing_buffer = []
|
||||
self.redis_flush_size = redis_flush_size
|
||||
self.redis_version = "Unknown"
|
||||
try:
|
||||
self.redis_version = self.redis_client.info()["redis_version"]
|
||||
|
@ -161,8 +170,10 @@ class RedisCache(BaseCache):
|
|||
)
|
||||
except Exception as e:
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
print_verbose(
|
||||
f"LiteLLM Redis Caching: async set() - Got exception from REDIS : {str(e)}"
|
||||
verbose_logger.error(
|
||||
"LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
|
||||
str(e),
|
||||
value,
|
||||
)
|
||||
traceback.print_exc()
|
||||
|
||||
|
@ -191,7 +202,27 @@ class RedisCache(BaseCache):
|
|||
# Optionally, you could process 'results' to make sure that all set operations were successful.
|
||||
return results
|
||||
except Exception as e:
|
||||
print_verbose(f"Error occurred in pipeline write - {str(e)}")
|
||||
verbose_logger.error(
|
||||
"LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
|
||||
str(e),
|
||||
cache_value,
|
||||
)
|
||||
traceback.print_exc()
|
||||
|
||||
async def batch_cache_write(self, key, value, **kwargs):
|
||||
print_verbose(
|
||||
f"in batch cache writing for redis buffer size={len(self.redis_batch_writing_buffer)}",
|
||||
)
|
||||
self.redis_batch_writing_buffer.append((key, value))
|
||||
if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
|
||||
await self.flush_cache_buffer()
|
||||
|
||||
async def flush_cache_buffer(self):
|
||||
print_verbose(
|
||||
f"flushing to redis....reached size of buffer {len(self.redis_batch_writing_buffer)}"
|
||||
)
|
||||
await self.async_set_cache_pipeline(self.redis_batch_writing_buffer)
|
||||
self.redis_batch_writing_buffer = []
|
||||
|
||||
def _get_cache_logic(self, cached_response: Any):
|
||||
"""
|
||||
|
@ -287,6 +318,9 @@ class RedisCache(BaseCache):
|
|||
def flush_cache(self):
|
||||
self.redis_client.flushall()
|
||||
|
||||
def flushall(self):
|
||||
self.redis_client.flushall()
|
||||
|
||||
async def disconnect(self):
|
||||
await self.async_redis_conn_pool.disconnect(inuse_connections=True)
|
||||
|
||||
|
@ -874,6 +908,7 @@ class Cache:
|
|||
port: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
namespace: Optional[str] = None,
|
||||
ttl: Optional[float] = None,
|
||||
similarity_threshold: Optional[float] = None,
|
||||
supported_call_types: Optional[
|
||||
List[
|
||||
|
@ -908,6 +943,7 @@ class Cache:
|
|||
s3_path: Optional[str] = None,
|
||||
redis_semantic_cache_use_async=False,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
redis_flush_size=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
@ -930,7 +966,9 @@ class Cache:
|
|||
None. Cache is set as a litellm param
|
||||
"""
|
||||
if type == "redis":
|
||||
self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
|
||||
self.cache: BaseCache = RedisCache(
|
||||
host, port, password, redis_flush_size, **kwargs
|
||||
)
|
||||
elif type == "redis-semantic":
|
||||
self.cache = RedisSemanticCache(
|
||||
host,
|
||||
|
@ -967,6 +1005,8 @@ class Cache:
|
|||
self.supported_call_types = supported_call_types # default to ["completion", "acompletion", "embedding", "aembedding"]
|
||||
self.type = type
|
||||
self.namespace = namespace
|
||||
self.redis_flush_size = redis_flush_size
|
||||
self.ttl = ttl
|
||||
|
||||
def get_cache_key(self, *args, **kwargs):
|
||||
"""
|
||||
|
@ -1206,6 +1246,9 @@ class Cache:
|
|||
if isinstance(result, OpenAIObject):
|
||||
result = result.model_dump_json()
|
||||
|
||||
## DEFAULT TTL ##
|
||||
if self.ttl is not None:
|
||||
kwargs["ttl"] = self.ttl
|
||||
## Get Cache-Controls ##
|
||||
if kwargs.get("cache", None) is not None and isinstance(
|
||||
kwargs.get("cache"), dict
|
||||
|
@ -1213,6 +1256,7 @@ class Cache:
|
|||
for k, v in kwargs.get("cache").items():
|
||||
if k == "ttl":
|
||||
kwargs["ttl"] = v
|
||||
|
||||
cached_data = {"timestamp": time.time(), "response": result}
|
||||
return cache_key, cached_data, kwargs
|
||||
else:
|
||||
|
@ -1246,10 +1290,14 @@ class Cache:
|
|||
Async implementation of add_cache
|
||||
"""
|
||||
try:
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=result, *args, **kwargs
|
||||
)
|
||||
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
|
||||
if self.type == "redis" and self.redis_flush_size is not None:
|
||||
# high traffic - fill in results in memory and then flush
|
||||
await self.batch_cache_write(result, *args, **kwargs)
|
||||
else:
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=result, *args, **kwargs
|
||||
)
|
||||
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
|
||||
except Exception as e:
|
||||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
@ -1287,6 +1335,12 @@ class Cache:
|
|||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
async def batch_cache_write(self, result, *args, **kwargs):
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=result, *args, **kwargs
|
||||
)
|
||||
await self.cache.batch_cache_write(cache_key, cached_data, **kwargs)
|
||||
|
||||
async def ping(self):
|
||||
if hasattr(self.cache, "ping"):
|
||||
return await self.cache.ping()
|
||||
|
|
|
@ -10,7 +10,7 @@ class AthinaLogger:
|
|||
"Content-Type": "application/json"
|
||||
}
|
||||
self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
|
||||
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response"]
|
||||
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]
|
||||
|
||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||
import requests
|
||||
|
@ -32,8 +32,6 @@ class AthinaLogger:
|
|||
|
||||
if "messages" in kwargs:
|
||||
data["prompt"] = kwargs.get("messages", None)
|
||||
if kwargs.get("messages") and len(kwargs.get("messages")) > 0:
|
||||
data["user_query"] = kwargs.get("messages")[0].get("content", None)
|
||||
|
||||
# Directly add tools or functions if present
|
||||
optional_params = kwargs.get("optional_params", {})
|
||||
|
|
|
@ -72,7 +72,12 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
|||
):
|
||||
pass
|
||||
|
||||
async def async_moderation_hook(self, data: dict):
|
||||
async def async_moderation_hook(
|
||||
self,
|
||||
data: dict,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
call_type: Literal["completion", "embeddings", "image_generation"],
|
||||
):
|
||||
pass
|
||||
|
||||
async def async_post_call_streaming_hook(
|
||||
|
|
|
@ -246,13 +246,13 @@ class LangFuseLogger:
|
|||
metadata_tags = metadata.get("tags", [])
|
||||
tags = metadata_tags
|
||||
|
||||
generation_name = metadata.get("generation_name", None)
|
||||
if generation_name is None:
|
||||
# just log `litellm-{call_type}` as the generation name
|
||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
trace_name = metadata.get("trace_name", None)
|
||||
if trace_name is None:
|
||||
# just log `litellm-{call_type}` as the trace name
|
||||
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
trace_params = {
|
||||
"name": generation_name,
|
||||
"name": trace_name,
|
||||
"input": input,
|
||||
"user_id": metadata.get("trace_user_id", user_id),
|
||||
"id": metadata.get("trace_id", None),
|
||||
|
@ -311,6 +311,11 @@ class LangFuseLogger:
|
|||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"total_cost": cost if supports_costs else None,
|
||||
}
|
||||
generation_name = metadata.get("generation_name", None)
|
||||
if generation_name is None:
|
||||
# just log `litellm-{call_type}` as the generation name
|
||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
generation_params = {
|
||||
"name": generation_name,
|
||||
"id": metadata.get("generation_id", generation_id),
|
||||
|
|
|
@ -131,18 +131,24 @@ def completion(
|
|||
)
|
||||
else:
|
||||
# Separate system prompt from rest of message
|
||||
system_prompt_idx: Optional[int] = None
|
||||
system_prompt_indices = []
|
||||
system_prompt = ""
|
||||
for idx, message in enumerate(messages):
|
||||
if message["role"] == "system":
|
||||
optional_params["system"] = message["content"]
|
||||
system_prompt_idx = idx
|
||||
break
|
||||
if system_prompt_idx is not None:
|
||||
messages.pop(system_prompt_idx)
|
||||
system_prompt += message["content"]
|
||||
system_prompt_indices.append(idx)
|
||||
if len(system_prompt_indices) > 0:
|
||||
for idx in reversed(system_prompt_indices):
|
||||
messages.pop(idx)
|
||||
if len(system_prompt) > 0:
|
||||
optional_params["system"] = system_prompt
|
||||
# Format rest of message according to anthropic guidelines
|
||||
messages = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||
)
|
||||
try:
|
||||
messages = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||
)
|
||||
except Exception as e:
|
||||
raise AnthropicError(status_code=400, message=str(e))
|
||||
|
||||
## Load Config
|
||||
config = litellm.AnthropicConfig.get_config()
|
||||
|
@ -295,7 +301,7 @@ def completion(
|
|||
)
|
||||
streaming_choice.delta = delta_obj
|
||||
streaming_model_response.choices = [streaming_choice]
|
||||
completion_stream = model_response_iterator(
|
||||
completion_stream = ModelResponseIterator(
|
||||
model_response=streaming_model_response
|
||||
)
|
||||
print_verbose(
|
||||
|
@ -324,8 +330,30 @@ def completion(
|
|||
return model_response
|
||||
|
||||
|
||||
def model_response_iterator(model_response):
|
||||
yield model_response
|
||||
class ModelResponseIterator:
|
||||
def __init__(self, model_response):
|
||||
self.model_response = model_response
|
||||
self.is_done = False
|
||||
|
||||
# Sync iterator
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.is_done:
|
||||
raise StopIteration
|
||||
self.is_done = True
|
||||
return self.model_response
|
||||
|
||||
# Async iterator
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.is_done:
|
||||
raise StopAsyncIteration
|
||||
self.is_done = True
|
||||
return self.model_response
|
||||
|
||||
|
||||
def embedding():
|
||||
|
|
|
@ -11,6 +11,7 @@ from .prompt_templates.factory import (
|
|||
construct_tool_use_system_prompt,
|
||||
extract_between_tags,
|
||||
parse_xml_params,
|
||||
contains_tag,
|
||||
)
|
||||
import httpx
|
||||
|
||||
|
@ -78,11 +79,13 @@ class AmazonTitanConfig:
|
|||
|
||||
class AmazonAnthropicClaude3Config:
|
||||
"""
|
||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
|
||||
Reference:
|
||||
https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
|
||||
https://docs.anthropic.com/claude/docs/models-overview#model-comparison
|
||||
|
||||
Supported Params for the Amazon / Anthropic Claude 3 models:
|
||||
|
||||
- `max_tokens` Required (integer) max tokens,
|
||||
- `max_tokens` Required (integer) max tokens. Default is 4096
|
||||
- `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
|
||||
- `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
|
||||
- `temperature` Optional (float) The amount of randomness injected into the response
|
||||
|
@ -91,7 +94,7 @@ class AmazonAnthropicClaude3Config:
|
|||
- `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
|
||||
"""
|
||||
|
||||
max_tokens: Optional[int] = litellm.max_tokens
|
||||
max_tokens: Optional[int] = 4096 # Opus, Sonnet, and Haiku default
|
||||
anthropic_version: Optional[str] = "bedrock-2023-05-31"
|
||||
system: Optional[str] = None
|
||||
temperature: Optional[float] = None
|
||||
|
@ -128,7 +131,15 @@ class AmazonAnthropicClaude3Config:
|
|||
}
|
||||
|
||||
def get_supported_openai_params(self):
|
||||
return ["max_tokens", "tools", "tool_choice", "stream"]
|
||||
return [
|
||||
"max_tokens",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"stream",
|
||||
"stop",
|
||||
"temperature",
|
||||
"top_p",
|
||||
]
|
||||
|
||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||
for param, value in non_default_params.items():
|
||||
|
@ -679,6 +690,7 @@ def completion(
|
|||
timeout=None,
|
||||
):
|
||||
exception_mapping_worked = False
|
||||
_is_function_call = False
|
||||
try:
|
||||
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
|
||||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
|
@ -727,8 +739,10 @@ def completion(
|
|||
system_messages.append(message["content"])
|
||||
system_prompt_idx.append(idx)
|
||||
if len(system_prompt_idx) > 0:
|
||||
inference_params["system"] = '\n'.join(system_messages)
|
||||
messages = [i for j, i in enumerate(messages) if j not in system_prompt_idx]
|
||||
inference_params["system"] = "\n".join(system_messages)
|
||||
messages = [
|
||||
i for j, i in enumerate(messages) if j not in system_prompt_idx
|
||||
]
|
||||
# Format rest of message according to anthropic guidelines
|
||||
messages = prompt_factory(
|
||||
model=model, messages=messages, custom_llm_provider="anthropic"
|
||||
|
@ -742,6 +756,7 @@ def completion(
|
|||
inference_params[k] = v
|
||||
## Handle Tool Calling
|
||||
if "tools" in inference_params:
|
||||
_is_function_call = True
|
||||
tool_calling_system_prompt = construct_tool_use_system_prompt(
|
||||
tools=inference_params["tools"]
|
||||
)
|
||||
|
@ -823,7 +838,7 @@ def completion(
|
|||
## COMPLETION CALL
|
||||
accept = "application/json"
|
||||
contentType = "application/json"
|
||||
if stream == True:
|
||||
if stream == True and _is_function_call == False:
|
||||
if provider == "ai21":
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
|
@ -918,7 +933,9 @@ def completion(
|
|||
elif provider == "anthropic":
|
||||
if model.startswith("anthropic.claude-3"):
|
||||
outputText = response_body.get("content")[0].get("text", None)
|
||||
if "<invoke>" in outputText: # OUTPUT PARSE FUNCTION CALL
|
||||
if outputText is not None and contains_tag(
|
||||
"invoke", outputText
|
||||
): # OUTPUT PARSE FUNCTION CALL
|
||||
function_name = extract_between_tags("tool_name", outputText)[0]
|
||||
function_arguments_str = extract_between_tags("invoke", outputText)[
|
||||
0
|
||||
|
@ -941,6 +958,56 @@ def completion(
|
|||
content=None,
|
||||
)
|
||||
model_response.choices[0].message = _message # type: ignore
|
||||
if _is_function_call == True and stream is not None and stream == True:
|
||||
print_verbose(
|
||||
f"INSIDE BEDROCK STREAMING TOOL CALLING CONDITION BLOCK"
|
||||
)
|
||||
# return an iterator
|
||||
streaming_model_response = ModelResponse(stream=True)
|
||||
streaming_model_response.choices[0].finish_reason = (
|
||||
model_response.choices[0].finish_reason
|
||||
)
|
||||
# streaming_model_response.choices = [litellm.utils.StreamingChoices()]
|
||||
streaming_choice = litellm.utils.StreamingChoices()
|
||||
streaming_choice.index = model_response.choices[0].index
|
||||
_tool_calls = []
|
||||
print_verbose(
|
||||
f"type of model_response.choices[0]: {type(model_response.choices[0])}"
|
||||
)
|
||||
print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
|
||||
if isinstance(model_response.choices[0], litellm.Choices):
|
||||
if getattr(
|
||||
model_response.choices[0].message, "tool_calls", None
|
||||
) is not None and isinstance(
|
||||
model_response.choices[0].message.tool_calls, list
|
||||
):
|
||||
for tool_call in model_response.choices[
|
||||
0
|
||||
].message.tool_calls:
|
||||
_tool_call = {**tool_call.dict(), "index": 0}
|
||||
_tool_calls.append(_tool_call)
|
||||
delta_obj = litellm.utils.Delta(
|
||||
content=getattr(
|
||||
model_response.choices[0].message, "content", None
|
||||
),
|
||||
role=model_response.choices[0].message.role,
|
||||
tool_calls=_tool_calls,
|
||||
)
|
||||
streaming_choice.delta = delta_obj
|
||||
streaming_model_response.choices = [streaming_choice]
|
||||
completion_stream = model_response_iterator(
|
||||
model_response=streaming_model_response
|
||||
)
|
||||
print_verbose(
|
||||
f"Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
|
||||
)
|
||||
return litellm.CustomStreamWrapper(
|
||||
completion_stream=completion_stream,
|
||||
model=model,
|
||||
custom_llm_provider="cached_response",
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
|
||||
model_response["finish_reason"] = response_body["stop_reason"]
|
||||
_usage = litellm.Usage(
|
||||
prompt_tokens=response_body["usage"]["input_tokens"],
|
||||
|
@ -1029,6 +1096,10 @@ def completion(
|
|||
raise BedrockError(status_code=500, message=traceback.format_exc())
|
||||
|
||||
|
||||
async def model_response_iterator(model_response):
|
||||
yield model_response
|
||||
|
||||
|
||||
def _embedding_func_single(
|
||||
model: str,
|
||||
input: str,
|
||||
|
|
38
litellm/llms/custom_httpx/httpx_handler.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
from typing import Optional
|
||||
import httpx
|
||||
|
||||
|
||||
class HTTPHandler:
|
||||
def __init__(self, concurrent_limit=1000):
|
||||
# Create a client with a connection pool
|
||||
self.client = httpx.AsyncClient(
|
||||
limits=httpx.Limits(
|
||||
max_connections=concurrent_limit,
|
||||
max_keepalive_connections=concurrent_limit,
|
||||
)
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
# Close the client when you're done with it
|
||||
await self.client.aclose()
|
||||
|
||||
async def get(
|
||||
self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
|
||||
):
|
||||
response = await self.client.get(url, params=params, headers=headers)
|
||||
return response
|
||||
|
||||
async def post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
):
|
||||
try:
|
||||
response = await self.client.post(
|
||||
url, data=data, params=params, headers=headers
|
||||
)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise e
|
|
@ -118,7 +118,7 @@ def completion(
|
|||
logger_fn=None,
|
||||
):
|
||||
try:
|
||||
import google.generativeai as genai
|
||||
import google.generativeai as genai # type: ignore
|
||||
except:
|
||||
raise Exception(
|
||||
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
|
||||
|
@ -308,7 +308,7 @@ async def async_completion(
|
|||
messages,
|
||||
encoding,
|
||||
):
|
||||
import google.generativeai as genai
|
||||
import google.generativeai as genai # type: ignore
|
||||
|
||||
response = await _model.generate_content_async(
|
||||
contents=prompt,
|
||||
|
|
|
@ -68,9 +68,9 @@ class OllamaConfig:
|
|||
repeat_last_n: Optional[int] = None
|
||||
repeat_penalty: Optional[float] = None
|
||||
temperature: Optional[float] = None
|
||||
stop: Optional[
|
||||
list
|
||||
] = None # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
|
||||
stop: Optional[list] = (
|
||||
None # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
|
||||
)
|
||||
tfs_z: Optional[float] = None
|
||||
num_predict: Optional[int] = None
|
||||
top_k: Optional[int] = None
|
||||
|
@ -344,9 +344,9 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
|||
|
||||
|
||||
async def ollama_aembeddings(
|
||||
api_base="http://localhost:11434",
|
||||
model="llama2",
|
||||
prompt="Why is the sky blue?",
|
||||
api_base: str,
|
||||
model: str,
|
||||
prompts: list,
|
||||
optional_params=None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
|
@ -365,51 +365,56 @@ async def ollama_aembeddings(
|
|||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
data = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
}
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
api_key=None,
|
||||
additional_args={"api_base": url, "complete_input_dict": data, "headers": {}},
|
||||
)
|
||||
total_input_tokens = 0
|
||||
output_data = []
|
||||
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
response = await session.post(url, json=data)
|
||||
|
||||
if response.status != 200:
|
||||
text = await response.text()
|
||||
raise OllamaError(status_code=response.status, message=text)
|
||||
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
original_response=response.text,
|
||||
additional_args={
|
||||
"headers": None,
|
||||
"api_base": api_base,
|
||||
},
|
||||
)
|
||||
|
||||
response_json = await response.json()
|
||||
embeddings = response_json["embedding"]
|
||||
## RESPONSE OBJECT
|
||||
output_data = []
|
||||
for idx, embedding in enumerate(embeddings):
|
||||
output_data.append(
|
||||
{"object": "embedding", "index": idx, "embedding": embedding}
|
||||
for idx, prompt in enumerate(prompts):
|
||||
data = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
}
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"api_base": url,
|
||||
"complete_input_dict": data,
|
||||
"headers": {},
|
||||
},
|
||||
)
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = output_data
|
||||
model_response["model"] = model
|
||||
|
||||
input_tokens = len(encoding.encode(prompt))
|
||||
response = await session.post(url, json=data)
|
||||
if response.status != 200:
|
||||
text = await response.text()
|
||||
raise OllamaError(status_code=response.status, message=text)
|
||||
|
||||
model_response["usage"] = {
|
||||
"prompt_tokens": input_tokens,
|
||||
"total_tokens": input_tokens,
|
||||
}
|
||||
return model_response
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
original_response=response.text,
|
||||
additional_args={
|
||||
"headers": None,
|
||||
"api_base": api_base,
|
||||
},
|
||||
)
|
||||
|
||||
response_json = await response.json()
|
||||
embeddings: list[float] = response_json["embedding"]
|
||||
output_data.append(
|
||||
{"object": "embedding", "index": idx, "embedding": embeddings}
|
||||
)
|
||||
|
||||
input_tokens = len(encoding.encode(prompt))
|
||||
total_input_tokens += input_tokens
|
||||
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = output_data
|
||||
model_response["model"] = model
|
||||
model_response["usage"] = {
|
||||
"prompt_tokens": total_input_tokens,
|
||||
"total_tokens": total_input_tokens,
|
||||
}
|
||||
return model_response
|
||||
|
|
|
@ -173,10 +173,11 @@ class OllamaChatConfig:
|
|||
litellm.add_function_to_prompt = (
|
||||
True # so that main.py adds the function call to the prompt
|
||||
)
|
||||
optional_params["functions_unsupported_model"] = non_default_params.pop(
|
||||
optional_params["functions_unsupported_model"] = non_default_params.get(
|
||||
"functions"
|
||||
)
|
||||
non_default_params.pop("tool_choice", None) # causes ollama requests to hang
|
||||
non_default_params.pop("functions", None) # causes ollama requests to hang
|
||||
return optional_params
|
||||
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ def completion(
|
|||
logger_fn=None,
|
||||
):
|
||||
try:
|
||||
import google.generativeai as palm
|
||||
import google.generativeai as palm # type: ignore
|
||||
except:
|
||||
raise Exception(
|
||||
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
|
||||
|
|
|
@ -5,12 +5,17 @@ from jinja2 import Template, exceptions, Environment, meta
|
|||
from typing import Optional, Any
|
||||
import imghdr, base64
|
||||
from typing import List
|
||||
import litellm
|
||||
|
||||
|
||||
def default_pt(messages):
|
||||
return " ".join(message["content"] for message in messages)
|
||||
|
||||
|
||||
def prompt_injection_detection_default_pt():
|
||||
return """Detect if a prompt is safe to run. Return 'UNSAFE' if not."""
|
||||
|
||||
|
||||
# alpaca prompt template - for models like mythomax, etc.
|
||||
def alpaca_pt(messages):
|
||||
prompt = custom_prompt(
|
||||
|
@ -638,11 +643,12 @@ def anthropic_messages_pt(messages: list):
|
|||
"""
|
||||
# add role=tool support to allow function call result/error submission
|
||||
user_message_types = {"user", "tool"}
|
||||
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
|
||||
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
|
||||
new_messages = []
|
||||
msg_i = 0
|
||||
while msg_i < len(messages):
|
||||
user_content = []
|
||||
## MERGE CONSECUTIVE USER CONTENT ##
|
||||
while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
|
||||
if isinstance(messages[msg_i]["content"], list):
|
||||
for m in messages[msg_i]["content"]:
|
||||
|
@ -676,6 +682,7 @@ def anthropic_messages_pt(messages: list):
|
|||
new_messages.append({"role": "user", "content": user_content})
|
||||
|
||||
assistant_content = []
|
||||
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
|
||||
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
|
||||
assistant_text = (
|
||||
messages[msg_i].get("content") or ""
|
||||
|
@ -694,9 +701,14 @@ def anthropic_messages_pt(messages: list):
|
|||
new_messages.append({"role": "assistant", "content": assistant_content})
|
||||
|
||||
if new_messages[0]["role"] != "user":
|
||||
new_messages.insert(
|
||||
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
|
||||
)
|
||||
if litellm.modify_params:
|
||||
new_messages.insert(
|
||||
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
|
||||
)
|
||||
else:
|
||||
raise Exception(
|
||||
"Invalid first message. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, "
|
||||
)
|
||||
|
||||
if new_messages[-1]["role"] == "assistant":
|
||||
for content in new_messages[-1]["content"]:
|
||||
|
@ -714,17 +726,23 @@ def extract_between_tags(tag: str, string: str, strip: bool = False) -> List[str
|
|||
ext_list = [e.strip() for e in ext_list]
|
||||
return ext_list
|
||||
|
||||
|
||||
def contains_tag(tag: str, string: str) -> bool:
|
||||
return bool(re.search(f"<{tag}>(.+?)</{tag}>", string, re.DOTALL))
|
||||
|
||||
|
||||
def parse_xml_params(xml_content):
|
||||
root = ET.fromstring(xml_content)
|
||||
params = {}
|
||||
for child in root.findall(".//parameters/*"):
|
||||
params[child.tag] = child.text
|
||||
try:
|
||||
# Attempt to decode the element's text as JSON
|
||||
params[child.tag] = json.loads(child.text)
|
||||
except json.JSONDecodeError:
|
||||
# If JSON decoding fails, use the original text
|
||||
params[child.tag] = child.text
|
||||
return params
|
||||
|
||||
|
||||
###
|
||||
|
||||
|
||||
|
@ -917,7 +935,7 @@ def gemini_text_image_pt(messages: list):
|
|||
}
|
||||
"""
|
||||
try:
|
||||
import google.generativeai as genai
|
||||
import google.generativeai as genai # type: ignore
|
||||
except:
|
||||
raise Exception(
|
||||
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
|
||||
|
@ -958,9 +976,7 @@ def azure_text_pt(messages: list):
|
|||
|
||||
# Function call template
|
||||
def function_call_prompt(messages: list, functions: list):
|
||||
function_prompt = (
|
||||
"""Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
|
||||
)
|
||||
function_prompt = """Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
|
||||
for function in functions:
|
||||
function_prompt += f"""\n{function}\n"""
|
||||
|
||||
|
|
|
@ -166,6 +166,7 @@ def completion(
|
|||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
model_id = optional_params.pop("model_id", None)
|
||||
|
||||
if aws_access_key_id != None:
|
||||
# uses auth params passed to completion
|
||||
|
@ -245,15 +246,28 @@ def completion(
|
|||
model=model,
|
||||
logging_obj=logging_obj,
|
||||
data=data,
|
||||
model_id=model_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_region_name=aws_region_name,
|
||||
)
|
||||
return response
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
|
||||
if model_id is not None:
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
InferenceComponentName=model_id,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
else:
|
||||
response = client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
return response["Body"]
|
||||
elif acompletion == True:
|
||||
_data = {"inputs": prompt, "parameters": inference_params}
|
||||
|
@ -264,36 +278,68 @@ def completion(
|
|||
model=model,
|
||||
logging_obj=logging_obj,
|
||||
data=_data,
|
||||
model_id=model_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_region_name=aws_region_name,
|
||||
)
|
||||
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
|
||||
"utf-8"
|
||||
)
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
"hf_model_name": hf_model_name,
|
||||
},
|
||||
)
|
||||
## COMPLETION CALL
|
||||
try:
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
if model_id is not None:
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
InferenceComponentName={model_id},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
"hf_model_name": hf_model_name,
|
||||
},
|
||||
)
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
InferenceComponentName=model_id,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
else:
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
"hf_model_name": hf_model_name,
|
||||
},
|
||||
)
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
except Exception as e:
|
||||
status_code = (
|
||||
getattr(e, "response", {})
|
||||
|
@ -303,6 +349,8 @@ def completion(
|
|||
error_message = (
|
||||
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
|
||||
)
|
||||
if "Inference Component Name header is required" in error_message:
|
||||
error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
|
||||
raise SagemakerError(status_code=status_code, message=error_message)
|
||||
|
||||
response = response["Body"].read().decode("utf8")
|
||||
|
@ -357,8 +405,12 @@ async def async_streaming(
|
|||
encoding,
|
||||
model_response: ModelResponse,
|
||||
model: str,
|
||||
model_id: Optional[str],
|
||||
logging_obj: Any,
|
||||
data,
|
||||
aws_secret_access_key: Optional[str],
|
||||
aws_access_key_id: Optional[str],
|
||||
aws_region_name: Optional[str],
|
||||
):
|
||||
"""
|
||||
Use aioboto3
|
||||
|
@ -367,11 +419,6 @@ async def async_streaming(
|
|||
|
||||
session = aioboto3.Session()
|
||||
|
||||
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
|
||||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
|
||||
if aws_access_key_id != None:
|
||||
# uses auth params passed to completion
|
||||
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
|
||||
|
@ -398,12 +445,21 @@ async def async_streaming(
|
|||
|
||||
async with _client as client:
|
||||
try:
|
||||
response = await client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
if model_id is not None:
|
||||
response = await client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
InferenceComponentName=model_id,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
else:
|
||||
response = await client.invoke_endpoint_with_response_stream(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
except Exception as e:
|
||||
raise SagemakerError(status_code=500, message=f"{str(e)}")
|
||||
response = response["Body"]
|
||||
|
@ -418,6 +474,10 @@ async def async_completion(
|
|||
model: str,
|
||||
logging_obj: Any,
|
||||
data: dict,
|
||||
model_id: Optional[str],
|
||||
aws_secret_access_key: Optional[str],
|
||||
aws_access_key_id: Optional[str],
|
||||
aws_region_name: Optional[str],
|
||||
):
|
||||
"""
|
||||
Use aioboto3
|
||||
|
@ -426,11 +486,6 @@ async def async_completion(
|
|||
|
||||
session = aioboto3.Session()
|
||||
|
||||
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
|
||||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
|
||||
if aws_access_key_id != None:
|
||||
# uses auth params passed to completion
|
||||
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
|
||||
|
@ -456,33 +511,63 @@ async def async_completion(
|
|||
)
|
||||
|
||||
async with _client as client:
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=data["inputs"],
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
encoded_data = json.dumps(data).encode("utf-8")
|
||||
try:
|
||||
response = await client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=encoded_data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
if model_id is not None:
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
InferenceComponentName={model_id},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=data["inputs"],
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
response = await client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
InferenceComponentName=model_id,
|
||||
ContentType="application/json",
|
||||
Body=encoded_data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
else:
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=data["inputs"],
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": data,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
response = await client.invoke_endpoint(
|
||||
EndpointName=model,
|
||||
ContentType="application/json",
|
||||
Body=encoded_data,
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
except Exception as e:
|
||||
raise SagemakerError(status_code=500, message=f"{str(e)}")
|
||||
error_message = f"{str(e)}"
|
||||
if "Inference Component Name header is required" in error_message:
|
||||
error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
|
||||
raise SagemakerError(status_code=500, message=error_message)
|
||||
response = await response["Body"].read()
|
||||
response = response.decode("utf8")
|
||||
## LOGGING
|
||||
|
|
|
@ -289,11 +289,11 @@ def completion(
|
|||
Part,
|
||||
GenerationConfig,
|
||||
)
|
||||
from google.cloud import aiplatform
|
||||
from google.cloud import aiplatform # type: ignore
|
||||
from google.protobuf import json_format # type: ignore
|
||||
from google.protobuf.struct_pb2 import Value # type: ignore
|
||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
|
||||
import google.auth
|
||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore
|
||||
import google.auth # type: ignore
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
print_verbose(
|
||||
|
@ -783,7 +783,7 @@ async def async_completion(
|
|||
"""
|
||||
Vertex AI Model Garden
|
||||
"""
|
||||
from google.cloud import aiplatform
|
||||
from google.cloud import aiplatform # type: ignore
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
|
@ -969,7 +969,7 @@ async def async_streaming(
|
|||
)
|
||||
response = llm_model.predict_streaming_async(prompt, **optional_params)
|
||||
elif mode == "custom":
|
||||
from google.cloud import aiplatform
|
||||
from google.cloud import aiplatform # type: ignore
|
||||
|
||||
stream = optional_params.pop("stream", None)
|
||||
|
||||
|
@ -1059,7 +1059,7 @@ def embedding(
|
|||
)
|
||||
|
||||
from vertexai.language_models import TextEmbeddingModel
|
||||
import google.auth
|
||||
import google.auth # type: ignore
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
try:
|
||||
|
|
|
@ -115,24 +115,54 @@ class LiteLLM:
|
|||
default_headers: Optional[Mapping[str, str]] = None,
|
||||
):
|
||||
self.params = locals()
|
||||
self.chat = Chat(self.params)
|
||||
self.chat = Chat(self.params, router_obj=None)
|
||||
|
||||
|
||||
class Chat:
|
||||
def __init__(self, params):
|
||||
def __init__(self, params, router_obj: Optional[Any]):
|
||||
self.params = params
|
||||
self.completions = Completions(self.params)
|
||||
if self.params.get("acompletion", False) == True:
|
||||
self.params.pop("acompletion")
|
||||
self.completions: Union[AsyncCompletions, Completions] = AsyncCompletions(
|
||||
self.params, router_obj=router_obj
|
||||
)
|
||||
else:
|
||||
self.completions = Completions(self.params, router_obj=router_obj)
|
||||
|
||||
|
||||
class Completions:
|
||||
def __init__(self, params):
|
||||
def __init__(self, params, router_obj: Optional[Any]):
|
||||
self.params = params
|
||||
self.router_obj = router_obj
|
||||
|
||||
def create(self, messages, model=None, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
self.params[k] = v
|
||||
model = model or self.params.get("model")
|
||||
response = completion(model=model, messages=messages, **self.params)
|
||||
if self.router_obj is not None:
|
||||
response = self.router_obj.completion(
|
||||
model=model, messages=messages, **self.params
|
||||
)
|
||||
else:
|
||||
response = completion(model=model, messages=messages, **self.params)
|
||||
return response
|
||||
|
||||
|
||||
class AsyncCompletions:
|
||||
def __init__(self, params, router_obj: Optional[Any]):
|
||||
self.params = params
|
||||
self.router_obj = router_obj
|
||||
|
||||
async def create(self, messages, model=None, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
self.params[k] = v
|
||||
model = model or self.params.get("model")
|
||||
if self.router_obj is not None:
|
||||
response = await self.router_obj.acompletion(
|
||||
model=model, messages=messages, **self.params
|
||||
)
|
||||
else:
|
||||
response = await acompletion(model=model, messages=messages, **self.params)
|
||||
return response
|
||||
|
||||
|
||||
|
@ -571,6 +601,7 @@ def completion(
|
|||
"ttl",
|
||||
"cache",
|
||||
"no-log",
|
||||
"base_model",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
@ -639,7 +670,7 @@ def completion(
|
|||
elif (
|
||||
input_cost_per_second is not None
|
||||
): # time based pricing just needs cost in place
|
||||
output_cost_per_second = output_cost_per_second or 0.0
|
||||
output_cost_per_second = output_cost_per_second
|
||||
litellm.register_model(
|
||||
{
|
||||
f"{custom_llm_provider}/{model}": {
|
||||
|
@ -1752,7 +1783,11 @@ def completion(
|
|||
timeout=timeout,
|
||||
)
|
||||
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
if (
|
||||
"stream" in optional_params
|
||||
and optional_params["stream"] == True
|
||||
and not isinstance(response, CustomStreamWrapper)
|
||||
):
|
||||
# don't try to access stream object,
|
||||
if "ai21" in model:
|
||||
response = CustomStreamWrapper(
|
||||
|
@ -2754,28 +2789,25 @@ def embedding(
|
|||
model_response=EmbeddingResponse(),
|
||||
)
|
||||
elif custom_llm_provider == "ollama":
|
||||
ollama_input = None
|
||||
if isinstance(input, list) and len(input) > 1:
|
||||
raise litellm.BadRequestError(
|
||||
message=f"Ollama Embeddings don't support batch embeddings",
|
||||
model=model, # type: ignore
|
||||
llm_provider="ollama", # type: ignore
|
||||
)
|
||||
if isinstance(input, list) and len(input) == 1:
|
||||
ollama_input = "".join(input[0])
|
||||
elif isinstance(input, str):
|
||||
ollama_input = input
|
||||
else:
|
||||
api_base = (
|
||||
litellm.api_base
|
||||
or api_base
|
||||
or get_secret("OLLAMA_API_BASE")
|
||||
or "http://localhost:11434"
|
||||
)
|
||||
if isinstance(input, str):
|
||||
input = [input]
|
||||
if not all(isinstance(item, str) for item in input):
|
||||
raise litellm.BadRequestError(
|
||||
message=f"Invalid input for ollama embeddings. input={input}",
|
||||
model=model, # type: ignore
|
||||
llm_provider="ollama", # type: ignore
|
||||
)
|
||||
|
||||
if aembedding == True:
|
||||
if aembedding:
|
||||
response = ollama.ollama_aembeddings(
|
||||
api_base=api_base,
|
||||
model=model,
|
||||
prompt=ollama_input,
|
||||
prompts=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
|
|
|
@ -1 +1 @@
|
|||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/68a21c6e6697f7ca.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
||||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/f8da5a6a5b29d249.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
|
@ -1 +1 @@
|
|||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a85b2c176012d8e5.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e1b183dda365ec86.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[19914,[\"730\",\"static/chunks/730-1411b729a1c79695.js\",\"931\",\"static/chunks/app/page-df9015da04018cc1.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"tXZFkeqtgh-goIRVbw_9q\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
||||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[90177,[\"798\",\"static/chunks/798-4baed68da0c5497d.js\",\"931\",\"static/chunks/app/page-a5a04da2a9356785.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"DptMjzo5xd96cx0b56k4u\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
|
@ -1,7 +1,7 @@
|
|||
2:I[77831,[],""]
|
||||
3:I[19914,["730","static/chunks/730-1411b729a1c79695.js","931","static/chunks/app/page-df9015da04018cc1.js"],""]
|
||||
3:I[90177,["798","static/chunks/798-4baed68da0c5497d.js","931","static/chunks/app/page-a5a04da2a9356785.js"],""]
|
||||
4:I[5613,[],""]
|
||||
5:I[31778,[],""]
|
||||
0:["tXZFkeqtgh-goIRVbw_9q",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/68a21c6e6697f7ca.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||
0:["DptMjzo5xd96cx0b56k4u",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f8da5a6a5b29d249.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||
1:null
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
model_list:
|
||||
- model_name: fake_openai
|
||||
- model_name: fake-openai-endpoint
|
||||
litellm_params:
|
||||
model: openai/my-fake-model
|
||||
api_key: my-fake-key
|
||||
api_base: http://0.0.0.0:8080
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo-1106
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
api_base: https://exampleopenaiendpoint-production.up.railway.app/
|
||||
|
||||
litellm_settings:
|
||||
cache: true
|
||||
cache_params:
|
||||
type: redis
|
||||
callbacks: ["batch_redis_requests"]
|
||||
# success_callbacks: ["langfuse"]
|
||||
max_budget: 600020
|
||||
budget_duration: 30d
|
||||
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
database_url: "postgresql://neondb_owner:hz8tyUlJ5ivV@ep-cool-sunset-a5ywubeh.us-east-2.aws.neon.tech/neondb?sslmode=require"
|
||||
proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
|
||||
enable_jwt_auth: True
|
||||
alerting: ["slack"]
|
||||
litellm_jwtauth:
|
||||
admin_jwt_scope: "litellm_proxy_admin"
|
||||
team_jwt_scope: "litellm_team"
|
||||
public_key_ttl: 600
|
|
@ -1,4 +1,5 @@
|
|||
from pydantic import BaseModel, Extra, Field, root_validator, Json
|
||||
from pydantic import BaseModel, Extra, Field, root_validator, Json, validator
|
||||
from dataclasses import fields
|
||||
import enum
|
||||
from typing import Optional, List, Union, Dict, Literal, Any
|
||||
from datetime import datetime
|
||||
|
@ -14,11 +15,6 @@ def hash_token(token: str):
|
|||
return hashed_token
|
||||
|
||||
|
||||
class LiteLLMProxyRoles(enum.Enum):
|
||||
PROXY_ADMIN = "litellm_proxy_admin"
|
||||
USER = "litellm_user"
|
||||
|
||||
|
||||
class LiteLLMBase(BaseModel):
|
||||
"""
|
||||
Implements default functions, all pydantic objects should have.
|
||||
|
@ -42,6 +38,135 @@ class LiteLLMBase(BaseModel):
|
|||
protected_namespaces = ()
|
||||
|
||||
|
||||
class LiteLLMRoutes(enum.Enum):
|
||||
openai_routes: List = [ # chat completions
|
||||
"/openai/deployments/{model}/chat/completions",
|
||||
"/chat/completions",
|
||||
"/v1/chat/completions",
|
||||
# completions
|
||||
"/openai/deployments/{model}/completions",
|
||||
"/completions",
|
||||
"/v1/completions",
|
||||
# embeddings
|
||||
"/openai/deployments/{model}/embeddings",
|
||||
"/embeddings",
|
||||
"/v1/embeddings",
|
||||
# image generation
|
||||
"/images/generations",
|
||||
"/v1/images/generations",
|
||||
# audio transcription
|
||||
"/audio/transcriptions",
|
||||
"/v1/audio/transcriptions",
|
||||
# moderations
|
||||
"/moderations",
|
||||
"/v1/moderations",
|
||||
# models
|
||||
"/models",
|
||||
"/v1/models",
|
||||
]
|
||||
|
||||
info_routes: List = ["/key/info", "/team/info", "/user/info", "/model/info"]
|
||||
|
||||
management_routes: List = [ # key
|
||||
"/key/generate",
|
||||
"/key/update",
|
||||
"/key/delete",
|
||||
"/key/info",
|
||||
# user
|
||||
"/user/new",
|
||||
"/user/update",
|
||||
"/user/delete",
|
||||
"/user/info",
|
||||
# team
|
||||
"/team/new",
|
||||
"/team/update",
|
||||
"/team/delete",
|
||||
"/team/info",
|
||||
"/team/block",
|
||||
"/team/unblock",
|
||||
# model
|
||||
"/model/new",
|
||||
"/model/update",
|
||||
"/model/delete",
|
||||
"/model/info",
|
||||
]
|
||||
|
||||
|
||||
class LiteLLM_JWTAuth(LiteLLMBase):
|
||||
"""
|
||||
A class to define the roles and permissions for a LiteLLM Proxy w/ JWT Auth.
|
||||
|
||||
Attributes:
|
||||
- admin_jwt_scope: The JWT scope required for proxy admin roles.
|
||||
- admin_allowed_routes: list of allowed routes for proxy admin roles.
|
||||
- team_jwt_scope: The JWT scope required for proxy team roles.
|
||||
- team_id_jwt_field: The field in the JWT token that stores the team ID. Default - `client_id`.
|
||||
- team_allowed_routes: list of allowed routes for proxy team roles.
|
||||
- end_user_id_jwt_field: Default - `sub`. The field in the JWT token that stores the end-user ID. Turn this off by setting to `None`. Enables end-user cost tracking.
|
||||
- public_key_ttl: Default - 600s. TTL for caching public JWT keys.
|
||||
|
||||
See `auth_checks.py` for the specific routes
|
||||
"""
|
||||
|
||||
admin_jwt_scope: str = "litellm_proxy_admin"
|
||||
admin_allowed_routes: List[
|
||||
Literal["openai_routes", "info_routes", "management_routes"]
|
||||
] = ["management_routes"]
|
||||
team_jwt_scope: str = "litellm_team"
|
||||
team_id_jwt_field: str = "client_id"
|
||||
team_allowed_routes: List[
|
||||
Literal["openai_routes", "info_routes", "management_routes"]
|
||||
] = ["openai_routes", "info_routes"]
|
||||
end_user_id_jwt_field: Optional[str] = "sub"
|
||||
public_key_ttl: float = 600
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
# get the attribute names for this Pydantic model
|
||||
allowed_keys = self.__annotations__.keys()
|
||||
|
||||
invalid_keys = set(kwargs.keys()) - allowed_keys
|
||||
|
||||
if invalid_keys:
|
||||
raise ValueError(
|
||||
f"Invalid arguments provided: {', '.join(invalid_keys)}. Allowed arguments are: {', '.join(allowed_keys)}."
|
||||
)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
|
||||
class LiteLLMPromptInjectionParams(LiteLLMBase):
|
||||
heuristics_check: bool = False
|
||||
vector_db_check: bool = False
|
||||
llm_api_check: bool = False
|
||||
llm_api_name: Optional[str] = None
|
||||
llm_api_system_prompt: Optional[str] = None
|
||||
llm_api_fail_call_string: Optional[str] = None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def check_llm_api_params(cls, values):
|
||||
llm_api_check = values.get("llm_api_check")
|
||||
if llm_api_check is True:
|
||||
if "llm_api_name" not in values or not values["llm_api_name"]:
|
||||
raise ValueError(
|
||||
"If llm_api_check is set to True, llm_api_name must be provided"
|
||||
)
|
||||
if (
|
||||
"llm_api_system_prompt" not in values
|
||||
or not values["llm_api_system_prompt"]
|
||||
):
|
||||
raise ValueError(
|
||||
"If llm_api_check is set to True, llm_api_system_prompt must be provided"
|
||||
)
|
||||
if (
|
||||
"llm_api_fail_call_string" not in values
|
||||
or not values["llm_api_fail_call_string"]
|
||||
):
|
||||
raise ValueError(
|
||||
"If llm_api_check is set to True, llm_api_fail_call_string must be provided"
|
||||
)
|
||||
return values
|
||||
|
||||
|
||||
######### Request Class Definition ######
|
||||
class ProxyChatCompletionRequest(LiteLLMBase):
|
||||
model: str
|
||||
|
@ -180,7 +305,7 @@ class GenerateKeyResponse(GenerateKeyRequest):
|
|||
key: str
|
||||
key_name: Optional[str] = None
|
||||
expires: Optional[datetime]
|
||||
user_id: str
|
||||
user_id: Optional[str] = None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def set_model_info(cls, values):
|
||||
|
@ -274,6 +399,7 @@ class TeamBase(LiteLLMBase):
|
|||
rpm_limit: Optional[int] = None
|
||||
max_budget: Optional[float] = None
|
||||
models: list = []
|
||||
blocked: bool = False
|
||||
|
||||
|
||||
class NewTeamRequest(TeamBase):
|
||||
|
@ -301,19 +427,18 @@ class TeamMemberDeleteRequest(LiteLLMBase):
|
|||
return values
|
||||
|
||||
|
||||
class UpdateTeamRequest(LiteLLMBase):
|
||||
class UpdateTeamRequest(TeamBase):
|
||||
team_id: str # required
|
||||
team_alias: Optional[str] = None
|
||||
admins: Optional[list] = None
|
||||
members: Optional[list] = None
|
||||
members_with_roles: Optional[List[Member]] = None
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
class DeleteTeamRequest(LiteLLMBase):
|
||||
team_ids: List[str] # required
|
||||
|
||||
|
||||
class BlockTeamRequest(LiteLLMBase):
|
||||
team_id: str # required
|
||||
|
||||
|
||||
class LiteLLM_TeamTable(TeamBase):
|
||||
spend: Optional[float] = None
|
||||
max_parallel_requests: Optional[int] = None
|
||||
|
@ -498,6 +623,9 @@ class ConfigGeneralSettings(LiteLLMBase):
|
|||
ui_access_mode: Optional[Literal["admin_only", "all"]] = Field(
|
||||
"all", description="Control access to the Proxy UI"
|
||||
)
|
||||
allowed_routes: Optional[List] = Field(
|
||||
None, description="Proxy API Endpoints you want users to be able to access"
|
||||
)
|
||||
|
||||
|
||||
class ConfigYAML(LiteLLMBase):
|
||||
|
@ -565,6 +693,8 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
|
|||
team_tpm_limit: Optional[int] = None
|
||||
team_rpm_limit: Optional[int] = None
|
||||
team_max_budget: Optional[float] = None
|
||||
team_models: List = []
|
||||
team_blocked: bool = False
|
||||
soft_budget: Optional[float] = None
|
||||
team_model_aliases: Optional[Dict] = None
|
||||
|
||||
|
|
|
@ -8,45 +8,160 @@ Run checks for:
|
|||
2. If user is in budget
|
||||
3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
|
||||
"""
|
||||
from litellm.proxy._types import LiteLLM_UserTable, LiteLLM_EndUserTable
|
||||
from typing import Optional
|
||||
from litellm.proxy._types import (
|
||||
LiteLLM_UserTable,
|
||||
LiteLLM_EndUserTable,
|
||||
LiteLLM_JWTAuth,
|
||||
LiteLLM_TeamTable,
|
||||
LiteLLMRoutes,
|
||||
)
|
||||
from typing import Optional, Literal, Union
|
||||
from litellm.proxy.utils import PrismaClient
|
||||
from litellm.caching import DualCache
|
||||
import litellm
|
||||
|
||||
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
|
||||
|
||||
|
||||
def common_checks(
|
||||
request_body: dict,
|
||||
user_object: LiteLLM_UserTable,
|
||||
team_object: LiteLLM_TeamTable,
|
||||
end_user_object: Optional[LiteLLM_EndUserTable],
|
||||
global_proxy_spend: Optional[float],
|
||||
general_settings: dict,
|
||||
route: str,
|
||||
) -> bool:
|
||||
"""
|
||||
Common checks across jwt + key-based auth.
|
||||
|
||||
1. If team is blocked
|
||||
2. If team can call model
|
||||
3. If team is in budget
|
||||
4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
|
||||
5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
|
||||
6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
|
||||
"""
|
||||
_model = request_body.get("model", None)
|
||||
# 1. If user can call model
|
||||
if team_object.blocked == True:
|
||||
raise Exception(
|
||||
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
|
||||
)
|
||||
# 2. If user can call model
|
||||
if (
|
||||
_model is not None
|
||||
and len(user_object.models) > 0
|
||||
and _model not in user_object.models
|
||||
and len(team_object.models) > 0
|
||||
and _model not in team_object.models
|
||||
):
|
||||
raise Exception(
|
||||
f"User={user_object.user_id} not allowed to call model={_model}. Allowed user models = {user_object.models}"
|
||||
f"Team={team_object.team_id} not allowed to call model={_model}. Allowed team models = {team_object.models}"
|
||||
)
|
||||
# 2. If user is in budget
|
||||
# 3. If team is in budget
|
||||
if (
|
||||
user_object.max_budget is not None
|
||||
and user_object.spend > user_object.max_budget
|
||||
team_object.max_budget is not None
|
||||
and team_object.spend is not None
|
||||
and team_object.spend > team_object.max_budget
|
||||
):
|
||||
raise Exception(
|
||||
f"User={user_object.user_id} over budget. Spend={user_object.spend}, Budget={user_object.max_budget}"
|
||||
f"Team={team_object.team_id} over budget. Spend={team_object.spend}, Budget={team_object.max_budget}"
|
||||
)
|
||||
# 3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
|
||||
# 4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
|
||||
if end_user_object is not None and end_user_object.litellm_budget_table is not None:
|
||||
end_user_budget = end_user_object.litellm_budget_table.max_budget
|
||||
if end_user_budget is not None and end_user_object.spend > end_user_budget:
|
||||
raise Exception(
|
||||
f"End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
|
||||
f"ExceededBudget: End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
|
||||
)
|
||||
# 5. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
|
||||
if (
|
||||
general_settings.get("enforce_user_param", None) is not None
|
||||
and general_settings["enforce_user_param"] == True
|
||||
):
|
||||
if route in LiteLLMRoutes.openai_routes.value and "user" not in request_body:
|
||||
raise Exception(
|
||||
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
|
||||
)
|
||||
# 6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
|
||||
if litellm.max_budget > 0 and global_proxy_spend is not None:
|
||||
if global_proxy_spend > litellm.max_budget:
|
||||
raise Exception(
|
||||
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _allowed_routes_check(user_route: str, allowed_routes: list) -> bool:
|
||||
for allowed_route in allowed_routes:
|
||||
if (
|
||||
allowed_route == LiteLLMRoutes.openai_routes.name
|
||||
and user_route in LiteLLMRoutes.openai_routes.value
|
||||
):
|
||||
return True
|
||||
elif (
|
||||
allowed_route == LiteLLMRoutes.info_routes.name
|
||||
and user_route in LiteLLMRoutes.info_routes.value
|
||||
):
|
||||
return True
|
||||
elif (
|
||||
allowed_route == LiteLLMRoutes.management_routes.name
|
||||
and user_route in LiteLLMRoutes.management_routes.value
|
||||
):
|
||||
return True
|
||||
elif allowed_route == user_route:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def allowed_routes_check(
|
||||
user_role: Literal["proxy_admin", "team"],
|
||||
user_route: str,
|
||||
litellm_proxy_roles: LiteLLM_JWTAuth,
|
||||
) -> bool:
|
||||
"""
|
||||
Check if user -> not admin - allowed to access these routes
|
||||
"""
|
||||
|
||||
if user_role == "proxy_admin":
|
||||
if litellm_proxy_roles.admin_allowed_routes is None:
|
||||
is_allowed = _allowed_routes_check(
|
||||
user_route=user_route, allowed_routes=["management_routes"]
|
||||
)
|
||||
return is_allowed
|
||||
elif litellm_proxy_roles.admin_allowed_routes is not None:
|
||||
is_allowed = _allowed_routes_check(
|
||||
user_route=user_route,
|
||||
allowed_routes=litellm_proxy_roles.admin_allowed_routes,
|
||||
)
|
||||
return is_allowed
|
||||
|
||||
elif user_role == "team":
|
||||
if litellm_proxy_roles.team_allowed_routes is None:
|
||||
"""
|
||||
By default allow a team to call openai + info routes
|
||||
"""
|
||||
is_allowed = _allowed_routes_check(
|
||||
user_route=user_route, allowed_routes=["openai_routes", "info_routes"]
|
||||
)
|
||||
return is_allowed
|
||||
elif litellm_proxy_roles.team_allowed_routes is not None:
|
||||
is_allowed = _allowed_routes_check(
|
||||
user_route=user_route,
|
||||
allowed_routes=litellm_proxy_roles.team_allowed_routes,
|
||||
)
|
||||
return is_allowed
|
||||
return False
|
||||
|
||||
|
||||
def get_actual_routes(allowed_routes: list) -> list:
|
||||
actual_routes: list = []
|
||||
for route_name in allowed_routes:
|
||||
try:
|
||||
route_value = LiteLLMRoutes[route_name].value
|
||||
actual_routes = actual_routes + route_value
|
||||
except KeyError:
|
||||
actual_routes.append(route_name)
|
||||
return actual_routes
|
||||
|
||||
|
||||
async def get_end_user_object(
|
||||
end_user_id: Optional[str],
|
||||
prisma_client: Optional[PrismaClient],
|
||||
|
@ -82,3 +197,75 @@ async def get_end_user_object(
|
|||
return LiteLLM_EndUserTable(**response.dict())
|
||||
except Exception as e: # if end-user not in db
|
||||
return None
|
||||
|
||||
|
||||
async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
|
||||
"""
|
||||
- Check if user id in proxy User Table
|
||||
- if valid, return LiteLLM_UserTable object with defined limits
|
||||
- if not, then raise an error
|
||||
"""
|
||||
if self.prisma_client is None:
|
||||
raise Exception(
|
||||
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
|
||||
)
|
||||
|
||||
# check if in cache
|
||||
cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
|
||||
if cached_user_obj is not None:
|
||||
if isinstance(cached_user_obj, dict):
|
||||
return LiteLLM_UserTable(**cached_user_obj)
|
||||
elif isinstance(cached_user_obj, LiteLLM_UserTable):
|
||||
return cached_user_obj
|
||||
# else, check db
|
||||
try:
|
||||
response = await self.prisma_client.db.litellm_usertable.find_unique(
|
||||
where={"user_id": user_id}
|
||||
)
|
||||
|
||||
if response is None:
|
||||
raise Exception
|
||||
|
||||
return LiteLLM_UserTable(**response.dict())
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
|
||||
)
|
||||
|
||||
|
||||
async def get_team_object(
|
||||
team_id: str,
|
||||
prisma_client: Optional[PrismaClient],
|
||||
user_api_key_cache: DualCache,
|
||||
) -> LiteLLM_TeamTable:
|
||||
"""
|
||||
- Check if team id in proxy Team Table
|
||||
- if valid, return LiteLLM_TeamTable object with defined limits
|
||||
- if not, then raise an error
|
||||
"""
|
||||
if prisma_client is None:
|
||||
raise Exception(
|
||||
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
|
||||
)
|
||||
|
||||
# check if in cache
|
||||
cached_team_obj = user_api_key_cache.async_get_cache(key=team_id)
|
||||
if cached_team_obj is not None:
|
||||
if isinstance(cached_team_obj, dict):
|
||||
return LiteLLM_TeamTable(**cached_team_obj)
|
||||
elif isinstance(cached_team_obj, LiteLLM_TeamTable):
|
||||
return cached_team_obj
|
||||
# else, check db
|
||||
try:
|
||||
response = await prisma_client.db.litellm_teamtable.find_unique(
|
||||
where={"team_id": team_id}
|
||||
)
|
||||
|
||||
if response is None:
|
||||
raise Exception
|
||||
|
||||
return LiteLLM_TeamTable(**response.dict())
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
|
||||
)
|
||||
|
|
|
@ -6,50 +6,17 @@ Currently only supports admin.
|
|||
JWT token must have 'litellm_proxy_admin' in scope.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import jwt
|
||||
from jwt.algorithms import RSAAlgorithm
|
||||
import json
|
||||
import os
|
||||
from litellm.caching import DualCache
|
||||
from litellm.proxy._types import LiteLLMProxyRoles, LiteLLM_UserTable
|
||||
from litellm._logging import verbose_proxy_logger
|
||||
from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
|
||||
from litellm.proxy.utils import PrismaClient
|
||||
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class HTTPHandler:
|
||||
def __init__(self, concurrent_limit=1000):
|
||||
# Create a client with a connection pool
|
||||
self.client = httpx.AsyncClient(
|
||||
limits=httpx.Limits(
|
||||
max_connections=concurrent_limit,
|
||||
max_keepalive_connections=concurrent_limit,
|
||||
)
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
# Close the client when you're done with it
|
||||
await self.client.aclose()
|
||||
|
||||
async def get(
|
||||
self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
|
||||
):
|
||||
response = await self.client.get(url, params=params, headers=headers)
|
||||
return response
|
||||
|
||||
async def post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
):
|
||||
response = await self.client.post(
|
||||
url, data=data, params=params, headers=headers
|
||||
)
|
||||
return response
|
||||
|
||||
|
||||
class JWTHandler:
|
||||
"""
|
||||
- treat the sub id passed in as the user id
|
||||
|
@ -67,113 +34,139 @@ class JWTHandler:
|
|||
self.http_handler = HTTPHandler()
|
||||
|
||||
def update_environment(
|
||||
self, prisma_client: Optional[PrismaClient], user_api_key_cache: DualCache
|
||||
self,
|
||||
prisma_client: Optional[PrismaClient],
|
||||
user_api_key_cache: DualCache,
|
||||
litellm_jwtauth: LiteLLM_JWTAuth,
|
||||
) -> None:
|
||||
self.prisma_client = prisma_client
|
||||
self.user_api_key_cache = user_api_key_cache
|
||||
self.litellm_jwtauth = litellm_jwtauth
|
||||
|
||||
def is_jwt(self, token: str):
|
||||
parts = token.split(".")
|
||||
return len(parts) == 3
|
||||
|
||||
def is_admin(self, scopes: list) -> bool:
|
||||
if LiteLLMProxyRoles.PROXY_ADMIN.value in scopes:
|
||||
if self.litellm_jwtauth.admin_jwt_scope in scopes:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_user_id(self, token: dict, default_value: str) -> str:
|
||||
def is_team(self, scopes: list) -> bool:
|
||||
if self.litellm_jwtauth.team_jwt_scope in scopes:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_end_user_id(self, token: dict, default_value: Optional[str]) -> str:
|
||||
try:
|
||||
user_id = token["sub"]
|
||||
if self.litellm_jwtauth.end_user_id_jwt_field is not None:
|
||||
user_id = token[self.litellm_jwtauth.end_user_id_jwt_field]
|
||||
else:
|
||||
user_id = None
|
||||
except KeyError:
|
||||
user_id = default_value
|
||||
return user_id
|
||||
|
||||
def get_team_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
|
||||
try:
|
||||
team_id = token["azp"]
|
||||
team_id = token[self.litellm_jwtauth.team_id_jwt_field]
|
||||
except KeyError:
|
||||
team_id = default_value
|
||||
return team_id
|
||||
|
||||
async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
|
||||
"""
|
||||
- Check if user id in proxy User Table
|
||||
- if valid, return LiteLLM_UserTable object with defined limits
|
||||
- if not, then raise an error
|
||||
"""
|
||||
if self.prisma_client is None:
|
||||
raise Exception(
|
||||
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
|
||||
)
|
||||
|
||||
# check if in cache
|
||||
cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
|
||||
if cached_user_obj is not None:
|
||||
if isinstance(cached_user_obj, dict):
|
||||
return LiteLLM_UserTable(**cached_user_obj)
|
||||
elif isinstance(cached_user_obj, LiteLLM_UserTable):
|
||||
return cached_user_obj
|
||||
# else, check db
|
||||
try:
|
||||
response = await self.prisma_client.db.litellm_usertable.find_unique(
|
||||
where={"user_id": user_id}
|
||||
)
|
||||
|
||||
if response is None:
|
||||
raise Exception
|
||||
|
||||
return LiteLLM_UserTable(**response.dict())
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
|
||||
)
|
||||
|
||||
def get_scopes(self, token: dict) -> list:
|
||||
try:
|
||||
# Assuming the scopes are stored in 'scope' claim and are space-separated
|
||||
scopes = token["scope"].split()
|
||||
if isinstance(token["scope"], str):
|
||||
# Assuming the scopes are stored in 'scope' claim and are space-separated
|
||||
scopes = token["scope"].split()
|
||||
elif isinstance(token["scope"], list):
|
||||
scopes = token["scope"]
|
||||
else:
|
||||
raise Exception(
|
||||
f"Unmapped scope type - {type(token['scope'])}. Supported types - list, str."
|
||||
)
|
||||
except KeyError:
|
||||
scopes = []
|
||||
return scopes
|
||||
|
||||
async def auth_jwt(self, token: str) -> dict:
|
||||
async def get_public_key(self, kid: Optional[str]) -> dict:
|
||||
keys_url = os.getenv("JWT_PUBLIC_KEY_URL")
|
||||
|
||||
if keys_url is None:
|
||||
raise Exception("Missing JWT Public Key URL from environment.")
|
||||
|
||||
response = await self.http_handler.get(keys_url)
|
||||
cached_keys = await self.user_api_key_cache.async_get_cache(
|
||||
"litellm_jwt_auth_keys"
|
||||
)
|
||||
if cached_keys is None:
|
||||
response = await self.http_handler.get(keys_url)
|
||||
|
||||
keys = response.json()["keys"]
|
||||
keys = response.json()["keys"]
|
||||
|
||||
await self.user_api_key_cache.async_set_cache(
|
||||
key="litellm_jwt_auth_keys",
|
||||
value=keys,
|
||||
ttl=self.litellm_jwtauth.public_key_ttl, # cache for 10 mins
|
||||
)
|
||||
else:
|
||||
keys = cached_keys
|
||||
|
||||
public_key: Optional[dict] = None
|
||||
|
||||
if len(keys) == 1:
|
||||
if kid is None or key["kid"] == kid:
|
||||
public_key = keys[0]
|
||||
elif len(keys) > 1:
|
||||
for key in keys:
|
||||
if kid is not None and key["kid"] == kid:
|
||||
public_key = key
|
||||
|
||||
if public_key is None:
|
||||
raise Exception(
|
||||
f"No matching public key found. kid={kid}, keys_url={keys_url}, cached_keys={cached_keys}"
|
||||
)
|
||||
|
||||
return public_key
|
||||
|
||||
async def auth_jwt(self, token: str) -> dict:
|
||||
from jwt.algorithms import RSAAlgorithm
|
||||
|
||||
header = jwt.get_unverified_header(token)
|
||||
kid = header["kid"]
|
||||
|
||||
for key in keys:
|
||||
if key["kid"] == kid:
|
||||
jwk = {
|
||||
"kty": key["kty"],
|
||||
"kid": key["kid"],
|
||||
"n": key["n"],
|
||||
"e": key["e"],
|
||||
}
|
||||
public_key = RSAAlgorithm.from_jwk(json.dumps(jwk))
|
||||
verbose_proxy_logger.debug("header: %s", header)
|
||||
|
||||
try:
|
||||
# decode the token using the public key
|
||||
payload = jwt.decode(
|
||||
token,
|
||||
public_key, # type: ignore
|
||||
algorithms=["RS256"],
|
||||
audience="account",
|
||||
)
|
||||
return payload
|
||||
kid = header.get("kid", None)
|
||||
|
||||
except jwt.ExpiredSignatureError:
|
||||
# the token is expired, do something to refresh it
|
||||
raise Exception("Token Expired")
|
||||
except Exception as e:
|
||||
raise Exception(f"Validation fails: {str(e)}")
|
||||
public_key = await self.get_public_key(kid=kid)
|
||||
|
||||
if public_key is not None and isinstance(public_key, dict):
|
||||
jwk = {}
|
||||
if "kty" in public_key:
|
||||
jwk["kty"] = public_key["kty"]
|
||||
if "kid" in public_key:
|
||||
jwk["kid"] = public_key["kid"]
|
||||
if "n" in public_key:
|
||||
jwk["n"] = public_key["n"]
|
||||
if "e" in public_key:
|
||||
jwk["e"] = public_key["e"]
|
||||
|
||||
public_key_rsa = RSAAlgorithm.from_jwk(json.dumps(jwk))
|
||||
|
||||
try:
|
||||
# decode the token using the public key
|
||||
payload = jwt.decode(
|
||||
token,
|
||||
public_key_rsa, # type: ignore
|
||||
algorithms=["RS256"],
|
||||
options={"verify_aud": False},
|
||||
)
|
||||
return payload
|
||||
|
||||
except jwt.ExpiredSignatureError:
|
||||
# the token is expired, do something to refresh it
|
||||
raise Exception("Token Expired")
|
||||
except Exception as e:
|
||||
raise Exception(f"Validation fails: {str(e)}")
|
||||
|
||||
raise Exception("Invalid JWT Submitted")
|
||||
|
||||
|
|