Merge branch 'main' into litellm_embedding_caching_updates

This commit is contained in:
Krish Dholakia 2024-02-03 18:08:47 -08:00 committed by GitHub
commit 9ab59045a3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
236 changed files with 24483 additions and 2014 deletions

View file

@ -40,7 +40,9 @@ jobs:
pip install "httpx==0.24.1"
pip install "gunicorn==21.2.0"
pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3"
pip install "apscheduler==3.10.4"
pip install "PyGithub==1.59.1"
- save_cache:
paths:
@ -96,6 +98,43 @@ jobs:
command: |
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install openai
python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install "google-generativeai>=0.3.2"
pip install "google-cloud-aiplatform>=1.38.0"
pip install "boto3>=1.28.57"
pip install langchain
pip install "langfuse>=2.0.0"
pip install numpydoc
pip install prisma
pip install "httpx==0.24.1"
pip install "gunicorn==21.2.0"
pip install "anyio==3.7.1"
pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
# Run pytest and generate JUnit XML report
- run:
name: Build Docker image
command: docker build -t my-app:latest -f Dockerfile.database .
@ -105,15 +144,20 @@ jobs:
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
-e AZURE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_API_KEY=$AZURE_API_KEY \
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
--name my-app \
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--num_workers 8
--num_workers 8 \
--detailed_debug \
--run_gunicorn \
- run:
name: Install curl and dockerize
command: |
@ -124,63 +168,22 @@ jobs:
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run:
name: Start outputting logs
command: |
while true; do
docker logs my-app
sleep 10
done
command: docker logs -f my-app
background: true
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m
- run:
name: Test the application
name: Run tests
command: |
mkdir -p /tmp/responses
for i in {1..10}; do
status_file="/tmp/responses/status_${i}.txt"
response_file="/tmp/responses/response_${i}.json"
pwd
ls
python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
(curl --location --request POST 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' \
--silent --output "${response_file}" --write-out '%{http_code}' > "${status_file}") &
# Capture PIDs of background processes
pids[${i}]=$!
done
# Wait for all background processes to finish
for pid in ${pids[*]}; do
wait $pid
done
# Check all responses and status codes
fail=false
for i in {1..10}; do
status=$(cat "/tmp/responses/status_${i}.txt")
# Here, we need to set the correct response file path for each iteration
response_file="/tmp/responses/response_${i}.json" # This was missing in the provided script
response=$(cat "${response_file}")
echo "Response ${i} (Status code: ${status}):"
echo "${response}" # Use echo here to print the contents
echo # Additional newline for readability
if [ "$status" -ne 200 ]; then
echo "A request did not return a 200 status code: $status"
fail=true
fi
done
# If any request did not return status code 200, fail the job
if [ "$fail" = true ]; then
exit 1
fi
echo "All requests returned a 200 status code."
# Store test results
- store_test_results:
path: test-results
publish_to_pypi:
docker:

View file

@ -41,6 +41,7 @@ jobs:
push: true
file: Dockerfile.database
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
build-and-push-image:
runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
@ -74,7 +75,9 @@ jobs:
push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
labels: ${{ steps.meta.outputs.labels }}
build-and-push-image-alpine:
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-ui:
runs-on: ubuntu-latest
permissions:
contents: read
@ -90,20 +93,21 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Alpine Dockerfile
id: meta-alpine
- name: Extract metadata (tags, labels) for UI Dockerfile
id: meta-ui
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-alpine
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
- name: Build and push Alpine Docker image
- name: Build and push UI Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
file: Dockerfile.alpine
context: ui/
file: ui/Dockerfile
push: true
tags: ${{ steps.meta-alpine.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-alpine.outputs.tags }}-latest
labels: ${{ steps.meta-alpine.outputs.labels }}
tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
labels: ${{ steps.meta-ui.outputs.labels }}
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-database:
runs-on: ubuntu-latest
permissions:
@ -168,3 +172,14 @@ jobs:
} catch (error) {
core.setFailed(error.message);
}
- name: Github Releases To Discord
uses: SethCohen/github-releases-to-discord@v1.13.1
with:
webhook_url: ${{ secrets.WEBHOOK_URL }}
color: "2105893"
username: "Release Changelog"
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
content: "||@everyone||"
footer_title: "Changelog"
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
footer_timestamp: true

8
.gitignore vendored
View file

@ -32,3 +32,11 @@ proxy_server_config_@.yaml
proxy_server_config_2.yaml
litellm/proxy/secret_managers/credentials.json
hosted_config.yaml
litellm/proxy/tests/node_modules
litellm/proxy/tests/package.json
litellm/proxy/tests/package-lock.json
ui/litellm-dashboard/.next
ui/litellm-dashboard/node_modules
ui/litellm-dashboard/next-env.d.ts
ui/litellm-dashboard/package.json
ui/litellm-dashboard/package-lock.json

View file

@ -52,4 +52,4 @@ RUN chmod +x entrypoint.sh
EXPOSE 4000/tcp
ENTRYPOINT ["litellm"]
CMD ["--port", "4000"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]

View file

@ -56,4 +56,4 @@ EXPOSE 4000/tcp
# # Set your entrypoint and command
ENTRYPOINT ["litellm"]
CMD ["--port", "4000"]
CMD ["--port", "4000", "--run_gunicorn"]

View file

@ -0,0 +1,2 @@
python3 -m build
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -

View file

@ -0,0 +1,34 @@
import os
from openai import OpenAI
from dotenv import load_dotenv
import httpx
import concurrent.futures
load_dotenv()
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
def create_chat_completion():
return client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test. Respond in 20 lines",
}
],
model="gpt-3.5-turbo",
)
with concurrent.futures.ThreadPoolExecutor() as executor:
# Set a timeout of 10 seconds
future = executor.submit(create_chat_completion)
try:
chat_completion = future.result(timeout=0.00001)
print(chat_completion)
except concurrent.futures.TimeoutError:
print("Operation timed out.")

View file

@ -0,0 +1,61 @@
# Notes - on how to do sagemaker streaming using boto3
import json
import boto3
import sys, os
import traceback
from dotenv import load_dotenv
load_dotenv()
import os, io
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
def __iter__(self):
return self
def __next__(self):
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
return line_data["token"]["text"]
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
payload = {
"inputs": "How do I build a website?",
"parameters": {"max_new_tokens": 256},
"stream": True,
}
import boto3
client = boto3.client("sagemaker-runtime", region_name="us-west-2")
response = client.invoke_endpoint_with_response_stream(
EndpointName="berri-benchmarking-Llama-2-70b-chat-hf-4",
Body=json.dumps(payload),
ContentType="application/json",
)
# for token in TokenIterator(response["Body"]):
# print(token)

Binary file not shown.

BIN
dist/litellm-1.16.21.dev1.tar.gz vendored Normal file

Binary file not shown.

Binary file not shown.

BIN
dist/litellm-1.16.21.dev2.tar.gz vendored Normal file

Binary file not shown.

Binary file not shown.

BIN
dist/litellm-1.16.21.dev3.tar.gz vendored Normal file

Binary file not shown.

View file

@ -1,12 +0,0 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any

15
docker-compose.yml Normal file
View file

@ -0,0 +1,15 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main-latest
volumes:
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
ports:
- "4000:4000"
environment:
- AZURE_API_KEY=sk-123
litellm-ui:
image: ghcr.io/berriai/litellm-ui:main-latest

View file

@ -204,6 +204,7 @@ def __init__(
s3_bucket_name: Optional[str] = None,
s3_region_name: Optional[str] = None,
s3_api_version: Optional[str] = None,
s3_path: Optional[str] = None, # if you wish to save to a spefic path
s3_use_ssl: Optional[bool] = True,
s3_verify: Optional[Union[bool, str]] = None,
s3_endpoint_url: Optional[str] = None,

View file

@ -150,5 +150,12 @@ litellm.register_model(model_cost=
"https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json")
```
**Don't pull hosted model_cost_map**
If you have firewalls, and want to just use the local copy of the model cost map, you can do so like this:
```bash
export LITELLM_LOCAL_MODEL_COST_MAP="True"
```
Note: this means you will need to upgrade to get updated pricing, and newer models.

View file

@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
```
- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
```python
input=["good morning from litellm"]
```
@ -22,7 +22,11 @@ input=["good morning from litellm"]
- `user`: *string (optional)* A unique identifier representing your end-user,
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
@ -66,11 +70,18 @@ input=["good morning from litellm"]
from litellm import embedding
import os
os.environ['OPENAI_API_KEY'] = ""
response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
response = embedding(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
metadata={"anything": "good day"},
dimensions=5 # Only supported in text-embedding-3 and later models.
)
```
| Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------|
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Embedding Models

View file

@ -57,7 +57,7 @@ print(f"response: {response}")
- `api_type`: *string (optional)* - The type of API to use.
### Output from `litellm.embedding()`
### Output from `litellm.image_generation()`
```json
@ -85,7 +85,7 @@ response = image_generation(model='dall-e-2', prompt="cute baby otter")
| Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------|
| dall-e-2 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
| dall-e-3 | `image_generation(model='dall-e-2', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
| dall-e-3 | `image_generation(model='dall-e-3', prompt="cute baby otter")` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Image Generation Models
@ -131,3 +131,23 @@ response = image_generation(
prompt="cute baby otter"
)
```
## Bedrock - Stable Diffusion
Use this for stable diffusion on bedrock
### Usage
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
)
print(f"response: {response}")
```

View file

@ -28,6 +28,8 @@ import litellm
import os
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = "" # defaults to litellm-completion
os.environ["LANGSMITH_DEFAULT_RUN_NAME"] = "" # defaults to LLMRun
# LLM API Keys
os.environ['OPENAI_API_KEY']=""

View file

@ -116,6 +116,57 @@ response = completion(
```
### Usage - with Azure Vision enhancements
Note: **Azure requires the `base_url` to be set with `/extensions`**
Example
```python
base_url=https://gpt-4-vision-resource.openai.azure.com/openai/deployments/gpt-4-vision/extensions
# base_url="{azure_endpoint}/openai/deployments/{azure_deployment}/extensions"
```
**Usage**
```python
import os
from litellm import completion
os.environ["AZURE_API_KEY"] = "your-api-key"
# azure call
response = completion(
model="azure/gpt-4-vision",
timeout=5,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://avatars.githubusercontent.com/u/29436595?v=4"
},
},
],
}
],
base_url="https://gpt-4-vision-resource.openai.azure.com/openai/deployments/gpt-4-vision/extensions",
api_key=os.getenv("AZURE_VISION_API_KEY"),
enhancements={"ocr": {"enabled": True}, "grounding": {"enabled": True}},
dataSources=[
{
"type": "AzureComputerVision",
"parameters": {
"endpoint": "https://gpt-4-vision-enhancement.cognitiveservices.azure.com/",
"key": os.environ["AZURE_VISION_ENHANCE_KEY"],
},
}
],
)
```
## Advanced
### Azure API Load-Balancing

View file

@ -195,6 +195,81 @@ response = completion(
)
```
### SSO Login (AWS Profile)
- Set `AWS_PROFILE` environment variable
- Make bedrock completion call
```python
import os
from litellm import completion
response = completion(
model="bedrock/anthropic.claude-instant-v1",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
### STS based Auth
- Set `aws_role_name` and `aws_session_name` in completion() / embedding() function
Make the bedrock completion call
```python
from litellm import completion
response = completion(
model="bedrock/anthropic.claude-instant-v1",
messages=messages,
max_tokens=10,
temperature=0.1,
aws_role_name=aws_role_name,
aws_session_name="my-test-session",
)
```
If you also need to dynamically set the aws user accessing the role, add the additional args in the completion()/embedding() function
```python
from litellm import completion
response = completion(
model="bedrock/anthropic.claude-instant-v1",
messages=messages,
max_tokens=10,
temperature=0.1,
aws_region_name=aws_region_name,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
aws_role_name=aws_role_name,
aws_session_name="my-test-session",
)
```
## Provisioned throughput models
To use provisioned throughput Bedrock models pass
- `model=bedrock/<base-model>`, example `model=bedrock/anthropic.claude-v2`. Set `model` to any of the [Supported AWS models](#supported-aws-bedrock-models)
- `model_id=provisioned-model-arn`
Completion
```python
import litellm
response = litellm.completion(
model="bedrock/anthropic.claude-instant-v1",
model_id="provisioned-model-arn",
messages=[{"content": "Hello, how are you?", "role": "user"}]
)
```
Embedding
```python
import litellm
response = litellm.embedding(
model="bedrock/amazon.titan-embed-text-v1",
model_id="provisioned-model-arn",
input=["hi"],
)
```
## Supported AWS Bedrock Models
Here's an example of using a bedrock model with LiteLLM
@ -240,3 +315,50 @@ print(response)
| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
| Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
| Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |
## Image Generation
Use this for stable diffusion on bedrock
### Usage
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
)
print(f"response: {response}")
```
**Set optional params**
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
### OPENAI-COMPATIBLE ###
size="128x512", # width=128, height=512
### PROVIDER-SPECIFIC ### see `AmazonStabilityConfig` in bedrock.py for all params
seed=30
)
print(f"response: {response}")
```
## Supported AWS Bedrock Image Generation Models
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |

View file

@ -6,7 +6,7 @@
# Gemini-Pro
## Sample Usage
```python
import litellm
from litellm import completion
import os
os.environ['GEMINI_API_KEY'] = ""

View file

@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
@ -173,6 +174,31 @@ response = completion(
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
### Set `ssl_verify=False`
This is done by setting your own `httpx.Client`
- For `litellm.completion` set `litellm.client_session=httpx.Client(verify=False)`
- For `litellm.acompletion` set `litellm.aclient_session=AsyncClient.Client(verify=False)`
```python
import litellm, httpx
# for completion
litellm.client_session = httpx.Client(verify=False)
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
)
# for acompletion
litellm.aclient_session = httpx.AsyncClient(verify=False)
response = litellm.acompletion(
model="gpt-3.5-turbo",
messages=messages,
)
```
### Using Helicone Proxy with LiteLLM
```python
import os

View file

@ -5,7 +5,7 @@
## Sample Usage
```python
import litellm
from litellm import completion
import os
os.environ['PALM_API_KEY'] = ""
@ -17,7 +17,7 @@ response = completion(
## Sample Usage - Streaming
```python
import litellm
from litellm import completion
import os
os.environ['PALM_API_KEY'] = ""

View file

@ -1,4 +1,4 @@
# VertexAI - Google [Gemini]
# VertexAI - Google [Gemini, Model Garden]
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -17,7 +17,28 @@ import litellm
litellm.vertex_project = "hardy-device-38811" # Your Project ID
litellm.vertex_location = "us-central1" # proj location
response = completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
```
## OpenAI Proxy Usage
1. Modify the config.yaml
```yaml
litellm_settings:
vertex_project: "hardy-device-38811" # Your Project ID
vertex_location: "us-central1" # proj location
model_list:
-model_name: team1-gemini-pro
litellm_params:
model: gemini-pro
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
```
## Set Vertex Project & Vertex Location
@ -46,16 +67,39 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
# set directly on module
litellm.vertex_location = "us-central1 # Your Location
```
## Model Garden
| Model Name | Function Call |
|------------------|--------------------------------------|
| llama2 | `completion('vertex_ai/<endpoint_id>', messages)` |
#### Using Model Garden
```python
from litellm import completion
import os
## set ENV variables
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/<your-endpoint-id>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
## Gemini Pro
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-pro | `completion('gemini-pro', messages)` |
| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
## Gemini Pro Vision
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-pro-vision | `completion('gemini-pro-vision', messages)` |
| gemini-pro-vision | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
#### Using Gemini Pro Vision
@ -93,6 +137,7 @@ response = litellm.completion(
print(response)
```
## Chat Models
| Model Name | Function Call |
|------------------|--------------------------------------|

View file

@ -11,7 +11,7 @@ pip install litellm vllm
```python
import litellm
response = completion(
response = litellm.completion(
model="vllm/facebook/opt-125m", # add a vllm prefix so litellm knows the custom_llm_provider==vllm
messages=messages,
temperature=0.2,
@ -29,7 +29,7 @@ In order to use litellm to call a hosted vllm server add the following to your c
```python
import litellm
response = completion(
response = litellm.completion(
model="openai/facebook/opt-125m", # pass the vllm model name
messages=messages,
api_base="https://hosted-vllm-api.co",

View file

@ -1,6 +1,13 @@
# Slack Alerting
Get alerts for failed db read/writes, hanging api calls, failed api calls.
Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
## Quick Start

View file

@ -1,4 +1,4 @@
# Modify Incoming Data
# Modify / Reject Incoming Requests
Modify data just before making litellm completion calls call on proxy

View file

@ -1,31 +1,46 @@
# CLI Arguments
Cli arguments, --host, --port, --num_workers
#### --host
## --host
- **Default:** `'0.0.0.0'`
- The host for the server to listen on.
- **Usage:**
```shell
litellm --host 127.0.0.1
```
- **Usage - set Environment Variable:** `HOST`
```shell
export HOST=127.0.0.1
litellm
```
#### --port
## --port
- **Default:** `8000`
- The port to bind the server to.
- **Usage:**
```shell
litellm --port 8080
```
- **Usage - set Environment Variable:** `PORT`
```shell
export PORT=8080
litellm
```
#### --num_workers
## --num_workers
- **Default:** `1`
- The number of uvicorn workers to spin up.
- **Usage:**
```shell
litellm --num_workers 4
```
- **Usage - set Environment Variable:** `NUM_WORKERS`
```shell
export NUM_WORKERS=4
litellm
```
#### --api_base
## --api_base
- **Default:** `None`
- The API base for the model litellm should call.
- **Usage:**
@ -33,7 +48,7 @@ Cli arguments, --host, --port, --num_workers
litellm --model huggingface/tinyllama --api_base https://k58ory32yinf1ly0.us-east-1.aws.endpoints.huggingface.cloud
```
#### --api_version
## --api_version
- **Default:** `None`
- For Azure services, specify the API version.
- **Usage:**
@ -41,7 +56,7 @@ Cli arguments, --host, --port, --num_workers
litellm --model azure/gpt-deployment --api_version 2023-08-01 --api_base https://<your api base>"
```
#### --model or -m
## --model or -m
- **Default:** `None`
- The model name to pass to Litellm.
- **Usage:**
@ -49,7 +64,7 @@ Cli arguments, --host, --port, --num_workers
litellm --model gpt-3.5-turbo
```
#### --test
## --test
- **Type:** `bool` (Flag)
- Proxy chat completions URL to make a test request.
- **Usage:**
@ -57,7 +72,7 @@ Cli arguments, --host, --port, --num_workers
litellm --test
```
#### --health
## --health
- **Type:** `bool` (Flag)
- Runs a health check on all models in config.yaml
- **Usage:**
@ -65,7 +80,7 @@ Cli arguments, --host, --port, --num_workers
litellm --health
```
#### --alias
## --alias
- **Default:** `None`
- An alias for the model, for user-friendly reference.
- **Usage:**
@ -73,7 +88,7 @@ Cli arguments, --host, --port, --num_workers
litellm --alias my-gpt-model
```
#### --debug
## --debug
- **Default:** `False`
- **Type:** `bool` (Flag)
- Enable debugging mode for the input.
@ -81,15 +96,25 @@ Cli arguments, --host, --port, --num_workers
```shell
litellm --debug
```
- **Usage - set Environment Variable:** `DEBUG`
```shell
export DEBUG=True
litellm
```
#### --detailed_debug
## --detailed_debug
- **Default:** `False`
- **Type:** `bool` (Flag)
- Enable debugging mode for the input.
- **Usage:**
```shell
litellm --detailed_debug
``
```
- **Usage - set Environment Variable:** `DETAILED_DEBUG`
```shell
export DETAILED_DEBUG=True
litellm
```
#### --temperature
- **Default:** `None`
@ -100,7 +125,7 @@ Cli arguments, --host, --port, --num_workers
litellm --temperature 0.7
```
#### --max_tokens
## --max_tokens
- **Default:** `None`
- **Type:** `int`
- Set the maximum number of tokens for the model output.
@ -109,7 +134,7 @@ Cli arguments, --host, --port, --num_workers
litellm --max_tokens 50
```
#### --request_timeout
## --request_timeout
- **Default:** `600`
- **Type:** `int`
- Set the timeout in seconds for completion calls.
@ -118,7 +143,7 @@ Cli arguments, --host, --port, --num_workers
litellm --request_timeout 300
```
#### --drop_params
## --drop_params
- **Type:** `bool` (Flag)
- Drop any unmapped params.
- **Usage:**
@ -126,7 +151,7 @@ Cli arguments, --host, --port, --num_workers
litellm --drop_params
```
#### --add_function_to_prompt
## --add_function_to_prompt
- **Type:** `bool` (Flag)
- If a function passed but unsupported, pass it as a part of the prompt.
- **Usage:**
@ -134,14 +159,14 @@ Cli arguments, --host, --port, --num_workers
litellm --add_function_to_prompt
```
#### --config
## --config
- Configure Litellm by providing a configuration file path.
- **Usage:**
```shell
litellm --config path/to/config.yaml
```
#### --telemetry
## --telemetry
- **Default:** `True`
- **Type:** `bool`
- Help track usage of this feature.

View file

@ -22,18 +22,22 @@ Set a model alias for your deployments.
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
In the config below requests with:
In the config below:
- `model_name`: the name to pass TO litellm from the external client
- `litellm_params.model`: the model string passed to the litellm.completion() function
E.g.:
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
```yaml
model_list:
- model_name: gpt-3.5-turbo # user-facing model alias
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
model: azure/gpt-turbo-small-eu
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
- model_name: bedrock-claude-v1
litellm_params:
model: bedrock/anthropic.claude-instant-v1
@ -43,6 +47,11 @@ model_list:
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_CA"
rpm: 6
- model_name: anthropic-claude
litellm_params:
model="bedrock/anthropic.claude-instant-v1"
### [OPTIONAL] SET AWS REGION ###
aws_region_name="us-east-1"
- model_name: vllm-models
litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
```
:::info
For more provider-specific info, [go here](../providers/)
:::
#### Step 2: Start Proxy with config
@ -188,7 +202,7 @@ print(response)
</Tabs>
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -210,6 +224,12 @@ model_list:
api_key: sk-123
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
temperature: 0.2
- model_name: openai-gpt-3.5
litellm_params:
model: openai/gpt-3.5-turbo
api_key: sk-123
organization: org-ikDc4ex8NB
temperature: 0.2
- model_name: mistral-7b
litellm_params:
model: ollama/mistral
@ -318,6 +338,26 @@ See supported Embedding Providers & Models [here](https://docs.litellm.ai/docs/e
#### Create Config.yaml
<Tabs>
<TabItem value="bedrock" label="Bedrock Completion/Chat">
```yaml
model_list:
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-west-2"
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-east-2"
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-east-1"
```
</TabItem>
<TabItem value="sagemaker" label="Sagemaker, Bedrock Embeddings">
@ -430,20 +470,26 @@ model_list:
</Tabs>
#### Start Proxy
```shell
litellm --config config.yaml
```
#### Make Request
Sends Request to `deployed-codebert-base`
Sends Request to `bedrock-cohere`
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "deployed-codebert-base",
"input": ["write a litellm poem"]
}'
"model": "bedrock-cohere",
"messages": [
{
"role": "user",
"content": "gm"
}
]
}'
```
@ -483,3 +529,55 @@ general_settings:
max_parallel_requests: 100 # max parallel requests for a user = 100
```
## All settings
```python
{
"environment_variables": {},
"model_list": [
{
"model_name": "string",
"litellm_params": {},
"model_info": {
"id": "string",
"mode": "embedding",
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"max_tokens": 2048,
"base_model": "gpt-4-1106-preview",
"additionalProp1": {}
}
}
],
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
"general_settings": {
"completion_model": "string",
"key_management_system": "google_kms", # either google_kms or azure_kms
"master_key": "string",
"database_url": "string",
"database_type": "dynamo_db",
"database_args": {
"billing_mode": "PROVISIONED_THROUGHPUT",
"read_capacity_units": 0,
"write_capacity_units": 0,
"ssl_verify": true,
"region_name": "string",
"user_table_name": "LiteLLM_UserTable",
"key_table_name": "LiteLLM_VerificationToken",
"config_table_name": "LiteLLM_Config",
"spend_table_name": "LiteLLM_SpendLogs"
},
"otel": true,
"custom_auth": "string",
"max_parallel_requests": 0,
"infer_model_from_keys": true,
"background_health_checks": true,
"health_check_interval": 300,
"alerting": [
"string"
],
"alerting_threshold": 0
}
}
```

View file

@ -0,0 +1,115 @@
import Image from '@theme/IdealImage';
# Custom Pricing - Sagemaker, etc.
Use this to register custom pricing for models.
There's 2 ways to track cost:
- cost per token
- cost per second
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
## Quick Start
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
### Usage with OpenAI Proxy Server
**Step 1: Add pricing to config.yaml**
```yaml
model_list:
- model_name: sagemaker-completion-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_second: 0.000420
- model_name: sagemaker-embedding-model
litellm_params:
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
input_cost_per_second: 0.000420
```
**Step 2: Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3: View Spend Logs**
<Image img={require('../../img/spend_logs_table.png')} />
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
```yaml
model_list:
- model_name: azure-model
litellm_params:
model: azure/<your_deployment_name>
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
api_version: os.envrion/AZURE_API_VERSION
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
```

View file

@ -0,0 +1,34 @@
# Debugging
2 levels of debugging supported.
- debug (prints info logs)
- detailed debug (prints debug logs)
## `debug`
**via cli**
```bash
$ litellm --debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "INFO"
```
## `detailed debug`
**via cli**
```bash
$ litellm --detailed_debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "DEBUG"
```

View file

@ -5,8 +5,10 @@ Use this to health check all LLMs defined in your config.yaml
The proxy exposes:
* a /health endpoint which returns the health of the LLM APIs
* a /test endpoint which makes a ping to the litellm server
* a /health/readiness endpoint for returning if the proxy is ready to accept requests
* a /health/liveliness endpoint for returning if the proxy is alive
## `/health`
#### Request
Make a GET Request to `/health` on the proxy
```shell
@ -39,7 +41,7 @@ litellm --health
}
```
## Background Health Checks
### Background Health Checks
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
@ -61,7 +63,7 @@ $ litellm /path/to/config.yaml
curl --location 'http://0.0.0.0:8000/health'
```
## Embedding Models
### Embedding Models
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -77,3 +79,69 @@ model_list:
mode: embedding # 👈 ADD THIS
```
### Text Completion Models
We need some way to know if the model is a text completion model when running checks, if you have this in your config, specifying mode it makes an embedding health check
```yaml
model_list:
- model_name: azure-text-completion
litellm_params:
model: azure/text-davinci-003
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
mode: completion # 👈 ADD THIS
```
## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:8000/health/readiness'
```
Example Response:
*If proxy connected to a database*
```json
{
"status": "healthy",
"db": "connected",
"litellm_version":"1.19.2",
}
```
*If proxy not connected to a database*
```json
{
"status": "healthy",
"db": "Not connected",
"litellm_version":"1.19.2",
}
```
## `/health/liveliness`
Unprotected endpoint for checking if proxy is alive
Example Request:
```
curl -X 'GET' \
'http://0.0.0.0:8000/health/liveliness' \
-H 'accept: application/json'
```
Example Response:
```json
"I'm alive!"
```

View file

@ -1,5 +1,4 @@
# Load Balancing - Multiple Instances of 1 model
# Multiple Instances of 1 model
Load balance multiple instances of the same model
The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**

View file

@ -435,6 +435,7 @@ print(response)
</TabItem>
</Tabs>
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
@ -490,6 +491,34 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
Your logs should be available on the specified s3 Bucket
## Team-based Logging
Set success callbacks (e.g. langfuse), for a specific team-id.
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_2
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_3
langfuse_secret: os.environ/LANGFUSE_SECRET_3
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:8000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging.
## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set

View file

@ -40,115 +40,6 @@ litellm --test
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
### Supported LLMs
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
<Tabs>
@ -331,6 +222,113 @@ $ litellm --model command-nightly
</Tabs>
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
## Quick Start - LiteLLM Proxy + Config.yaml

View file

@ -8,9 +8,12 @@ Use this to fail a request based on the output of an llm api call.
```python
def my_custom_rule(input): # receives the model response
if len(input) < 5: # trigger fallback if the model response is too short
return False
return True
if len(input) < 5:
return {
"decision": False,
"message": "This violates LiteLLM Proxy Rules. Response too short"
}
return {"decision": True} # message not required since, request will pass
```
### Step 2. Point it to your proxy
@ -18,7 +21,6 @@ def my_custom_rule(input): # receives the model response
```python
litellm_settings:
post_call_rules: post_call_rules.my_custom_rule
num_retries: 3
```
### Step 3. Start + test your proxy
@ -32,7 +34,7 @@ curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "deepseek-coder",
"model": "gpt-3.5-turbo",
"messages": [{"role":"user","content":"What llm are you?"}],
"temperature": 0.7,
"max_tokens": 10,
@ -41,3 +43,19 @@ curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
---
This will now check if a response is > len 5, and if it fails, it'll retry a call 3 times before failing.
### Response that fail the rule
This is the response from LiteLLM Proxy on failing a rule
```json
{
"error":
{
"message":"This violates LiteLLM Proxy Rules. Response too short",
"type":null,
"param":null,
"code":500
}
}
```

View file

@ -1,8 +1,8 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] Self-serve UI
Allow your users to create their own keys through a UI
# [BETA] Admin UI
:::info
@ -10,40 +10,89 @@ This is in beta, so things may change. If you have feedback, [let us know](https
:::
Allow your users to create, view their own keys through a UI
<Image img={require('../../img/admin_ui_2.png')} />
## Quick Start
Requirements:
## 1. Setup SSO/Auth for UI
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
<Tabs>
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
<TabItem value="username" label="Quick Start - Username, Password">
### Step 1. Save SMTP server credentials
Set the following in your .env on the Proxy
```env
export SMTP_HOST="my-smtp-host"
export SMTP_USERNAME="my-smtp-password"
export SMTP_PASSWORD="my-smtp-password"
export SMTP_SENDER_EMAIL="krrish@berri.ai"
```shell
UI_USERNAME=ishaan-litellm
UI_PASSWORD=langchain
```
### Step 2. Enable user auth
On accessing the LiteLLM UI, you will be prompted to enter your username, password
In your config.yaml,
</TabItem>
```yaml
general_settings:
# other changes
allow_user_auth: true
<TabItem value="google" label="Google SSO">
- Create a new Oauth 2.0 Client on https://console.cloud.google.com/
**Required .env variables on your Proxy**
```shell
# for Google SSO Login
GOOGLE_CLIENT_ID=
GOOGLE_CLIENT_SECRET=
```
This will enable:
* Users to create keys via `/key/generate` (by default, only admin can create keys)
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
https://litellm-production-7002.up.railway.app/sso/callback
```
### Step 3. Connect to UI
</TabItem>
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
<TabItem value="msft" label="Microsoft SSO">
- Create a new App Registration on https://portal.azure.com/
- Create a client Secret for your App Registration
**Required .env variables on your Proxy**
```shell
MICROSOFT_CLIENT_ID="84583a4d-"
MICROSOFT_CLIENT_SECRET="nbk8Q~"
MICROSOFT_TENANT="5a39737
```
- Set Redirect URI on your App Registration on https://portal.azure.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
http://localhost:4000/sso/callback
```
</TabItem>
</Tabs>
## 2. Start Proxy Server
```shell
litellm --config proxy_config.yaml --port 4000
# start proxy on port 4000
```
## 3. Get Admin UI Link to you on Swagger
Your Proxy Swagger is available on the root of the Proxy: `http://localhost:4000/`
<Image img={require('../../img/ui_link.png')} />
<!-- You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
@ -63,3 +112,12 @@ Connect your proxy to your UI, by entering:
### Create Keys
<Image img={require('../../img/user_create_key_screen.png')} />
### Spend Per Key
<Image img={require('../../img/spend_per_api_key.png')} />
### Spend Per User
<Image img={require('../../img/spend_per_user.png')} /> -->

View file

@ -1,4 +1,7 @@
# 💰 Budgets, Rate Limits per user
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💰 Budgets, Rate Limits
Requirements:
@ -6,17 +9,74 @@ Requirements:
## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
You can set budgets at 3 levels:
- For the proxy
- For a user
- For a key
<Tabs>
<TabItem value="proxy" label="For Proxy">
Apply a budget across all calls on the proxy
**Step 1. Modify config.yaml**
```yaml
general_settings:
master_key: sk-1234
litellm_settings:
# other litellm settings
max_budget: 0 # (float) sets max budget as $0 USD
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
**Step 2. Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
<TabItem value="per-user" label="For User">
Apply a budget across multiple keys.
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
You can:
- Add budgets to users [**Jump**](#add-budgets-to-users)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Sample Response**
@ -29,14 +89,201 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
}
```
### **Add budget duration to users**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
### Create new keys for existing user
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
```
</TabItem>
<TabItem value="per-key" label="For Key">
Apply a budget on a key.
You can:
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-keys)
**Expected Behaviour**
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
- If duration set, spend is reset at the end of the duration
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
}'
```
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <generated-key>' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
### **Add budget duration to keys**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
</TabItem>
</Tabs>
## Set Rate Limits
Set max parallel requests a user can make, when you create user keys - `/key/generate`.
You can set:
- max parallel requests
- tpm limits
- rpm limits
<Tabs>
<TabItem value="per-user" label="Per User">
Use `/user/new`, to persist rate limits across multiple keys.
```shell
curl --location 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
```
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
**Expected Response**
```json
{
"key": "sk-sA7VDkyhlQ7m8Gt77Mbt3Q",
"expires": "2024-01-19T01:21:12.816168",
"user_id": "krrish@berri.ai",
}
```
</TabItem>
<TabItem value="per-key" label="Per Key">
Use `/key/generate`, if you want them for just that key.
```shell
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"duration": "20m", "max_parallel_requests": 1}' # 👈 max parallel requests = 1
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
```
**Expected Response**
```json
{
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
"expires": "2024-01-18T20:48:44.297973",
"user_id": "78c2c8fc-c233-43b9-b0c3-eb931da27b84" // 👈 auto-generated
}
```
</TabItem>
</Tabs>
## Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.).
Difference between doing this with `/key/generate` vs. `/user/new`? If you do it on `/user/new` it'll persist across multiple keys generated for that user.
**Step 1. Assign model, access group in config.yaml**
```yaml
model_list:
- model_name: text-embedding-ada-002
litellm_params:
model: azure/azure-embedding-model
api_base: "os.environ/AZURE_API_BASE"
api_key: "os.environ/AZURE_API_KEY"
api_version: "2023-07-01-preview"
model_info:
access_groups: ["beta-models"] # 👈 Model Access Group
```
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/user/new' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
"max_budget": 0}'
```
## Create new keys for existing user
Just include user_id in the `/key/generate` request.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'
```

View file

@ -1,4 +1,4 @@
# Key Management
# Virtual Keys, Users
Track Spend, Set budgets and create virtual keys for the proxy
Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -12,7 +12,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
:::
## Quick Start
## Setup
Requirements:
@ -56,38 +56,55 @@ litellm --config /path/to/config.yaml
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
```
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
## /key/generate
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"duration": "20m",
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
Expected response:
Request Params:
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
### Response
```python
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
}
```
## Keys that don't expire
Just set duration to None.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
```
## Upgrade/Downgrade Models
### Upgrade/Downgrade Models
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -124,7 +141,7 @@ model_list:
```bash
curl -X POST "https://0.0.0.0:8000/key/generate" \
-H "Authorization: Bearer sk-1234" \
-H "Authorization: Bearer <your-master-key>" \
-H "Content-Type: application/json" \
-d '{
"models": ["my-free-tier"],
@ -136,6 +153,291 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
- **How to upgrade / downgrade request?** Change the alias mapping
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
### Grant Access to new model
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
**Step 1. Assign model, access group in config.yaml**
```yaml
model_list:
- model_name: text-embedding-ada-002
litellm_params:
model: azure/azure-embedding-model
api_base: "os.environ/AZURE_API_BASE"
api_key: "os.environ/AZURE_API_KEY"
api_version: "2023-07-01-preview"
model_info:
access_groups: ["beta-models"] # 👈 Model Access Group
```
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/key/generate' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
"max_budget": 0,}'
```
## /key/info
### Request
```shell
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
Request Params:
- key: str - The key you want the info for
### Response
`token` is the hashed key (The DB stores the hashed key for security)
```json
{
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
"info": {
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
"spend": 0.0,
"expires": "2024-01-18T23:52:09.125000+00:00",
"models": ["azure-gpt-3.5", "azure-embedding-model"],
"aliases": {},
"config": {},
"user_id": "ishaan2@berri.ai",
"team_id": "None",
"max_parallel_requests": null,
"metadata": {}
}
}
```
## /key/update
### Request
```shell
curl 'http://0.0.0.0:8000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra"
}'
```
Request Params:
- key: str - The key that needs to be updated.
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
- team_id: str or null (optional) - Specify the team_id for the associated key.
### Response
```json
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {
"user": "ishaan@berri.ai"
}
}
```
## /key/delete
### Request
```shell
curl 'http://0.0.0.0:8000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}'
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## /user/new
### Request
All [key/generate params supported](#keygenerate) for creating a user
```shell
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{
"user_id": "ishaan1",
"user_email": "ishaan@litellm.ai",
"user_role": "admin",
"team_id": "cto-team",
"max_budget": 20,
"budget_duration": "1h"
}'
```
Request Params:
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
- user_email: str (optional - defaults to "") - The email address associated with the user.
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
**Possible `user_role` values**
```
"admin" - Maintaining the proxy and owning the overall budget
"app_owner" - employees maintaining the apps, each owner may own more than one app
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
```
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
### Response
A key will be generated for the new user created
```shell
{
"models": [],
"spend": 0.0,
"max_budget": null,
"user_id": "ishaan1",
"team_id": null,
"max_parallel_requests": null,
"metadata": {},
"tpm_limit": null,
"rpm_limit": null,
"budget_duration": null,
"allowed_cache_controls": [],
"key_alias": null,
"duration": null,
"aliases": {},
"config": {},
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
"key_name": null,
"expires": null
}
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
## Set Budgets - Per Key
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
#### Expected Behaviour
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
## Set Budgets - Per User
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Tracking Spend
You can get spend for a key by using the `/key/info` endpoint.
@ -171,32 +473,6 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
```
## Set Budgets
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
```curl
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```curl
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
## Custom Auth
You can now override the default api key auth.
@ -242,7 +518,133 @@ general_settings:
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
### 3. Start the proxy
```bash
```shell
$ litellm --config /path/to/config.yaml
```
## Custom /key/generate
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
### 1. Write a custom `custom_generate_key_fn`
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
The output of your `custom_generate_key_fn` should be a dictionary with the following structure
```python
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
- decision (Type: bool): A boolean value indicating whether the key generation is allowed (True) or not (False).
- message (Type: str, Optional): An optional message providing additional information about the decision. This field is included when the decision is False.
```python
async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
"""
Asynchronous function for generating a key based on the input data.
Args:
data (GenerateKeyRequest): The input data for key generation.
Returns:
dict: A dictionary containing the decision and an optional message.
{
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
"""
# decide if a key should be generated or not
print("using custom auth function!")
data_json = data.json() # type: ignore
# Unpacking variables
team_id = data_json.get("team_id")
duration = data_json.get("duration")
models = data_json.get("models")
aliases = data_json.get("aliases")
config = data_json.get("config")
spend = data_json.get("spend")
user_id = data_json.get("user_id")
max_parallel_requests = data_json.get("max_parallel_requests")
metadata = data_json.get("metadata")
tpm_limit = data_json.get("tpm_limit")
rpm_limit = data_json.get("rpm_limit")
if team_id is not None and team_id == "litellm-core-infra@gmail.com":
# only team_id="litellm-core-infra@gmail.com" can make keys
return {
"decision": True,
}
else:
print("Failed custom auth")
return {
"decision": False,
"message": "This violates LiteLLM Proxy Rules. No team id provided.",
}
```
### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
e.g. if they're both in the same dir - `./config.yaml` and `./custom_auth.py`, this is what it looks like:
```yaml
model_list:
- model_name: "openai-model"
litellm_params:
model: "gpt-3.5-turbo"
litellm_settings:
drop_params: True
set_verbose: True
general_settings:
custom_key_generate: custom_auth.custom_generate_key_fn
```
## [BETA] Dynamo DB
Only live in `v1.16.21.dev1`.
### Step 1. Save keys to env
```shell
AWS_ACCESS_KEY_ID = "your-aws-access-key-id"
AWS_SECRET_ACCESS_KEY = "your-aws-secret-access-key"
```
### Step 2. Add details to config
```yaml
general_settings:
master_key: sk-1234
database_type: "dynamo_db"
database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
"billing_mode": "PAY_PER_REQUEST",
"region_name": "us-west-2"
"user_table_name": "your-user-table",
"key_table_name": "your-token-table",
"config_table_name": "your-config-table"
}
```
### Step 3. Generate Key
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
```

View file

@ -302,6 +302,7 @@ asyncio.run(router_acompletion())
The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well.
**Global Timeouts**
```python
from litellm import Router
@ -313,6 +314,36 @@ router = Router(model_list=model_list,
print(response)
```
**Timeouts per model**
```python
from litellm import Router
import asyncio
model_list = [{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
"timeout": 300 # sets a 5 minute timeout
"stream_timeout": 30 # sets a 30s timeout for streaming calls
}
}]
# init router
router = Router(model_list=model_list, routing_strategy="least-busy")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
return response
asyncio.run(router_acompletion())
```
### Cooldowns
Set the limit for how many calls a model is allowed to fail in a minute, before being cooled down for a minute.
@ -574,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
print(f"response: {response}")
```
## Custom Callbacks - Track API Key, API Endpoint, Model Used
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
### Usage
```python
import litellm
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
print("kwargs=", kwargs)
litellm_params= kwargs.get("litellm_params")
api_key = litellm_params.get("api_key")
api_base = litellm_params.get("api_base")
custom_llm_provider= litellm_params.get("custom_llm_provider")
response_cost = kwargs.get("response_cost")
# print the values
print("api_key=", api_key)
print("api_base=", api_base)
print("custom_llm_provider=", custom_llm_provider)
print("response_cost=", response_cost)
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
print("kwargs=")
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
# Init Router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
# router completion call
response = router.completion(
model="gpt-3.5-turbo",
messages=[{ "role": "user", "content": "Hi who are you"}]
)
```
## Deploy Router
@ -602,17 +676,63 @@ def __init__(
num_retries: int = 0,
timeout: Optional[float] = None,
default_litellm_params={}, # default params for Router.chat.completion.create
set_verbose: bool = False,
fallbacks: List = [],
allowed_fails: Optional[int] = None,
allowed_fails: Optional[int] = None, # Number of times a deployment can failbefore being added to cooldown
cooldown_time: float = 1, # (seconds) time to cooldown a deployment after failure
context_window_fallbacks: List = [],
model_group_alias: Optional[dict] = {},
retry_after: int = 0, # min time to wait before retrying a failed request
retry_after: int = 0, # (min) time to wait before retrying a failed request
routing_strategy: Literal[
"simple-shuffle",
"least-busy",
"usage-based-routing",
"latency-based-routing",
] = "simple-shuffle",
## DEBUGGING ##
set_verbose: bool = False, # set this to True for seeing logs
debug_level: Literal["DEBUG", "INFO"] = "INFO", # set this to "DEBUG" for detailed debugging
):
```
## Debugging Router
### Basic Debugging
Set `Router(set_verbose=True)`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True
)
```
### Detailed Debugging
Set `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```
### Very Detailed Debugging
Set `litellm.set_verbose=True` and `Router(set_verbose=True,debug_level="DEBUG")`
```python
from litellm import Router
import litellm
litellm.set_verbose = True
router = Router(
model_list=model_list,
set_verbose=True,
debug_level="DEBUG" # defaults to INFO
)
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View file

@ -104,24 +104,49 @@ const sidebars = {
items: [
"proxy/quick_start",
"proxy/configs",
{
type: 'link',
label: '📖 All Endpoints',
href: 'https://litellm-api.up.railway.app/',
},
"proxy/user_keys",
"proxy/load_balancing",
"proxy/virtual_keys",
"proxy/ui",
"proxy/users",
"proxy/ui",
"proxy/model_management",
"proxy/reliability",
"proxy/caching",
"proxy/logging",
"proxy/health",
"proxy/call_hooks",
"proxy/rules",
"proxy/debugging",
{
"type": "category",
"label": "🔥 Load Balancing",
"items": [
"proxy/load_balancing",
"proxy/reliability",
]
},
{
"type": "category",
"label": "Logging, Alerting, Caching",
"items": [
"proxy/logging",
"proxy/alerting",
"proxy/streaming_logging",
"proxy/caching",
]
},
{
"type": "category",
"label": "Admin Controls",
"items": [
"proxy/call_hooks",
"proxy/rules",
]
},
"proxy/deploy",
"proxy/cli",
]
},
"proxy/custom_pricing",
"routing",
"rules",
"set_keys",

View file

@ -1,11 +1,17 @@
### INIT VARIABLES ###
import threading, requests
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any
from litellm.caching import Cache
from litellm._logging import set_verbose
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
from litellm.proxy._types import KeyManagementSystem
import httpx
import dotenv
dotenv.load_dotenv()
#############################################
if set_verbose == True:
_turn_on_debug()
#############################################
input_callback: List[Union[str, Callable]] = []
success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
@ -58,6 +64,9 @@ cache: Optional[
model_alias_map: Dict[str, str] = {}
model_group_alias_map: Dict[str, str] = {}
max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[
str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
_openai_completion_params = [
"functions",
"function_call",
@ -136,6 +145,8 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
suppress_debug_info = False
dynamodb_table_name: Optional[str] = None
s3_callback_params: Optional[Dict] = None
default_key_generate_params: Optional[Dict] = None
default_team_settings: Optional[List] = None
#### RELIABILITY ####
request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint
@ -155,6 +166,19 @@ _key_management_system: Optional[KeyManagementSystem] = None
def get_model_cost_map(url: str):
if (
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
):
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content
try:
with requests.get(
url, timeout=5
@ -210,6 +234,7 @@ vertex_chat_models: List = []
vertex_code_chat_models: List = []
vertex_text_models: List = []
vertex_code_text_models: List = []
vertex_embedding_models: List = []
ai21_models: List = []
nlp_cloud_models: List = []
aleph_alpha_models: List = []
@ -239,6 +264,8 @@ for key, value in model_cost.items():
vertex_chat_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
vertex_code_chat_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-embedding-models":
vertex_embedding_models.append(key)
elif value.get("litellm_provider") == "ai21":
ai21_models.append(key)
elif value.get("litellm_provider") == "nlp_cloud":
@ -475,7 +502,10 @@ bedrock_embedding_models: List = [
]
all_embedding_models = (
open_ai_embedding_models + cohere_embedding_models + bedrock_embedding_models
open_ai_embedding_models
+ cohere_embedding_models
+ bedrock_embedding_models
+ vertex_embedding_models
)
####### IMAGE GENERATION MODELS ###################
@ -530,6 +560,7 @@ from .llms.bedrock import (
AmazonAnthropicConfig,
AmazonCohereConfig,
AmazonLlamaConfig,
AmazonStabilityConfig,
)
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError

View file

@ -7,20 +7,14 @@ handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
# Create a formatter and set it for the handler
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
datefmt="%H:%M:%S",
)
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
handler.setFormatter(formatter)
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass
verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
verbose_router_logger = logging.getLogger("LiteLLM Router")
verbose_logger = logging.getLogger("LiteLLM")
@ -28,3 +22,18 @@ verbose_logger = logging.getLogger("LiteLLM")
# Add the handler to the logger
verbose_router_logger.addHandler(handler)
verbose_proxy_logger.addHandler(handler)
verbose_logger.addHandler(handler)
def _turn_on_debug():
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
verbose_router_logger.setLevel(level=logging.DEBUG) # set router logs to debug
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
def print_verbose(print_statement):
try:
if set_verbose:
print(print_statement) # noqa
except:
pass

View file

@ -1,3 +1,12 @@
# +-----------------------------------------------+
# | |
# | NOT PROXY BUDGET MANAGER |
# | proxy budget manager is in proxy_server.py |
# | |
# +-----------------------------------------------+
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import os, json, time
import litellm
from litellm.utils import ModelResponse
@ -11,10 +20,12 @@ class BudgetManager:
project_name: str,
client_type: str = "local",
api_base: Optional[str] = None,
headers: Optional[dict] = None,
):
self.client_type = client_type
self.project_name = project_name
self.api_base = api_base or "https://api.litellm.ai"
self.headers = headers or {"Content-Type": "application/json"}
## load the data or init the initial dictionaries
self.load_data()
@ -43,7 +54,7 @@ class BudgetManager:
url = self.api_base + "/get_budget"
headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name}
response = requests.post(url, headers=headers, json=data)
response = requests.post(url, headers=self.headers, json=data)
response = response.json()
if response["status"] == "error":
self.user_dict = (
@ -201,6 +212,6 @@ class BudgetManager:
url = self.api_base + "/set_budget"
headers = {"Content-Type": "application/json"}
data = {"project_name": self.project_name, "user_dict": self.user_dict}
response = requests.post(url, headers=headers, json=data)
response = requests.post(url, headers=self.headers, json=data)
response = response.json()
return response

View file

@ -12,10 +12,12 @@ import time, logging, asyncio
import json, traceback, ast, hashlib
from typing import Optional, Literal, List, Union, Any
from openai._models import BaseModel as OpenAIObject
from litellm._logging import verbose_logger
def print_verbose(print_statement):
try:
verbose_logger.debug(print_statement)
if litellm.set_verbose:
print(print_statement) # noqa
except:
@ -81,9 +83,14 @@ class InMemoryCache(BaseCache):
self.cache_dict.clear()
self.ttl_dict.clear()
async def disconnect(self):
pass
def delete_cache(self, key):
self.cache_dict.pop(key, None)
self.ttl_dict.pop(key, None)
class RedisCache(BaseCache):
# if users don't provider one, use the default litellm cache
@ -210,9 +217,14 @@ class RedisCache(BaseCache):
def flush_cache(self):
self.redis_client.flushall()
async def disconnect(self):
pass
def delete_cache(self, key):
self.redis_client.delete(key)
class S3Cache(BaseCache):
def __init__(
@ -227,11 +239,13 @@ class S3Cache(BaseCache):
s3_aws_secret_access_key=None,
s3_aws_session_token=None,
s3_config=None,
s3_path=None,
**kwargs,
):
import boto3
self.bucket_name = s3_bucket_name
self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
# Create an S3 client with custom endpoint URL
self.s3_client = boto3.client(
"s3",
@ -253,6 +267,8 @@ class S3Cache(BaseCache):
ttl = kwargs.get("ttl", None)
# Convert value to JSON before storing in S3
serialized_value = json.dumps(value)
key = self.key_prefix + key
if ttl is not None:
cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
import datetime
@ -294,6 +310,8 @@ class S3Cache(BaseCache):
import boto3, botocore
try:
key = self.key_prefix + key
print_verbose(f"Get S3 Cache: key: {key}")
# Download the data from S3
cached_response = self.s3_client.get_object(
@ -400,6 +418,12 @@ class DualCache(BaseCache):
if self.redis_cache is not None:
self.redis_cache.flush_cache()
def delete_cache(self, key):
if self.in_memory_cache is not None:
self.in_memory_cache.delete_cache(key)
if self.redis_cache is not None:
self.redis_cache.delete_cache(key)
#### LiteLLM.Completion / Embedding Cache ####
class Cache:

View file

@ -21,6 +21,7 @@ from openai import (
APIConnectionError,
APIResponseValidationError,
UnprocessableEntityError,
PermissionDeniedError,
)
import httpx
@ -82,6 +83,17 @@ class Timeout(APITimeoutError): # type: ignore
) # Call the base class constructor with the parameters it needs
class PermissionDeniedError(PermissionDeniedError): # type:ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
self.status_code = 403
self.message = message
self.llm_provider = llm_provider
self.model = model
super().__init__(
self.message, response=response, body=None
) # Call the base class constructor with the parameters it needs
class RateLimitError(RateLimitError): # type: ignore
def __init__(self, message, llm_provider, model, response: httpx.Response):
self.status_code = 429

View file

@ -2,6 +2,7 @@
# On success, logs events to Helicone
import dotenv, os
import requests
import litellm
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
@ -56,6 +57,10 @@ class HeliconeLogger:
else "gpt-3.5-turbo"
)
provider_request = {"model": model, "messages": messages}
if isinstance(response_obj, litellm.EmbeddingResponse) or isinstance(
response_obj, litellm.ModelResponse
):
response_obj = response_obj.json()
if "claude" in model:
provider_request, response_obj = self.claude_mapping(

View file

@ -8,11 +8,13 @@ from datetime import datetime
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
from packaging.version import Version
from litellm._logging import verbose_logger
import litellm
class LangFuseLogger:
# Class variables or attributes
def __init__(self):
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
try:
from langfuse import Langfuse
except Exception as e:
@ -20,8 +22,8 @@ class LangFuseLogger:
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\033[0m"
)
# Instance variables
self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
@ -33,6 +35,26 @@ class LangFuseLogger:
debug=self.langfuse_debug,
)
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
self.upstream_langfuse_secret_key = os.getenv(
"UPSTREAM_LANGFUSE_SECRET_KEY"
)
self.upstream_langfuse_public_key = os.getenv(
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
)
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
self.upstream_langfuse = Langfuse(
public_key=self.upstream_langfuse_public_key,
secret_key=self.upstream_langfuse_secret_key,
host=self.upstream_langfuse_host,
release=self.upstream_langfuse_release,
debug=self.upstream_langfuse_debug,
)
else:
self.upstream_langfuse = None
def log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
):
@ -62,11 +84,15 @@ class LangFuseLogger:
pass
# end of processing langfuse ########################
if kwargs.get("call_type", None) == "embedding" or isinstance(
response_obj, litellm.EmbeddingResponse
):
input = prompt
output = response_obj["data"]
else:
input = prompt
output = response_obj["choices"][0]["message"].json()
print_verbose(
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
)
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
self._log_langfuse_v2(
user_id,
metadata,
@ -77,6 +103,7 @@ class LangFuseLogger:
optional_params,
input,
response_obj,
print_verbose,
) if self._is_langfuse_v2() else self._log_langfuse_v1(
user_id,
metadata,
@ -93,6 +120,7 @@ class LangFuseLogger:
print_verbose(
f"Langfuse Layer Logging - final response object: {response_obj}"
)
verbose_logger.info(f"Langfuse Layer Logging - logging success")
except:
traceback.print_exc()
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
@ -165,18 +193,46 @@ class LangFuseLogger:
optional_params,
input,
response_obj,
print_verbose,
):
trace = self.Langfuse.trace(
name=metadata.get("generation_name", "litellm-completion"),
input=input,
output=output,
user_id=metadata.get("trace_user_id", user_id),
id=metadata.get("trace_id", None),
)
import langfuse
tags = []
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
generation_name = metadata.get("generation_name", None)
if generation_name is None:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_params = {
"name": generation_name,
"input": input,
"output": output,
"user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None),
}
cost = kwargs["response_cost"]
print_verbose(f"trace: {cost}")
if supports_tags:
for key, value in metadata.items():
tags.append(f"{key}:{value}")
if "cache_hit" in kwargs:
tags.append(f"cache_hit:{kwargs['cache_hit']}")
trace_params.update({"tags": tags})
trace = self.Langfuse.trace(**trace_params)
# get generation_id
generation_id = None
if response_obj.get("id", None) is not None:
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
trace.generation(
name=metadata.get("generation_name", "litellm-completion"),
id=metadata.get("generation_id", None),
name=generation_name,
id=metadata.get("generation_id", generation_id),
startTime=start_time,
endTime=end_time,
model=kwargs["model"],
@ -186,6 +242,7 @@ class LangFuseLogger:
usage={
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
"completion_tokens": response_obj["usage"]["completion_tokens"],
"total_cost": cost if supports_costs else None,
},
metadata=metadata,
)

View file

@ -13,19 +13,22 @@ class LangsmithLogger:
# Class variables or attributes
def __init__(self):
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
self.langsmith_project = os.getenv("LANGSMITH_PROJECT", "litellm-completion")
self.langsmith_default_run_name = os.getenv(
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
)
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
# Method definition
# inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
metadata = {}
if "litellm_params" in kwargs:
metadata = kwargs["litellm_params"].get("metadata", {})
metadata = kwargs.get('litellm_params', {}).get("metadata", {}) or {} # if metadata is None
# set project name and run_name for langsmith logging
# users can pass project_name and run name to litellm.completion()
# Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
# if not set litellm will use default project_name = litellm-completion, run_name = LLMRun
project_name = metadata.get("project_name", "litellm-completion")
run_name = metadata.get("run_name", "LLMRun")
# if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
project_name = metadata.get("project_name", self.langsmith_project)
run_name = metadata.get("run_name", self.langsmith_default_run_name)
print_verbose(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
)

View file

@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import datetime, subprocess, sys
import litellm, uuid
from litellm._logging import print_verbose
from litellm._logging import print_verbose, verbose_logger
class S3Logger:
@ -16,6 +16,7 @@ class S3Logger:
def __init__(
self,
s3_bucket_name=None,
s3_path=None,
s3_region_name=None,
s3_api_version=None,
s3_use_ssl=True,
@ -30,7 +31,9 @@ class S3Logger:
import boto3
try:
print_verbose("in init s3 logger")
verbose_logger.debug(
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
)
if litellm.s3_callback_params is not None:
# read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -41,7 +44,7 @@ class S3Logger:
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
s3_verify = litellm.s3_callback_params.get("s3_verify")
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -57,6 +60,8 @@ class S3Logger:
# done reading litellm.s3_callback_params
self.bucket_name = s3_bucket_name
self.s3_path = s3_path
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
# Create an S3 client with custom endpoint URL
self.s3_client = boto3.client(
"s3",
@ -82,7 +87,9 @@ class S3Logger:
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
try:
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
verbose_logger.debug(
f"s3 Logging - Enters logging function for model {kwargs}"
)
# construct payload to send to s3
# follows the same params as langfuse.py
@ -121,9 +128,22 @@ class S3Logger:
# non blocking if it can't cast to a str
pass
s3_file_name = litellm.utils.get_logging_id(start_time, payload) or ""
s3_object_key = (
payload["id"] + "-time=" + str(start_time)
(self.s3_path.rstrip("/") + "/" if self.s3_path else "")
+ start_time.strftime("%Y-%m-%d")
+ "/"
+ s3_file_name
) # we need the s3 key to include the time, so we log cache hits too
s3_object_key += ".json"
s3_object_download_filename = (
"time-"
+ start_time.strftime("%Y-%m-%dT%H-%M-%S-%f")
+ "_"
+ payload["id"]
+ ".json"
)
import json
@ -137,7 +157,8 @@ class S3Logger:
Body=payload,
ContentType="application/json",
ContentLanguage="en",
ContentDisposition=f'inline; filename="{key}.json"',
ContentDisposition=f'inline; filename="{s3_object_download_filename}"',
CacheControl="private, immutable, max-age=31536000, s-maxage=0",
)
print_verbose(f"Response from s3:{str(response)}")
@ -146,5 +167,5 @@ class S3Logger:
return response
except Exception as e:
traceback.print_exc()
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
pass

View file

@ -78,7 +78,7 @@ class AnthropicConfig:
# makes headers for API call
def validate_environment(api_key):
def validate_environment(api_key, user_headers):
if api_key is None:
raise ValueError(
"Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params"
@ -89,6 +89,8 @@ def validate_environment(api_key):
"content-type": "application/json",
"x-api-key": api_key,
}
if user_headers is not None and isinstance(user_headers, dict):
headers = {**headers, **user_headers}
return headers
@ -105,8 +107,9 @@ def completion(
optional_params=None,
litellm_params=None,
logger_fn=None,
headers={},
):
headers = validate_environment(api_key)
headers = validate_environment(api_key, headers)
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
@ -139,7 +142,11 @@ def completion(
logging_obj.pre_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data, "api_base": api_base},
additional_args={
"complete_input_dict": data,
"api_base": api_base,
"headers": headers,
},
)
## COMPLETION CALL

View file

@ -95,6 +95,26 @@ class AzureOpenAIConfig(OpenAIConfig):
)
def select_azure_base_url_or_endpoint(azure_client_params: dict):
# azure_client_params = {
# "api_version": api_version,
# "azure_endpoint": api_base,
# "azure_deployment": model,
# "http_client": litellm.client_session,
# "max_retries": max_retries,
# "timeout": timeout,
# }
azure_endpoint = azure_client_params.get("azure_endpoint", None)
if azure_endpoint is not None:
# see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
if "/openai/deployments" in azure_endpoint:
# this is base_url, not an azure_endpoint
azure_client_params["base_url"] = azure_endpoint
azure_client_params.pop("azure_endpoint")
return azure_client_params
class AzureChatCompletion(BaseLLM):
def __init__(self) -> None:
super().__init__()
@ -239,6 +259,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": max_retries,
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -303,6 +326,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": max_retries,
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -364,6 +390,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": max_retries,
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -414,6 +443,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": data.pop("max_retries", 2),
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -527,6 +559,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": max_retries,
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -594,12 +629,23 @@ class AzureChatCompletion(BaseLLM):
client_session = litellm.aclient_session or httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(),
)
openai_aclient = AsyncAzureOpenAI(
azure_client = AsyncAzureOpenAI(
http_client=client_session, **azure_client_params
)
else:
openai_aclient = client
response = await openai_aclient.images.generate(**data, timeout=timeout)
azure_client = client
## LOGGING
logging_obj.pre_call(
input=data["prompt"],
api_key=azure_client.api_key,
additional_args={
"headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference,
"acompletion": True,
"complete_input_dict": data,
},
)
response = await azure_client.images.generate(**data, timeout=timeout)
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
@ -659,6 +705,9 @@ class AzureChatCompletion(BaseLLM):
"max_retries": max_retries,
"timeout": timeout,
}
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
@ -681,7 +730,7 @@ class AzureChatCompletion(BaseLLM):
input=prompt,
api_key=azure_client.api_key,
additional_args={
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
"headers": {"api_key": azure_client.api_key},
"api_base": azure_client._base_url._uri_reference,
"acompletion": False,
"complete_input_dict": data,
@ -753,6 +802,11 @@ class AzureChatCompletion(BaseLLM):
completion = None
if mode == "completion":
completion = await client.completions.with_raw_response.create(
model=model, # type: ignore
prompt=prompt, # type: ignore
)
elif mode == "chat":
if messages is None:
raise Exception("messages is not set")
completion = await client.chat.completions.with_raw_response.create(

View file

@ -2,9 +2,9 @@ import json, copy, types
import os
from enum import Enum
import time
from typing import Callable, Optional, Any, Union
from typing import Callable, Optional, Any, Union, List
import litellm
from litellm.utils import ModelResponse, get_secret, Usage
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
from .prompt_templates.factory import prompt_factory, custom_prompt
import httpx
@ -282,12 +282,82 @@ class AmazonLlamaConfig:
}
class AmazonStabilityConfig:
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
Supported Params for the Amazon / Stable Diffusion models:
- `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)
- `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed)
- `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run.
- `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64.
Engine-specific dimension validation:
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
- SDXL v1.0: same as SDXL v0.9
- SD v1.6: must be between 320x320 and 1536x1536
- `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64.
Engine-specific dimension validation:
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
- SDXL v1.0: same as SDXL v0.9
- SD v1.6: must be between 320x320 and 1536x1536
"""
cfg_scale: Optional[int] = None
seed: Optional[float] = None
steps: Optional[List[str]] = None
width: Optional[int] = None
height: Optional[int] = None
def __init__(
self,
cfg_scale: Optional[int] = None,
seed: Optional[float] = None,
steps: Optional[List[str]] = None,
width: Optional[int] = None,
height: Optional[int] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def init_bedrock_client(
region_name=None,
aws_access_key_id: Optional[str] = None,
aws_secret_access_key: Optional[str] = None,
aws_region_name: Optional[str] = None,
aws_bedrock_runtime_endpoint: Optional[str] = None,
aws_session_name: Optional[str] = None,
aws_role_name: Optional[str] = None,
timeout: Optional[int] = None,
):
# check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
@ -300,6 +370,8 @@ def init_bedrock_client(
aws_secret_access_key,
aws_region_name,
aws_bedrock_runtime_endpoint,
aws_session_name,
aws_role_name,
]
# Iterate over parameters and update if needed
@ -312,7 +384,11 @@ def init_bedrock_client(
aws_secret_access_key,
aws_region_name,
aws_bedrock_runtime_endpoint,
aws_session_name,
aws_role_name,
) = params_to_check
### SET REGION NAME
if region_name:
pass
elif aws_region_name:
@ -338,7 +414,31 @@ def init_bedrock_client(
import boto3
if aws_access_key_id != None:
config = boto3.session.Config(connect_timeout=timeout, read_timeout=timeout)
### CHECK STS ###
if aws_role_name is not None and aws_session_name is not None:
# use sts if role name passed in
sts_client = boto3.client(
"sts",
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
sts_response = sts_client.assume_role(
RoleArn=aws_role_name, RoleSessionName=aws_session_name
)
client = boto3.client(
service_name="bedrock-runtime",
aws_access_key_id=sts_response["Credentials"]["AccessKeyId"],
aws_secret_access_key=sts_response["Credentials"]["SecretAccessKey"],
aws_session_token=sts_response["Credentials"]["SessionToken"],
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
elif aws_access_key_id is not None:
# uses auth params passed to completion
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
@ -348,6 +448,7 @@ def init_bedrock_client(
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
else:
# aws_access_key_id is None, assume user is trying to auth using env variables
@ -357,6 +458,7 @@ def init_bedrock_client(
service_name="bedrock-runtime",
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
return client
@ -419,6 +521,8 @@ def completion(
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_bedrock_runtime_endpoint = optional_params.pop(
"aws_bedrock_runtime_endpoint", None
)
@ -433,9 +537,14 @@ def completion(
aws_secret_access_key=aws_secret_access_key,
aws_region_name=aws_region_name,
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
)
model = model
modelId = (
optional_params.pop("model_id", None) or model
) # default to model if not passed
provider = model.split(".")[0]
prompt = convert_messages_to_prompt(
model, messages, provider, custom_prompt_dict
@ -498,6 +607,8 @@ def completion(
"textGenerationConfig": inference_params,
}
)
else:
data = json.dumps({})
## COMPLETION CALL
accept = "application/json"
@ -508,7 +619,7 @@ def completion(
request_str = f"""
response = client.invoke_model(
body={data},
modelId={model},
modelId={modelId},
accept=accept,
contentType=contentType
)
@ -523,7 +634,7 @@ def completion(
)
response = client.invoke_model(
body=data, modelId=model, accept=accept, contentType=contentType
body=data, modelId=modelId, accept=accept, contentType=contentType
)
response = response.get("body").read()
@ -533,7 +644,7 @@ def completion(
request_str = f"""
response = client.invoke_model_with_response_stream(
body={data},
modelId={model},
modelId={modelId},
accept=accept,
contentType=contentType
)
@ -548,7 +659,7 @@ def completion(
)
response = client.invoke_model_with_response_stream(
body=data, modelId=model, accept=accept, contentType=contentType
body=data, modelId=modelId, accept=accept, contentType=contentType
)
response = response.get("body")
return response
@ -557,7 +668,7 @@ def completion(
request_str = f"""
response = client.invoke_model(
body={data},
modelId={model},
modelId={modelId},
accept=accept,
contentType=contentType
)
@ -571,8 +682,12 @@ def completion(
},
)
response = client.invoke_model(
body=data, modelId=model, accept=accept, contentType=contentType
body=data, modelId=modelId, accept=accept, contentType=contentType
)
except client.exceptions.ValidationException as e:
if "The provided model identifier is invalid" in str(e):
raise BedrockError(status_code=404, message=str(e))
raise BedrockError(status_code=400, message=str(e))
except Exception as e:
raise BedrockError(status_code=500, message=str(e))
@ -617,9 +732,16 @@ def completion(
)
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
prompt_tokens = response_metadata.get(
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
)
completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count",
len(
encoding.encode(
model_response["choices"][0]["message"].get("content", "")
)
),
)
model_response["created"] = int(time.time())
@ -630,6 +752,8 @@ def completion(
total_tokens=prompt_tokens + completion_tokens,
)
model_response.usage = usage
model_response._hidden_params["region_name"] = client.meta.region_name
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
return model_response
except BedrockError as e:
exception_mapping_worked = True
@ -651,6 +775,11 @@ def _embedding_func_single(
encoding=None,
logging_obj=None,
):
if isinstance(input, str) is False:
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
# logic for parsing in - calling - parsing out model embedding calls
## FORMAT EMBEDDING INPUT ##
provider = model.split(".")[0]
@ -658,6 +787,9 @@ def _embedding_func_single(
inference_params.pop(
"user", None
) # make sure user is not passed in for bedrock call
modelId = (
optional_params.pop("model_id", None) or model
) # default to model if not passed
if provider == "amazon":
input = input.replace(os.linesep, " ")
data = {"inputText": input, **inference_params}
@ -672,7 +804,7 @@ def _embedding_func_single(
request_str = f"""
response = client.invoke_model(
body={body},
modelId={model},
modelId={modelId},
accept="*/*",
contentType="application/json",
)""" # type: ignore
@ -680,14 +812,14 @@ def _embedding_func_single(
input=input,
api_key="", # boto3 is used for init.
additional_args={
"complete_input_dict": {"model": model, "texts": input},
"complete_input_dict": {"model": modelId, "texts": input},
"request_str": request_str,
},
)
try:
response = client.invoke_model(
body=body,
modelId=model,
modelId=modelId,
accept="*/*",
contentType="application/json",
)
@ -726,6 +858,8 @@ def embedding(
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_bedrock_runtime_endpoint = optional_params.pop(
"aws_bedrock_runtime_endpoint", None
)
@ -736,8 +870,11 @@ def embedding(
aws_secret_access_key=aws_secret_access_key,
aws_region_name=aws_region_name,
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
)
if type(input) == str:
if isinstance(input, str):
## Embedding Call
embeddings = [
_embedding_func_single(
model,
@ -747,8 +884,8 @@ def embedding(
logging_obj=logging_obj,
)
]
else:
## Embedding Call
elif isinstance(input, list):
## Embedding Call - assuming this is a List[str]
embeddings = [
_embedding_func_single(
model,
@ -759,6 +896,12 @@ def embedding(
)
for i in input
] # [TODO]: make these parallel calls
else:
# enters this branch if input = int, ex. input=2
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
## Populate OpenAI compliant dictionary
embedding_response = []
@ -785,3 +928,112 @@ def embedding(
model_response.usage = usage
return model_response
def image_generation(
model: str,
prompt: str,
timeout=None,
logging_obj=None,
model_response=None,
optional_params=None,
aimg_generation=False,
):
"""
Bedrock Image Gen endpoint support
"""
### BOTO3 INIT ###
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_bedrock_runtime_endpoint = optional_params.pop(
"aws_bedrock_runtime_endpoint", None
)
# use passed in BedrockRuntime.Client if provided, otherwise create a new one
client = init_bedrock_client(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
aws_region_name=aws_region_name,
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
timeout=timeout,
)
### FORMAT IMAGE GENERATION INPUT ###
modelId = model
provider = model.split(".")[0]
inference_params = copy.deepcopy(optional_params)
inference_params.pop(
"user", None
) # make sure user is not passed in for bedrock call
data = {}
if provider == "stability":
prompt = prompt.replace(os.linesep, " ")
## LOAD CONFIG
config = litellm.AmazonStabilityConfig.get_config()
for k, v in config.items():
if (
k not in inference_params
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params}
else:
raise BedrockError(
status_code=422, message=f"Unsupported model={model}, passed in"
)
body = json.dumps(data).encode("utf-8")
## LOGGING
request_str = f"""
response = client.invoke_model(
body={body},
modelId={modelId},
accept="application/json",
contentType="application/json",
)""" # type: ignore
logging_obj.pre_call(
input=prompt,
api_key="", # boto3 is used for init.
additional_args={
"complete_input_dict": {"model": modelId, "texts": prompt},
"request_str": request_str,
},
)
try:
response = client.invoke_model(
body=body,
modelId=modelId,
accept="application/json",
contentType="application/json",
)
response_body = json.loads(response.get("body").read())
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
additional_args={"complete_input_dict": data},
original_response=json.dumps(response_body),
)
except Exception as e:
raise BedrockError(
message=f"Embedding Error with model {model}: {e}", status_code=500
)
### FORMAT RESPONSE TO OPENAI FORMAT ###
if response_body is None:
raise Exception("Error in response object format")
if model_response is None:
model_response = ImageResponse()
image_list: List = []
for artifact in response_body["artifacts"]:
image_dict = {"url": artifact["base64"]}
model_response.data = image_dict
return model_response

View file

@ -43,7 +43,7 @@ class AsyncCustomHTTPTransport(httpx.AsyncHTTPTransport):
request=request,
)
time.sleep(int(response.headers.get("retry-after")) or 10)
await asyncio.sleep(int(response.headers.get("retry-after") or 10))
response = await super().handle_async_request(request)
await response.aread()
@ -95,7 +95,6 @@ class CustomHTTPTransport(httpx.HTTPTransport):
request.method = "GET"
response = super().handle_request(request)
response.read()
timeout_secs: int = 120
start_time = time.time()
while response.json()["status"] not in ["succeeded", "failed"]:
@ -112,11 +111,9 @@ class CustomHTTPTransport(httpx.HTTPTransport):
content=json.dumps(timeout).encode("utf-8"),
request=request,
)
time.sleep(int(response.headers.get("retry-after")) or 10)
time.sleep(int(response.headers.get("retry-after", None) or 10))
response = super().handle_request(request)
response.read()
if response.json()["status"] == "failed":
error_data = response.json()
return httpx.Response(

View file

@ -120,9 +120,7 @@ def completion(
## Load Config
inference_params = copy.deepcopy(optional_params)
inference_params.pop(
"stream", None
) # palm does not support streaming, so we handle this by fake streaming in main.py
stream = inference_params.pop("stream", None)
config = litellm.GeminiConfig.get_config()
for k, v in config.items():
if (
@ -139,10 +137,18 @@ def completion(
## COMPLETION CALL
try:
_model = genai.GenerativeModel(f"models/{model}")
if stream != True:
response = _model.generate_content(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
)
else:
response = _model.generate_content(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
stream=True,
)
return response
except Exception as e:
raise GeminiError(
message=str(e),
@ -184,9 +190,13 @@ def completion(
if hasattr(response, "candidates"):
original_response = f"response: {response.candidates}"
if "SAFETY" in original_response:
original_response += "\nThe candidate content was flagged for safety reasons."
original_response += (
"\nThe candidate content was flagged for safety reasons."
)
elif "RECITATION" in original_response:
original_response += "\nThe candidate content was flagged for recitation reasons."
original_response += (
"\nThe candidate content was flagged for recitation reasons."
)
raise GeminiError(
status_code=400,
message=f"No response received. Original response - {original_response}",

View file

@ -145,8 +145,8 @@ def get_ollama_response(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
optional_params["stream"] = optional_params.get("stream", False)
data = {"model": model, "prompt": prompt, **optional_params}
stream = optional_params.pop("stream", False)
data = {"model": model, "prompt": prompt, "options": optional_params}
## LOGGING
logging_obj.pre_call(
input=None,
@ -159,7 +159,7 @@ def get_ollama_response(
},
)
if acompletion is True:
if optional_params.get("stream", False) == True:
if stream == True:
response = ollama_async_streaming(
url=url,
data=data,
@ -176,7 +176,7 @@ def get_ollama_response(
logging_obj=logging_obj,
)
return response
elif optional_params.get("stream", False) == True:
elif stream == True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post(url=f"{url}", json=data, timeout=litellm.request_timeout)
@ -254,7 +254,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
) as response:
if response.status_code != 200:
raise OllamaError(
status_code=response.status_code, message=response.text
status_code=response.status_code, message=await response.aread()
)
streamwrapper = litellm.CustomStreamWrapper(

View file

@ -145,8 +145,8 @@ def get_ollama_response(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
optional_params["stream"] = optional_params.get("stream", False)
data = {"model": model, "messages": messages, **optional_params}
stream = optional_params.pop("stream", False)
data = {"model": model, "messages": messages, "options": optional_params}
## LOGGING
logging_obj.pre_call(
input=None,
@ -159,7 +159,7 @@ def get_ollama_response(
},
)
if acompletion is True:
if optional_params.get("stream", False) == True:
if stream == True:
response = ollama_async_streaming(
url=url,
data=data,
@ -176,7 +176,7 @@ def get_ollama_response(
logging_obj=logging_obj,
)
return response
elif optional_params.get("stream", False) == True:
elif stream == True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post(
@ -220,8 +220,10 @@ def get_ollama_response(
model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
completion_tokens = response_json["eval_count"]
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -320,8 +322,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
completion_tokens = response_json["eval_count"]
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"])
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,5 +1,5 @@
from typing import Optional, Union, Any
import types, time, json
import types, time, json, traceback
import httpx
from .base import BaseLLM
from litellm.utils import (
@ -221,6 +221,7 @@ class OpenAIChatCompletion(BaseLLM):
headers: Optional[dict] = None,
custom_prompt_dict: dict = {},
client=None,
organization: Optional[str] = None,
):
super().completion()
exception_mapping_worked = False
@ -254,6 +255,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
else:
return self.acompletion(
@ -266,6 +268,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
elif optional_params.get("stream", False):
return self.streaming(
@ -278,6 +281,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
else:
if not isinstance(max_retries, int):
@ -291,6 +295,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_client = client
@ -349,7 +354,7 @@ class OpenAIChatCompletion(BaseLLM):
if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e))
else:
raise OpenAIError(status_code=500, message=str(e))
raise OpenAIError(status_code=500, message=traceback.format_exc())
async def acompletion(
self,
@ -358,6 +363,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
logging_obj=None,
@ -372,6 +378,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_aclient = client
@ -412,6 +419,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
headers=None,
@ -423,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_client = client
@ -454,6 +463,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
headers=None,
@ -467,6 +477,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_aclient = client
@ -706,19 +717,34 @@ class OpenAIChatCompletion(BaseLLM):
## COMPLETION CALL
response = openai_client.images.generate(**data, timeout=timeout) # type: ignore
response = response.model_dump() # type: ignore
## LOGGING
logging_obj.post_call(
input=input,
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=response,
)
# return response
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e:
exception_mapping_worked = True
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
raise e
except Exception as e:
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e))
else:
@ -733,14 +759,22 @@ class OpenAIChatCompletion(BaseLLM):
messages: Optional[list] = None,
input: Optional[list] = None,
prompt: Optional[str] = None,
organization: Optional[str] = None,
):
client = AsyncOpenAI(api_key=api_key, timeout=timeout)
client = AsyncOpenAI(
api_key=api_key, timeout=timeout, organization=organization
)
if model is None and mode != "image_generation":
raise Exception("model is not set")
completion = None
if mode == "completion":
completion = await client.completions.with_raw_response.create(
model=model, # type: ignore
prompt=prompt, # type: ignore
)
elif mode == "chat":
if messages is None:
raise Exception("messages is not set")
completion = await client.chat.completions.with_raw_response.create(
@ -889,7 +923,7 @@ class OpenAITextCompletion(BaseLLM):
headers=headers,
model_response=model_response,
model=model,
timeout=timeout
timeout=timeout,
)
else:
return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response, prompt=prompt, api_key=api_key, logging_obj=logging_obj, model=model, timeout=timeout) # type: ignore
@ -901,14 +935,11 @@ class OpenAITextCompletion(BaseLLM):
headers=headers,
model_response=model_response,
model=model,
timeout=timeout
timeout=timeout,
)
else:
response = httpx.post(
url=f"{api_base}",
json=data,
headers=headers,
timeout=timeout
url=f"{api_base}", json=data, headers=headers, timeout=timeout
)
if response.status_code != 200:
raise OpenAIError(
@ -944,7 +975,7 @@ class OpenAITextCompletion(BaseLLM):
prompt: str,
api_key: str,
model: str,
timeout: float
timeout: float,
):
async with httpx.AsyncClient(timeout=timeout) as client:
try:
@ -986,7 +1017,7 @@ class OpenAITextCompletion(BaseLLM):
headers: dict,
model_response: ModelResponse,
model: str,
timeout: float
timeout: float,
):
with httpx.stream(
url=f"{api_base}",
@ -1017,7 +1048,7 @@ class OpenAITextCompletion(BaseLLM):
headers: dict,
model_response: ModelResponse,
model: str,
timeout: float
timeout: float,
):
client = httpx.AsyncClient()
async with client.stream(

View file

@ -99,12 +99,16 @@ def ollama_pt(
def mistral_instruct_pt(messages):
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
prompt = custom_prompt(
initial_prompt_value="<s>",
role_dict={
"system": {"pre_message": "[INST]", "post_message": "[/INST]"},
"user": {"pre_message": "[INST]", "post_message": "[/INST]"},
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
"system": {
"pre_message": "[INST] \n",
"post_message": " [/INST]\n",
},
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
"assistant": {"pre_message": " ", "post_message": " "},
},
final_prompt_value="</s>",
messages=messages,
@ -372,6 +376,7 @@ def anthropic_pt(
You can "put words in Claude's mouth" by ending with an assistant message.
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
"""
class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman: "
AI_PROMPT = "\n\nAssistant: "
@ -399,27 +404,30 @@ def _load_image_from_url(image_url):
try:
from PIL import Image
except:
raise Exception("gemini image conversion failed please run `pip install Pillow`")
raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
from io import BytesIO
try:
# Send a GET request to the image URL
response = requests.get(image_url)
response.raise_for_status() # Raise an exception for HTTP errors
# Check the response's content type to ensure it is an image
content_type = response.headers.get('content-type')
if not content_type or 'image' not in content_type:
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
content_type = response.headers.get("content-type")
if not content_type or "image" not in content_type:
raise ValueError(
f"URL does not point to a valid image (content-type: {content_type})"
)
# Load the image from the response content
return Image.open(BytesIO(response.content))
except requests.RequestException as e:
print(f"Request failed: {e}")
except UnidentifiedImageError:
print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
except ValueError as e:
print(e)
raise Exception(f"Request failed: {e}")
except Exception as e:
raise e
def _gemini_vision_convert_messages(messages: list):
@ -437,10 +445,11 @@ def _gemini_vision_convert_messages(messages: list):
try:
from PIL import Image
except:
raise Exception("gemini image conversion failed please run `pip install Pillow`")
raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
try:
# given messages for gpt-4 vision, convert them for gemini
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
prompt = ""
@ -589,7 +598,7 @@ def prompt_factory(
if custom_llm_provider == "ollama":
return ollama_pt(model=model, messages=messages)
elif custom_llm_provider == "anthropic":
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
return claude_2_1_pt(messages=messages)
else:
return anthropic_pt(messages=messages)

View file

@ -25,6 +25,46 @@ class SagemakerError(Exception):
) # Call the base class constructor with the parameters it needs
import io
import json
class TokenIterator:
def __init__(self, stream):
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
self.end_of_data = False
def __iter__(self):
return self
def __next__(self):
try:
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
response_obj = {"text": "", "is_finished": False}
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
if line_data.get("generated_text", None) is not None:
self.end_of_data = True
response_obj["is_finished"] = True
response_obj["text"] = line_data["token"]["text"]
return response_obj
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
except StopIteration as e:
if self.end_of_data == True:
raise e # Re-raise StopIteration
else:
self.end_of_data = True
return "data: [DONE]"
class SagemakerConfig:
"""
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
@ -121,7 +161,6 @@ def completion(
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
inference_params = deepcopy(optional_params)
inference_params.pop("stream", None)
## Load Config
config = litellm.SagemakerConfig.get_config()
@ -152,6 +191,28 @@ def completion(
hf_model_name or model
) # pass in hf model name for pulling it's prompt template - (e.g. `hf_model_name="meta-llama/Llama-2-7b-chat-hf` applies the llama2 chat template to the prompt)
prompt = prompt_factory(model=hf_model_name, messages=messages)
stream = inference_params.pop("stream", None)
if stream == True:
data = json.dumps(
{"inputs": prompt, "parameters": inference_params, "stream": True}
).encode("utf-8")
## LOGGING
request_str = f"""
response = client.invoke_endpoint_with_response_stream(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
return response["Body"]
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
"utf-8"
@ -184,7 +245,15 @@ def completion(
CustomAttributes="accept_eula=true",
)
except Exception as e:
raise SagemakerError(status_code=500, message=f"{str(e)}")
status_code = (
getattr(e, "response", {})
.get("ResponseMetadata", {})
.get("HTTPStatusCode", 500)
)
error_message = (
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
)
raise SagemakerError(status_code=status_code, message=error_message)
response = response["Body"].read().decode("utf8")
## LOGGING
@ -358,7 +427,15 @@ def embedding(
CustomAttributes="accept_eula=true",
)
except Exception as e:
raise SagemakerError(status_code=500, message=f"{str(e)}")
status_code = (
getattr(e, "response", {})
.get("ResponseMetadata", {})
.get("HTTPStatusCode", 500)
)
error_message = (
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
)
raise SagemakerError(status_code=status_code, message=error_message)
response = json.loads(response["Body"].read().decode("utf8"))
## LOGGING

View file

View file

@ -3,7 +3,7 @@ import json
from enum import Enum
import requests
import time
from typing import Callable, Optional
from typing import Callable, Optional, Union
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
import litellm, uuid
import httpx
@ -75,6 +75,41 @@ class VertexAIConfig:
}
import asyncio
class TextStreamer:
"""
Fake streaming iterator for Vertex AI Model Garden calls
"""
def __init__(self, text):
self.text = text.split() # let's assume words as a streaming unit
self.index = 0
def __iter__(self):
return self
def __next__(self):
if self.index < len(self.text):
result = self.text[self.index]
self.index += 1
return result
else:
raise StopIteration
def __aiter__(self):
return self
async def __anext__(self):
if self.index < len(self.text):
result = self.text[self.index]
self.index += 1
return result
else:
raise StopAsyncIteration # once we run out of data to stream, we raise this error
def _get_image_bytes_from_url(image_url: str) -> bytes:
try:
response = requests.get(image_url)
@ -216,6 +251,14 @@ def completion(
status_code=400,
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
)
if not (
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
):
raise VertexAIError(
status_code=400,
message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
)
try:
from vertexai.preview.language_models import (
ChatModel,
@ -228,9 +271,17 @@ def completion(
Part,
GenerationConfig,
)
from google.cloud import aiplatform
from google.protobuf import json_format # type: ignore
from google.protobuf.struct_pb2 import Value # type: ignore
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
import google.auth
vertexai.init(project=vertex_project, location=vertex_location)
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
creds, _ = google.auth.default(quota_project_id=vertex_project)
vertexai.init(
project=vertex_project, location=vertex_location, credentials=creds
)
## Load Config
config = litellm.VertexAIConfig.get_config()
@ -264,6 +315,11 @@ def completion(
request_str = ""
response_obj = None
async_client = None
instances = None
client_options = {
"api_endpoint": f"{vertex_location}-aiplatform.googleapis.com"
}
if (
model in litellm.vertex_language_models
or model in litellm.vertex_vision_models
@ -283,39 +339,51 @@ def completion(
llm_model = CodeGenerationModel.from_pretrained(model)
mode = "text"
request_str += f"llm_model = CodeGenerationModel.from_pretrained({model})\n"
else: # vertex_code_llm_models
elif model in litellm.vertex_code_chat_models: # vertex_code_llm_models
llm_model = CodeChatModel.from_pretrained(model)
mode = "chat"
request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n"
else: # assume vertex model garden
client = aiplatform.gapic.PredictionServiceClient(
client_options=client_options
)
if acompletion == True: # [TODO] expand support to vertex ai chat + text models
instances = [optional_params]
instances[0]["prompt"] = prompt
instances = [
json_format.ParseDict(instance_dict, Value())
for instance_dict in instances
]
llm_model = client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
mode = "custom"
request_str += f"llm_model = client.endpoint_path(project={vertex_project}, location={vertex_location}, endpoint={model})\n"
if acompletion == True:
data = {
"llm_model": llm_model,
"mode": mode,
"prompt": prompt,
"logging_obj": logging_obj,
"request_str": request_str,
"model": model,
"model_response": model_response,
"encoding": encoding,
"messages": messages,
"print_verbose": print_verbose,
"client_options": client_options,
"instances": instances,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
**optional_params,
}
if optional_params.get("stream", False) is True:
# async streaming
return async_streaming(
llm_model=llm_model,
mode=mode,
prompt=prompt,
logging_obj=logging_obj,
request_str=request_str,
model=model,
model_response=model_response,
messages=messages,
print_verbose=print_verbose,
**optional_params,
)
return async_completion(
llm_model=llm_model,
mode=mode,
prompt=prompt,
logging_obj=logging_obj,
request_str=request_str,
model=model,
model_response=model_response,
encoding=encoding,
messages=messages,
print_verbose=print_verbose,
**optional_params,
)
return async_streaming(**data)
return async_completion(**data)
if mode == "vision":
print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@ -460,7 +528,36 @@ def completion(
},
)
completion_response = llm_model.predict(prompt, **optional_params).text
elif mode == "custom":
"""
Vertex AI Model Garden
"""
request_str += (
f"client.predict(endpoint={llm_model}, instances={instances})\n"
)
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response = client.predict(
endpoint=llm_model,
instances=instances,
).predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
if "stream" in optional_params and optional_params["stream"] == True:
response = TextStreamer(completion_response)
return response
## LOGGING
logging_obj.post_call(
input=prompt, api_key=None, original_response=completion_response
@ -528,6 +625,10 @@ async def async_completion(
encoding=None,
messages=None,
print_verbose=None,
client_options=None,
instances=None,
vertex_project=None,
vertex_location=None,
**optional_params,
):
"""
@ -616,7 +717,43 @@ async def async_completion(
)
response_obj = await llm_model.predict_async(prompt, **optional_params)
completion_response = response_obj.text
elif mode == "custom":
"""
Vertex AI Model Garden
"""
from google.cloud import aiplatform
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
client_options=client_options
)
llm_model = async_client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
request_str += (
f"client.predict(endpoint={llm_model}, instances={instances})\n"
)
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response_obj = await async_client.predict(
endpoint=llm_model,
instances=instances,
)
response = response_obj.predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
## LOGGING
logging_obj.post_call(
input=prompt, api_key=None, original_response=completion_response
@ -646,14 +783,12 @@ async def async_completion(
# init prompt tokens
# this block attempts to get usage from response_obj if it exists, if not it uses the litellm token counter
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
if response_obj is not None:
if hasattr(response_obj, "usage_metadata") and hasattr(
response_obj.usage_metadata, "prompt_token_count"
if response_obj is not None and (
hasattr(response_obj, "usage_metadata")
and hasattr(response_obj.usage_metadata, "prompt_token_count")
):
prompt_tokens = response_obj.usage_metadata.prompt_token_count
completion_tokens = (
response_obj.usage_metadata.candidates_token_count
)
completion_tokens = response_obj.usage_metadata.candidates_token_count
else:
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
@ -682,8 +817,13 @@ async def async_streaming(
model_response: ModelResponse,
logging_obj=None,
request_str=None,
encoding=None,
messages=None,
print_verbose=None,
client_options=None,
instances=None,
vertex_project=None,
vertex_location=None,
**optional_params,
):
"""
@ -752,17 +892,198 @@ async def async_streaming(
},
)
response = llm_model.predict_streaming_async(prompt, **optional_params)
elif mode == "custom":
from google.cloud import aiplatform
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
client_options=client_options
)
llm_model = async_client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
request_str += f"client.predict(endpoint={llm_model}, instances={instances})\n"
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response_obj = await async_client.predict(
endpoint=llm_model,
instances=instances,
)
response = response_obj.predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
if "stream" in optional_params and optional_params["stream"] == True:
response = TextStreamer(completion_response)
streamwrapper = CustomStreamWrapper(
completion_stream=response,
model=model,
custom_llm_provider="vertex_ai",
logging_obj=logging_obj,
)
async for transformed_chunk in streamwrapper:
yield transformed_chunk
return streamwrapper
def embedding():
def embedding(
model: str,
input: Union[list, str],
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
vertex_project=None,
vertex_location=None,
aembedding=False,
):
# logic for parsing in - calling - parsing out model embedding calls
pass
try:
import vertexai
except:
raise VertexAIError(
status_code=400,
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
)
from vertexai.language_models import TextEmbeddingModel
import google.auth
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
try:
creds, _ = google.auth.default(quota_project_id=vertex_project)
vertexai.init(
project=vertex_project, location=vertex_location, credentials=creds
)
except Exception as e:
raise VertexAIError(status_code=401, message=str(e))
if isinstance(input, str):
input = [input]
try:
llm_model = TextEmbeddingModel.from_pretrained(model)
except Exception as e:
raise VertexAIError(status_code=422, message=str(e))
if aembedding == True:
return async_embedding(
model=model,
client=llm_model,
input=input,
logging_obj=logging_obj,
model_response=model_response,
optional_params=optional_params,
encoding=encoding,
)
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
## LOGGING PRE-CALL
logging_obj.pre_call(
input=input,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
try:
embeddings = llm_model.get_embeddings(input)
except Exception as e:
raise VertexAIError(status_code=500, message=str(e))
## LOGGING POST-CALL
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
## Populate OpenAI compliant dictionary
embedding_response = []
for idx, embedding in enumerate(embeddings):
embedding_response.append(
{
"object": "embedding",
"index": idx,
"embedding": embedding.values,
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = 0
input_str = "".join(input)
input_tokens += len(encoding.encode(input_str))
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
return model_response
async def async_embedding(
model: str,
input: Union[list, str],
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
client=None,
):
"""
Async embedding implementation
"""
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
## LOGGING PRE-CALL
logging_obj.pre_call(
input=input,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
try:
embeddings = await client.get_embeddings_async(input)
except Exception as e:
raise VertexAIError(status_code=500, message=str(e))
## LOGGING POST-CALL
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
## Populate OpenAI compliant dictionary
embedding_response = []
for idx, embedding in enumerate(embeddings):
embedding_response.append(
{
"object": "embedding",
"index": idx,
"embedding": embedding.values,
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = 0
input_str = "".join(input)
input_tokens += len(encoding.encode(input_str))
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
return model_response

View file

@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy
import httpx
import litellm
from ._logging import verbose_logger
from litellm import ( # type: ignore
client,
exception_type,
@ -83,6 +83,7 @@ from litellm.utils import (
TextCompletionResponse,
TextChoices,
EmbeddingResponse,
ImageResponse,
read_config_args,
Choices,
Message,
@ -275,14 +276,10 @@ async def acompletion(
else:
# Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context) # type: ignore
# if kwargs.get("stream", False): # return an async generator
# return _async_streaming(
# response=response,
# model=model,
# custom_llm_provider=custom_llm_provider,
# args=args,
# )
# else:
if isinstance(response, CustomStreamWrapper):
response.set_logging_event_loop(
loop=loop
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
return response
except Exception as e:
custom_llm_provider = custom_llm_provider or "openai"
@ -345,6 +342,18 @@ def mock_completion(
model_response["choices"][0]["message"]["content"] = mock_response
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.usage = Usage(
prompt_tokens=10, completion_tokens=20, total_tokens=30
)
try:
_, custom_llm_provider, _, _ = litellm.utils.get_llm_provider(model=model)
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
except:
# dont let setting a hidden param block a mock_respose
pass
return model_response
except:
@ -444,9 +453,12 @@ def completion(
num_retries = kwargs.get("num_retries", None) ## deprecated
max_retries = kwargs.get("max_retries", None)
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
organization = kwargs.get("organization", None)
### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
### CUSTOM PROMPT TEMPLATE ###
initial_prompt_value = kwargs.get("initial_prompt_value", None)
roles = kwargs.get("roles", None)
@ -524,6 +536,8 @@ def completion(
"tpm",
"input_cost_per_token",
"output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name",
"model_info",
"proxy_server_request",
@ -536,10 +550,6 @@ def completion(
non_default_params = {
k: v for k, v in kwargs.items() if k not in default_params
} # model-specific params - pass them straight to the model/provider
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if timeout is None:
timeout = (
kwargs.get("request_timeout", None) or 600
@ -577,15 +587,45 @@ def completion(
api_base=api_base,
api_key=api_key,
)
if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
model_response._hidden_params["region_name"] = kwargs.get(
"aws_region_name", None
) # support region-based pricing for bedrock
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None:
print_verbose(f"Registering model={model} in model cost map")
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
},
model: {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
},
}
)
elif (
input_cost_per_second is not None
): # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
},
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
},
}
)
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
@ -674,6 +714,10 @@ def completion(
optional_params=optional_params,
litellm_params=litellm_params,
)
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
)
if custom_llm_provider == "azure":
# azure configs
api_type = get_secret("AZURE_API_TYPE") or "azure"
@ -692,9 +736,9 @@ def completion(
or get_secret("AZURE_API_KEY")
)
azure_ad_token = optional_params.pop("azure_ad_token", None) or get_secret(
"AZURE_AD_TOKEN"
)
azure_ad_token = optional_params.get("extra_body", {}).pop(
"azure_ad_token", None
) or get_secret("AZURE_AD_TOKEN")
headers = headers or litellm.headers
@ -758,7 +802,8 @@ def completion(
or "https://api.openai.com/v1"
)
openai.organization = (
litellm.organization
organization
or litellm.organization
or get_secret("OPENAI_ORGANIZATION")
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
@ -798,6 +843,7 @@ def completion(
timeout=timeout,
custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
)
except Exception as e:
## LOGGING - log the original exception returned
@ -967,6 +1013,7 @@ def completion(
encoding=encoding, # for calculating input/output tokens
api_key=api_key,
logging_obj=logging,
headers=headers,
)
if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object,
@ -1376,11 +1423,29 @@ def completion(
acompletion=acompletion,
custom_prompt_dict=custom_prompt_dict,
)
if (
"stream" in optional_params
and optional_params["stream"] == True
and acompletion == False
):
response = CustomStreamWrapper(
iter(model_response),
model,
custom_llm_provider="gemini",
logging_obj=logging,
)
return response
response = model_response
elif custom_llm_provider == "vertex_ai":
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
vertex_ai_location = litellm.vertex_location or get_secret(
"VERTEXAI_LOCATION"
vertex_ai_project = (
optional_params.pop("vertex_ai_project", None)
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
model_response = vertex_ai.completion(
@ -1471,19 +1536,22 @@ def completion(
if (
"stream" in optional_params and optional_params["stream"] == True
): ## [BETA]
# sagemaker does not support streaming as of now so we're faking streaming:
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
# "SageMaker is currently not supporting streaming responses."
# fake streaming for sagemaker
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
resp_string = model_response["choices"][0]["message"]["content"]
from .llms.sagemaker import TokenIterator
tokenIterator = TokenIterator(model_response)
response = CustomStreamWrapper(
resp_string,
model,
completion_stream=tokenIterator,
model=model,
custom_llm_provider="sagemaker",
logging_obj=logging,
)
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
return response
## RESPONSE OBJECT
@ -2146,6 +2214,7 @@ async def aembedding(*args, **kwargs):
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai"
): # currently implemented aiohttp calls for just azure and openai, soon all.
# Await normally
init_response = await loop.run_in_executor(None, func_with_context)
@ -2158,6 +2227,8 @@ async def aembedding(*args, **kwargs):
else:
# Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context)
if response is not None and hasattr(response, "_hidden_params"):
response._hidden_params["custom_llm_provider"] = custom_llm_provider
return response
except Exception as e:
custom_llm_provider = custom_llm_provider or "openai"
@ -2174,6 +2245,7 @@ def embedding(
model,
input=[],
# Optional params
dimensions: Optional[int] = None,
timeout=600, # default to 10 minutes
# set api_base, api_version, api_key
api_base: Optional[str] = None,
@ -2194,6 +2266,7 @@ def embedding(
Parameters:
- model: The embedding model to use.
- input: The input for which embeddings are to be generated.
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging.
- litellm_logging_obj: The litellm logging object.
@ -2220,8 +2293,14 @@ def embedding(
encoding_format = kwargs.get("encoding_format", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
aembedding = kwargs.get("aembedding", None)
### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None)
input_cost_per_second = kwargs.get("input_cost_per_second", None)
output_cost_per_second = kwargs.get("output_cost_per_second", None)
openai_params = [
"user",
"dimensions",
"request_timeout",
"api_base",
"api_version",
@ -2268,6 +2347,8 @@ def embedding(
"tpm",
"input_cost_per_token",
"output_cost_per_token",
"input_cost_per_second",
"output_cost_per_second",
"hf_model_name",
"proxy_server_request",
"model_info",
@ -2288,11 +2369,35 @@ def embedding(
api_key=api_key,
)
optional_params = get_optional_params_embeddings(
model=model,
user=user,
dimensions=dimensions,
encoding_format=encoding_format,
custom_llm_provider=custom_llm_provider,
**non_default_params,
)
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None:
litellm.register_model(
{
model: {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
}
}
)
if input_cost_per_second is not None: # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
}
}
)
try:
response = None
logging = litellm_logging_obj
@ -2385,7 +2490,7 @@ def embedding(
client=client,
aembedding=aembedding,
)
elif model in litellm.cohere_embedding_models:
elif custom_llm_provider == "cohere":
cohere_key = (
api_key
or litellm.cohere_key
@ -2427,6 +2532,29 @@ def embedding(
optional_params=optional_params,
model_response=EmbeddingResponse(),
)
elif custom_llm_provider == "vertex_ai":
vertex_ai_project = (
optional_params.pop("vertex_ai_project", None)
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
response = vertex_ai.embedding(
model=model,
input=input,
encoding=encoding,
logging_obj=logging,
optional_params=optional_params,
model_response=EmbeddingResponse(),
vertex_project=vertex_ai_project,
vertex_location=vertex_ai_location,
aembedding=aembedding,
)
elif custom_llm_provider == "oobabooga":
response = oobabooga.embedding(
model=model,
@ -2513,6 +2641,8 @@ def embedding(
else:
args = locals()
raise ValueError(f"No valid embedding model args passed in - {args}")
if response is not None and hasattr(response, "_hidden_params"):
response._hidden_params["custom_llm_provider"] = custom_llm_provider
return response
except Exception as e:
## LOGGING
@ -2523,9 +2653,7 @@ def embedding(
)
## Map to OpenAI Exception
raise exception_type(
model=model,
original_exception=e,
custom_llm_provider="azure" if azure == True else None,
model=model, original_exception=e, custom_llm_provider=custom_llm_provider
)
@ -2914,6 +3042,7 @@ def image_generation(
else:
model = "dall-e-2"
custom_llm_provider = "openai" # default to dall-e-2 on openai
model_response._hidden_params["model"] = model
openai_params = [
"user",
"request_timeout",
@ -2987,7 +3116,7 @@ def image_generation(
custom_llm_provider=custom_llm_provider,
**non_default_params,
)
logging = litellm_logging_obj
logging: Logging = litellm_logging_obj
logging.update_environment_variables(
model=model,
user=user,
@ -3051,7 +3180,18 @@ def image_generation(
model_response=model_response,
aimg_generation=aimg_generation,
)
elif custom_llm_provider == "bedrock":
if model is None:
raise Exception("Model needs to be set for bedrock")
model_response = bedrock.image_generation(
model=model,
prompt=prompt,
timeout=timeout,
logging_obj=litellm_logging_obj,
optional_params=optional_params,
model_response=model_response,
aimg_generation=aimg_generation,
)
return model_response
except Exception as e:
## Map to OpenAI Exception
@ -3068,7 +3208,9 @@ def image_generation(
async def ahealth_check(
model_params: dict,
mode: Optional[Literal["completion", "embedding", "image_generation"]] = None,
mode: Optional[
Literal["completion", "embedding", "image_generation", "chat"]
] = None,
prompt: Optional[str] = None,
input: Optional[List] = None,
default_timeout: float = 6000,
@ -3085,8 +3227,11 @@ async def ahealth_check(
if model is None:
raise Exception("model not set")
if model in litellm.model_cost and mode is None:
mode = litellm.model_cost[model]["mode"]
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
mode = mode or "completion" # default to completion calls
mode = mode or "chat" # default to chat completion calls
if custom_llm_provider == "azure":
api_key = (
@ -3126,8 +3271,12 @@ async def ahealth_check(
prompt=prompt,
input=input,
)
elif custom_llm_provider == "openai":
elif (
custom_llm_provider == "openai"
or custom_llm_provider == "text-completion-openai"
):
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
organization = model_params.get("organization")
timeout = (
model_params.get("timeout")
@ -3145,8 +3294,12 @@ async def ahealth_check(
mode=mode,
prompt=prompt,
input=input,
organization=organization,
)
else:
model_params["cache"] = {
"no-cache": True
} # don't used cached responses for making health check calls
if mode == "embedding":
model_params.pop("messages", None)
model_params["input"] = input
@ -3169,6 +3322,7 @@ async def ahealth_check(
## Set verbose to true -> ```litellm.set_verbose = True```
def print_verbose(print_statement):
try:
verbose_logger.debug(print_statement)
if litellm.set_verbose:
print(print_statement) # noqa
except:
@ -3256,7 +3410,23 @@ def stream_chunk_builder_text_completion(chunks: list, messages: Optional[List]
return response
def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
def stream_chunk_builder(
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
):
model_response = litellm.ModelResponse()
### SORT CHUNKS BASED ON CREATED ORDER ##
print_verbose("Goes into checking if chunk has hiddden created at param")
if chunks[0]._hidden_params.get("created_at", None):
print_verbose("Chunks have a created at hidden param")
# Sort chunks based on created_at in ascending order
chunks = sorted(
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
)
print_verbose("Chunks sorted")
# set hidden params from chunk to model_response
if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params = chunks[0].get("_hidden_params", {})
id = chunks[0]["id"]
object = chunks[0]["object"]
created = chunks[0]["created"]
@ -3427,5 +3597,8 @@ def stream_chunk_builder(chunks: list, messages: Optional[list] = None):
response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"]
)
return convert_to_model_response_object(
response_object=response, model_response_object=litellm.ModelResponse()
response_object=response,
model_response_object=model_response,
start_time=start_time,
end_time=end_time,
)

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-9a890acb1e81c3fc.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

View file

@ -0,0 +1 @@
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[165],{3155:function(e,t,n){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_not-found",function(){return n(4032)}])},4032:function(e,t,n){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),Object.defineProperty(t,"default",{enumerable:!0,get:function(){return i}}),n(6921);let o=n(3827);n(4090);let r={error:{fontFamily:'system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji"',height:"100vh",textAlign:"center",display:"flex",flexDirection:"column",alignItems:"center",justifyContent:"center"},desc:{display:"inline-block"},h1:{display:"inline-block",margin:"0 20px 0 0",padding:"0 23px 0 0",fontSize:24,fontWeight:500,verticalAlign:"top",lineHeight:"49px"},h2:{fontSize:14,fontWeight:400,lineHeight:"49px",margin:0}};function i(){return(0,o.jsxs)(o.Fragment,{children:[(0,o.jsx)("title",{children:"404: This page could not be found."}),(0,o.jsx)("div",{style:r.error,children:(0,o.jsxs)("div",{children:[(0,o.jsx)("style",{dangerouslySetInnerHTML:{__html:"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}),(0,o.jsx)("h1",{className:"next-error-h1",style:r.h1,children:"404"}),(0,o.jsx)("div",{style:r.desc,children:(0,o.jsx)("h2",{style:r.h2,children:"This page could not be found."})})]})})]})}("function"==typeof t.default||"object"==typeof t.default&&null!==t.default)&&void 0===t.default.__esModule&&(Object.defineProperty(t.default,"__esModule",{value:!0}),Object.assign(t.default,t),e.exports=t.default)}},function(e){e.O(0,[971,69,744],function(){return e(e.s=3155)}),_N_E=e.O()}]);

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{7421:function(n,e,t){Promise.resolve().then(t.t.bind(t,9646,23)),Promise.resolve().then(t.t.bind(t,3385,23))},3385:function(){},9646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=7421)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{2028:function(e,n,t){Promise.resolve().then(t.t.bind(t,7690,23)),Promise.resolve().then(t.t.bind(t,8955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,1902,23)),Promise.resolve().then(t.t.bind(t,1778,23)),Promise.resolve().then(t.t.bind(t,7831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(5317),n(2028)}),_N_E=e.O()}]);

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[888],{1597:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_app",function(){return u(7174)}])}},function(n){var _=function(_){return n(n.s=_)};n.O(0,[774,179],function(){return _(1597),_(4546)}),_N_E=n.O()}]);

View file

@ -0,0 +1 @@
(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[820],{1981:function(n,_,u){(window.__NEXT_P=window.__NEXT_P||[]).push(["/_error",function(){return u(5103)}])}},function(n){n.O(0,[888,774,179],function(){return n(n.s=1981)}),_N_E=n.O()}]);

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function s(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={exports:{}},r=!0;try{a[e](n,n.exports,s),r=!1}finally{r&&delete l[e]}return n.exports}s.m=a,e=[],s.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(s.O).every(function(e){return s.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},s.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return s.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},s.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);s.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},s.d(o,u),o},s.d=function(e,t){for(var n in t)s.o(t,n)&&!s.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},s.f={},s.e=function(e){return Promise.all(Object.keys(s.f).reduce(function(t,n){return s.f[n](e,t),t},[]))},s.u=function(e){},s.miniCssF=function(e){return"static/css/7384ba6288e79f81.css"},s.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),s.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",s.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,s.nc&&i.setAttribute("nonce",s.nc),i.setAttribute("data-webpack",o+n),i.src=s.tu(e)),r[e]=[t];var d=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(d.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=d.bind(null,i.onerror),i.onload=d.bind(null,i.onload),c&&document.head.appendChild(i)},s.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},s.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},s.tu=function(e){return s.tt().createScriptURL(e)},s.p="/ui/_next/",i={272:0},s.f.j=function(e,t){var n=s.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=s.p+s.u(e),u=Error();s.l(o,function(t){if(s.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},s.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)s.o(u,n)&&(s.m[n]=u[n]);if(c)var a=c(s)}for(e&&e(t);f<o.length;f++)r=o[f],s.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return s.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,8 @@
2:"$Sreact.suspense"
3:I[5250,["448","static/chunks/448-cd38799829cf7b57.js","931","static/chunks/app/page-e5227a95293777d5.js"],""]
4:I[3239,["448","static/chunks/448-cd38799829cf7b57.js","931","static/chunks/app/page-e5227a95293777d5.js"],""]
5:I[5613,[],""]
6:I[1778,[],""]
0:["B4oAVsVV35eL3Y1bPepKW",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col items-center","children":[["$","nav",null,{"className":"left-0 right-0 top-0 flex justify-between items-center h-12","children":[["$","div",null,{"className":"text-left mx-4 my-2 absolute top-0 left-0","children":["$","div",null,{"className":"flex flex-col items-center","children":["$","$L3",null,{"href":"/","children":["$","button",null,{"className":"text-gray-800 text-2xl px-4 py-1 rounded text-center","children":"🚅 LiteLLM"}]}]}]}],["$","div",null,{"className":"text-right mx-4 my-2 absolute top-0 right-0","children":[["$","a",null,{"href":"https://docs.litellm.ai/docs/","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 mr-2 text-center","children":"Docs"}]}],["$","a",null,{"href":"https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version","target":"_blank","rel":"noopener noreferrer","children":["$","button",null,{"className":"border border-gray-800 rounded-lg text-gray-800 text-xl px-4 py-1 rounded p-1 text-center","children":"Schedule Demo"}]}]]}]]}],["$","$L4",null,{}]]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L5",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/7384ba6288e79f81.css","precedence":"next","crossOrigin":""}]],"$L7"]]]]
7:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

Some files were not shown because too many files have changed in this diff Show more