Merge branch 'main' into litellm_dynamo_use_arn

This commit is contained in:
Ishaan Jaff 2024-02-13 21:27:38 -08:00 committed by GitHub
commit 003feda33f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
249 changed files with 24392 additions and 2775 deletions

View file

@ -147,12 +147,18 @@ jobs:
-e AZURE_API_KEY=$AZURE_API_KEY \
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
--name my-app \
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--num_workers 8
--num_workers 8 \
--detailed_debug \
--run_gunicorn \
- run:
name: Install curl and dockerize
command: |

View file

@ -10,4 +10,5 @@ anthropic
boto3
orjson
pydantic
google-cloud-aiplatform
google-cloud-aiplatform
redisvl==0.0.7 # semantic caching

View file

@ -0,0 +1,77 @@
name: Helm OCI Chart Releaser
description: Push Helm charts to OCI-based (Docker) registries
author: sergeyshaykhullin
branding:
color: yellow
icon: upload-cloud
inputs:
name:
required: true
description: Chart name
repository:
required: true
description: Chart repository name
tag:
required: true
description: Chart version
app_version:
required: true
description: App version
path:
required: false
description: Chart path (Default 'charts/{name}')
registry:
required: true
description: OCI registry
registry_username:
required: true
description: OCI registry username
registry_password:
required: true
description: OCI registry password
update_dependencies:
required: false
default: 'false'
description: Update chart dependencies before packaging (Default 'false')
outputs:
image:
value: ${{ steps.output.outputs.image }}
description: Chart image (Default '{registry}/{repository}/{image}:{tag}')
runs:
using: composite
steps:
- name: Helm | Login
shell: bash
run: echo ${{ inputs.registry_password }} | helm registry login -u ${{ inputs.registry_username }} --password-stdin ${{ inputs.registry }}
env:
HELM_EXPERIMENTAL_OCI: '1'
- name: Helm | Dependency
if: inputs.update_dependencies == 'true'
shell: bash
run: helm dependency update ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }}
env:
HELM_EXPERIMENTAL_OCI: '1'
- name: Helm | Package
shell: bash
run: helm package ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }} --version ${{ inputs.tag }} --app-version ${{ inputs.app_version }}
env:
HELM_EXPERIMENTAL_OCI: '1'
- name: Helm | Push
shell: bash
run: helm push ${{ inputs.name }}-${{ inputs.tag }}.tgz oci://${{ inputs.registry }}/${{ inputs.repository }}
env:
HELM_EXPERIMENTAL_OCI: '1'
- name: Helm | Logout
shell: bash
run: helm registry logout ${{ inputs.registry }}
env:
HELM_EXPERIMENTAL_OCI: '1'
- name: Helm | Output
id: output
shell: bash
run: echo "image=${{ inputs.registry }}/${{ inputs.repository }}/${{ inputs.name }}:${{ inputs.tag }}" >> $GITHUB_OUTPUT

View file

@ -34,13 +34,6 @@ jobs:
with:
push: true
tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }}
-
name: Build and push litellm-ui image
uses: docker/build-push-action@v5
with:
push: true
file: ui/Dockerfile
tags: litellm/litellm-ui:${{ github.event.inputs.tag || 'latest' }}
-
name: Build and push litellm-database image
uses: docker/build-push-action@v5
@ -82,36 +75,8 @@ jobs:
push: true
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
labels: ${{ steps.meta.outputs.labels }}
build-and-push-image-ui:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for UI Dockerfile
id: meta-ui
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
- name: Build and push UI Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: ui/
file: ui/Dockerfile
push: true
tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
labels: ${{ steps.meta-ui.outputs.labels }}
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-database:
runs-on: ubuntu-latest
permissions:
@ -176,3 +141,14 @@ jobs:
} catch (error) {
core.setFailed(error.message);
}
- name: Github Releases To Discord
uses: SethCohen/github-releases-to-discord@v1.13.1
with:
webhook_url: ${{ secrets.WEBHOOK_URL }}
color: "2105893"
username: "Release Changelog"
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
content: "||@everyone||"
footer_title: "Changelog"
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
footer_timestamp: true

64
.github/workflows/ghcr_helm_deploy.yml vendored Normal file
View file

@ -0,0 +1,64 @@
# this workflow is triggered by an API call when there is a new PyPI release of LiteLLM
name: Build, Publish LiteLLM Helm Chart. New Release
on:
workflow_dispatch:
inputs:
chartVersion:
description: "Update the helm chart's version to this"
# Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds.
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
REPO_OWNER: ${{github.repository_owner}}
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
build-and-push-helm-chart:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: lowercase github.repository_owner
run: |
echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
- name: Get LiteLLM Latest Tag
id: current_app_tag
uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
- name: Get last published chart version
id: current_version
shell: bash
run: helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/litellm-helm | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
env:
HELM_EXPERIMENTAL_OCI: '1'
# Automatically update the helm chart version one "patch" level
- name: Bump release version
id: bump_version
uses: christian-draeger/increment-semantic-version@1.1.0
with:
current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
version-fragment: 'bug'
- uses: ./.github/actions/helm-oci-chart-releaser
with:
name: litellm-helm
repository: ${{ env.REPO_OWNER }}
tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
path: deploy/charts/litellm-helm
registry: ${{ env.REGISTRY }}
registry_username: ${{ github.actor }}
registry_password: ${{ secrets.GITHUB_TOKEN }}
update_dependencies: true

6
.gitignore vendored
View file

@ -39,4 +39,8 @@ ui/litellm-dashboard/.next
ui/litellm-dashboard/node_modules
ui/litellm-dashboard/next-env.d.ts
ui/litellm-dashboard/package.json
ui/litellm-dashboard/package-lock.json
ui/litellm-dashboard/package-lock.json
deploy/charts/litellm-helm/*.tgz
deploy/charts/litellm-helm/charts/*
deploy/charts/*.tgz
litellm/proxy/vertex_key.json

View file

@ -10,6 +10,12 @@ repos:
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
additional_dependencies: [flake8-print]
files: litellm/.*\.py
- repo: local
hooks:
- id: check-files-match
name: Check if files match
entry: python3 ci_cd/check_files_match.py
language: system
- repo: local
hooks:
- id: mypy

View file

@ -32,6 +32,9 @@ RUN pip install dist/*.whl
# install dependencies as wheels
RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# Runtime stage
FROM $LITELLM_RUNTIME_IMAGE as runtime
@ -52,4 +55,4 @@ RUN chmod +x entrypoint.sh
EXPOSE 4000/tcp
ENTRYPOINT ["litellm"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]

View file

@ -47,6 +47,9 @@ COPY --from=builder /wheels/ /wheels/
# Install the built wheel using pip; again using a wildcard if it's the only file
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
RUN pip install redisvl==0.0.7 --no-deps
# Generate prisma client
RUN prisma generate
RUN chmod +x entrypoint.sh
@ -56,4 +59,4 @@ EXPOSE 4000/tcp
# # Set your entrypoint and command
ENTRYPOINT ["litellm"]
CMD ["--port", "4000"]
CMD ["--port", "4000", "--run_gunicorn"]

View file

@ -5,7 +5,7 @@
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
<br>
</p>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a></h4>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Support</a></h4>
<h4 align="center">
<a href="https://pypi.org/project/litellm/" target="_blank">
<img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -28,10 +28,14 @@ LiteLLM manages:
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
# Usage ([**Docs**](https://docs.litellm.ai/docs/))
> [!IMPORTANT]
> LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration)
@ -155,6 +159,9 @@ print(response)
```
## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys))
UI on `/ui` on your proxy server
![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
Track Spend, Set budgets and create virtual keys for the proxy
`POST /key/generate`
@ -174,17 +181,6 @@ curl 'http://0.0.0.0:8000/key/generate' \
}
```
### [Beta] Proxy UI
A simple UI to add new models and let your users create keys.
Live here: https://dashboard.litellm.ai/
Code: https://github.com/BerriAI/litellm/tree/main/ui
<img width="1672" alt="Screenshot 2023-12-26 at 8 33 53 AM" src="https://github.com/BerriAI/litellm/assets/17561003/274254d8-c5fe-4645-9123-100045a7fb21">
## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers))
| Provider | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses) | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion) | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming) | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding) | [Async Image Generation](https://docs.litellm.ai/docs/image_generation) |
| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |

View file

@ -0,0 +1,32 @@
import sys
import filecmp
import shutil
def main(argv=None):
print(
"Comparing model_prices_and_context_window and litellm/model_prices_and_context_window_backup.json files... checking if they match."
)
file1 = "model_prices_and_context_window.json"
file2 = "litellm/model_prices_and_context_window_backup.json"
cmp_result = filecmp.cmp(file1, file2, shallow=False)
if cmp_result:
print(f"Passed! Files {file1} and {file2} match.")
return 0
else:
print(
f"Failed! Files {file1} and {file2} do not match. Copying content from {file1} to {file2}."
)
copy_content(file1, file2)
return 1
def copy_content(source, destination):
shutil.copy2(source, destination)
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,76 @@
import sys, os
import traceback
from dotenv import load_dotenv
import copy
load_dotenv()
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
from litellm import Router, Timeout
import time
from litellm.caching import Cache
import litellm
import openai
### Test just calling AsyncAzureOpenAI
openai_client = openai.AsyncAzureOpenAI(
azure_endpoint=os.getenv("AZURE_API_BASE"),
api_key=os.getenv("AZURE_API_KEY"),
)
async def call_acompletion(semaphore, input_data):
async with semaphore:
try:
# Use asyncio.wait_for to set a timeout for the task
response = await openai_client.chat.completions.create(**input_data)
# Handle the response as needed
print(response)
return response
except Timeout:
print(f"Task timed out: {input_data}")
return None # You may choose to return something else or raise an exception
async def main():
# Initialize the Router
# Create a semaphore with a capacity of 100
semaphore = asyncio.Semaphore(100)
# List to hold all task references
tasks = []
start_time_all_tasks = time.time()
# Launch 1000 tasks
for _ in range(500):
task = asyncio.create_task(
call_acompletion(
semaphore,
{
"model": "chatgpt-v-2",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
},
)
)
tasks.append(task)
# Wait for all tasks to complete
responses = await asyncio.gather(*tasks)
# Process responses as needed
# Record the end time for all tasks
end_time_all_tasks = time.time()
# Calculate the total time for all tasks
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
# Calculate the average time per response
average_time_per_response = total_time_all_tasks / len(responses)
print(f"Average time per response: {average_time_per_response} seconds")
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
# Run the main function
asyncio.run(main())

View file

@ -0,0 +1,88 @@
import sys, os
import traceback
from dotenv import load_dotenv
import copy
load_dotenv()
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
from litellm import Router, Timeout
import time
### Test calling router async
async def call_acompletion(semaphore, router: Router, input_data):
async with semaphore:
try:
# Use asyncio.wait_for to set a timeout for the task
response = await router.acompletion(**input_data)
# Handle the response as needed
print(response)
return response
except Timeout:
print(f"Task timed out: {input_data}")
return None # You may choose to return something else or raise an exception
async def main():
# Initialize the Router
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
},
},
]
router = Router(model_list=model_list, num_retries=3, timeout=10)
# Create a semaphore with a capacity of 100
semaphore = asyncio.Semaphore(100)
# List to hold all task references
tasks = []
start_time_all_tasks = time.time()
# Launch 1000 tasks
for _ in range(500):
task = asyncio.create_task(
call_acompletion(
semaphore,
router,
{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
},
)
)
tasks.append(task)
# Wait for all tasks to complete
responses = await asyncio.gather(*tasks)
# Process responses as needed
# Record the end time for all tasks
end_time_all_tasks = time.time()
# Calculate the total time for all tasks
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
# Calculate the average time per response
average_time_per_response = total_time_all_tasks / len(responses)
print(f"Average time per response: {average_time_per_response} seconds")
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
# Run the main function
asyncio.run(main())

View file

@ -0,0 +1,94 @@
import sys, os
import traceback
from dotenv import load_dotenv
import copy
load_dotenv()
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
from litellm import Router, Timeout
import time
from litellm.caching import Cache
import litellm
litellm.cache = Cache(
type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2"
)
### Test calling router with s3 Cache
async def call_acompletion(semaphore, router: Router, input_data):
async with semaphore:
try:
# Use asyncio.wait_for to set a timeout for the task
response = await router.acompletion(**input_data)
# Handle the response as needed
print(response)
return response
except Timeout:
print(f"Task timed out: {input_data}")
return None # You may choose to return something else or raise an exception
async def main():
# Initialize the Router
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_base": os.getenv("AZURE_API_BASE"),
"api_version": os.getenv("AZURE_API_VERSION"),
},
},
]
router = Router(model_list=model_list, num_retries=3, timeout=10)
# Create a semaphore with a capacity of 100
semaphore = asyncio.Semaphore(100)
# List to hold all task references
tasks = []
start_time_all_tasks = time.time()
# Launch 1000 tasks
for _ in range(500):
task = asyncio.create_task(
call_acompletion(
semaphore,
router,
{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
},
)
)
tasks.append(task)
# Wait for all tasks to complete
responses = await asyncio.gather(*tasks)
# Process responses as needed
# Record the end time for all tasks
end_time_all_tasks = time.time()
# Calculate the total time for all tasks
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
# Calculate the average time per response
average_time_per_response = total_time_all_tasks / len(responses)
print(f"Average time per response: {average_time_per_response} seconds")
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
# Run the main function
asyncio.run(main())

View file

@ -0,0 +1,2 @@
python3 -m build
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -

View file

@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View file

@ -0,0 +1,6 @@
dependencies:
- name: postgresql
repository: oci://registry-1.docker.io/bitnamicharts
version: 13.3.1
digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
generated: "2024-01-19T11:32:56.694808861+11:00"

View file

@ -0,0 +1,34 @@
apiVersion: v2
# We can't call ourselves just "litellm" because then we couldn't publish to the
# same OCI repository as the "litellm" OCI image
name: litellm-helm
description: Call all LLM APIs using the OpenAI format
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: v1.18.9
dependencies:
- name: "postgresql"
version: ">=13.3.0"
repository: oci://registry-1.docker.io/bitnamicharts
condition: db.deployStandalone

View file

@ -0,0 +1,107 @@
# Helm Chart for LiteLLM
## Prerequisites
- Kubernetes 1.23+
- Helm 3.8.0+
If `db.deployStandalone` is used:
- PV provisioner support in the underlying infrastructure
If `db.useStackgresOperator` is used (not yet implemented):
- The Stackgres Operator must already be installed in the Kubernetes Cluster. This chart will **not** install the operator if it is missing.
## Parameters
### LiteLLM Proxy Deployment Settings
| Name | Description | Value |
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` |
| `masterkey` | The Master API Key for LiteLLM. If not specified, a random key is generated. | N/A |
| `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` |
| `image.repository` | LiteLLM Proxy image repository | `ghcr.io/berriai/litellm` |
| `image.pullPolicy` | LiteLLM Proxy image pull policy | `IfNotPresent` |
| `image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` |
| `image.dbReadyImage` | On Pod startup, an initContainer is used to make sure the Postgres database is available before attempting to start LiteLLM. This field specifies the image to use as that initContainer. | `docker.io/bitnami/postgresql` |
| `image.dbReadyTag` | Tag for the above image. If not specified, "latest" is used. | `""` |
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` |
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
#### Example `environmentSecrets` Secret
```
apiVersion: v1
kind: Secret
metadata:
name: litellm-envsecrets
data:
AZURE_OPENAI_API_KEY: TXlTZWN1cmVLM3k=
type: Opaque
```
### LiteLLM Admin UI Settings
| Name | Description | Value |
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| `ui.enabled` | Should the LiteLLM Admin UI be deployed | `true` |
| `ui.replicaCount` | The number of LiteLLM Admin UI pods to be deployed | `1` |
| `ui.image.repository` | LiteLLM Admin UI image repository | `ghcr.io/berriai/litellm` |
| `ui.image.pullPolicy` | LiteLLM Admin UI image pull policy | `IfNotPresent` |
| `ui.image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` |
| `ui.imagePullSecrets` | Registry credentials for the above images. | `[]` |
| `ui.service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
| `ui.service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the web server will listen on. | `8000` |
| `ui.ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
### Database Settings
| Name | Description | Value |
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
| `db.useExisting` | Use an existing Postgres database. A Kubernetes Secret object must exist that contains credentials for connecting to the database. An example secret object definition is provided below. | `false` |
| `db.endpoint` | If `db.useExisting` is `true`, this is the IP, Hostname or Service Name of the Postgres server to connect to. | `localhost` |
| `db.database` | If `db.useExisting` is `true`, the name of the existing database to connect to. | `litellm` |
| `db.secret.name` | If `db.useExisting` is `true`, the name of the Kubernetes Secret that contains credentials. | `postgres` |
| `db.secret.usernameKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the username for authenticating with the Postgres instance. | `username` |
| `db.secret.passwordKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the password associates with the above user. | `password` |
| `db.useStackgresOperator` | Not yet implemented. | `false` |
| `db.deployStandalone` | Deploy a standalone, single instance deployment of Postgres, using the Bitnami postgresql chart. This is useful for getting started but doesn't provide HA or (by default) data backups. | `true` |
| `postgresql.*` | If `db.deployStandalone` is `true`, configuration passed to the Bitnami postgresql chart. See the [Bitnami Documentation](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) for full configuration details. See [values.yaml](./values.yaml) for the default configuration. | See [values.yaml](./values.yaml) |
| `postgresql.auth.*` | If `db.deployStandalone` is `true`, care should be taken to ensure the default `password` and `postgres-password` values are **NOT** used. | `NoTaGrEaTpAsSwOrD` |
#### Example Postgres `db.useExisting` Secret
```yaml
apiVersion: v1
kind: Secret
metadata:
name: postgres
data:
# Password for the "postgres" user
postgres-password: <some secure password, base64 encoded>
username: litellm
password: <some secure password, base64 encoded>
type: Opaque
```
## Accessing the Admin UI
When browsing to the URL published per the settings in `ui.ingress.*`, you will
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
(from the `litellm-ui` pod's perspective) URL published by the `litellm-proxy`
Kubernetes Service. If the deployment uses the default settings for this
service, the **Proxy Endpoint** should be set to `http://litellm-proxy:8000`.
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
was not provided to the helm command line, the `masterkey` is a randomly
generated string stored in the `litellm-masterkey` Kubernetes Secret.
```bash
kubectl -n litellm get secret litellm-masterkey -o jsonpath="{.data.masterkey}"
```
## Admin UI Limitations
At the time of writing, the Admin UI is unable to add models. This is because
it would need to update the `config.yaml` file which is a exposed ConfigMap, and
therefore, read-only. This is a limitation of this helm chart, not the Admin UI
itself.

View file

@ -0,0 +1,22 @@
1. Get the application URL by running these commands:
{{- if .Values.ingress.enabled }}
{{- range $host := .Values.ingress.hosts }}
{{- range .paths }}
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
{{- end }}
{{- end }}
{{- else if contains "NodePort" .Values.service.type }}
export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "litellm.fullname" . }})
export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
echo http://$NODE_IP:$NODE_PORT
{{- else if contains "LoadBalancer" .Values.service.type }}
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "litellm.fullname" . }}'
export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "litellm.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
echo http://$SERVICE_IP:{{ .Values.service.port }}
{{- else if contains "ClusterIP" .Values.service.type }}
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "litellm.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
echo "Visit http://127.0.0.1:8080 to use your application"
kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
{{- end }}

View file

@ -0,0 +1,74 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "litellm.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "litellm.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "litellm.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "litellm.labels" -}}
helm.sh/chart: {{ include "litellm.chart" . }}
{{ include "litellm.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{- define "litellm.ui.labels" -}}
helm.sh/chart: {{ include "litellm.chart" . }}
{{ include "litellm.ui.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "litellm.selectorLabels" -}}
app.kubernetes.io/name: {{ include "litellm.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{- define "litellm.ui.selectorLabels" -}}
app.kubernetes.io/name: {{ include "litellm.name" . }}-ui
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "litellm.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "litellm.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "litellm.fullname" . }}-config
data:
config.yaml: |
{{ .Values.proxy_config | toYaml | indent 6 }}

View file

@ -0,0 +1,230 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "litellm.fullname" . }}-proxy
labels:
{{- include "litellm.labels" . | nindent 4 }}
spec:
{{- if not .Values.autoscaling.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "litellm.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "litellm.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: db-ready
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "docker.io/bitnami/postgresql:16.1.0-debian-11-r20"
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
{{- if .Values.db.deployStandalone }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ include "litellm.name" . }}-dbcredentials
key: username
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: {{ include "litellm.name" . }}-dbcredentials
key: password
- name: DATABASE_HOST
value: {{ .Release.Name }}-postgresql
- name: DATABASE_NAME
value: litellm
{{- else if .Values.db.useExisting }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.usernameKey }}
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.passwordKey }}
- name: DATABASE_HOST
value: {{ .Values.db.endpoint }}
- name: DATABASE_NAME
value: litellm
{{- end }}
command:
- sh
- -c
- |
# Maximum wait time will be (limit * 2) seconds.
limit=60
current=0
ret=1
while [ $current -lt $limit ] && [ $ret -ne 0 ]; do
echo "Waiting for database to be ready $current"
psql -U $(DATABASE_USERNAME) -h $(DATABASE_HOST) -l
ret=$?
current=$(( $current + 1 ))
sleep 2
done
if [ $ret -eq 0 ]; then
echo "Database is ready"
else
echo "Database failed to become ready before we gave up waiting."
fi
{{ if .Values.securityContext.readOnlyRootFilesystem }}
volumeMounts:
- name: tmp
mountPath: /tmp
{{ end }}
containers:
- name: {{ include "litellm.name" . }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
- name: HOST
value: "0.0.0.0"
- name: PORT
value: {{ .Values.service.port | quote}}
{{- if .Values.db.deployStandalone }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ include "litellm.name" . }}-dbcredentials
key: username
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "litellm.name" . }}-dbcredentials
key: password
- name: DATABASE_HOST
value: {{ .Release.Name }}-postgresql
- name: DATABASE_NAME
value: litellm
{{- else if .Values.db.useExisting }}
- name: DATABASE_USERNAME
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.usernameKey }}
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ .Values.db.secret.name }}
key: {{ .Values.db.secret.passwordKey }}
- name: DATABASE_HOST
value: {{ .Values.db.endpoint }}
- name: DATABASE_NAME
value: {{ .Values.db.database }}
{{- end }}
- name: DATABASE_URL
value: "postgresql://$(DATABASE_USERNAME):$(DATABASE_PASSWORD)@$(DATABASE_HOST)/$(DATABASE_NAME)"
- name: PROXY_MASTER_KEY
valueFrom:
secretKeyRef:
name: {{ include "litellm.name" . }}-masterkey
key: masterkey
envFrom:
{{- range .Values.environmentSecrets }}
- secretRef:
name: {{ . }}
{{- end }}
args:
- --config
- /etc/litellm/config.yaml
# command:
# - bash
# - -c
# - |
# ls -la /etc/litellm/; cat /etc/litellm/config.yaml; export
# find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/before.list
# prisma generate
# find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/after.list
# diff -ruN /tmp/before.list /tmp/after.list
# sleep 3600
ports:
- name: http
containerPort: {{ .Values.service.port }}
protocol: TCP
livenessProbe:
httpGet:
path: /health/liveliness
port: http
readinessProbe:
httpGet:
path: /health/readiness
port: http
# Give the container time to start up. Up to 5 minutes (10 * 30 seconds)
startupProbe:
httpGet:
path: /health/readiness
port: http
failureThreshold: 30
periodSeconds: 10
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: litellm-config
mountPath: /etc/litellm/
{{ if .Values.securityContext.readOnlyRootFilesystem }}
- name: tmp
mountPath: /tmp
- name: cache
mountPath: /.cache
- name: npm
mountPath: /.npm
{{- end }}
{{- with .Values.volumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}
volumes:
{{ if .Values.securityContext.readOnlyRootFilesystem }}
- name: tmp
emptyDir:
sizeLimit: 500Mi
- name: cache
emptyDir:
sizeLimit: 500Mi
- name: npm
emptyDir:
sizeLimit: 500Mi
{{- end }}
- name: litellm-config
configMap:
name: {{ include "litellm.fullname" . }}-config
items:
- key: "config.yaml"
path: "config.yaml"
{{- with .Values.volumes }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

View file

@ -0,0 +1,89 @@
{{- if .Values.ui.enabled -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "litellm.fullname" . }}-ui
labels:
{{- include "litellm.labels" . | nindent 4 }}
spec:
{{- if not .Values.ui.autoscaling.enabled }}
replicas: {{ .Values.ui.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "litellm.ui.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "litellm.ui.labels" . | nindent 8 }}
{{- with .Values.ui.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.ui.podSecurityContext | nindent 8 }}
containers:
- name: {{ include "litellm.name" . }}-ui
securityContext:
{{- toYaml .Values.ui.securityContext | nindent 12 }}
image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
imagePullPolicy: {{ .Values.ui.image.pullPolicy }}
env:
- name: BASE_URL
value: {{ (index .Values.ui.ingress.hosts 0).host | default "example.com" }}
ports:
- name: http
containerPort: {{ .Values.ui.service.port }}
protocol: TCP
livenessProbe:
httpGet:
path: /
port: http
readinessProbe:
httpGet:
path: /
port: http
# Give the container time to start up. Up to 5 minutes (10 * 30 seconds)
startupProbe:
httpGet:
path: /
port: http
failureThreshold: 30
periodSeconds: 10
resources:
{{- toYaml .Values.ui.resources | nindent 12 }}
volumeMounts:
- name: tmp
mountPath: /tmp
{{- with .Values.ui.volumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}
volumes:
- name: tmp
emptyDir:
sizeLimit: 500Mi
{{- with .Values.ui.volumes }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.ui.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.ui.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.ui.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end -}}

View file

@ -0,0 +1,32 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "litellm.fullname" . }}
labels:
{{- include "litellm.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "litellm.fullname" . }}
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,61 @@
{{- if .Values.ingress.enabled -}}
{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-proxy") -}}
{{- $svcPort := .Values.service.port -}}
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
{{- end }}
{{- end }}
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "litellm.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
pathType: {{ .pathType }}
{{- end }}
backend:
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
service:
name: {{ $fullName }}
port:
number: {{ $svcPort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $svcPort }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,61 @@
{{- if .Values.ui.ingress.enabled -}}
{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-ui") -}}
{{- $svcPort := .Values.ui.service.port -}}
{{- if and .Values.ui.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
{{- if not (hasKey .Values.ui.ingress.annotations "kubernetes.io/ingress.class") }}
{{- $_ := set .Values.ui.ingress.annotations "kubernetes.io/ingress.class" .Values.ui.ingress.className}}
{{- end }}
{{- end }}
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "litellm.ui.labels" . | nindent 4 }}
{{- with .Values.ui.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and .Values.ui.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
ingressClassName: {{ .Values.ui.ingress.className }}
{{- end }}
{{- if .Values.ui.ingress.tls }}
tls:
{{- range .Values.ui.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ui.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
pathType: {{ .pathType }}
{{- end }}
backend:
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
service:
name: {{ $fullName }}
port:
number: {{ $svcPort }}
{{- else }}
serviceName: {{ $fullName }}
servicePort: {{ $svcPort }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,12 @@
{{- if .Values.db.deployStandalone -}}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "litellm.name" . }}-dbcredentials
data:
# Password for the "postgres" user
postgres-password: {{ ( index .Values.postgresql.auth "postgres-password") | default "litellm" | b64enc }}
username: {{ .Values.postgresql.auth.username | default "litellm" | b64enc }}
password: {{ .Values.postgresql.auth.password | default "litellm" | b64enc }}
type: Opaque
{{- end -}}

View file

@ -0,0 +1,8 @@
{{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "litellm.name" . }}-masterkey
data:
masterkey: {{ $masterkey | b64enc }}
type: Opaque

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "litellm.fullname" . }}-proxy
labels:
{{- include "litellm.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: http
selector:
{{- include "litellm.selectorLabels" . | nindent 4 }}

View file

@ -0,0 +1,17 @@
{{- if .Values.ui.enabled -}}
apiVersion: v1
kind: Service
metadata:
name: {{ include "litellm.fullname" . }}-ui
labels:
{{- include "litellm.labels" . | nindent 4 }}
spec:
type: {{ .Values.ui.service.type }}
ports:
- port: {{ .Values.ui.service.port }}
targetPort: http
protocol: TCP
name: http
selector:
{{- include "litellm.ui.selectorLabels" . | nindent 4 }}
{{ end -}}

View file

@ -0,0 +1,13 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "litellm.serviceAccountName" . }}
labels:
{{- include "litellm.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
{{- end }}

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: Pod
metadata:
name: "{{ include "litellm.fullname" . }}-test-connection"
labels:
{{- include "litellm.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": test
spec:
containers:
- name: wget
image: busybox
command: ['wget']
args: ['{{ include "litellm.fullname" . }}:{{ .Values.service.port }}']
restartPolicy: Never

View file

@ -0,0 +1,219 @@
# Default values for litellm.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
replicaCount: 1
image:
repository: ghcr.io/berriai/litellm
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
# tag: "main-latest"
tag: ""
# Image and tag used for the init container to check and wait for the
# readiness of the postgres database.
dbReadyImage: docker.io/bitnami/postgresql
dbReadyTag: ""
imagePullSecrets: []
nameOverride: "litellm"
fullnameOverride: ""
serviceAccount:
# Specifies whether a service account should be created
create: false
# Automatically mount a ServiceAccount's API credentials?
automount: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
podAnnotations: {}
podLabels: {}
# At the time of writing, the litellm docker image requires write access to the
# filesystem on startup so that prisma can install some dependencies.
podSecurityContext: {}
securityContext: {}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: false
# runAsNonRoot: true
# runAsUser: 1000
# A list of Kubernetes Secret objects that will be exported to the LiteLLM proxy
# pod as environment variables. These secrets can then be referenced in the
# configuration file (or "litellm" ConfigMap) with `os.environ/<Env Var Name>`
environmentSecrets: []
# - litellm-envsecrets
service:
type: ClusterIP
port: 8000
ingress:
enabled: true
className: "nginx"
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: api.example.local
paths:
- path: /
pathType: ImplementationSpecific
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
# The elements within proxy_config are rendered as config.yaml for the proxy
# Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
# Reference: https://docs.litellm.ai/docs/proxy/configs
proxy_config:
model_list:
# At least one model must exist for the proxy to start.
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: eXaMpLeOnLy
general_settings:
master_key: os.environ/PROXY_MASTER_KEY
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# Additional volumes on the output Deployment definition.
volumes: []
# - name: foo
# secret:
# secretName: mysecret
# optional: false
# Additional volumeMounts on the output Deployment definition.
volumeMounts: []
# - name: foo
# mountPath: "/etc/foo"
# readOnly: true
nodeSelector: {}
tolerations: []
affinity: {}
db:
# Use an existing postgres server/cluster
useExisting: false
# How to connect to the existing postgres server/cluster
endpoint: localhost
database: litellm
secret:
name: postgres
usernameKey: username
passwordKey: password
# Use the Stackgres Helm chart to deploy an instance of a Stackgres cluster.
# The Stackgres Operator must already be installed within the target
# Kubernetes cluster.
# TODO: Stackgres deployment currently unsupported
useStackgresOperator: false
# Use the Postgres Helm chart to create a single node, stand alone postgres
# instance. See the "postgresql" top level key for additional configuration.
deployStandalone: true
# Settings for Bitnami postgresql chart (if db.deployStandalone is true, ignored
# otherwise)
postgresql:
architecture: standalone
auth:
username: litellm
database: litellm
# You should override these on the helm command line with
# `--set postgresql.auth.postgres-password=<some good password>,postgresql.auth.password=<some good password>`
password: NoTaGrEaTpAsSwOrD
postgres-password: NoTaGrEaTpAsSwOrD
# A secret is created by this chart (litellm-helm) with the credentials that
# the new Postgres instance should use.
existingSecret: litellm-dbcredentials
secretKeys:
userPasswordKey: password
ui:
enabled: true
replicaCount: 1
autoscaling:
enabled: false
image:
repository: ghcr.io/berriai/litellm-ui
pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion.
# tag: "main-latest"
# TODO: Switch to BerryAI repo and tags if/when they provide a ui image
# https://github.com/BerriAI/litellm/pull/1505
tag: ""
service:
type: ClusterIP
port: 8501
ingress:
enabled: true
className: "nginx"
annotations: {}
hosts:
- host: ui.example.local
paths:
- path: /
pathType: ImplementationSpecific
tls: []
podAnnotations: {}
podLabels: {}
podSecurityContext:
fsGroup: 1000
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
resources: {}
volumes: []
volumeMounts: []
nodeSelector: {}
tolerations: []
affinity: {}

View file

@ -1,12 +0,0 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any

15
docker-compose.yml Normal file
View file

@ -0,0 +1,15 @@
version: "3.9"
services:
litellm:
image: ghcr.io/berriai/litellm:main-latest
volumes:
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
ports:
- "4000:4000"
environment:
- AZURE_API_KEY=sk-123
litellm-ui:
image: ghcr.io/berriai/litellm-ui:main-latest

View file

@ -1,11 +1,17 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Caching - In-Memory, Redis, s3
# Caching - In-Memory, Redis, s3, Redis Semantic Cache
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
## Initialize Cache - In Memory, Redis, s3 Bucket
:::info
Need to use Caching on LiteLLM Proxy Server? Doc here: [Caching Proxy Server](https://docs.litellm.ai/docs/proxy/caching)
:::
## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
<Tabs>
@ -18,7 +24,7 @@ pip install redis
```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
### Quick Start
```python
import litellm
from litellm import completion
@ -55,7 +61,7 @@ Set AWS environment variables
AWS_ACCESS_KEY_ID = "AKI*******"
AWS_SECRET_ACCESS_KEY = "WOl*****"
```
### Quick Start
```python
import litellm
from litellm import completion
@ -80,6 +86,66 @@ response2 = completion(
</TabItem>
<TabItem value="redis-sem" label="redis-semantic cache">
Install redis
```shell
pip install redisvl==0.0.7
```
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
```python
import litellm
from litellm import completion
from litellm.caching import Cache
random_number = random.randint(
1, 100000
) # add a random number to ensure it's always adding / reading from cache
print("testing semantic caching")
litellm.cache = Cache(
type="redis-semantic",
host=os.environ["REDIS_HOST"],
port=os.environ["REDIS_PORT"],
password=os.environ["REDIS_PASSWORD"],
similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
)
response1 = completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
}
],
max_tokens=20,
)
print(f"response1: {response1}")
random_number = random.randint(1, 100000)
response2 = completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"write a one sentence poem about: {random_number}",
}
],
max_tokens=20,
)
print(f"response2: {response1}")
assert response1.id == response2.id
# response1 == response2, response 1 is cached
```
</TabItem>
<TabItem value="in-mem" label="in memory cache">
### Quick Start

View file

@ -150,5 +150,12 @@ litellm.register_model(model_cost=
"https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json")
```
**Don't pull hosted model_cost_map**
If you have firewalls, and want to just use the local copy of the model cost map, you can do so like this:
```bash
export LITELLM_LOCAL_MODEL_COST_MAP="True"
```
Note: this means you will need to upgrade to get updated pricing, and newer models.

View file

@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
```
- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
```python
input=["good morning from litellm"]
```
@ -22,7 +22,11 @@ input=["good morning from litellm"]
- `user`: *string (optional)* A unique identifier representing your end-user,
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
@ -66,11 +70,18 @@ input=["good morning from litellm"]
from litellm import embedding
import os
os.environ['OPENAI_API_KEY'] = ""
response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
response = embedding(
model="text-embedding-3-small",
input=["good morning from litellm", "this is another item"],
metadata={"anything": "good day"},
dimensions=5 # Only supported in text-embedding-3 and later models.
)
```
| Model Name | Function Call | Required OS Variables |
|----------------------|---------------------------------------------|--------------------------------------|
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
## Azure OpenAI Embedding Models

View file

@ -0,0 +1,15 @@
# Enterprise
LiteLLM offers dedicated enterprise support.
This covers:
- **Feature Prioritization**
- **Custom Integrations**
- **Professional Support - Dedicated discord + slack**
- **Custom SLAs**
:::info
[Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::

View file

@ -131,3 +131,23 @@ response = image_generation(
prompt="cute baby otter"
)
```
## Bedrock - Stable Diffusion
Use this for stable diffusion on bedrock
### Usage
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
)
print(f"response: {response}")
```

View file

@ -5,10 +5,14 @@ import TabItem from '@theme/TabItem';
https://github.com/BerriAI/litellm
import QuickStart from '../src/components/QuickStart.js'
## **Call 100+ LLMs using the same Input/Output Format**
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## Basic usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -157,9 +161,6 @@ response = completion(
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
@ -177,9 +178,6 @@ response = completion(
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
@ -199,9 +197,6 @@ response = completion(
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
@ -222,9 +217,7 @@ response = completion(
stream=True,
)
for chunk in response:
print(chunk)
print(response)
```
</TabItem>
@ -246,9 +239,6 @@ response = completion(
messages = [{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
@ -265,9 +255,6 @@ response = completion(
api_base="http://localhost:11434",
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
<TabItem value="or" label="Openrouter">
@ -284,9 +271,6 @@ response = completion(
messages = [{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
for chunk in response:
print(chunk)
```
</TabItem>
@ -327,34 +311,8 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
```
## Calculate Costs, Usage, Latency
Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
```python
from litellm import completion, completion_cost
import os
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = completion(
model="gpt-3.5-turbo",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
cost = completion_cost(completion_response=response)
print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
```
**Output**
```shell
Cost for completion call with gpt-3.5-turbo: $0.0000775000
```
### Track Costs, Usage, Latency for streaming
We use a custom callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
- We define a callback function to calculate cost `def track_cost_callback()`
- In `def track_cost_callback()` we check if the stream is complete - `if "complete_streaming_response" in kwargs`
- Use `litellm.completion_cost()` to calculate cost, once the stream is complete
## Track Costs, Usage, Latency for streaming
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
```python
import litellm
@ -366,18 +324,8 @@ def track_cost_callback(
start_time, end_time # start/end time
):
try:
# check if it has collected an entire stream response
if "complete_streaming_response" in kwargs:
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
completion_response=kwargs["complete_streaming_response"]
input_text = kwargs["messages"]
output_text = completion_response["choices"][0]["message"]["content"]
response_cost = litellm.completion_cost(
model = kwargs["model"],
messages = input_text,
completion=output_text
)
print("streaming response_cost", response_cost)
response_cost = kwargs.get("response_cost", 0)
print("streaming response_cost", response_cost)
except:
pass
# set callback
@ -400,6 +348,8 @@ response = completion(
Track spend across multiple projects/people
![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
The proxy provides:
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
@ -436,8 +386,7 @@ response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
print(response)
```
## More details
* [exception mapping](./exception_mapping.md)
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
* [proxy virtual keys & spend management](./tutorials/fallbacks.md)

View file

@ -27,6 +27,7 @@ Use just 2 lines of code, to instantly log your responses **across all providers
Get your Langfuse API Keys from https://cloud.langfuse.com/
```python
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"] # logs errors to langfuse
```
```python
# pip install langfuse
@ -93,7 +94,7 @@ print(response)
```
### Set Custom Trace ID, Trace User ID
### Set Custom Trace ID, Trace User ID and Tags
Pass `trace_id`, `trace_user_id` in `metadata`
@ -122,6 +123,8 @@ response = completion(
"generation_id": "gen-id22", # set langfuse Generation ID
"trace_id": "trace-id22", # set langfuse Trace ID
"trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"] # set langfuse Tags
},
)

View file

@ -74,6 +74,8 @@ response = litellm.completion(
| gpt-4-32k | `completion('azure/<your deployment name>', messages)` |
| gpt-4-32k-0314 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-32k-0613 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-1106-preview | `completion('azure/<your deployment name>', messages)` |
| gpt-4-0125-preview | `completion('azure/<your deployment name>', messages)` |
| gpt-3.5-turbo | `completion('azure/<your deployment name>', messages)` |
| gpt-3.5-turbo-0301 | `completion('azure/<your deployment name>', messages)` |
| gpt-3.5-turbo-0613 | `completion('azure/<your deployment name>', messages)` |

View file

@ -197,7 +197,7 @@ response = completion(
### SSO Login (AWS Profile)
- Set `AWS_PROFILE` environment variable
- Make bedrock completion call
- Make bedrock completion call
```python
import os
from litellm import completion
@ -208,11 +208,24 @@ response = completion(
)
```
### STS based Auth
or pass `aws_profile_name`:
```python
import os
from litellm import completion
response = completion(
model="bedrock/anthropic.claude-instant-v1",
messages=[{ "content": "Hello, how are you?","role": "user"}],
aws_profile_name="dev-profile",
)
```
### STS based Auth
- Set `aws_role_name` and `aws_session_name` in completion() / embedding() function
Make the bedrock completion call
Make the bedrock completion call
```python
from litellm import completion
@ -315,3 +328,50 @@ print(response)
| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
| Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
| Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |
## Image Generation
Use this for stable diffusion on bedrock
### Usage
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
)
print(f"response: {response}")
```
**Set optional params**
```python
import os
from litellm import image_generation
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = image_generation(
prompt="A cute baby sea otter",
model="bedrock/stability.stable-diffusion-xl-v0",
### OPENAI-COMPATIBLE ###
size="128x512", # width=128, height=512
### PROVIDER-SPECIFIC ### see `AmazonStabilityConfig` in bedrock.py for all params
seed=30
)
print(f"response: {response}")
```
## Supported AWS Bedrock Image Generation Models
| Model Name | Function Call |
|----------------------|---------------------------------------------|
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |

View file

@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------|
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
@ -173,6 +174,31 @@ response = completion(
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
### Set `ssl_verify=False`
This is done by setting your own `httpx.Client`
- For `litellm.completion` set `litellm.client_session=httpx.Client(verify=False)`
- For `litellm.acompletion` set `litellm.aclient_session=AsyncClient.Client(verify=False)`
```python
import litellm, httpx
# for completion
litellm.client_session = httpx.Client(verify=False)
response = litellm.completion(
model="gpt-3.5-turbo",
messages=messages,
)
# for acompletion
litellm.aclient_session = httpx.AsyncClient(verify=False)
response = litellm.acompletion(
model="gpt-3.5-turbo",
messages=messages,
)
```
### Using Helicone Proxy with LiteLLM
```python
import os

View file

@ -1,4 +1,4 @@
# VertexAI - Google [Gemini]
# VertexAI - Google [Gemini, Model Garden]
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -20,6 +20,27 @@ litellm.vertex_location = "us-central1" # proj location
response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
```
## OpenAI Proxy Usage
1. Modify the config.yaml
```yaml
litellm_settings:
vertex_project: "hardy-device-38811" # Your Project ID
vertex_location: "us-central1" # proj location
model_list:
-model_name: team1-gemini-pro
litellm_params:
model: gemini-pro
```
2. Start the proxy
```bash
$ litellm --config /path/to/config.yaml
```
## Set Vertex Project & Vertex Location
All calls using Vertex AI require the following parameters:
* Your Project ID
@ -46,16 +67,39 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
# set directly on module
litellm.vertex_location = "us-central1 # Your Location
```
## Model Garden
| Model Name | Function Call |
|------------------|--------------------------------------|
| llama2 | `completion('vertex_ai/<endpoint_id>', messages)` |
#### Using Model Garden
```python
from litellm import completion
import os
## set ENV variables
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/<your-endpoint-id>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
```
## Gemini Pro
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-pro | `completion('gemini-pro', messages)` |
| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
## Gemini Pro Vision
| Model Name | Function Call |
|------------------|--------------------------------------|
| gemini-pro-vision | `completion('gemini-pro-vision', messages)` |
| gemini-pro-vision | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
#### Using Gemini Pro Vision
@ -93,6 +137,7 @@ response = litellm.completion(
print(response)
```
## Chat Models
| Model Name | Function Call |
|------------------|--------------------------------------|

View file

@ -1,6 +1,13 @@
# Slack Alerting
Get alerts for failed db read/writes, hanging api calls, failed api calls.
Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
- When a User/Key crosses their Budget
- When a User/Key is 15% away from crossing their Budget
- failed db read/writes
## Quick Start

View file

@ -7,16 +7,17 @@ Cache LLM Responses
LiteLLM supports:
- In Memory Cache
- Redis Cache
- Redis Semantic Cache
- s3 Bucket Cache
## Quick Start - Redis, s3 Cache
## Quick Start - Redis, s3 Cache, Semantic Cache
<Tabs>
<TabItem value="redis" label="redis cache">
Caching can be enabled by adding the `cache` key in the `config.yaml`
### Step 1: Add `cache` to the config.yaml
#### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -31,7 +32,7 @@ litellm_settings:
cache: True # set cache responses to True, litellm defaults to using a redis cache
```
### Step 2: Add Redis Credentials to .env
#### Step 2: Add Redis Credentials to .env
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
```shell
@ -49,7 +50,7 @@ REDIS_<redis-kwarg-name> = ""
```
[**See how it's read from the environment**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/_redis.py#L40)
### Step 3: Run proxy with config
#### Step 3: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
@ -57,7 +58,7 @@ $ litellm --config /path/to/config.yaml
<TabItem value="s3" label="s3 cache">
### Step 1: Add `cache` to the config.yaml
#### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -79,7 +80,57 @@ litellm_settings:
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
### Step 2: Run proxy with config
#### Step 2: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
</TabItem>
<TabItem value="redis-sem" label="redis semantic cache">
Caching can be enabled by adding the `cache` key in the `config.yaml`
#### Step 1: Add `cache` to the config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
- model_name: azure-embedding-model
litellm_params:
model: azure/azure-embedding-model
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
litellm_settings:
set_verbose: True
cache: True # set cache responses to True, litellm defaults to using a redis cache
cache_params:
type: "redis-semantic"
similarity_threshold: 0.8 # similarity threshold for semantic cache
redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
```
#### Step 2: Add Redis Credentials to .env
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
```shell
REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database'
## OR ##
REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
REDIS_PORT = "" # REDIS_PORT='18841'
REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing'
```
**Additional kwargs**
You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this:
```shell
REDIS_<redis-kwarg-name> = ""
```
#### Step 3: Run proxy with config
```shell
$ litellm --config /path/to/config.yaml
```
@ -160,9 +211,10 @@ litellm_settings:
The proxy support 3 cache-controls:
- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint.
- `no-store`: *Optional(bool)* Will not cache the response.
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)

View file

@ -22,18 +22,22 @@ Set a model alias for your deployments.
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
In the config below requests with:
In the config below:
- `model_name`: the name to pass TO litellm from the external client
- `litellm_params.model`: the model string passed to the litellm.completion() function
E.g.:
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
```yaml
model_list:
- model_name: gpt-3.5-turbo # user-facing model alias
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
model: azure/gpt-turbo-small-eu
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
- model_name: bedrock-claude-v1
litellm_params:
model: bedrock/anthropic.claude-instant-v1
@ -43,6 +47,11 @@ model_list:
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_CA"
rpm: 6
- model_name: anthropic-claude
litellm_params:
model="bedrock/anthropic.claude-instant-v1"
### [OPTIONAL] SET AWS REGION ###
aws_region_name="us-east-1"
- model_name: vllm-models
litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
```
:::info
For more provider-specific info, [go here](../providers/)
:::
#### Step 2: Start Proxy with config
@ -188,7 +202,7 @@ print(response)
</Tabs>
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -210,6 +224,12 @@ model_list:
api_key: sk-123
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
temperature: 0.2
- model_name: openai-gpt-3.5
litellm_params:
model: openai/gpt-3.5-turbo
api_key: sk-123
organization: org-ikDc4ex8NB
temperature: 0.2
- model_name: mistral-7b
litellm_params:
model: ollama/mistral
@ -226,6 +246,28 @@ model_list:
$ litellm --config /path/to/config.yaml
```
## Set Azure `base_model` for cost tracking
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
**Solution** ✅ : Set `base_model` on your config so litellm uses the correct model for calculating azure cost
Example config with `base_model`
```yaml
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
base_model: azure/gpt-4-1106-preview
```
You can view your cost once you set up [Virtual keys](https://docs.litellm.ai/docs/proxy/virtual_keys) or [custom_callbacks](https://docs.litellm.ai/docs/proxy/logging)
## Load API Keys
### Load API Keys from Environment
@ -318,6 +360,26 @@ See supported Embedding Providers & Models [here](https://docs.litellm.ai/docs/e
#### Create Config.yaml
<Tabs>
<TabItem value="bedrock" label="Bedrock Completion/Chat">
```yaml
model_list:
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-west-2"
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-east-2"
- model_name: bedrock-cohere
litellm_params:
model: "bedrock/cohere.command-text-v14"
aws_region_name: "us-east-1"
```
</TabItem>
<TabItem value="sagemaker" label="Sagemaker, Bedrock Embeddings">
@ -430,20 +492,26 @@ model_list:
</Tabs>
#### Start Proxy
```shell
litellm --config config.yaml
```
#### Make Request
Sends Request to `deployed-codebert-base`
Sends Request to `bedrock-cohere`
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "deployed-codebert-base",
"input": ["write a litellm poem"]
}'
"model": "bedrock-cohere",
"messages": [
{
"role": "user",
"content": "gm"
}
]
}'
```
@ -483,3 +551,55 @@ general_settings:
max_parallel_requests: 100 # max parallel requests for a user = 100
```
## All settings
```python
{
"environment_variables": {},
"model_list": [
{
"model_name": "string",
"litellm_params": {},
"model_info": {
"id": "string",
"mode": "embedding",
"input_cost_per_token": 0,
"output_cost_per_token": 0,
"max_tokens": 2048,
"base_model": "gpt-4-1106-preview",
"additionalProp1": {}
}
}
],
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
"general_settings": {
"completion_model": "string",
"key_management_system": "google_kms", # either google_kms or azure_kms
"master_key": "string",
"database_url": "string",
"database_type": "dynamo_db",
"database_args": {
"billing_mode": "PROVISIONED_THROUGHPUT",
"read_capacity_units": 0,
"write_capacity_units": 0,
"ssl_verify": true,
"region_name": "string",
"user_table_name": "LiteLLM_UserTable",
"key_table_name": "LiteLLM_VerificationToken",
"config_table_name": "LiteLLM_Config",
"spend_table_name": "LiteLLM_SpendLogs"
},
"otel": true,
"custom_auth": "string",
"max_parallel_requests": 0,
"infer_model_from_keys": true,
"background_health_checks": true,
"health_check_interval": 300,
"alerting": [
"string"
],
"alerting_threshold": 0
}
}
```

View file

@ -10,6 +10,12 @@ There's 2 ways to track cost:
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
:::info
LiteLLM already has pricing for any model in our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
:::
## Quick Start
Register custom pricing for sagemaker completion model.
@ -54,7 +60,7 @@ model_list:
- model_name: sagemaker-embedding-model
litellm_params:
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
input_cost_per_second: 0.000420
input_cost_per_second: 0.000420
```
**Step 2: Start proxy**
@ -67,25 +73,28 @@ litellm /path/to/config.yaml
<Image img={require('../../img/spend_logs_table.png')} />
## Cost Per Token
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_sagemaker():
def test_completion_azure_model():
try:
print("testing sagemaker")
print("testing azure custom pricing")
# azure call
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_token=0.005,
output_cost_per_token=1,
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
@ -94,15 +103,19 @@ def test_completion_sagemaker():
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
```yaml
model_list:
- model_name: sagemaker-completion-model
- model_name: azure-model
litellm_params:
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
input_cost_per_token: 0.000420 # 👈 key change
output_cost_per_token: 0.000420 # 👈 key change
model: azure/<your_deployment_name>
api_key: os.environ/AZURE_API_KEY
api_base: os.environ/AZURE_API_BASE
api_version: os.envrion/AZURE_API_VERSION
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
```

View file

@ -0,0 +1,34 @@
# Debugging
2 levels of debugging supported.
- debug (prints info logs)
- detailed debug (prints debug logs)
## `debug`
**via cli**
```bash
$ litellm --debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "INFO"
```
## `detailed debug`
**via cli**
```bash
$ litellm --detailed_debug
```
**via env**
```python
os.environ["LITELLM_LOG"] = "DEBUG"
```

View file

@ -116,6 +116,20 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
</TabItem>
</Tabs>
## Setting SSL Certification
Use this, If you need to set ssl certificates for your on prem litellm proxy
Pass `ssl_keyfile_path` (Path to the SSL keyfile) and `ssl_certfile_path` (Path to the SSL certfile) when starting litellm proxy
```shell
docker run ghcr.io/berriai/litellm:main-latest \
--ssl_keyfile_path ssl_test/keyfile.key \
--ssl_certfile_path ssl_test/certfile.crt
```
Provide an ssl certificate when starting litellm proxy server
## Platform-specific Guide

View file

@ -112,7 +112,8 @@ Example Response:
```json
{
"status": "healthy",
"db": "connected"
"db": "connected",
"litellm_version":"1.19.2",
}
```
@ -121,7 +122,8 @@ Example Response:
```json
{
"status": "healthy",
"db": "Not connected"
"db": "Not connected",
"litellm_version":"1.19.2",
}
```

View file

@ -435,6 +435,7 @@ print(response)
</TabItem>
</Tabs>
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
@ -490,6 +491,34 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
Your logs should be available on the specified s3 Bucket
## Team-based Logging
Set success callbacks (e.g. langfuse), for a specific team-id.
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_2
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_3
langfuse_secret: os.environ/LANGFUSE_SECRET_3
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:8000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging.
## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set

View file

@ -0,0 +1,30 @@
import Image from '@theme/IdealImage';
# PII Masking
LiteLLM supports [Microsoft Presidio](https://github.com/microsoft/presidio/) for PII masking.
## Step 1. Add env
```bash
export PRESIDIO_ANALYZER_API_BASE="http://localhost:5002"
export PRESIDIO_ANONYMIZER_API_BASE="http://localhost:5001"
```
## Step 2. Set it as a callback in config.yaml
```yaml
litellm_settings:
callbacks = ["presidio", ...] # e.g. ["presidio", custom_callbacks.proxy_handler_instance]
```
## Start proxy
```
litellm --config /path/to/config.yaml
```
This will mask the input going to the llm provider
<Image img={require('../../img/presidio_screenshot.png')} />

View file

@ -8,16 +8,8 @@ Quick start CLI, Config, Docker
LiteLLM Server manages:
* **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
* **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
* **Load Balancing**: between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model) - LiteLLM proxy can handle 1.5k+ requests/second during load tests.
* **Cost tracking**: Authentication & Spend Tracking [Virtual Keys](#managing-auth---virtual-keys)
[**See LiteLLM Proxy code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy)
#### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
```shell
$ pip install 'litellm[proxy]'
@ -40,115 +32,6 @@ litellm --test
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
### Supported LLMs
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
<Tabs>
@ -330,9 +213,6 @@ $ litellm --model command-nightly
</Tabs>
## Quick Start - LiteLLM Proxy + Config.yaml
The config allows you to create a model list and set `api_base`, `max_tokens` (all litellm params). See more details about the config [here](https://docs.litellm.ai/docs/proxy/configs)
@ -363,6 +243,115 @@ model_list:
litellm --config your_config.yaml
```
## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
<TabItem value="langchain-embedding" label="Langchain Embeddings">
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(f"TITAN EMBEDDINGS")
print(query_result[:5])
```
</TabItem>
</Tabs>
[**More Info**](./configs.md)

View file

@ -1,9 +1,11 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] Admin UI
# 🔑 [BETA] Proxy UI
### **Create + delete keys through a UI**
- Track Spend Per API Key, User
- Allow your users to create their own keys through a UI
[Let users create their own keys](#setup-ssoauth-for-ui)
:::info
@ -11,61 +13,129 @@ This is in beta, so things may change. If you have feedback, [let us know](https
:::
<Image img={require('../../img/litellm_ui_create_key.png')} />
## Quick Start
Requirements:
- Requires proxy master key to be set
- Requires db connected
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
Follow [setup](./virtual_keys.md#setup)
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
### 1. Start the proxy
```bash
litellm --config /path/to/config.yaml
### Step 1. Save SMTP server credentials
```env
export SMTP_HOST="my-smtp-host"
export SMTP_USERNAME="my-smtp-password"
export SMTP_PASSWORD="my-smtp-password"
export SMTP_SENDER_EMAIL="krrish@berri.ai"
#INFO: Proxy running on http://0.0.0.0:8000
```
### Step 2. Enable user auth
### 2. Go to UI
```bash
http://0.0.0.0:8000/ui # <proxy_base_url>/ui
```
In your config.yaml,
## Get Admin UI Link on Swagger
Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhost:4000/`
<Image img={require('../../img/ui_link.png')} />
## Change default username + password
Set the following in your .env on the Proxy
```shell
UI_USERNAME=ishaan-litellm
UI_PASSWORD=langchain
```
On accessing the LiteLLM UI, you will be prompted to enter your username, password
## Setup SSO/Auth for UI
### Step 1: Set upperbounds for keys
Control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
```yaml
general_settings:
# other changes
allow_user_auth: true
litellm_settings:
upperbound_key_generate_params:
max_budget: 100 # upperbound of $100, for all /key/generate requests
duration: "30d" # upperbound of 30 days for all /key/generate requests
```
This will enable:
* Users to create keys via `/key/generate` (by default, only admin can create keys)
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
** Expected Behavior **
### Step 3. Connect to UI
- Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
### Step 2: Setup Oauth Client
<Tabs>
<TabItem value="google" label="Google SSO">
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
- Create a new Oauth 2.0 Client on https://console.cloud.google.com/
Connect your proxy to your UI, by entering:
1. The hosted proxy URL
2. Accepted email subdomains
3. [OPTIONAL] Allowed admin emails
**Required .env variables on your Proxy**
```shell
# for Google SSO Login
GOOGLE_CLIENT_ID=
GOOGLE_CLIENT_SECRET=
```
<Image img={require('../../img/admin_dashboard.png')} />
- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
https://litellm-production-7002.up.railway.app/sso/callback
```
## What users will see?
</TabItem>
### Auth
<TabItem value="msft" label="Microsoft SSO">
<Image img={require('../../img/user_auth_screen.png')} />
- Create a new App Registration on https://portal.azure.com/
- Create a client Secret for your App Registration
### Create Keys
**Required .env variables on your Proxy**
```shell
MICROSOFT_CLIENT_ID="84583a4d-"
MICROSOFT_CLIENT_SECRET="nbk8Q~"
MICROSOFT_TENANT="5a39737
```
- Set Redirect URI on your App Registration on https://portal.azure.com/
- Set a redirect url = `<your proxy base url>/sso/callback`
```shell
http://localhost:4000/sso/callback
```
<Image img={require('../../img/user_create_key_screen.png')} />
</TabItem>
### Spend Per Key
</Tabs>
<Image img={require('../../img/spend_per_api_key.png')} />
### Step 3. Test flow
<Image img={require('../../img/litellm_ui_3.gif')} />
## Set Admin view w/ SSO
You just need to set Proxy Admin ID
### Step 1: Copy your ID from the UI
<Image img={require('../../img/litellm_ui_copy_id.png')} />
### Step 2: Set it in your .env as the PROXY_ADMIN_ID
```env
export PROXY_ADMIN_ID="116544810872468347480"
```
### Step 3: See all proxy keys
<Image img={require('../../img/litellm_ui_admin.png')} />
:::info
If you don't see all your keys this could be due to a cached token. So just re-login and it should work.
:::

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Use with Langchain, OpenAI SDK, Curl
# Use with Langchain, OpenAI SDK, LlamaIndex, Curl
:::info
@ -51,6 +51,42 @@ response = client.chat.completions.create(
print(response)
```
</TabItem>
<TabItem value="LlamaIndex" label="LlamaIndex">
```python
import os, dotenv
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
llm = AzureOpenAI(
engine="azure-gpt-3.5", # model_name on litellm proxy
temperature=0.0,
azure_endpoint="http://0.0.0.0:4000", # litellm proxy endpoint
api_key="sk-1234", # litellm proxy API Key
api_version="2023-07-01-preview",
)
embed_model = AzureOpenAIEmbedding(
deployment_name="azure-embedding-model",
azure_endpoint="http://0.0.0.0:4000",
api_key="sk-1234",
api_version="2023-07-01-preview",
)
documents = SimpleDirectoryReader("llama_index_data").load_data()
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)
```
</TabItem>
<TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 💰 Budgets, Rate Limits per user
# 💰 Budgets, Rate Limits
Requirements:
@ -10,22 +10,72 @@ Requirements:
## Set Budgets
You can set budgets at 3 levels:
- For the proxy
- For a user
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
- For a key
Set `max_budget` in (USD $) param in the `/user/new` or `/key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
<Tabs>
<TabItem value="per-user" label="Per User">
<TabItem value="proxy" label="For Proxy">
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
Apply a budget across all calls on the proxy
**Step 1. Modify config.yaml**
```yaml
general_settings:
master_key: sk-1234
litellm_settings:
# other litellm settings
max_budget: 0 # (float) sets max budget as $0 USD
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
**Step 2. Start proxy**
```bash
litellm /path/to/config.yaml
```
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
<TabItem value="per-user" label="For User">
Apply a budget across multiple keys.
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
You can:
- Add budgets to users [**Jump**](#add-budgets-to-users)
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
@ -40,9 +90,93 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
}
```
### **Add budget duration to users**
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_id": "core-infra", # [OPTIONAL]
"max_budget": 10,
"budget_duration": 10s,
}'
```
### Create new keys for existing user
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
```
</TabItem>
<TabItem value="per-key" label="Per Key">
<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
**Step 1. Modify config.yaml**
Define `litellm.max_user_budget`
```yaml
general_settings:
master_key: sk-1234
litellm_settings:
max_budget: 10 # global budget for proxy
max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
```
2. Make a /chat/completions call, pass 'user' - First call Works
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "ishaan3",
"messages": [
{
"role": "user",
"content": "what time is it"
}
]
}'
```
3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "ishaan3",
"messages": [
{
"role": "user",
"content": "what time is it"
}
]
}'
```
Error
```shell
{"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%
```
</TabItem>
<TabItem value="per-key" label="For Key">
Apply a budget on a key.
You can:
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
@ -53,6 +187,8 @@ You can:
- After the key crosses it's `max_budget`, requests fail
- If duration set, spend is reset at the end of the duration
By default the `max_budget` is set to `null` and is not checked for keys
### **Add budgets to keys**
```bash

View file

@ -1,4 +1,4 @@
# Virtual Keys
# Virtual Keys, Users
Track Spend, Set budgets and create virtual keys for the proxy
Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -6,6 +6,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
:::info
- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
- [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
- Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
@ -16,8 +17,11 @@ Grant other's temporary access to your proxy, with keys that expire after a set
Requirements:
- Need to a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
- Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
- Set a `master key`, this is your Proxy Admin key - you can use this to create other keys
- ** Set on config.yaml** set your master key under `general_settings:master_key`, example below
- ** Set env variable** set `LITELLM_MASTER_KEY` (**Note: either set this on the config.yaml or in your env** whatever is more convenient for you)
(the proxy Dockerfile checks if the `DATABASE_URL` is set and then intializes the DB connection)
@ -81,15 +85,17 @@ curl 'http://0.0.0.0:8000/key/generate' \
Request Params:
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
- `team_id`: *str or null (optional)* Specify team_id for the associated key
- `max_budget`: *float or null (optional)* Specify max budget (in Dollars $) for a given key. If no value is set, the key has no budget
### Response
@ -97,20 +103,11 @@ Request Params:
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
}
```
### Keys that don't expire
Just set duration to None.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
```
### Upgrade/Downgrade Models
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -285,7 +282,152 @@ Request Params:
}
```
## Set Budgets - Per Key
## /user/new
### Request
All [key/generate params supported](#keygenerate) for creating a user
```shell
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{
"user_id": "ishaan1",
"user_email": "ishaan@litellm.ai",
"user_role": "admin",
"team_id": "cto-team",
"max_budget": 20,
"budget_duration": "1h"
}'
```
Request Params:
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
- user_email: str (optional - defaults to "") - The email address associated with the user.
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
**Possible `user_role` values**
```
"admin" - Maintaining the proxy and owning the overall budget
"app_owner" - employees maintaining the apps, each owner may own more than one app
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
```
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
### Response
A key will be generated for the new user created
```shell
{
"models": [],
"spend": 0.0,
"max_budget": null,
"user_id": "ishaan1",
"team_id": null,
"max_parallel_requests": null,
"metadata": {},
"tpm_limit": null,
"rpm_limit": null,
"budget_duration": null,
"allowed_cache_controls": [],
"key_alias": null,
"duration": null,
"aliases": {},
"config": {},
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
"key_name": null,
"expires": null
}
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## Advanced
### Upperbound /key/generate params
Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
Set `litellm_settings:upperbound_key_generate_params`:
```yaml
litellm_settings:
upperbound_key_generate_params:
max_budget: 100 # upperbound of $100, for all /key/generate requests
duration: "30d" # upperbound of 30 days for all /key/generate requests
```
** Expected Behavior **
- Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound
### Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
### Restrict models by `team_id`
`litellm-dev` can only access `azure-gpt-3.5`
```yaml
litellm_settings:
default_team_settings:
- team_id: litellm-dev
models: ["azure-gpt-3.5"]
```
#### Create key with team_id="litellm-dev"
```shell
curl --location 'http://localhost:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"team_id": "litellm-dev"}'
```
#### Use Key to call invalid model - Fails
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
--data '{
"model": "BEDROCK_GROUP",
"messages": [
{
"role": "user",
"content": "hi"
}
]
}'
```
```shell
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
```
### Set Budgets - Per Key
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
@ -331,7 +473,7 @@ Expected Response from `/chat/completions` when key has crossed budget
```
## Set Budgets - Per User
### Set Budgets - Per User
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
@ -356,7 +498,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
}
```
## Tracking Spend
### Tracking Spend
You can get spend for a key by using the `/key/info` endpoint.
@ -391,13 +533,13 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
```
## Custom Auth
### Custom Auth
You can now override the default api key auth.
Here's how:
### 1. Create a custom auth file.
#### 1. Create a custom auth file.
Make sure the response type follows the `UserAPIKeyAuth` pydantic object. This is used by for logging usage specific to that user key.
@ -414,7 +556,7 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
raise Exception
```
### 2. Pass the filepath (relative to the config.yaml)
#### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
@ -435,16 +577,16 @@ general_settings:
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
### 3. Start the proxy
#### 3. Start the proxy
```shell
$ litellm --config /path/to/config.yaml
```
## Custom /key/generate
### Custom /key/generate
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
### 1. Write a custom `custom_generate_key_fn`
#### 1. Write a custom `custom_generate_key_fn`
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
@ -510,7 +652,7 @@ async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
```
### 2. Pass the filepath (relative to the config.yaml)
#### 2. Pass the filepath (relative to the config.yaml)
Pass the filepath to the config.yaml
@ -532,18 +674,18 @@ general_settings:
## [BETA] Dynamo DB
### [BETA] Dynamo DB
Only live in `v1.16.21.dev1`.
### Step 1. Save keys to env
#### Step 1. Save keys to env
```shell
AWS_ACCESS_KEY_ID = "your-aws-access-key-id"
AWS_SECRET_ACCESS_KEY = "your-aws-secret-access-key"
```
### Step 2. Add details to config
#### Step 2. Add details to config
```yaml
general_settings:
@ -560,7 +702,7 @@ general_settings:
}
```
### Step 3. Generate Key
#### Step 3. Generate Key
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \

View file

@ -605,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
print(f"response: {response}")
```
## Custom Callbacks - Track API Key, API Endpoint, Model Used
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
### Usage
```python
import litellm
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Success")
print("kwargs=", kwargs)
litellm_params= kwargs.get("litellm_params")
api_key = litellm_params.get("api_key")
api_base = litellm_params.get("api_base")
custom_llm_provider= litellm_params.get("custom_llm_provider")
response_cost = kwargs.get("response_cost")
# print the values
print("api_key=", api_key)
print("api_base=", api_base)
print("custom_llm_provider=", custom_llm_provider)
print("response_cost=", response_cost)
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure")
print("kwargs=")
customHandler = MyCustomHandler()
litellm.callbacks = [customHandler]
# Init Router
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
# router completion call
response = router.completion(
model="gpt-3.5-turbo",
messages=[{ "role": "user", "content": "Hi who are you"}]
)
```
## Deploy Router

View file

@ -99,6 +99,12 @@ const config = {
position: 'left',
label: 'Docs',
},
{
sidebarId: 'tutorialSidebar',
position: 'left',
label: 'Enterprise',
to: "docs/enterprise"
},
{
href: 'https://github.com/BerriAI/litellm',
label: 'GitHub',

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 297 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 243 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 205 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 249 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.9 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View file

@ -98,7 +98,7 @@ const sidebars = {
link: {
type: 'generated-index',
title: '💥 OpenAI Proxy Server',
description: `Proxy Server to call 100+ LLMs in a unified interface, load balance deployments, track costs per user`,
description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
slug: '/simple_proxy',
},
items: [
@ -115,6 +115,8 @@ const sidebars = {
"proxy/ui",
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/pii_masking",
{
"type": "category",
"label": "🔥 Load Balancing",
@ -123,6 +125,7 @@ const sidebars = {
"proxy/reliability",
]
},
"proxy/caching",
{
"type": "category",
"label": "Logging, Alerting, Caching",
@ -130,7 +133,6 @@ const sidebars = {
"proxy/logging",
"proxy/alerting",
"proxy/streaming_logging",
"proxy/caching",
]
},
{

View file

@ -8,6 +8,11 @@ https://github.com/BerriAI/litellm
## **Call 100+ LLMs using the same Input/Output Format**
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## Basic usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -306,30 +311,7 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
```
## Calculate Costs, Usage, Latency
Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
```python
from litellm import completion, completion_cost
import os
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = completion(
model="gpt-3.5-turbo",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
cost = completion_cost(completion_response=response)
print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
```
**Output**
```shell
Cost for completion call with gpt-3.5-turbo: $0.0000775000
```
### Track Costs, Usage, Latency for streaming
## Track Costs, Usage, Latency for streaming
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
```python
@ -342,18 +324,8 @@ def track_cost_callback(
start_time, end_time # start/end time
):
try:
# check if it has collected an entire stream response
if "complete_streaming_response" in kwargs:
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
completion_response=kwargs["complete_streaming_response"]
input_text = kwargs["messages"]
output_text = completion_response["choices"][0]["message"]["content"]
response_cost = litellm.completion_cost(
model = kwargs["model"],
messages = input_text,
completion=output_text
)
print("streaming response_cost", response_cost)
response_cost = kwargs.get("response_cost", 0)
print("streaming response_cost", response_cost)
except:
pass
# set callback
@ -372,13 +344,12 @@ response = completion(
)
```
Need a dedicated key? Email us @ krrish@berri.ai
## OpenAI Proxy
Track spend across multiple projects/people
![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
The proxy provides:
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
@ -418,4 +389,4 @@ print(response)
## More details
* [exception mapping](./exception_mapping.md)
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
* [proxy virtual keys & spend management](./tutorials/fallbacks.md)

View file

@ -1,11 +1,13 @@
### INIT VARIABLES ###
import threading, requests
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any
from litellm.caching import Cache
from litellm._logging import set_verbose, _turn_on_debug
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
from litellm.proxy._types import KeyManagementSystem
import httpx
import dotenv
dotenv.load_dotenv()
#############################################
if set_verbose == True:
_turn_on_debug()
@ -62,6 +64,9 @@ cache: Optional[
model_alias_map: Dict[str, str] = {}
model_group_alias_map: Dict[str, str] = {}
max_budget: float = 0.0 # set the max budget across all providers
budget_duration: Optional[
str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
_openai_completion_params = [
"functions",
"function_call",
@ -140,6 +145,10 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
suppress_debug_info = False
dynamodb_table_name: Optional[str] = None
s3_callback_params: Optional[Dict] = None
default_key_generate_params: Optional[Dict] = None
upperbound_key_generate_params: Optional[Dict] = None
default_team_settings: Optional[List] = None
max_user_budget: Optional[float] = None
#### RELIABILITY ####
request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint
@ -159,6 +168,19 @@ _key_management_system: Optional[KeyManagementSystem] = None
def get_model_cost_map(url: str):
if (
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
):
import importlib.resources
import json
with importlib.resources.open_text(
"litellm", "model_prices_and_context_window_backup.json"
) as f:
content = json.load(f)
return content
try:
with requests.get(
url, timeout=5
@ -214,6 +236,7 @@ vertex_chat_models: List = []
vertex_code_chat_models: List = []
vertex_text_models: List = []
vertex_code_text_models: List = []
vertex_embedding_models: List = []
ai21_models: List = []
nlp_cloud_models: List = []
aleph_alpha_models: List = []
@ -243,6 +266,8 @@ for key, value in model_cost.items():
vertex_chat_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
vertex_code_chat_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-embedding-models":
vertex_embedding_models.append(key)
elif value.get("litellm_provider") == "ai21":
ai21_models.append(key)
elif value.get("litellm_provider") == "nlp_cloud":
@ -262,6 +287,7 @@ openai_compatible_endpoints: List = [
"api.endpoints.anyscale.com/v1",
"api.deepinfra.com/v1/openai",
"api.mistral.ai/v1",
"api.together.xyz/v1",
]
# this is maintained for Exception Mapping
@ -271,6 +297,7 @@ openai_compatible_providers: List = [
"deepinfra",
"perplexity",
"xinference",
"together_ai",
]
@ -479,7 +506,10 @@ bedrock_embedding_models: List = [
]
all_embedding_models = (
open_ai_embedding_models + cohere_embedding_models + bedrock_embedding_models
open_ai_embedding_models
+ cohere_embedding_models
+ bedrock_embedding_models
+ vertex_embedding_models
)
####### IMAGE GENERATION MODELS ###################
@ -534,6 +564,7 @@ from .llms.bedrock import (
AmazonAnthropicConfig,
AmazonCohereConfig,
AmazonLlamaConfig,
AmazonStabilityConfig,
)
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError

View file

@ -7,8 +7,11 @@ handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)
# Create a formatter and set it for the handler
formatter = logging.Formatter(
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
datefmt="%H:%M:%S",
)
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
handler.setFormatter(formatter)

View file

@ -11,6 +11,7 @@
import os
import inspect
import redis, litellm
import redis.asyncio as async_redis
from typing import List, Optional
@ -67,7 +68,10 @@ def get_redis_url_from_environment():
)
def get_redis_client(**env_overrides):
def _get_redis_client_logic(**env_overrides):
"""
Common functionality across sync + async redis client implementations
"""
### check if "os.environ/<key-name>" passed in
for k, v in env_overrides.items():
if isinstance(v, str) and v.startswith("os.environ/"):
@ -85,9 +89,33 @@ def get_redis_client(**env_overrides):
redis_kwargs.pop("port", None)
redis_kwargs.pop("db", None)
redis_kwargs.pop("password", None)
return redis.Redis.from_url(**redis_kwargs)
elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
raise ValueError("Either 'host' or 'url' must be specified for redis.")
litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
return redis_kwargs
def get_redis_client(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return redis.Redis.from_url(**redis_kwargs)
return redis.Redis(**redis_kwargs)
def get_redis_async_client(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return async_redis.Redis.from_url(**redis_kwargs)
return async_redis.Redis(
socket_timeout=5,
**redis_kwargs,
)
def get_redis_connection_pool(**env_overrides):
redis_kwargs = _get_redis_client_logic(**env_overrides)
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return async_redis.BlockingConnectionPool.from_url(
timeout=5, url=redis_kwargs["url"]
)
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)

View file

@ -1,3 +1,12 @@
# +-----------------------------------------------+
# | |
# | NOT PROXY BUDGET MANAGER |
# | proxy budget manager is in proxy_server.py |
# | |
# +-----------------------------------------------+
#
# Thank you users! We ❤️ you! - Krrish & Ishaan
import os, json, time
import litellm
from litellm.utils import ModelResponse
@ -16,7 +25,7 @@ class BudgetManager:
self.client_type = client_type
self.project_name = project_name
self.api_base = api_base or "https://api.litellm.ai"
self.headers = headers or {'Content-Type': 'application/json'}
self.headers = headers or {"Content-Type": "application/json"}
## load the data or init the initial dictionaries
self.load_data()

View file

@ -8,7 +8,7 @@
# Thank you users! We ❤️ you! - Krrish & Ishaan
import litellm
import time, logging
import time, logging, asyncio
import json, traceback, ast, hashlib
from typing import Optional, Literal, List, Union, Any
from openai._models import BaseModel as OpenAIObject
@ -28,9 +28,18 @@ class BaseCache:
def set_cache(self, key, value, **kwargs):
raise NotImplementedError
async def async_set_cache(self, key, value, **kwargs):
raise NotImplementedError
def get_cache(self, key, **kwargs):
raise NotImplementedError
async def async_get_cache(self, key, **kwargs):
raise NotImplementedError
async def disconnect(self):
raise NotImplementedError
class InMemoryCache(BaseCache):
def __init__(self):
@ -43,6 +52,16 @@ class InMemoryCache(BaseCache):
if "ttl" in kwargs:
self.ttl_dict[key] = time.time() + kwargs["ttl"]
async def async_set_cache(self, key, value, **kwargs):
self.set_cache(key=key, value=value, **kwargs)
async def async_set_cache_pipeline(self, cache_list, ttl=None):
for cache_key, cache_value in cache_list:
if ttl is not None:
self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
else:
self.set_cache(key=cache_key, value=cache_value)
def get_cache(self, key, **kwargs):
if key in self.cache_dict:
if key in self.ttl_dict:
@ -57,17 +76,26 @@ class InMemoryCache(BaseCache):
return cached_response
return None
async def async_get_cache(self, key, **kwargs):
return self.get_cache(key=key, **kwargs)
def flush_cache(self):
self.cache_dict.clear()
self.ttl_dict.clear()
async def disconnect(self):
pass
def delete_cache(self, key):
self.cache_dict.pop(key, None)
self.ttl_dict.pop(key, None)
class RedisCache(BaseCache):
def __init__(self, host=None, port=None, password=None, **kwargs):
import redis
# if users don't provider one, use the default litellm cache
# if users don't provider one, use the default litellm cache
from ._redis import get_redis_client
def __init__(self, host=None, port=None, password=None, **kwargs):
from ._redis import get_redis_client, get_redis_connection_pool
redis_kwargs = {}
if host is not None:
@ -78,18 +106,84 @@ class RedisCache(BaseCache):
redis_kwargs["password"] = password
redis_kwargs.update(kwargs)
self.redis_client = get_redis_client(**redis_kwargs)
self.redis_kwargs = redis_kwargs
self.async_redis_conn_pool = get_redis_connection_pool()
def init_async_client(self):
from ._redis import get_redis_async_client
return get_redis_async_client(
connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
)
def set_cache(self, key, value, **kwargs):
ttl = kwargs.get("ttl", None)
print_verbose(f"Set Redis Cache: key: {key}\nValue {value}")
print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
try:
self.redis_client.set(name=key, value=str(value), ex=ttl)
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
async def async_set_cache(self, key, value, **kwargs):
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
ttl = kwargs.get("ttl", None)
print_verbose(
f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
)
try:
await redis_client.set(name=key, value=json.dumps(value), ex=ttl)
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
async def async_set_cache_pipeline(self, cache_list, ttl=None):
"""
Use Redis Pipelines for bulk write operations
"""
_redis_client = self.init_async_client()
try:
async with _redis_client as redis_client:
async with redis_client.pipeline(transaction=True) as pipe:
# Iterate through each key-value pair in the cache_list and set them in the pipeline.
for cache_key, cache_value in cache_list:
print_verbose(
f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
)
# Set the value with a TTL if it's provided.
if ttl is not None:
pipe.setex(cache_key, ttl, json.dumps(cache_value))
else:
pipe.set(cache_key, json.dumps(cache_value))
# Execute the pipeline and return the results.
results = await pipe.execute()
print_verbose(f"pipeline results: {results}")
# Optionally, you could process 'results' to make sure that all set operations were successful.
return results
except Exception as e:
print_verbose(f"Error occurred in pipeline write - {str(e)}")
# NON blocking - notify users Redis is throwing an exception
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
def _get_cache_logic(self, cached_response: Any):
"""
Common 'get_cache_logic' across sync + async redis client implementations
"""
if cached_response is None:
return cached_response
# cached_response is in `b{} convert it to ModelResponse
cached_response = cached_response.decode("utf-8") # Convert bytes to string
try:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response)
return cached_response
def get_cache(self, key, **kwargs):
try:
print_verbose(f"Get Redis Cache: key: {key}")
@ -97,26 +191,361 @@ class RedisCache(BaseCache):
print_verbose(
f"Got Redis Cache: key: {key}, cached_response {cached_response}"
)
if cached_response != None:
# cached_response is in `b{} convert it to ModelResponse
cached_response = cached_response.decode(
"utf-8"
) # Convert bytes to string
try:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response)
return cached_response
return self._get_cache_logic(cached_response=cached_response)
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
traceback.print_exc()
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
async def async_get_cache(self, key, **kwargs):
_redis_client = self.init_async_client()
async with _redis_client as redis_client:
try:
print_verbose(f"Get Redis Cache: key: {key}")
cached_response = await redis_client.get(key)
print_verbose(
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
)
response = self._get_cache_logic(cached_response=cached_response)
return response
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
traceback.print_exc()
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
def flush_cache(self):
self.redis_client.flushall()
async def disconnect(self):
pass
def delete_cache(self, key):
self.redis_client.delete(key)
class RedisSemanticCache(BaseCache):
def __init__(
self,
host=None,
port=None,
password=None,
redis_url=None,
similarity_threshold=None,
use_async=False,
embedding_model="text-embedding-ada-002",
**kwargs,
):
from redisvl.index import SearchIndex
from redisvl.query import VectorQuery
print_verbose(
"redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
)
if similarity_threshold is None:
raise Exception("similarity_threshold must be provided, passed None")
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model
schema = {
"index": {
"name": "litellm_semantic_cache_index",
"prefix": "litellm",
"storage_type": "hash",
},
"fields": {
"text": [{"name": "response"}],
"text": [{"name": "prompt"}],
"vector": [
{
"name": "litellm_embedding",
"dims": 1536,
"distance_metric": "cosine",
"algorithm": "flat",
"datatype": "float32",
}
],
},
}
if redis_url is None:
# if no url passed, check if host, port and password are passed, if not raise an Exception
if host is None or port is None or password is None:
# try checking env for host, port and password
import os
host = os.getenv("REDIS_HOST")
port = os.getenv("REDIS_PORT")
password = os.getenv("REDIS_PASSWORD")
if host is None or port is None or password is None:
raise Exception("Redis host, port, and password must be provided")
redis_url = "redis://:" + password + "@" + host + ":" + port
print_verbose(f"redis semantic-cache redis_url: {redis_url}")
if use_async == False:
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url)
try:
self.index.create(overwrite=False) # don't overwrite existing index
except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
elif use_async == True:
schema["index"]["name"] = "litellm_semantic_cache_index_async"
self.index = SearchIndex.from_dict(schema)
self.index.connect(redis_url=redis_url, use_async=True)
#
def _get_cache_logic(self, cached_response: Any):
"""
Common 'get_cache_logic' across sync + async redis client implementations
"""
if cached_response is None:
return cached_response
# check if cached_response is bytes
if isinstance(cached_response, bytes):
cached_response = cached_response.decode("utf-8")
try:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response)
return cached_response
def set_cache(self, key, value, **kwargs):
import numpy as np
print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
# get the prompt
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
# create an embedding for prompt
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
# make the embedding a numpy array, convert to bytes
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
value = str(value)
assert isinstance(value, str)
new_data = [
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
]
# Add more data
keys = self.index.load(new_data)
return
def get_cache(self, key, **kwargs):
print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
import numpy as np
# query
# get the messages
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
# convert to embedding
embedding_response = litellm.embedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
query = VectorQuery(
vector=embedding,
vector_field_name="litellm_embedding",
return_fields=["response", "prompt", "vector_distance"],
num_results=1,
)
results = self.index.query(query)
if results == None:
return None
if isinstance(results, list):
if len(results) == 0:
return None
vector_distance = results[0]["vector_distance"]
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# check similarity, if more than self.similarity_threshold, return results
print_verbose(
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
)
if similarity > self.similarity_threshold:
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass
async def async_set_cache(self, key, value, **kwargs):
import numpy as np
from litellm.proxy.proxy_server import llm_router, llm_model_list
try:
await self.index.acreate(overwrite=False) # don't overwrite existing index
except Exception as e:
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
# get the prompt
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
# create an embedding for prompt
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
if llm_router is not None and self.embedding_model in router_model_names:
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# convert to embedding
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
# make the embedding a numpy array, convert to bytes
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
value = str(value)
assert isinstance(value, str)
new_data = [
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
]
# Add more data
keys = await self.index.aload(new_data)
return
async def async_get_cache(self, key, **kwargs):
print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
from redisvl.query import VectorQuery
import numpy as np
from litellm.proxy.proxy_server import llm_router, llm_model_list
# query
# get the messages
messages = kwargs["messages"]
prompt = ""
for message in messages:
prompt += message["content"]
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
if llm_router is not None and self.embedding_model in router_model_names:
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
embedding_response = await llm_router.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
metadata={
"user_api_key": user_api_key,
"semantic-cache-embedding": True,
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
},
)
else:
# convert to embedding
embedding_response = await litellm.aembedding(
model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
)
# get the embedding
embedding = embedding_response["data"][0]["embedding"]
query = VectorQuery(
vector=embedding,
vector_field_name="litellm_embedding",
return_fields=["response", "prompt", "vector_distance"],
)
results = await self.index.aquery(query)
if results == None:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
return None
if isinstance(results, list):
if len(results) == 0:
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
return None
vector_distance = results[0]["vector_distance"]
vector_distance = float(vector_distance)
similarity = 1 - vector_distance
cached_prompt = results[0]["prompt"]
# check similarity, if more than self.similarity_threshold, return results
print_verbose(
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
)
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
if similarity > self.similarity_threshold:
# cache hit !
cached_value = results[0]["response"]
print_verbose(
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
)
return self._get_cache_logic(cached_response=cached_value)
else:
# cache miss !
return None
pass
async def _index_info(self):
return await self.index.ainfo()
class S3Cache(BaseCache):
def __init__(
@ -195,6 +624,9 @@ class S3Cache(BaseCache):
# NON blocking - notify users S3 is throwing an exception
print_verbose(f"S3 Caching: set_cache() - Got exception from S3: {e}")
async def async_set_cache(self, key, value, **kwargs):
self.set_cache(key=key, value=value, **kwargs)
def get_cache(self, key, **kwargs):
import boto3, botocore
@ -237,9 +669,15 @@ class S3Cache(BaseCache):
traceback.print_exc()
print_verbose(f"S3 Caching: get_cache() - Got exception from S3: {e}")
async def async_get_cache(self, key, **kwargs):
return self.get_cache(key=key, **kwargs)
def flush_cache(self):
pass
async def disconnect(self):
pass
class DualCache(BaseCache):
"""
@ -304,15 +742,22 @@ class DualCache(BaseCache):
if self.redis_cache is not None:
self.redis_cache.flush_cache()
def delete_cache(self, key):
if self.in_memory_cache is not None:
self.in_memory_cache.delete_cache(key)
if self.redis_cache is not None:
self.redis_cache.delete_cache(key)
#### LiteLLM.Completion / Embedding Cache ####
class Cache:
def __init__(
self,
type: Optional[Literal["local", "redis", "s3"]] = "local",
type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
host: Optional[str] = None,
port: Optional[str] = None,
password: Optional[str] = None,
similarity_threshold: Optional[float] = None,
supported_call_types: Optional[
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
] = ["completion", "acompletion", "embedding", "aembedding"],
@ -327,16 +772,20 @@ class Cache:
s3_aws_secret_access_key: Optional[str] = None,
s3_aws_session_token: Optional[str] = None,
s3_config: Optional[Any] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
**kwargs,
):
"""
Initializes the cache based on the given type.
Args:
type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
host (str, optional): The host address for the Redis cache. Required if type is "redis".
port (int, optional): The port number for the Redis cache. Required if type is "redis".
password (str, optional): The password for the Redis cache. Required if type is "redis".
similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
**kwargs: Additional keyword arguments for redis.Redis() cache
@ -348,9 +797,19 @@ class Cache:
"""
if type == "redis":
self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
if type == "local":
elif type == "redis-semantic":
self.cache = RedisSemanticCache(
host,
port,
password,
similarity_threshold=similarity_threshold,
use_async=redis_semantic_cache_use_async,
embedding_model=redis_semantic_cache_embedding_model,
**kwargs,
)
elif type == "local":
self.cache = InMemoryCache()
if type == "s3":
elif type == "s3":
self.cache = S3Cache(
s3_bucket_name=s3_bucket_name,
s3_region_name=s3_region_name,
@ -476,6 +935,45 @@ class Cache:
}
time.sleep(0.02)
def _get_cache_logic(
self,
cached_result: Optional[Any],
max_age: Optional[float],
):
"""
Common get cache logic across sync + async implementations
"""
# Check if a timestamp was stored with the cached response
if (
cached_result is not None
and isinstance(cached_result, dict)
and "timestamp" in cached_result
):
timestamp = cached_result["timestamp"]
current_time = time.time()
# Calculate age of the cached response
response_age = current_time - timestamp
# Check if the cached response is older than the max-age
if max_age is not None and response_age > max_age:
return None # Cached response is too old
# If the response is fresh, or there's no max-age requirement, return the cached response
# cached_response is in `b{} convert it to ModelResponse
cached_response = cached_result.get("response")
try:
if isinstance(cached_response, dict):
pass
else:
cached_response = json.loads(
cached_response # type: ignore
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response) # type: ignore
return cached_response
return cached_result
def get_cache(self, *args, **kwargs):
"""
Retrieves the cached result for the given arguments.
@ -488,6 +986,7 @@ class Cache:
The cached result if it exists, otherwise None.
"""
try: # never block execution
messages = kwargs.get("messages", [])
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
@ -497,55 +996,44 @@ class Cache:
max_age = cache_control_args.get(
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
)
cached_result = self.cache.get_cache(cache_key)
# Check if a timestamp was stored with the cached response
if (
cached_result is not None
and isinstance(cached_result, dict)
and "timestamp" in cached_result
and max_age is not None
):
timestamp = cached_result["timestamp"]
current_time = time.time()
# Calculate age of the cached response
response_age = current_time - timestamp
# Check if the cached response is older than the max-age
if response_age > max_age:
print_verbose(
f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
)
return None # Cached response is too old
# If the response is fresh, or there's no max-age requirement, return the cached response
# cached_response is in `b{} convert it to ModelResponse
cached_response = cached_result.get("response")
try:
if isinstance(cached_response, dict):
pass
else:
cached_response = json.loads(
cached_response
) # Convert string to dictionary
except:
cached_response = ast.literal_eval(cached_response)
return cached_response
return cached_result
cached_result = self.cache.get_cache(cache_key, messages=messages)
return self._get_cache_logic(
cached_result=cached_result, max_age=max_age
)
except Exception as e:
print_verbose(f"An exception occurred: {traceback.format_exc()}")
return None
def add_cache(self, result, *args, **kwargs):
async def async_get_cache(self, *args, **kwargs):
"""
Adds a result to the cache.
Async get cache implementation.
Args:
*args: args to litellm.completion() or embedding()
**kwargs: kwargs to litellm.completion() or embedding()
Used for embedding calls in async wrapper
"""
try: # never block execution
messages = kwargs.get("messages", [])
if "cache_key" in kwargs:
cache_key = kwargs["cache_key"]
else:
cache_key = self.get_cache_key(*args, **kwargs)
if cache_key is not None:
cache_control_args = kwargs.get("cache", {})
max_age = cache_control_args.get(
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
)
cached_result = await self.cache.async_get_cache(
cache_key, *args, **kwargs
)
return self._get_cache_logic(
cached_result=cached_result, max_age=max_age
)
except Exception as e:
print_verbose(f"An exception occurred: {traceback.format_exc()}")
return None
Returns:
None
def _add_cache_logic(self, result, *args, **kwargs):
"""
Common implementation across sync + async add_cache functions
"""
try:
if "cache_key" in kwargs:
@ -564,14 +1052,82 @@ class Cache:
if k == "ttl":
kwargs["ttl"] = v
cached_data = {"timestamp": time.time(), "response": result}
self.cache.set_cache(cache_key, cached_data, **kwargs)
return cache_key, cached_data, kwargs
else:
raise Exception("cache key is None")
except Exception as e:
raise e
def add_cache(self, result, *args, **kwargs):
"""
Adds a result to the cache.
Args:
*args: args to litellm.completion() or embedding()
**kwargs: kwargs to litellm.completion() or embedding()
Returns:
None
"""
try:
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
self.cache.set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc()
pass
async def _async_add_cache(self, result, *args, **kwargs):
self.add_cache(result, *args, **kwargs)
async def async_add_cache(self, result, *args, **kwargs):
"""
Async implementation of add_cache
"""
try:
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc()
async def async_add_cache_pipeline(self, result, *args, **kwargs):
"""
Async implementation of add_cache for Embedding calls
Does a bulk write, to prevent using too many clients
"""
try:
cache_list = []
for idx, i in enumerate(kwargs["input"]):
preset_cache_key = litellm.cache.get_cache_key(
*args, **{**kwargs, "input": i}
)
kwargs["cache_key"] = preset_cache_key
embedding_response = result.data[idx]
cache_key, cached_data, kwargs = self._add_cache_logic(
result=embedding_response,
*args,
**kwargs,
)
cache_list.append((cache_key, cached_data))
if hasattr(self.cache, "async_set_cache_pipeline"):
await self.cache.async_set_cache_pipeline(cache_list=cache_list)
else:
tasks = []
for val in cache_list:
tasks.append(
self.cache.async_set_cache(cache_key, cached_data, **kwargs)
)
await asyncio.gather(*tasks)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc()
async def disconnect(self):
if hasattr(self.cache, "disconnect"):
await self.cache.disconnect()
def enable_cache(

View file

@ -63,6 +63,22 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
):
pass
async def async_post_call_streaming_hook(
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
):
"""
Returns streaming chunk before their returned to user
"""
pass
async def async_post_call_success_hook(
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
):
"""
Returns llm response before it's returned to user
"""
pass
#### SINGLE-USE #### - https://docs.litellm.ai/docs/observability/custom_callback#using-your-custom-callback-function
def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):

View file

@ -2,6 +2,7 @@
# On success, logs events to Helicone
import dotenv, os
import requests
import litellm
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
@ -56,6 +57,10 @@ class HeliconeLogger:
else "gpt-3.5-turbo"
)
provider_request = {"model": model, "messages": messages}
if isinstance(response_obj, litellm.EmbeddingResponse) or isinstance(
response_obj, litellm.ModelResponse
):
response_obj = response_obj.json()
if "claude" in model:
provider_request, response_obj = self.claude_mapping(

View file

@ -9,11 +9,12 @@ dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
from packaging.version import Version
from litellm._logging import verbose_logger
import litellm
class LangFuseLogger:
# Class variables or attributes
def __init__(self):
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
try:
from langfuse import Langfuse
except Exception as e:
@ -21,8 +22,8 @@ class LangFuseLogger:
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\033[0m"
)
# Instance variables
self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
@ -34,8 +35,41 @@ class LangFuseLogger:
debug=self.langfuse_debug,
)
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
self.upstream_langfuse_secret_key = os.getenv(
"UPSTREAM_LANGFUSE_SECRET_KEY"
)
self.upstream_langfuse_public_key = os.getenv(
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
)
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
self.upstream_langfuse = Langfuse(
public_key=self.upstream_langfuse_public_key,
secret_key=self.upstream_langfuse_secret_key,
host=self.upstream_langfuse_host,
release=self.upstream_langfuse_release,
debug=self.upstream_langfuse_debug,
)
else:
self.upstream_langfuse = None
# def log_error(kwargs, response_obj, start_time, end_time):
# generation = trace.generation(
# level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
# status_message='error' # can be any string (e.g. stringified stack trace or error body)
# )
def log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
self,
kwargs,
response_obj,
start_time,
end_time,
user_id,
print_verbose,
level="DEFAULT",
status_message=None,
):
# Method definition
@ -63,32 +97,49 @@ class LangFuseLogger:
pass
# end of processing langfuse ########################
input = prompt
output = response_obj["choices"][0]["message"].json()
print_verbose(
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
)
self._log_langfuse_v2(
user_id,
metadata,
output,
start_time,
end_time,
kwargs,
optional_params,
input,
response_obj,
) if self._is_langfuse_v2() else self._log_langfuse_v1(
user_id,
metadata,
output,
start_time,
end_time,
kwargs,
optional_params,
input,
response_obj,
)
if (
level == "ERROR"
and status_message is not None
and isinstance(status_message, str)
):
input = prompt
output = status_message
elif response_obj is not None and (
kwargs.get("call_type", None) == "embedding"
or isinstance(response_obj, litellm.EmbeddingResponse)
):
input = prompt
output = response_obj["data"]
elif response_obj is not None:
input = prompt
output = response_obj["choices"][0]["message"].json()
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
if self._is_langfuse_v2():
self._log_langfuse_v2(
user_id,
metadata,
output,
start_time,
end_time,
kwargs,
optional_params,
input,
response_obj,
level,
print_verbose,
)
elif response_obj is not None:
self._log_langfuse_v1(
user_id,
metadata,
output,
start_time,
end_time,
kwargs,
optional_params,
input,
response_obj,
)
self.Langfuse.flush()
print_verbose(
@ -97,15 +148,15 @@ class LangFuseLogger:
verbose_logger.info(f"Langfuse Layer Logging - logging success")
except:
traceback.print_exc()
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
print(f"Langfuse Layer Error - {traceback.format_exc()}")
pass
async def _async_log_event(
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
):
self.log_event(
kwargs, response_obj, start_time, end_time, user_id, print_verbose
)
"""
TODO: support async calls when langfuse is truly async
"""
def _is_langfuse_v2(self):
import langfuse
@ -167,40 +218,84 @@ class LangFuseLogger:
optional_params,
input,
response_obj,
level,
print_verbose,
):
import langfuse
tags = []
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
try:
tags = []
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
trace_params = {
"name": metadata.get("generation_name", "litellm-completion"),
"input": input,
"output": output,
"user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None),
}
if supports_tags:
for key, value in metadata.items():
tags.append(f"{key}:{value}")
if "cache_hit" in kwargs:
tags.append(f"cache_hit:{kwargs['cache_hit']}")
trace_params.update({"tags": tags})
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
trace = self.Langfuse.trace(**trace_params)
if supports_tags:
metadata_tags = metadata.get("tags", [])
tags = metadata_tags
trace.generation(
name=metadata.get("generation_name", "litellm-completion"),
id=metadata.get("generation_id", None),
startTime=start_time,
endTime=end_time,
model=kwargs["model"],
modelParameters=optional_params,
input=input,
output=output,
usage={
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
"completion_tokens": response_obj["usage"]["completion_tokens"],
},
metadata=metadata,
)
generation_name = metadata.get("generation_name", None)
if generation_name is None:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_params = {
"name": generation_name,
"input": input,
"user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None),
"session_id": metadata.get("session_id", None),
}
if level == "ERROR":
trace_params["status_message"] = output
else:
trace_params["output"] = output
cost = kwargs.get("response_cost", None)
print_verbose(f"trace: {cost}")
if supports_tags:
for key, value in metadata.items():
if key in [
"user_api_key",
"user_api_key_user_id",
"semantic-similarity",
]:
tags.append(f"{key}:{value}")
if "cache_hit" in kwargs:
if kwargs["cache_hit"] is None:
kwargs["cache_hit"] = False
tags.append(f"cache_hit:{kwargs['cache_hit']}")
trace_params.update({"tags": tags})
trace = self.Langfuse.trace(**trace_params)
generation_id = None
usage = None
if response_obj is not None and response_obj.get("id", None) is not None:
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
usage = {
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
"completion_tokens": response_obj["usage"]["completion_tokens"],
"total_cost": cost if supports_costs else None,
}
generation_params = {
"name": generation_name,
"id": metadata.get("generation_id", generation_id),
"startTime": start_time,
"endTime": end_time,
"model": kwargs["model"],
"modelParameters": optional_params,
"input": input,
"output": output,
"usage": usage,
"metadata": metadata,
"level": level,
}
if output is not None and isinstance(output, str) and level == "ERROR":
generation_params["statusMessage"] = output
trace.generation(**generation_params)
except Exception as e:
print(f"Langfuse Layer Error - {traceback.format_exc()}")

View file

@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import datetime, subprocess, sys
import litellm, uuid
from litellm._logging import print_verbose
from litellm._logging import print_verbose, verbose_logger
class S3Logger:
@ -31,7 +31,9 @@ class S3Logger:
import boto3
try:
print_verbose("in init s3 logger")
verbose_logger.debug(
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
)
if litellm.s3_callback_params is not None:
# read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -42,7 +44,7 @@ class S3Logger:
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
s3_verify = litellm.s3_callback_params.get("s3_verify")
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -59,6 +61,7 @@ class S3Logger:
self.bucket_name = s3_bucket_name
self.s3_path = s3_path
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
# Create an S3 client with custom endpoint URL
self.s3_client = boto3.client(
"s3",
@ -84,7 +87,9 @@ class S3Logger:
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
try:
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
verbose_logger.debug(
f"s3 Logging - Enters logging function for model {kwargs}"
)
# construct payload to send to s3
# follows the same params as langfuse.py
@ -123,12 +128,22 @@ class S3Logger:
# non blocking if it can't cast to a str
pass
s3_file_name = litellm.utils.get_logging_id(start_time, payload) or ""
s3_object_key = (
(self.s3_path.rstrip("/") + "/" if self.s3_path else "")
+ payload["id"]
+ "-time="
+ str(start_time)
+ start_time.strftime("%Y-%m-%d")
+ "/"
+ s3_file_name
) # we need the s3 key to include the time, so we log cache hits too
s3_object_key += ".json"
s3_object_download_filename = (
"time-"
+ start_time.strftime("%Y-%m-%dT%H-%M-%S-%f")
+ "_"
+ payload["id"]
+ ".json"
)
import json
@ -142,7 +157,8 @@ class S3Logger:
Body=payload,
ContentType="application/json",
ContentLanguage="en",
ContentDisposition=f'inline; filename="{key}.json"',
ContentDisposition=f'inline; filename="{s3_object_download_filename}"',
CacheControl="private, immutable, max-age=31536000, s-maxage=0",
)
print_verbose(f"Response from s3:{str(response)}")
@ -151,5 +167,5 @@ class S3Logger:
return response
except Exception as e:
traceback.print_exc()
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
pass

View file

@ -2,9 +2,9 @@ import json, copy, types
import os
from enum import Enum
import time
from typing import Callable, Optional, Any, Union
from typing import Callable, Optional, Any, Union, List
import litellm
from litellm.utils import ModelResponse, get_secret, Usage
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
from .prompt_templates.factory import prompt_factory, custom_prompt
import httpx
@ -282,6 +282,73 @@ class AmazonLlamaConfig:
}
class AmazonStabilityConfig:
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
Supported Params for the Amazon / Stable Diffusion models:
- `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)
- `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed)
- `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run.
- `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64.
Engine-specific dimension validation:
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
- SDXL v1.0: same as SDXL v0.9
- SD v1.6: must be between 320x320 and 1536x1536
- `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64.
Engine-specific dimension validation:
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
- SDXL v1.0: same as SDXL v0.9
- SD v1.6: must be between 320x320 and 1536x1536
"""
cfg_scale: Optional[int] = None
seed: Optional[float] = None
steps: Optional[List[str]] = None
width: Optional[int] = None
height: Optional[int] = None
def __init__(
self,
cfg_scale: Optional[int] = None,
seed: Optional[float] = None,
steps: Optional[List[str]] = None,
width: Optional[int] = None,
height: Optional[int] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def init_bedrock_client(
region_name=None,
aws_access_key_id: Optional[str] = None,
@ -289,7 +356,9 @@ def init_bedrock_client(
aws_region_name: Optional[str] = None,
aws_bedrock_runtime_endpoint: Optional[str] = None,
aws_session_name: Optional[str] = None,
aws_profile_name: Optional[str] = None,
aws_role_name: Optional[str] = None,
timeout: Optional[int] = None,
):
# check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
@ -303,6 +372,7 @@ def init_bedrock_client(
aws_region_name,
aws_bedrock_runtime_endpoint,
aws_session_name,
aws_profile_name,
aws_role_name,
]
@ -317,6 +387,7 @@ def init_bedrock_client(
aws_region_name,
aws_bedrock_runtime_endpoint,
aws_session_name,
aws_profile_name,
aws_role_name,
) = params_to_check
@ -346,6 +417,8 @@ def init_bedrock_client(
import boto3
config = boto3.session.Config(connect_timeout=timeout, read_timeout=timeout)
### CHECK STS ###
if aws_role_name is not None and aws_session_name is not None:
# use sts if role name passed in
@ -366,6 +439,7 @@ def init_bedrock_client(
aws_session_token=sts_response["Credentials"]["SessionToken"],
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
elif aws_access_key_id is not None:
# uses auth params passed to completion
@ -377,6 +451,16 @@ def init_bedrock_client(
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
elif aws_profile_name is not None:
# uses auth values from AWS profile usually stored in ~/.aws/credentials
client = boto3.Session(profile_name=aws_profile_name).client(
service_name="bedrock-runtime",
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
else:
# aws_access_key_id is None, assume user is trying to auth using env variables
@ -386,6 +470,7 @@ def init_bedrock_client(
service_name="bedrock-runtime",
region_name=region_name,
endpoint_url=endpoint_url,
config=config,
)
return client
@ -441,6 +526,7 @@ def completion(
optional_params=None,
litellm_params=None,
logger_fn=None,
timeout=None,
):
exception_mapping_worked = False
try:
@ -450,6 +536,7 @@ def completion(
aws_region_name = optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_profile_name = optional_params.pop("aws_profile_name", None)
aws_bedrock_runtime_endpoint = optional_params.pop(
"aws_bedrock_runtime_endpoint", None
)
@ -466,6 +553,8 @@ def completion(
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
aws_profile_name=aws_profile_name,
timeout=timeout,
)
model = model
@ -652,6 +741,8 @@ def completion(
try:
if len(outputText) > 0:
model_response["choices"][0]["message"]["content"] = outputText
else:
raise Exception()
except:
raise BedrockError(
message=json.dumps(outputText),
@ -659,9 +750,16 @@ def completion(
)
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
prompt_tokens = response_metadata.get(
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
)
completion_tokens = response_metadata.get(
"x-amzn-bedrock-output-token-count",
len(
encoding.encode(
model_response["choices"][0]["message"].get("content", "")
)
),
)
model_response["created"] = int(time.time())
@ -672,6 +770,8 @@ def completion(
total_tokens=prompt_tokens + completion_tokens,
)
model_response.usage = usage
model_response._hidden_params["region_name"] = client.meta.region_name
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
return model_response
except BedrockError as e:
exception_mapping_worked = True
@ -693,6 +793,11 @@ def _embedding_func_single(
encoding=None,
logging_obj=None,
):
if isinstance(input, str) is False:
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
# logic for parsing in - calling - parsing out model embedding calls
## FORMAT EMBEDDING INPUT ##
provider = model.split(".")[0]
@ -786,7 +891,8 @@ def embedding(
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
)
if type(input) == str:
if isinstance(input, str):
## Embedding Call
embeddings = [
_embedding_func_single(
model,
@ -796,8 +902,8 @@ def embedding(
logging_obj=logging_obj,
)
]
else:
## Embedding Call
elif isinstance(input, list):
## Embedding Call - assuming this is a List[str]
embeddings = [
_embedding_func_single(
model,
@ -808,6 +914,12 @@ def embedding(
)
for i in input
] # [TODO]: make these parallel calls
else:
# enters this branch if input = int, ex. input=2
raise BedrockError(
message="Bedrock Embedding API input must be type str | List[str]",
status_code=400,
)
## Populate OpenAI compliant dictionary
embedding_response = []
@ -834,3 +946,112 @@ def embedding(
model_response.usage = usage
return model_response
def image_generation(
model: str,
prompt: str,
timeout=None,
logging_obj=None,
model_response=None,
optional_params=None,
aimg_generation=False,
):
"""
Bedrock Image Gen endpoint support
"""
### BOTO3 INIT ###
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
aws_role_name = optional_params.pop("aws_role_name", None)
aws_session_name = optional_params.pop("aws_session_name", None)
aws_bedrock_runtime_endpoint = optional_params.pop(
"aws_bedrock_runtime_endpoint", None
)
# use passed in BedrockRuntime.Client if provided, otherwise create a new one
client = init_bedrock_client(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
aws_region_name=aws_region_name,
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
aws_role_name=aws_role_name,
aws_session_name=aws_session_name,
timeout=timeout,
)
### FORMAT IMAGE GENERATION INPUT ###
modelId = model
provider = model.split(".")[0]
inference_params = copy.deepcopy(optional_params)
inference_params.pop(
"user", None
) # make sure user is not passed in for bedrock call
data = {}
if provider == "stability":
prompt = prompt.replace(os.linesep, " ")
## LOAD CONFIG
config = litellm.AmazonStabilityConfig.get_config()
for k, v in config.items():
if (
k not in inference_params
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params}
else:
raise BedrockError(
status_code=422, message=f"Unsupported model={model}, passed in"
)
body = json.dumps(data).encode("utf-8")
## LOGGING
request_str = f"""
response = client.invoke_model(
body={body},
modelId={modelId},
accept="application/json",
contentType="application/json",
)""" # type: ignore
logging_obj.pre_call(
input=prompt,
api_key="", # boto3 is used for init.
additional_args={
"complete_input_dict": {"model": modelId, "texts": prompt},
"request_str": request_str,
},
)
try:
response = client.invoke_model(
body=body,
modelId=modelId,
accept="application/json",
contentType="application/json",
)
response_body = json.loads(response.get("body").read())
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
additional_args={"complete_input_dict": data},
original_response=json.dumps(response_body),
)
except Exception as e:
raise BedrockError(
message=f"Embedding Error with model {model}: {e}", status_code=500
)
### FORMAT RESPONSE TO OPENAI FORMAT ###
if response_body is None:
raise Exception("Error in response object format")
if model_response is None:
model_response = ImageResponse()
image_list: List = []
for artifact in response_body["artifacts"]:
image_dict = {"url": artifact["base64"]}
model_response.data = image_dict
return model_response

View file

@ -145,8 +145,17 @@ def get_ollama_response(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
optional_params["stream"] = optional_params.get("stream", False)
data = {"model": model, "prompt": prompt, **optional_params}
stream = optional_params.pop("stream", False)
format = optional_params.pop("format", None)
data = {
"model": model,
"prompt": prompt,
"options": optional_params,
"stream": stream,
}
if format is not None:
data["format"] = format
## LOGGING
logging_obj.pre_call(
input=None,
@ -159,7 +168,7 @@ def get_ollama_response(
},
)
if acompletion is True:
if optional_params.get("stream", False) == True:
if stream == True:
response = ollama_async_streaming(
url=url,
data=data,
@ -176,10 +185,12 @@ def get_ollama_response(
logging_obj=logging_obj,
)
return response
elif optional_params.get("stream", False) == True:
elif stream == True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post(url=f"{url}", json=data, timeout=litellm.request_timeout)
response = requests.post(
url=f"{url}", json={**data, "stream": stream}, timeout=litellm.request_timeout
)
if response.status_code != 200:
raise OllamaError(status_code=response.status_code, message=response.text)
@ -254,7 +265,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
) as response:
if response.status_code != 200:
raise OllamaError(
status_code=response.status_code, message=response.text
status_code=response.status_code, message=await response.aread()
)
streamwrapper = litellm.CustomStreamWrapper(
@ -267,6 +278,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
yield transformed_chunk
except Exception as e:
traceback.print_exc()
raise e
async def ollama_acompletion(url, data, model_response, encoding, logging_obj):

View file

@ -145,8 +145,16 @@ def get_ollama_response(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
optional_params["stream"] = optional_params.get("stream", False)
data = {"model": model, "messages": messages, **optional_params}
stream = optional_params.pop("stream", False)
format = optional_params.pop("format", None)
data = {
"model": model,
"messages": messages,
"options": optional_params,
"stream": stream,
}
if format is not None:
data["format"] = format
## LOGGING
logging_obj.pre_call(
input=None,
@ -159,7 +167,7 @@ def get_ollama_response(
},
)
if acompletion is True:
if optional_params.get("stream", False) == True:
if stream == True:
response = ollama_async_streaming(
url=url,
data=data,
@ -176,7 +184,7 @@ def get_ollama_response(
logging_obj=logging_obj,
)
return response
elif optional_params.get("stream", False) == True:
elif stream == True:
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
response = requests.post(
@ -220,8 +228,10 @@ def get_ollama_response(
model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
completion_tokens = response_json["eval_count"]
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"]["content"])
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -318,10 +328,16 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
model_response["choices"][0]["message"] = message
else:
model_response["choices"][0]["message"] = response_json["message"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
completion_tokens = response_json["eval_count"]
model_response["model"] = "ollama_chat/" + data["model"]
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
completion_tokens = response_json.get(
"eval_count",
litellm.token_counter(
text=response_json["message"]["content"], count_response_tokens=True
),
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -221,6 +221,8 @@ class OpenAIChatCompletion(BaseLLM):
headers: Optional[dict] = None,
custom_prompt_dict: dict = {},
client=None,
organization: Optional[str] = None,
custom_llm_provider: Optional[str] = None,
):
super().completion()
exception_mapping_worked = False
@ -235,6 +237,14 @@ class OpenAIChatCompletion(BaseLLM):
status_code=422, message=f"Timeout needs to be a float"
)
if custom_llm_provider == "mistral":
# check if message content passed in as list, and not string
messages = prompt_factory(
model=model,
messages=messages,
custom_llm_provider=custom_llm_provider,
)
for _ in range(
2
): # if call fails due to alternating messages, retry with reformatted message
@ -254,6 +264,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
else:
return self.acompletion(
@ -266,6 +277,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
elif optional_params.get("stream", False):
return self.streaming(
@ -278,6 +290,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout=timeout,
client=client,
max_retries=max_retries,
organization=organization,
)
else:
if not isinstance(max_retries, int):
@ -291,6 +304,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_client = client
@ -320,12 +334,13 @@ class OpenAIChatCompletion(BaseLLM):
model_response_object=model_response,
)
except Exception as e:
if "Conversation roles must alternate user/assistant" in str(
e
) or "user and assistant roles should be alternating" in str(e):
if (
"Conversation roles must alternate user/assistant" in str(e)
or "user and assistant roles should be alternating" in str(e)
) and messages is not None:
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
new_messages = []
for i in range(len(messages) - 1):
for i in range(len(messages) - 1): # type: ignore
new_messages.append(messages[i])
if messages[i]["role"] == messages[i + 1]["role"]:
if messages[i]["role"] == "user":
@ -336,7 +351,9 @@ class OpenAIChatCompletion(BaseLLM):
new_messages.append({"role": "user", "content": ""})
new_messages.append(messages[-1])
messages = new_messages
elif "Last message must have role `user`" in str(e):
elif (
"Last message must have role `user`" in str(e)
) and messages is not None:
new_messages = messages
new_messages.append({"role": "user", "content": ""})
messages = new_messages
@ -358,6 +375,7 @@ class OpenAIChatCompletion(BaseLLM):
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
logging_obj=None,
@ -372,6 +390,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_aclient = client
@ -412,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
headers=None,
@ -423,6 +443,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.client_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_client = client
@ -431,8 +452,8 @@ class OpenAIChatCompletion(BaseLLM):
input=data["messages"],
api_key=api_key,
additional_args={
"headers": headers,
"api_base": api_base,
"headers": {"Authorization": f"Bearer {openai_client.api_key}"},
"api_base": openai_client._base_url._uri_reference,
"acompletion": False,
"complete_input_dict": data,
},
@ -454,6 +475,7 @@ class OpenAIChatCompletion(BaseLLM):
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
organization: Optional[str] = None,
client=None,
max_retries=None,
headers=None,
@ -467,6 +489,7 @@ class OpenAIChatCompletion(BaseLLM):
http_client=litellm.aclient_session,
timeout=timeout,
max_retries=max_retries,
organization=organization,
)
else:
openai_aclient = client
@ -718,8 +741,22 @@ class OpenAIChatCompletion(BaseLLM):
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e:
exception_mapping_worked = True
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
raise e
except Exception as e:
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=str(e),
)
if hasattr(e, "status_code"):
raise OpenAIError(status_code=e.status_code, message=str(e))
else:
@ -734,8 +771,11 @@ class OpenAIChatCompletion(BaseLLM):
messages: Optional[list] = None,
input: Optional[list] = None,
prompt: Optional[str] = None,
organization: Optional[str] = None,
):
client = AsyncOpenAI(api_key=api_key, timeout=timeout)
client = AsyncOpenAI(
api_key=api_key, timeout=timeout, organization=organization
)
if model is None and mode != "image_generation":
raise Exception("model is not set")

View file

@ -99,12 +99,16 @@ def ollama_pt(
def mistral_instruct_pt(messages):
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
prompt = custom_prompt(
initial_prompt_value="<s>",
role_dict={
"system": {"pre_message": "[INST]", "post_message": "[/INST]"},
"user": {"pre_message": "[INST]", "post_message": "[/INST]"},
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
"system": {
"pre_message": "[INST] \n",
"post_message": " [/INST]\n",
},
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
"assistant": {"pre_message": " ", "post_message": " "},
},
final_prompt_value="</s>",
messages=messages,
@ -112,6 +116,28 @@ def mistral_instruct_pt(messages):
return prompt
def mistral_api_pt(messages):
"""
- handles scenario where content is list and not string
- content list is just text, and no images
- if image passed in, then just return as is (user-intended)
Motivation: mistral api doesn't support content as a list
"""
new_messages = []
for m in messages:
texts = ""
if isinstance(m["content"], list):
for c in m["content"]:
if c["type"] == "image_url":
return messages
elif c["type"] == "text" and isinstance(c["text"], str):
texts += c["text"]
new_m = {"role": m["role"], "content": texts}
new_messages.append(new_m)
return new_messages
# Falcon prompt template - from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py#L110
def falcon_instruct_pt(messages):
prompt = ""
@ -372,6 +398,7 @@ def anthropic_pt(
You can "put words in Claude's mouth" by ending with an assistant message.
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
"""
class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman: "
AI_PROMPT = "\n\nAssistant: "
@ -394,32 +421,35 @@ def anthropic_pt(
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
return prompt
def _load_image_from_url(image_url):
try:
from PIL import Image
except:
raise Exception("gemini image conversion failed please run `pip install Pillow`")
raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
from io import BytesIO
try:
# Send a GET request to the image URL
response = requests.get(image_url)
response.raise_for_status() # Raise an exception for HTTP errors
# Check the response's content type to ensure it is an image
content_type = response.headers.get('content-type')
if not content_type or 'image' not in content_type:
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
content_type = response.headers.get("content-type")
if not content_type or "image" not in content_type:
raise ValueError(
f"URL does not point to a valid image (content-type: {content_type})"
)
# Load the image from the response content
return Image.open(BytesIO(response.content))
except requests.RequestException as e:
print(f"Request failed: {e}")
except UnidentifiedImageError:
print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
except ValueError as e:
print(e)
raise Exception(f"Request failed: {e}")
except Exception as e:
raise e
def _gemini_vision_convert_messages(messages: list):
@ -437,10 +467,11 @@ def _gemini_vision_convert_messages(messages: list):
try:
from PIL import Image
except:
raise Exception("gemini image conversion failed please run `pip install Pillow`")
raise Exception(
"gemini image conversion failed please run `pip install Pillow`"
)
try:
# given messages for gpt-4 vision, convert them for gemini
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
prompt = ""
@ -589,7 +620,7 @@ def prompt_factory(
if custom_llm_provider == "ollama":
return ollama_pt(model=model, messages=messages)
elif custom_llm_provider == "anthropic":
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
return claude_2_1_pt(messages=messages)
else:
return anthropic_pt(messages=messages)
@ -603,6 +634,8 @@ def prompt_factory(
return _gemini_vision_convert_messages(messages=messages)
else:
return gemini_text_image_pt(messages=messages)
elif custom_llm_provider == "mistral":
return mistral_api_pt(messages=messages)
try:
if "meta-llama/llama-2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)

View file

@ -34,22 +34,35 @@ class TokenIterator:
self.byte_iterator = iter(stream)
self.buffer = io.BytesIO()
self.read_pos = 0
self.end_of_data = False
def __iter__(self):
return self
def __next__(self):
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
return line_data["token"]["text"]
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
try:
while True:
self.buffer.seek(self.read_pos)
line = self.buffer.readline()
if line and line[-1] == ord("\n"):
response_obj = {"text": "", "is_finished": False}
self.read_pos += len(line) + 1
full_line = line[:-1].decode("utf-8")
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
if line_data.get("generated_text", None) is not None:
self.end_of_data = True
response_obj["is_finished"] = True
response_obj["text"] = line_data["token"]["text"]
return response_obj
chunk = next(self.byte_iterator)
self.buffer.seek(0, io.SEEK_END)
self.buffer.write(chunk["PayloadPart"]["Bytes"])
except StopIteration as e:
if self.end_of_data == True:
raise e # Re-raise StopIteration
else:
self.end_of_data = True
return "data: [DONE]"
class SagemakerConfig:
@ -353,7 +366,7 @@ def embedding(
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
if aws_access_key_id != None:
if aws_access_key_id is not None:
# uses auth params passed to completion
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
client = boto3.client(

View file

@ -1,3 +1,7 @@
"""
Deprecated. We now do together ai calls via the openai client.
Reference: https://docs.together.ai/docs/openai-api-compatibility
"""
import os, types
import json
from enum import Enum

View file

@ -3,7 +3,7 @@ import json
from enum import Enum
import requests
import time
from typing import Callable, Optional
from typing import Callable, Optional, Union
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
import litellm, uuid
import httpx
@ -75,6 +75,41 @@ class VertexAIConfig:
}
import asyncio
class TextStreamer:
"""
Fake streaming iterator for Vertex AI Model Garden calls
"""
def __init__(self, text):
self.text = text.split() # let's assume words as a streaming unit
self.index = 0
def __iter__(self):
return self
def __next__(self):
if self.index < len(self.text):
result = self.text[self.index]
self.index += 1
return result
else:
raise StopIteration
def __aiter__(self):
return self
async def __anext__(self):
if self.index < len(self.text):
result = self.text[self.index]
self.index += 1
return result
else:
raise StopAsyncIteration # once we run out of data to stream, we raise this error
def _get_image_bytes_from_url(image_url: str) -> bytes:
try:
response = requests.get(image_url)
@ -236,9 +271,17 @@ def completion(
Part,
GenerationConfig,
)
from google.cloud import aiplatform
from google.protobuf import json_format # type: ignore
from google.protobuf.struct_pb2 import Value # type: ignore
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
import google.auth
vertexai.init(project=vertex_project, location=vertex_location)
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
creds, _ = google.auth.default(quota_project_id=vertex_project)
vertexai.init(
project=vertex_project, location=vertex_location, credentials=creds
)
## Load Config
config = litellm.VertexAIConfig.get_config()
@ -272,6 +315,11 @@ def completion(
request_str = ""
response_obj = None
async_client = None
instances = None
client_options = {
"api_endpoint": f"{vertex_location}-aiplatform.googleapis.com"
}
if (
model in litellm.vertex_language_models
or model in litellm.vertex_vision_models
@ -291,39 +339,51 @@ def completion(
llm_model = CodeGenerationModel.from_pretrained(model)
mode = "text"
request_str += f"llm_model = CodeGenerationModel.from_pretrained({model})\n"
else: # vertex_code_llm_models
elif model in litellm.vertex_code_chat_models: # vertex_code_llm_models
llm_model = CodeChatModel.from_pretrained(model)
mode = "chat"
request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n"
else: # assume vertex model garden
client = aiplatform.gapic.PredictionServiceClient(
client_options=client_options
)
if acompletion == True: # [TODO] expand support to vertex ai chat + text models
instances = [optional_params]
instances[0]["prompt"] = prompt
instances = [
json_format.ParseDict(instance_dict, Value())
for instance_dict in instances
]
llm_model = client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
mode = "custom"
request_str += f"llm_model = client.endpoint_path(project={vertex_project}, location={vertex_location}, endpoint={model})\n"
if acompletion == True:
data = {
"llm_model": llm_model,
"mode": mode,
"prompt": prompt,
"logging_obj": logging_obj,
"request_str": request_str,
"model": model,
"model_response": model_response,
"encoding": encoding,
"messages": messages,
"print_verbose": print_verbose,
"client_options": client_options,
"instances": instances,
"vertex_location": vertex_location,
"vertex_project": vertex_project,
**optional_params,
}
if optional_params.get("stream", False) is True:
# async streaming
return async_streaming(
llm_model=llm_model,
mode=mode,
prompt=prompt,
logging_obj=logging_obj,
request_str=request_str,
model=model,
model_response=model_response,
messages=messages,
print_verbose=print_verbose,
**optional_params,
)
return async_completion(
llm_model=llm_model,
mode=mode,
prompt=prompt,
logging_obj=logging_obj,
request_str=request_str,
model=model,
model_response=model_response,
encoding=encoding,
messages=messages,
print_verbose=print_verbose,
**optional_params,
)
return async_streaming(**data)
return async_completion(**data)
if mode == "vision":
print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@ -468,7 +528,36 @@ def completion(
},
)
completion_response = llm_model.predict(prompt, **optional_params).text
elif mode == "custom":
"""
Vertex AI Model Garden
"""
request_str += (
f"client.predict(endpoint={llm_model}, instances={instances})\n"
)
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response = client.predict(
endpoint=llm_model,
instances=instances,
).predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
if "stream" in optional_params and optional_params["stream"] == True:
response = TextStreamer(completion_response)
return response
## LOGGING
logging_obj.post_call(
input=prompt, api_key=None, original_response=completion_response
@ -536,6 +625,10 @@ async def async_completion(
encoding=None,
messages=None,
print_verbose=None,
client_options=None,
instances=None,
vertex_project=None,
vertex_location=None,
**optional_params,
):
"""
@ -624,7 +717,43 @@ async def async_completion(
)
response_obj = await llm_model.predict_async(prompt, **optional_params)
completion_response = response_obj.text
elif mode == "custom":
"""
Vertex AI Model Garden
"""
from google.cloud import aiplatform
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
client_options=client_options
)
llm_model = async_client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
request_str += (
f"client.predict(endpoint={llm_model}, instances={instances})\n"
)
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response_obj = await async_client.predict(
endpoint=llm_model,
instances=instances,
)
response = response_obj.predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
## LOGGING
logging_obj.post_call(
input=prompt, api_key=None, original_response=completion_response
@ -654,14 +783,12 @@ async def async_completion(
# init prompt tokens
# this block attempts to get usage from response_obj if it exists, if not it uses the litellm token counter
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
if response_obj is not None:
if hasattr(response_obj, "usage_metadata") and hasattr(
response_obj.usage_metadata, "prompt_token_count"
):
prompt_tokens = response_obj.usage_metadata.prompt_token_count
completion_tokens = (
response_obj.usage_metadata.candidates_token_count
)
if response_obj is not None and (
hasattr(response_obj, "usage_metadata")
and hasattr(response_obj.usage_metadata, "prompt_token_count")
):
prompt_tokens = response_obj.usage_metadata.prompt_token_count
completion_tokens = response_obj.usage_metadata.candidates_token_count
else:
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
@ -690,8 +817,13 @@ async def async_streaming(
model_response: ModelResponse,
logging_obj=None,
request_str=None,
encoding=None,
messages=None,
print_verbose=None,
client_options=None,
instances=None,
vertex_project=None,
vertex_location=None,
**optional_params,
):
"""
@ -760,17 +892,198 @@ async def async_streaming(
},
)
response = llm_model.predict_streaming_async(prompt, **optional_params)
elif mode == "custom":
from google.cloud import aiplatform
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
client_options=client_options
)
llm_model = async_client.endpoint_path(
project=vertex_project, location=vertex_location, endpoint=model
)
request_str += f"client.predict(endpoint={llm_model}, instances={instances})\n"
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
response_obj = await async_client.predict(
endpoint=llm_model,
instances=instances,
)
response = response_obj.predictions
completion_response = response[0]
if (
isinstance(completion_response, str)
and "\nOutput:\n" in completion_response
):
completion_response = completion_response.split("\nOutput:\n", 1)[1]
if "stream" in optional_params and optional_params["stream"] == True:
response = TextStreamer(completion_response)
streamwrapper = CustomStreamWrapper(
completion_stream=response,
model=model,
custom_llm_provider="vertex_ai",
logging_obj=logging_obj,
)
async for transformed_chunk in streamwrapper:
yield transformed_chunk
return streamwrapper
def embedding():
def embedding(
model: str,
input: Union[list, str],
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
vertex_project=None,
vertex_location=None,
aembedding=False,
):
# logic for parsing in - calling - parsing out model embedding calls
pass
try:
import vertexai
except:
raise VertexAIError(
status_code=400,
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
)
from vertexai.language_models import TextEmbeddingModel
import google.auth
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
try:
creds, _ = google.auth.default(quota_project_id=vertex_project)
vertexai.init(
project=vertex_project, location=vertex_location, credentials=creds
)
except Exception as e:
raise VertexAIError(status_code=401, message=str(e))
if isinstance(input, str):
input = [input]
try:
llm_model = TextEmbeddingModel.from_pretrained(model)
except Exception as e:
raise VertexAIError(status_code=422, message=str(e))
if aembedding == True:
return async_embedding(
model=model,
client=llm_model,
input=input,
logging_obj=logging_obj,
model_response=model_response,
optional_params=optional_params,
encoding=encoding,
)
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
## LOGGING PRE-CALL
logging_obj.pre_call(
input=input,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
try:
embeddings = llm_model.get_embeddings(input)
except Exception as e:
raise VertexAIError(status_code=500, message=str(e))
## LOGGING POST-CALL
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
## Populate OpenAI compliant dictionary
embedding_response = []
for idx, embedding in enumerate(embeddings):
embedding_response.append(
{
"object": "embedding",
"index": idx,
"embedding": embedding.values,
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = 0
input_str = "".join(input)
input_tokens += len(encoding.encode(input_str))
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
return model_response
async def async_embedding(
model: str,
input: Union[list, str],
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
client=None,
):
"""
Async embedding implementation
"""
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
## LOGGING PRE-CALL
logging_obj.pre_call(
input=input,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
try:
embeddings = await client.get_embeddings_async(input)
except Exception as e:
raise VertexAIError(status_code=500, message=str(e))
## LOGGING POST-CALL
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
## Populate OpenAI compliant dictionary
embedding_response = []
for idx, embedding in enumerate(embeddings):
embedding_response.append(
{
"object": "embedding",
"index": idx,
"embedding": embedding.values,
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens = 0
input_str = "".join(input)
input_tokens += len(encoding.encode(input_str))
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
return model_response

View file

@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy
import httpx
import litellm
from ._logging import verbose_logger
from litellm import ( # type: ignore
client,
exception_type,
@ -31,6 +31,7 @@ from litellm.utils import (
get_llm_provider,
get_api_key,
mock_completion_streaming_obj,
async_mock_completion_streaming_obj,
convert_to_model_response_object,
token_counter,
Usage,
@ -235,6 +236,9 @@ async def acompletion(
"model_list": model_list,
"acompletion": True, # assuming this is a required parameter
}
_, custom_llm_provider, _, _ = get_llm_provider(
model=model, api_base=completion_kwargs.get("base_url", None)
)
try:
# Use a partial function to pass your keyword arguments
func = partial(completion, **completion_kwargs, **kwargs)
@ -246,7 +250,6 @@ async def acompletion(
_, custom_llm_provider, _, _ = get_llm_provider(
model=model, api_base=kwargs.get("api_base", None)
)
if (
custom_llm_provider == "openai"
or custom_llm_provider == "azure"
@ -261,6 +264,7 @@ async def acompletion(
or custom_llm_provider == "ollama"
or custom_llm_provider == "ollama_chat"
or custom_llm_provider == "vertex_ai"
or custom_llm_provider in litellm.openai_compatible_providers
): # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
init_response = await loop.run_in_executor(None, func_with_context)
if isinstance(init_response, dict) or isinstance(
@ -274,14 +278,10 @@ async def acompletion(
else:
# Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context) # type: ignore
# if kwargs.get("stream", False): # return an async generator
# return _async_streaming(
# response=response,
# model=model,
# custom_llm_provider=custom_llm_provider,
# args=args,
# )
# else:
if isinstance(response, CustomStreamWrapper):
response.set_logging_event_loop(
loop=loop
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
return response
except Exception as e:
custom_llm_provider = custom_llm_provider or "openai"
@ -308,6 +308,7 @@ def mock_completion(
messages: List,
stream: Optional[bool] = False,
mock_response: str = "This is a mock request",
logging=None,
**kwargs,
):
"""
@ -336,6 +337,15 @@ def mock_completion(
model_response = ModelResponse(stream=stream)
if stream is True:
# don't try to access stream object,
if kwargs.get("acompletion", False) == True:
return CustomStreamWrapper(
completion_stream=async_mock_completion_streaming_obj(
model_response, mock_response=mock_response, model=model
),
model=model,
custom_llm_provider="openai",
logging_obj=logging,
)
response = mock_completion_streaming_obj(
model_response, mock_response=mock_response, model=model
)
@ -455,6 +465,7 @@ def completion(
num_retries = kwargs.get("num_retries", None) ## deprecated
max_retries = kwargs.get("max_retries", None)
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
organization = kwargs.get("organization", None)
### CUSTOM MODEL COST ###
input_cost_per_token = kwargs.get("input_cost_per_token", None)
output_cost_per_token = kwargs.get("output_cost_per_token", None)
@ -590,28 +601,43 @@ def completion(
)
if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
model_response._hidden_params["region_name"] = kwargs.get(
"aws_region_name", None
) # support region-based pricing for bedrock
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
if input_cost_per_token is not None and output_cost_per_token is not None:
print_verbose(f"Registering model={model} in model cost map")
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
},
model: {
"input_cost_per_token": input_cost_per_token,
"output_cost_per_token": output_cost_per_token,
"litellm_provider": custom_llm_provider,
}
},
}
)
if (
elif (
input_cost_per_second is not None
): # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
},
model: {
"input_cost_per_second": input_cost_per_second,
"output_cost_per_second": output_cost_per_second,
"litellm_provider": custom_llm_provider,
}
},
}
)
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
@ -702,7 +728,12 @@ def completion(
)
if mock_response:
return mock_completion(
model, messages, stream=stream, mock_response=mock_response
model,
messages,
stream=stream,
mock_response=mock_response,
logging=logging,
acompletion=acompletion,
)
if custom_llm_provider == "azure":
# azure configs
@ -777,6 +808,7 @@ def completion(
or custom_llm_provider == "anyscale"
or custom_llm_provider == "mistral"
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works
@ -788,7 +820,8 @@ def completion(
or "https://api.openai.com/v1"
)
openai.organization = (
litellm.organization
organization
or litellm.organization
or get_secret("OPENAI_ORGANIZATION")
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
@ -828,6 +861,7 @@ def completion(
timeout=timeout,
custom_prompt_dict=custom_prompt_dict,
client=client, # pass AsyncOpenAI, OpenAI client
organization=organization,
)
except Exception as e:
## LOGGING - log the original exception returned
@ -1314,6 +1348,9 @@ def completion(
or ("togethercomputer" in model)
or (model in litellm.together_ai_models)
):
"""
Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
"""
custom_llm_provider = "together_ai"
together_ai_key = (
api_key
@ -1421,9 +1458,15 @@ def completion(
return response
response = model_response
elif custom_llm_provider == "vertex_ai":
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
vertex_ai_location = litellm.vertex_location or get_secret(
"VERTEXAI_LOCATION"
vertex_ai_project = (
optional_params.pop("vertex_ai_project", None)
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
model_response = vertex_ai.completion(
@ -1514,11 +1557,6 @@ def completion(
if (
"stream" in optional_params and optional_params["stream"] == True
): ## [BETA]
# sagemaker does not support streaming as of now so we're faking streaming:
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
# "SageMaker is currently not supporting streaming responses."
# fake streaming for sagemaker
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
from .llms.sagemaker import TokenIterator
@ -1529,6 +1567,12 @@ def completion(
custom_llm_provider="sagemaker",
logging_obj=logging,
)
## LOGGING
logging.post_call(
input=messages,
api_key=None,
original_response=response,
)
return response
## RESPONSE OBJECT
@ -1547,6 +1591,7 @@ def completion(
logger_fn=logger_fn,
encoding=encoding,
logging_obj=logging,
timeout=timeout,
)
if "stream" in optional_params and optional_params["stream"] == True:
@ -2191,6 +2236,7 @@ async def aembedding(*args, **kwargs):
or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity"
or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai"
): # currently implemented aiohttp calls for just azure and openai, soon all.
# Await normally
init_response = await loop.run_in_executor(None, func_with_context)
@ -2221,6 +2267,7 @@ def embedding(
model,
input=[],
# Optional params
dimensions: Optional[int] = None,
timeout=600, # default to 10 minutes
# set api_base, api_version, api_key
api_base: Optional[str] = None,
@ -2241,6 +2288,7 @@ def embedding(
Parameters:
- model: The embedding model to use.
- input: The input for which embeddings are to be generated.
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
- timeout: The timeout value for the API call, default 10 mins
- litellm_call_id: The call ID for litellm logging.
- litellm_logging_obj: The litellm logging object.
@ -2274,6 +2322,7 @@ def embedding(
output_cost_per_second = kwargs.get("output_cost_per_second", None)
openai_params = [
"user",
"dimensions",
"request_timeout",
"api_base",
"api_version",
@ -2342,7 +2391,9 @@ def embedding(
api_key=api_key,
)
optional_params = get_optional_params_embeddings(
model=model,
user=user,
dimensions=dimensions,
encoding_format=encoding_format,
custom_llm_provider=custom_llm_provider,
**non_default_params,
@ -2461,7 +2512,7 @@ def embedding(
client=client,
aembedding=aembedding,
)
elif model in litellm.cohere_embedding_models:
elif custom_llm_provider == "cohere":
cohere_key = (
api_key
or litellm.cohere_key
@ -2503,6 +2554,29 @@ def embedding(
optional_params=optional_params,
model_response=EmbeddingResponse(),
)
elif custom_llm_provider == "vertex_ai":
vertex_ai_project = (
optional_params.pop("vertex_ai_project", None)
or litellm.vertex_project
or get_secret("VERTEXAI_PROJECT")
)
vertex_ai_location = (
optional_params.pop("vertex_ai_location", None)
or litellm.vertex_location
or get_secret("VERTEXAI_LOCATION")
)
response = vertex_ai.embedding(
model=model,
input=input,
encoding=encoding,
logging_obj=logging,
optional_params=optional_params,
model_response=EmbeddingResponse(),
vertex_project=vertex_ai_project,
vertex_location=vertex_ai_location,
aembedding=aembedding,
)
elif custom_llm_provider == "oobabooga":
response = oobabooga.embedding(
model=model,
@ -3064,7 +3138,7 @@ def image_generation(
custom_llm_provider=custom_llm_provider,
**non_default_params,
)
logging = litellm_logging_obj
logging: Logging = litellm_logging_obj
logging.update_environment_variables(
model=model,
user=user,
@ -3128,7 +3202,18 @@ def image_generation(
model_response=model_response,
aimg_generation=aimg_generation,
)
elif custom_llm_provider == "bedrock":
if model is None:
raise Exception("Model needs to be set for bedrock")
model_response = bedrock.image_generation(
model=model,
prompt=prompt,
timeout=timeout,
logging_obj=litellm_logging_obj,
optional_params=optional_params,
model_response=model_response,
aimg_generation=aimg_generation,
)
return model_response
except Exception as e:
## Map to OpenAI Exception
@ -3164,6 +3249,9 @@ async def ahealth_check(
if model is None:
raise Exception("model not set")
if model in litellm.model_cost and mode is None:
mode = litellm.model_cost[model]["mode"]
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
mode = mode or "chat" # default to chat completion calls
@ -3210,6 +3298,7 @@ async def ahealth_check(
or custom_llm_provider == "text-completion-openai"
):
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
organization = model_params.get("organization")
timeout = (
model_params.get("timeout")
@ -3227,8 +3316,12 @@ async def ahealth_check(
mode=mode,
prompt=prompt,
input=input,
organization=organization,
)
else:
model_params["cache"] = {
"no-cache": True
} # don't used cached responses for making health check calls
if mode == "embedding":
model_params.pop("messages", None)
model_params["input"] = input
@ -3244,6 +3337,10 @@ async def ahealth_check(
response = {} # args like remaining ratelimit etc.
return response
except Exception as e:
if model not in litellm.model_cost and mode is None:
raise Exception(
"Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
)
return {"error": str(e)}
@ -3251,6 +3348,7 @@ async def ahealth_check(
## Set verbose to true -> ```litellm.set_verbose = True```
def print_verbose(print_statement):
try:
verbose_logger.debug(print_statement)
if litellm.set_verbose:
print(print_statement) # noqa
except:
@ -3342,6 +3440,16 @@ def stream_chunk_builder(
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
):
model_response = litellm.ModelResponse()
### SORT CHUNKS BASED ON CREATED ORDER ##
print_verbose("Goes into checking if chunk has hiddden created at param")
if chunks[0]._hidden_params.get("created_at", None):
print_verbose("Chunks have a created at hidden param")
# Sort chunks based on created_at in ascending order
chunks = sorted(
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
)
print_verbose("Chunks sorted")
# set hidden params from chunk to model_response
if model_response is not None and hasattr(model_response, "_hidden_params"):
model_response._hidden_params = chunks[0].get("_hidden_params", {})

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

View file

@ -0,0 +1 @@
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()

Some files were not shown because too many files have changed in this diff Show more