Merge branch 'main' into litellm_dynamo_use_arn
|
@ -147,12 +147,18 @@ jobs:
|
|||
-e AZURE_API_KEY=$AZURE_API_KEY \
|
||||
-e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
|
||||
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
|
||||
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||
-e AWS_REGION_NAME=$AWS_REGION_NAME \
|
||||
-e OPENAI_API_KEY=$OPENAI_API_KEY \
|
||||
--name my-app \
|
||||
-v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
|
||||
my-app:latest \
|
||||
--config /app/config.yaml \
|
||||
--port 4000 \
|
||||
--num_workers 8
|
||||
--num_workers 8 \
|
||||
--detailed_debug \
|
||||
--run_gunicorn \
|
||||
- run:
|
||||
name: Install curl and dockerize
|
||||
command: |
|
||||
|
|
|
@ -10,4 +10,5 @@ anthropic
|
|||
boto3
|
||||
orjson
|
||||
pydantic
|
||||
google-cloud-aiplatform
|
||||
google-cloud-aiplatform
|
||||
redisvl==0.0.7 # semantic caching
|
77
.github/actions/helm-oci-chart-releaser/action.yml
vendored
Normal file
|
@ -0,0 +1,77 @@
|
|||
name: Helm OCI Chart Releaser
|
||||
description: Push Helm charts to OCI-based (Docker) registries
|
||||
author: sergeyshaykhullin
|
||||
branding:
|
||||
color: yellow
|
||||
icon: upload-cloud
|
||||
inputs:
|
||||
name:
|
||||
required: true
|
||||
description: Chart name
|
||||
repository:
|
||||
required: true
|
||||
description: Chart repository name
|
||||
tag:
|
||||
required: true
|
||||
description: Chart version
|
||||
app_version:
|
||||
required: true
|
||||
description: App version
|
||||
path:
|
||||
required: false
|
||||
description: Chart path (Default 'charts/{name}')
|
||||
registry:
|
||||
required: true
|
||||
description: OCI registry
|
||||
registry_username:
|
||||
required: true
|
||||
description: OCI registry username
|
||||
registry_password:
|
||||
required: true
|
||||
description: OCI registry password
|
||||
update_dependencies:
|
||||
required: false
|
||||
default: 'false'
|
||||
description: Update chart dependencies before packaging (Default 'false')
|
||||
outputs:
|
||||
image:
|
||||
value: ${{ steps.output.outputs.image }}
|
||||
description: Chart image (Default '{registry}/{repository}/{image}:{tag}')
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Helm | Login
|
||||
shell: bash
|
||||
run: echo ${{ inputs.registry_password }} | helm registry login -u ${{ inputs.registry_username }} --password-stdin ${{ inputs.registry }}
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
- name: Helm | Dependency
|
||||
if: inputs.update_dependencies == 'true'
|
||||
shell: bash
|
||||
run: helm dependency update ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }}
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
- name: Helm | Package
|
||||
shell: bash
|
||||
run: helm package ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }} --version ${{ inputs.tag }} --app-version ${{ inputs.app_version }}
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
- name: Helm | Push
|
||||
shell: bash
|
||||
run: helm push ${{ inputs.name }}-${{ inputs.tag }}.tgz oci://${{ inputs.registry }}/${{ inputs.repository }}
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
- name: Helm | Logout
|
||||
shell: bash
|
||||
run: helm registry logout ${{ inputs.registry }}
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
- name: Helm | Output
|
||||
id: output
|
||||
shell: bash
|
||||
run: echo "image=${{ inputs.registry }}/${{ inputs.repository }}/${{ inputs.name }}:${{ inputs.tag }}" >> $GITHUB_OUTPUT
|
0
template.yaml → .github/template.yaml
vendored
50
.github/workflows/ghcr_deploy.yml
vendored
|
@ -34,13 +34,6 @@ jobs:
|
|||
with:
|
||||
push: true
|
||||
tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }}
|
||||
-
|
||||
name: Build and push litellm-ui image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
push: true
|
||||
file: ui/Dockerfile
|
||||
tags: litellm/litellm-ui:${{ github.event.inputs.tag || 'latest' }}
|
||||
-
|
||||
name: Build and push litellm-database image
|
||||
uses: docker/build-push-action@v5
|
||||
|
@ -82,36 +75,8 @@ jobs:
|
|||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-and-push-image-ui:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for UI Dockerfile
|
||||
id: meta-ui
|
||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
|
||||
|
||||
- name: Build and push UI Docker image
|
||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
||||
with:
|
||||
context: ui/
|
||||
file: ui/Dockerfile
|
||||
push: true
|
||||
tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
|
||||
labels: ${{ steps.meta-ui.outputs.labels }}
|
||||
platform: local, linux/amd64,linux/arm64,linux/arm64/v8
|
||||
|
||||
build-and-push-image-database:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
|
@ -176,3 +141,14 @@ jobs:
|
|||
} catch (error) {
|
||||
core.setFailed(error.message);
|
||||
}
|
||||
- name: Github Releases To Discord
|
||||
uses: SethCohen/github-releases-to-discord@v1.13.1
|
||||
with:
|
||||
webhook_url: ${{ secrets.WEBHOOK_URL }}
|
||||
color: "2105893"
|
||||
username: "Release Changelog"
|
||||
avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||
content: "||@everyone||"
|
||||
footer_title: "Changelog"
|
||||
footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
|
||||
footer_timestamp: true
|
||||
|
|
64
.github/workflows/ghcr_helm_deploy.yml
vendored
Normal file
|
@ -0,0 +1,64 @@
|
|||
# this workflow is triggered by an API call when there is a new PyPI release of LiteLLM
|
||||
name: Build, Publish LiteLLM Helm Chart. New Release
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
chartVersion:
|
||||
description: "Update the helm chart's version to this"
|
||||
|
||||
# Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds.
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
REPO_OWNER: ${{github.repository_owner}}
|
||||
|
||||
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
|
||||
jobs:
|
||||
build-and-push-helm-chart:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Log in to the Container registry
|
||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: lowercase github.repository_owner
|
||||
run: |
|
||||
echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
|
||||
|
||||
- name: Get LiteLLM Latest Tag
|
||||
id: current_app_tag
|
||||
uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
|
||||
|
||||
- name: Get last published chart version
|
||||
id: current_version
|
||||
shell: bash
|
||||
run: helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/litellm-helm | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
|
||||
env:
|
||||
HELM_EXPERIMENTAL_OCI: '1'
|
||||
|
||||
# Automatically update the helm chart version one "patch" level
|
||||
- name: Bump release version
|
||||
id: bump_version
|
||||
uses: christian-draeger/increment-semantic-version@1.1.0
|
||||
with:
|
||||
current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
|
||||
version-fragment: 'bug'
|
||||
|
||||
- uses: ./.github/actions/helm-oci-chart-releaser
|
||||
with:
|
||||
name: litellm-helm
|
||||
repository: ${{ env.REPO_OWNER }}
|
||||
tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
|
||||
app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
|
||||
path: deploy/charts/litellm-helm
|
||||
registry: ${{ env.REGISTRY }}
|
||||
registry_username: ${{ github.actor }}
|
||||
registry_password: ${{ secrets.GITHUB_TOKEN }}
|
||||
update_dependencies: true
|
||||
|
6
.gitignore
vendored
|
@ -39,4 +39,8 @@ ui/litellm-dashboard/.next
|
|||
ui/litellm-dashboard/node_modules
|
||||
ui/litellm-dashboard/next-env.d.ts
|
||||
ui/litellm-dashboard/package.json
|
||||
ui/litellm-dashboard/package-lock.json
|
||||
ui/litellm-dashboard/package-lock.json
|
||||
deploy/charts/litellm-helm/*.tgz
|
||||
deploy/charts/litellm-helm/charts/*
|
||||
deploy/charts/*.tgz
|
||||
litellm/proxy/vertex_key.json
|
||||
|
|
|
@ -10,6 +10,12 @@ repos:
|
|||
exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
|
||||
additional_dependencies: [flake8-print]
|
||||
files: litellm/.*\.py
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: check-files-match
|
||||
name: Check if files match
|
||||
entry: python3 ci_cd/check_files_match.py
|
||||
language: system
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy
|
||||
|
|
|
@ -32,6 +32,9 @@ RUN pip install dist/*.whl
|
|||
# install dependencies as wheels
|
||||
RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
|
||||
|
||||
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
|
||||
RUN pip install redisvl==0.0.7 --no-deps
|
||||
|
||||
# Runtime stage
|
||||
FROM $LITELLM_RUNTIME_IMAGE as runtime
|
||||
|
||||
|
@ -52,4 +55,4 @@ RUN chmod +x entrypoint.sh
|
|||
EXPOSE 4000/tcp
|
||||
|
||||
ENTRYPOINT ["litellm"]
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug"]
|
||||
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
|
|
@ -47,6 +47,9 @@ COPY --from=builder /wheels/ /wheels/
|
|||
# Install the built wheel using pip; again using a wildcard if it's the only file
|
||||
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
|
||||
|
||||
# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0
|
||||
RUN pip install redisvl==0.0.7 --no-deps
|
||||
|
||||
# Generate prisma client
|
||||
RUN prisma generate
|
||||
RUN chmod +x entrypoint.sh
|
||||
|
@ -56,4 +59,4 @@ EXPOSE 4000/tcp
|
|||
# # Set your entrypoint and command
|
||||
|
||||
ENTRYPOINT ["litellm"]
|
||||
CMD ["--port", "4000"]
|
||||
CMD ["--port", "4000", "--run_gunicorn"]
|
||||
|
|
20
README.md
|
@ -5,7 +5,7 @@
|
|||
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
|
||||
<br>
|
||||
</p>
|
||||
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a></h4>
|
||||
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Support</a></h4>
|
||||
<h4 align="center">
|
||||
<a href="https://pypi.org/project/litellm/" target="_blank">
|
||||
<img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
|
||||
|
@ -28,10 +28,14 @@ LiteLLM manages:
|
|||
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
|
||||
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
|
||||
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
|
||||
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)
|
||||
|
||||
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
|
||||
|
||||
# Usage ([**Docs**](https://docs.litellm.ai/docs/))
|
||||
> [!IMPORTANT]
|
||||
> LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration)
|
||||
|
@ -155,6 +159,9 @@ print(response)
|
|||
```
|
||||
|
||||
## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys))
|
||||
UI on `/ui` on your proxy server
|
||||

|
||||
|
||||
Track Spend, Set budgets and create virtual keys for the proxy
|
||||
`POST /key/generate`
|
||||
|
||||
|
@ -174,17 +181,6 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
|||
}
|
||||
```
|
||||
|
||||
### [Beta] Proxy UI
|
||||
|
||||
A simple UI to add new models and let your users create keys.
|
||||
|
||||
Live here: https://dashboard.litellm.ai/
|
||||
|
||||
Code: https://github.com/BerriAI/litellm/tree/main/ui
|
||||
|
||||
|
||||
<img width="1672" alt="Screenshot 2023-12-26 at 8 33 53 AM" src="https://github.com/BerriAI/litellm/assets/17561003/274254d8-c5fe-4645-9123-100045a7fb21">
|
||||
|
||||
## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers))
|
||||
| Provider | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses) | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion) | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming) | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding) | [Async Image Generation](https://docs.litellm.ai/docs/image_generation) |
|
||||
| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
|
||||
|
|
32
ci_cd/check_files_match.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import sys
|
||||
import filecmp
|
||||
import shutil
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
print(
|
||||
"Comparing model_prices_and_context_window and litellm/model_prices_and_context_window_backup.json files... checking if they match."
|
||||
)
|
||||
|
||||
file1 = "model_prices_and_context_window.json"
|
||||
file2 = "litellm/model_prices_and_context_window_backup.json"
|
||||
|
||||
cmp_result = filecmp.cmp(file1, file2, shallow=False)
|
||||
|
||||
if cmp_result:
|
||||
print(f"Passed! Files {file1} and {file2} match.")
|
||||
return 0
|
||||
else:
|
||||
print(
|
||||
f"Failed! Files {file1} and {file2} do not match. Copying content from {file1} to {file2}."
|
||||
)
|
||||
copy_content(file1, file2)
|
||||
return 1
|
||||
|
||||
|
||||
def copy_content(source, destination):
|
||||
shutil.copy2(source, destination)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
|
@ -0,0 +1,76 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
import copy
|
||||
|
||||
load_dotenv()
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import asyncio
|
||||
from litellm import Router, Timeout
|
||||
import time
|
||||
from litellm.caching import Cache
|
||||
import litellm
|
||||
import openai
|
||||
|
||||
### Test just calling AsyncAzureOpenAI
|
||||
|
||||
openai_client = openai.AsyncAzureOpenAI(
|
||||
azure_endpoint=os.getenv("AZURE_API_BASE"),
|
||||
api_key=os.getenv("AZURE_API_KEY"),
|
||||
)
|
||||
|
||||
|
||||
async def call_acompletion(semaphore, input_data):
|
||||
async with semaphore:
|
||||
try:
|
||||
# Use asyncio.wait_for to set a timeout for the task
|
||||
response = await openai_client.chat.completions.create(**input_data)
|
||||
# Handle the response as needed
|
||||
print(response)
|
||||
return response
|
||||
except Timeout:
|
||||
print(f"Task timed out: {input_data}")
|
||||
return None # You may choose to return something else or raise an exception
|
||||
|
||||
|
||||
async def main():
|
||||
# Initialize the Router
|
||||
|
||||
# Create a semaphore with a capacity of 100
|
||||
semaphore = asyncio.Semaphore(100)
|
||||
|
||||
# List to hold all task references
|
||||
tasks = []
|
||||
start_time_all_tasks = time.time()
|
||||
# Launch 1000 tasks
|
||||
for _ in range(500):
|
||||
task = asyncio.create_task(
|
||||
call_acompletion(
|
||||
semaphore,
|
||||
{
|
||||
"model": "chatgpt-v-2",
|
||||
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
|
||||
},
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
responses = await asyncio.gather(*tasks)
|
||||
# Process responses as needed
|
||||
# Record the end time for all tasks
|
||||
end_time_all_tasks = time.time()
|
||||
# Calculate the total time for all tasks
|
||||
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
|
||||
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
|
||||
|
||||
# Calculate the average time per response
|
||||
average_time_per_response = total_time_all_tasks / len(responses)
|
||||
print(f"Average time per response: {average_time_per_response} seconds")
|
||||
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
|
||||
|
||||
|
||||
# Run the main function
|
||||
asyncio.run(main())
|
88
cookbook/litellm_router_load_test/test_loadtest_router.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
import copy
|
||||
|
||||
load_dotenv()
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import asyncio
|
||||
from litellm import Router, Timeout
|
||||
import time
|
||||
|
||||
### Test calling router async
|
||||
|
||||
|
||||
async def call_acompletion(semaphore, router: Router, input_data):
|
||||
async with semaphore:
|
||||
try:
|
||||
# Use asyncio.wait_for to set a timeout for the task
|
||||
response = await router.acompletion(**input_data)
|
||||
# Handle the response as needed
|
||||
print(response)
|
||||
return response
|
||||
except Timeout:
|
||||
print(f"Task timed out: {input_data}")
|
||||
return None # You may choose to return something else or raise an exception
|
||||
|
||||
|
||||
async def main():
|
||||
# Initialize the Router
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
},
|
||||
},
|
||||
]
|
||||
router = Router(model_list=model_list, num_retries=3, timeout=10)
|
||||
|
||||
# Create a semaphore with a capacity of 100
|
||||
semaphore = asyncio.Semaphore(100)
|
||||
|
||||
# List to hold all task references
|
||||
tasks = []
|
||||
start_time_all_tasks = time.time()
|
||||
# Launch 1000 tasks
|
||||
for _ in range(500):
|
||||
task = asyncio.create_task(
|
||||
call_acompletion(
|
||||
semaphore,
|
||||
router,
|
||||
{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
|
||||
},
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
responses = await asyncio.gather(*tasks)
|
||||
# Process responses as needed
|
||||
# Record the end time for all tasks
|
||||
end_time_all_tasks = time.time()
|
||||
# Calculate the total time for all tasks
|
||||
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
|
||||
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
|
||||
|
||||
# Calculate the average time per response
|
||||
average_time_per_response = total_time_all_tasks / len(responses)
|
||||
print(f"Average time per response: {average_time_per_response} seconds")
|
||||
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
|
||||
|
||||
|
||||
# Run the main function
|
||||
asyncio.run(main())
|
|
@ -0,0 +1,94 @@
|
|||
import sys, os
|
||||
import traceback
|
||||
from dotenv import load_dotenv
|
||||
import copy
|
||||
|
||||
load_dotenv()
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../..")
|
||||
) # Adds the parent directory to the system path
|
||||
import asyncio
|
||||
from litellm import Router, Timeout
|
||||
import time
|
||||
from litellm.caching import Cache
|
||||
import litellm
|
||||
|
||||
litellm.cache = Cache(
|
||||
type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2"
|
||||
)
|
||||
|
||||
### Test calling router with s3 Cache
|
||||
|
||||
|
||||
async def call_acompletion(semaphore, router: Router, input_data):
|
||||
async with semaphore:
|
||||
try:
|
||||
# Use asyncio.wait_for to set a timeout for the task
|
||||
response = await router.acompletion(**input_data)
|
||||
# Handle the response as needed
|
||||
print(response)
|
||||
return response
|
||||
except Timeout:
|
||||
print(f"Task timed out: {input_data}")
|
||||
return None # You may choose to return something else or raise an exception
|
||||
|
||||
|
||||
async def main():
|
||||
# Initialize the Router
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"api_key": os.getenv("AZURE_API_KEY"),
|
||||
"api_base": os.getenv("AZURE_API_BASE"),
|
||||
"api_version": os.getenv("AZURE_API_VERSION"),
|
||||
},
|
||||
},
|
||||
]
|
||||
router = Router(model_list=model_list, num_retries=3, timeout=10)
|
||||
|
||||
# Create a semaphore with a capacity of 100
|
||||
semaphore = asyncio.Semaphore(100)
|
||||
|
||||
# List to hold all task references
|
||||
tasks = []
|
||||
start_time_all_tasks = time.time()
|
||||
# Launch 1000 tasks
|
||||
for _ in range(500):
|
||||
task = asyncio.create_task(
|
||||
call_acompletion(
|
||||
semaphore,
|
||||
router,
|
||||
{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
|
||||
},
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
responses = await asyncio.gather(*tasks)
|
||||
# Process responses as needed
|
||||
# Record the end time for all tasks
|
||||
end_time_all_tasks = time.time()
|
||||
# Calculate the total time for all tasks
|
||||
total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
|
||||
print(f"Total time for all tasks: {total_time_all_tasks} seconds")
|
||||
|
||||
# Calculate the average time per response
|
||||
average_time_per_response = total_time_all_tasks / len(responses)
|
||||
print(f"Average time per response: {average_time_per_response} seconds")
|
||||
print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
|
||||
|
||||
|
||||
# Run the main function
|
||||
asyncio.run(main())
|
2
cookbook/misc/dev_release.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
python3 -m build
|
||||
twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ -
|
23
deploy/charts/litellm-helm/.helmignore
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
6
deploy/charts/litellm-helm/Chart.lock
Normal file
|
@ -0,0 +1,6 @@
|
|||
dependencies:
|
||||
- name: postgresql
|
||||
repository: oci://registry-1.docker.io/bitnamicharts
|
||||
version: 13.3.1
|
||||
digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
|
||||
generated: "2024-01-19T11:32:56.694808861+11:00"
|
34
deploy/charts/litellm-helm/Chart.yaml
Normal file
|
@ -0,0 +1,34 @@
|
|||
apiVersion: v2
|
||||
|
||||
# We can't call ourselves just "litellm" because then we couldn't publish to the
|
||||
# same OCI repository as the "litellm" OCI image
|
||||
name: litellm-helm
|
||||
description: Call all LLM APIs using the OpenAI format
|
||||
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
# to be deployed.
|
||||
#
|
||||
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.1.0
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: v1.18.9
|
||||
|
||||
dependencies:
|
||||
- name: "postgresql"
|
||||
version: ">=13.3.0"
|
||||
repository: oci://registry-1.docker.io/bitnamicharts
|
||||
condition: db.deployStandalone
|
||||
|
107
deploy/charts/litellm-helm/README.md
Normal file
|
@ -0,0 +1,107 @@
|
|||
# Helm Chart for LiteLLM
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Kubernetes 1.23+
|
||||
- Helm 3.8.0+
|
||||
|
||||
If `db.deployStandalone` is used:
|
||||
- PV provisioner support in the underlying infrastructure
|
||||
|
||||
If `db.useStackgresOperator` is used (not yet implemented):
|
||||
- The Stackgres Operator must already be installed in the Kubernetes Cluster. This chart will **not** install the operator if it is missing.
|
||||
|
||||
## Parameters
|
||||
|
||||
### LiteLLM Proxy Deployment Settings
|
||||
|
||||
| Name | Description | Value |
|
||||
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
|
||||
| `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` |
|
||||
| `masterkey` | The Master API Key for LiteLLM. If not specified, a random key is generated. | N/A |
|
||||
| `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` |
|
||||
| `image.repository` | LiteLLM Proxy image repository | `ghcr.io/berriai/litellm` |
|
||||
| `image.pullPolicy` | LiteLLM Proxy image pull policy | `IfNotPresent` |
|
||||
| `image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` |
|
||||
| `image.dbReadyImage` | On Pod startup, an initContainer is used to make sure the Postgres database is available before attempting to start LiteLLM. This field specifies the image to use as that initContainer. | `docker.io/bitnami/postgresql` |
|
||||
| `image.dbReadyTag` | Tag for the above image. If not specified, "latest" is used. | `""` |
|
||||
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
|
||||
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
|
||||
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
|
||||
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` |
|
||||
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
|
||||
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
|
||||
|
||||
#### Example `environmentSecrets` Secret
|
||||
```
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: litellm-envsecrets
|
||||
data:
|
||||
AZURE_OPENAI_API_KEY: TXlTZWN1cmVLM3k=
|
||||
type: Opaque
|
||||
```
|
||||
|
||||
### LiteLLM Admin UI Settings
|
||||
|
||||
| Name | Description | Value |
|
||||
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
|
||||
| `ui.enabled` | Should the LiteLLM Admin UI be deployed | `true` |
|
||||
| `ui.replicaCount` | The number of LiteLLM Admin UI pods to be deployed | `1` |
|
||||
| `ui.image.repository` | LiteLLM Admin UI image repository | `ghcr.io/berriai/litellm` |
|
||||
| `ui.image.pullPolicy` | LiteLLM Admin UI image pull policy | `IfNotPresent` |
|
||||
| `ui.image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` |
|
||||
| `ui.imagePullSecrets` | Registry credentials for the above images. | `[]` |
|
||||
| `ui.service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
|
||||
| `ui.service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the web server will listen on. | `8000` |
|
||||
| `ui.ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
|
||||
|
||||
### Database Settings
|
||||
| Name | Description | Value |
|
||||
| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
|
||||
| `db.useExisting` | Use an existing Postgres database. A Kubernetes Secret object must exist that contains credentials for connecting to the database. An example secret object definition is provided below. | `false` |
|
||||
| `db.endpoint` | If `db.useExisting` is `true`, this is the IP, Hostname or Service Name of the Postgres server to connect to. | `localhost` |
|
||||
| `db.database` | If `db.useExisting` is `true`, the name of the existing database to connect to. | `litellm` |
|
||||
| `db.secret.name` | If `db.useExisting` is `true`, the name of the Kubernetes Secret that contains credentials. | `postgres` |
|
||||
| `db.secret.usernameKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the username for authenticating with the Postgres instance. | `username` |
|
||||
| `db.secret.passwordKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the password associates with the above user. | `password` |
|
||||
| `db.useStackgresOperator` | Not yet implemented. | `false` |
|
||||
| `db.deployStandalone` | Deploy a standalone, single instance deployment of Postgres, using the Bitnami postgresql chart. This is useful for getting started but doesn't provide HA or (by default) data backups. | `true` |
|
||||
| `postgresql.*` | If `db.deployStandalone` is `true`, configuration passed to the Bitnami postgresql chart. See the [Bitnami Documentation](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) for full configuration details. See [values.yaml](./values.yaml) for the default configuration. | See [values.yaml](./values.yaml) |
|
||||
| `postgresql.auth.*` | If `db.deployStandalone` is `true`, care should be taken to ensure the default `password` and `postgres-password` values are **NOT** used. | `NoTaGrEaTpAsSwOrD` |
|
||||
|
||||
#### Example Postgres `db.useExisting` Secret
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: postgres
|
||||
data:
|
||||
# Password for the "postgres" user
|
||||
postgres-password: <some secure password, base64 encoded>
|
||||
username: litellm
|
||||
password: <some secure password, base64 encoded>
|
||||
type: Opaque
|
||||
```
|
||||
|
||||
## Accessing the Admin UI
|
||||
When browsing to the URL published per the settings in `ui.ingress.*`, you will
|
||||
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
|
||||
(from the `litellm-ui` pod's perspective) URL published by the `litellm-proxy`
|
||||
Kubernetes Service. If the deployment uses the default settings for this
|
||||
service, the **Proxy Endpoint** should be set to `http://litellm-proxy:8000`.
|
||||
|
||||
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
|
||||
was not provided to the helm command line, the `masterkey` is a randomly
|
||||
generated string stored in the `litellm-masterkey` Kubernetes Secret.
|
||||
|
||||
```bash
|
||||
kubectl -n litellm get secret litellm-masterkey -o jsonpath="{.data.masterkey}"
|
||||
```
|
||||
|
||||
## Admin UI Limitations
|
||||
At the time of writing, the Admin UI is unable to add models. This is because
|
||||
it would need to update the `config.yaml` file which is a exposed ConfigMap, and
|
||||
therefore, read-only. This is a limitation of this helm chart, not the Admin UI
|
||||
itself.
|
22
deploy/charts/litellm-helm/templates/NOTES.txt
Normal file
|
@ -0,0 +1,22 @@
|
|||
1. Get the application URL by running these commands:
|
||||
{{- if .Values.ingress.enabled }}
|
||||
{{- range $host := .Values.ingress.hosts }}
|
||||
{{- range .paths }}
|
||||
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- else if contains "NodePort" .Values.service.type }}
|
||||
export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "litellm.fullname" . }})
|
||||
export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
|
||||
echo http://$NODE_IP:$NODE_PORT
|
||||
{{- else if contains "LoadBalancer" .Values.service.type }}
|
||||
NOTE: It may take a few minutes for the LoadBalancer IP to be available.
|
||||
You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "litellm.fullname" . }}'
|
||||
export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "litellm.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
|
||||
echo http://$SERVICE_IP:{{ .Values.service.port }}
|
||||
{{- else if contains "ClusterIP" .Values.service.type }}
|
||||
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "litellm.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
|
||||
export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
|
||||
echo "Visit http://127.0.0.1:8080 to use your application"
|
||||
kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
|
||||
{{- end }}
|
74
deploy/charts/litellm-helm/templates/_helpers.tpl
Normal file
|
@ -0,0 +1,74 @@
|
|||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "litellm.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "litellm.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "litellm.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "litellm.labels" -}}
|
||||
helm.sh/chart: {{ include "litellm.chart" . }}
|
||||
{{ include "litellm.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
{{- define "litellm.ui.labels" -}}
|
||||
helm.sh/chart: {{ include "litellm.chart" . }}
|
||||
{{ include "litellm.ui.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "litellm.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "litellm.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
{{- define "litellm.ui.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "litellm.name" . }}-ui
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "litellm.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create }}
|
||||
{{- default (include "litellm.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else }}
|
||||
{{- default "default" .Values.serviceAccount.name }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -0,0 +1,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}-config
|
||||
data:
|
||||
config.yaml: |
|
||||
{{ .Values.proxy_config | toYaml | indent 6 }}
|
230
deploy/charts/litellm-helm/templates/deployment-proxy.yaml
Normal file
|
@ -0,0 +1,230 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}-proxy
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.autoscaling.enabled }}
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "litellm.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
{{- with .Values.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 8 }}
|
||||
{{- with .Values.podLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- with .Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
- name: db-ready
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: "docker.io/bitnami/postgresql:16.1.0-debian-11-r20"
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
env:
|
||||
{{- if .Values.db.deployStandalone }}
|
||||
- name: DATABASE_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "litellm.name" . }}-dbcredentials
|
||||
key: username
|
||||
- name: PGPASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "litellm.name" . }}-dbcredentials
|
||||
key: password
|
||||
- name: DATABASE_HOST
|
||||
value: {{ .Release.Name }}-postgresql
|
||||
- name: DATABASE_NAME
|
||||
value: litellm
|
||||
{{- else if .Values.db.useExisting }}
|
||||
- name: DATABASE_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Values.db.secret.name }}
|
||||
key: {{ .Values.db.secret.usernameKey }}
|
||||
- name: PGPASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Values.db.secret.name }}
|
||||
key: {{ .Values.db.secret.passwordKey }}
|
||||
- name: DATABASE_HOST
|
||||
value: {{ .Values.db.endpoint }}
|
||||
- name: DATABASE_NAME
|
||||
value: litellm
|
||||
{{- end }}
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
# Maximum wait time will be (limit * 2) seconds.
|
||||
limit=60
|
||||
current=0
|
||||
ret=1
|
||||
while [ $current -lt $limit ] && [ $ret -ne 0 ]; do
|
||||
echo "Waiting for database to be ready $current"
|
||||
psql -U $(DATABASE_USERNAME) -h $(DATABASE_HOST) -l
|
||||
ret=$?
|
||||
current=$(( $current + 1 ))
|
||||
sleep 2
|
||||
done
|
||||
if [ $ret -eq 0 ]; then
|
||||
echo "Database is ready"
|
||||
else
|
||||
echo "Database failed to become ready before we gave up waiting."
|
||||
fi
|
||||
{{ if .Values.securityContext.readOnlyRootFilesystem }}
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
{{ end }}
|
||||
containers:
|
||||
- name: {{ include "litellm.name" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
env:
|
||||
- name: HOST
|
||||
value: "0.0.0.0"
|
||||
- name: PORT
|
||||
value: {{ .Values.service.port | quote}}
|
||||
{{- if .Values.db.deployStandalone }}
|
||||
- name: DATABASE_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "litellm.name" . }}-dbcredentials
|
||||
key: username
|
||||
- name: DATABASE_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "litellm.name" . }}-dbcredentials
|
||||
key: password
|
||||
- name: DATABASE_HOST
|
||||
value: {{ .Release.Name }}-postgresql
|
||||
- name: DATABASE_NAME
|
||||
value: litellm
|
||||
{{- else if .Values.db.useExisting }}
|
||||
- name: DATABASE_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Values.db.secret.name }}
|
||||
key: {{ .Values.db.secret.usernameKey }}
|
||||
- name: DATABASE_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ .Values.db.secret.name }}
|
||||
key: {{ .Values.db.secret.passwordKey }}
|
||||
- name: DATABASE_HOST
|
||||
value: {{ .Values.db.endpoint }}
|
||||
- name: DATABASE_NAME
|
||||
value: {{ .Values.db.database }}
|
||||
{{- end }}
|
||||
- name: DATABASE_URL
|
||||
value: "postgresql://$(DATABASE_USERNAME):$(DATABASE_PASSWORD)@$(DATABASE_HOST)/$(DATABASE_NAME)"
|
||||
- name: PROXY_MASTER_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "litellm.name" . }}-masterkey
|
||||
key: masterkey
|
||||
envFrom:
|
||||
{{- range .Values.environmentSecrets }}
|
||||
- secretRef:
|
||||
name: {{ . }}
|
||||
{{- end }}
|
||||
args:
|
||||
- --config
|
||||
- /etc/litellm/config.yaml
|
||||
# command:
|
||||
# - bash
|
||||
# - -c
|
||||
# - |
|
||||
# ls -la /etc/litellm/; cat /etc/litellm/config.yaml; export
|
||||
# find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/before.list
|
||||
# prisma generate
|
||||
# find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/after.list
|
||||
# diff -ruN /tmp/before.list /tmp/after.list
|
||||
# sleep 3600
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: {{ .Values.service.port }}
|
||||
protocol: TCP
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/liveliness
|
||||
port: http
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/readiness
|
||||
port: http
|
||||
# Give the container time to start up. Up to 5 minutes (10 * 30 seconds)
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /health/readiness
|
||||
port: http
|
||||
failureThreshold: 30
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
{{- toYaml .Values.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: litellm-config
|
||||
mountPath: /etc/litellm/
|
||||
{{ if .Values.securityContext.readOnlyRootFilesystem }}
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
- name: cache
|
||||
mountPath: /.cache
|
||||
- name: npm
|
||||
mountPath: /.npm
|
||||
{{- end }}
|
||||
{{- with .Values.volumeMounts }}
|
||||
{{- toYaml . | nindent 12 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
{{ if .Values.securityContext.readOnlyRootFilesystem }}
|
||||
- name: tmp
|
||||
emptyDir:
|
||||
sizeLimit: 500Mi
|
||||
- name: cache
|
||||
emptyDir:
|
||||
sizeLimit: 500Mi
|
||||
- name: npm
|
||||
emptyDir:
|
||||
sizeLimit: 500Mi
|
||||
{{- end }}
|
||||
- name: litellm-config
|
||||
configMap:
|
||||
name: {{ include "litellm.fullname" . }}-config
|
||||
items:
|
||||
- key: "config.yaml"
|
||||
path: "config.yaml"
|
||||
{{- with .Values.volumes }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
89
deploy/charts/litellm-helm/templates/deployment-ui.yaml
Normal file
|
@ -0,0 +1,89 @@
|
|||
{{- if .Values.ui.enabled -}}
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}-ui
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.ui.autoscaling.enabled }}
|
||||
replicas: {{ .Values.ui.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "litellm.ui.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
{{- with .Values.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "litellm.ui.labels" . | nindent 8 }}
|
||||
{{- with .Values.ui.podLabels }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- with .Values.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "litellm.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.ui.podSecurityContext | nindent 8 }}
|
||||
containers:
|
||||
- name: {{ include "litellm.name" . }}-ui
|
||||
securityContext:
|
||||
{{- toYaml .Values.ui.securityContext | nindent 12 }}
|
||||
image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
|
||||
imagePullPolicy: {{ .Values.ui.image.pullPolicy }}
|
||||
env:
|
||||
- name: BASE_URL
|
||||
value: {{ (index .Values.ui.ingress.hosts 0).host | default "example.com" }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: {{ .Values.ui.service.port }}
|
||||
protocol: TCP
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
# Give the container time to start up. Up to 5 minutes (10 * 30 seconds)
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
failureThreshold: 30
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
{{- toYaml .Values.ui.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
{{- with .Values.ui.volumeMounts }}
|
||||
{{- toYaml . | nindent 12 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: tmp
|
||||
emptyDir:
|
||||
sizeLimit: 500Mi
|
||||
{{- with .Values.ui.volumes }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.ui.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.ui.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.ui.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
32
deploy/charts/litellm-helm/templates/hpa.yaml
Normal file
|
@ -0,0 +1,32 @@
|
|||
{{- if .Values.autoscaling.enabled }}
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: {{ include "litellm.fullname" . }}
|
||||
minReplicas: {{ .Values.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- end }}
|
61
deploy/charts/litellm-helm/templates/ingress-proxy.yaml
Normal file
|
@ -0,0 +1,61 @@
|
|||
{{- if .Values.ingress.enabled -}}
|
||||
{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-proxy") -}}
|
||||
{{- $svcPort := .Values.service.port -}}
|
||||
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
|
||||
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
|
||||
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1beta1
|
||||
{{- else -}}
|
||||
apiVersion: extensions/v1beta1
|
||||
{{- end }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ $fullName }}
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
{{- with .Values.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- range .Values.ingress.hosts }}
|
||||
- host: {{ .host | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- range .paths }}
|
||||
- path: {{ .path }}
|
||||
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
|
||||
pathType: {{ .pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
number: {{ $svcPort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $fullName }}
|
||||
servicePort: {{ $svcPort }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
61
deploy/charts/litellm-helm/templates/ingress-ui.yaml
Normal file
|
@ -0,0 +1,61 @@
|
|||
{{- if .Values.ui.ingress.enabled -}}
|
||||
{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-ui") -}}
|
||||
{{- $svcPort := .Values.ui.service.port -}}
|
||||
{{- if and .Values.ui.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
|
||||
{{- if not (hasKey .Values.ui.ingress.annotations "kubernetes.io/ingress.class") }}
|
||||
{{- $_ := set .Values.ui.ingress.annotations "kubernetes.io/ingress.class" .Values.ui.ingress.className}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: networking.k8s.io/v1beta1
|
||||
{{- else -}}
|
||||
apiVersion: extensions/v1beta1
|
||||
{{- end }}
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ $fullName }}
|
||||
labels:
|
||||
{{- include "litellm.ui.labels" . | nindent 4 }}
|
||||
{{- with .Values.ui.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if and .Values.ui.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
|
||||
ingressClassName: {{ .Values.ui.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.ui.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.ui.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
{{- range .Values.ui.ingress.hosts }}
|
||||
- host: {{ .host | quote }}
|
||||
http:
|
||||
paths:
|
||||
{{- range .paths }}
|
||||
- path: {{ .path }}
|
||||
{{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
|
||||
pathType: {{ .pathType }}
|
||||
{{- end }}
|
||||
backend:
|
||||
{{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
|
||||
service:
|
||||
name: {{ $fullName }}
|
||||
port:
|
||||
number: {{ $svcPort }}
|
||||
{{- else }}
|
||||
serviceName: {{ $fullName }}
|
||||
servicePort: {{ $svcPort }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -0,0 +1,12 @@
|
|||
{{- if .Values.db.deployStandalone -}}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "litellm.name" . }}-dbcredentials
|
||||
data:
|
||||
# Password for the "postgres" user
|
||||
postgres-password: {{ ( index .Values.postgresql.auth "postgres-password") | default "litellm" | b64enc }}
|
||||
username: {{ .Values.postgresql.auth.username | default "litellm" | b64enc }}
|
||||
password: {{ .Values.postgresql.auth.password | default "litellm" | b64enc }}
|
||||
type: Opaque
|
||||
{{- end -}}
|
|
@ -0,0 +1,8 @@
|
|||
{{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "litellm.name" . }}-masterkey
|
||||
data:
|
||||
masterkey: {{ $masterkey | b64enc }}
|
||||
type: Opaque
|
15
deploy/charts/litellm-helm/templates/service-proxy.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}-proxy
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
spec:
|
||||
type: {{ .Values.service.type }}
|
||||
ports:
|
||||
- port: {{ .Values.service.port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
{{- include "litellm.selectorLabels" . | nindent 4 }}
|
17
deploy/charts/litellm-helm/templates/service-ui.yaml
Normal file
|
@ -0,0 +1,17 @@
|
|||
{{- if .Values.ui.enabled -}}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "litellm.fullname" . }}-ui
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
spec:
|
||||
type: {{ .Values.ui.service.type }}
|
||||
ports:
|
||||
- port: {{ .Values.ui.service.port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
{{- include "litellm.ui.selectorLabels" . | nindent 4 }}
|
||||
{{ end -}}
|
13
deploy/charts/litellm-helm/templates/serviceaccount.yaml
Normal file
|
@ -0,0 +1,13 @@
|
|||
{{- if .Values.serviceAccount.create -}}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "litellm.serviceAccountName" . }}
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
{{- with .Values.serviceAccount.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
|
||||
{{- end }}
|
|
@ -0,0 +1,15 @@
|
|||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: "{{ include "litellm.fullname" . }}-test-connection"
|
||||
labels:
|
||||
{{- include "litellm.labels" . | nindent 4 }}
|
||||
annotations:
|
||||
"helm.sh/hook": test
|
||||
spec:
|
||||
containers:
|
||||
- name: wget
|
||||
image: busybox
|
||||
command: ['wget']
|
||||
args: ['{{ include "litellm.fullname" . }}:{{ .Values.service.port }}']
|
||||
restartPolicy: Never
|
219
deploy/charts/litellm-helm/values.yaml
Normal file
|
@ -0,0 +1,219 @@
|
|||
# Default values for litellm.
|
||||
# This is a YAML-formatted file.
|
||||
# Declare variables to be passed into your templates.
|
||||
|
||||
replicaCount: 1
|
||||
|
||||
image:
|
||||
repository: ghcr.io/berriai/litellm
|
||||
pullPolicy: IfNotPresent
|
||||
# Overrides the image tag whose default is the chart appVersion.
|
||||
# tag: "main-latest"
|
||||
tag: ""
|
||||
|
||||
# Image and tag used for the init container to check and wait for the
|
||||
# readiness of the postgres database.
|
||||
dbReadyImage: docker.io/bitnami/postgresql
|
||||
dbReadyTag: ""
|
||||
|
||||
imagePullSecrets: []
|
||||
nameOverride: "litellm"
|
||||
fullnameOverride: ""
|
||||
|
||||
serviceAccount:
|
||||
# Specifies whether a service account should be created
|
||||
create: false
|
||||
# Automatically mount a ServiceAccount's API credentials?
|
||||
automount: true
|
||||
# Annotations to add to the service account
|
||||
annotations: {}
|
||||
# The name of the service account to use.
|
||||
# If not set and create is true, a name is generated using the fullname template
|
||||
name: ""
|
||||
|
||||
podAnnotations: {}
|
||||
podLabels: {}
|
||||
|
||||
# At the time of writing, the litellm docker image requires write access to the
|
||||
# filesystem on startup so that prisma can install some dependencies.
|
||||
podSecurityContext: {}
|
||||
securityContext: {}
|
||||
# capabilities:
|
||||
# drop:
|
||||
# - ALL
|
||||
# readOnlyRootFilesystem: false
|
||||
# runAsNonRoot: true
|
||||
# runAsUser: 1000
|
||||
|
||||
# A list of Kubernetes Secret objects that will be exported to the LiteLLM proxy
|
||||
# pod as environment variables. These secrets can then be referenced in the
|
||||
# configuration file (or "litellm" ConfigMap) with `os.environ/<Env Var Name>`
|
||||
environmentSecrets: []
|
||||
# - litellm-envsecrets
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8000
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: "nginx"
|
||||
annotations: {}
|
||||
# kubernetes.io/ingress.class: nginx
|
||||
# kubernetes.io/tls-acme: "true"
|
||||
hosts:
|
||||
- host: api.example.local
|
||||
paths:
|
||||
- path: /
|
||||
pathType: ImplementationSpecific
|
||||
tls: []
|
||||
# - secretName: chart-example-tls
|
||||
# hosts:
|
||||
# - chart-example.local
|
||||
|
||||
# The elements within proxy_config are rendered as config.yaml for the proxy
|
||||
# Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
|
||||
# Reference: https://docs.litellm.ai/docs/proxy/configs
|
||||
proxy_config:
|
||||
model_list:
|
||||
# At least one model must exist for the proxy to start.
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
api_key: eXaMpLeOnLy
|
||||
general_settings:
|
||||
master_key: os.environ/PROXY_MASTER_KEY
|
||||
|
||||
resources: {}
|
||||
# We usually recommend not to specify default resources and to leave this as a conscious
|
||||
# choice for the user. This also increases chances charts run on environments with little
|
||||
# resources, such as Minikube. If you do want to specify resources, uncomment the following
|
||||
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
|
||||
# limits:
|
||||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
|
||||
autoscaling:
|
||||
enabled: false
|
||||
minReplicas: 1
|
||||
maxReplicas: 100
|
||||
targetCPUUtilizationPercentage: 80
|
||||
# targetMemoryUtilizationPercentage: 80
|
||||
|
||||
# Additional volumes on the output Deployment definition.
|
||||
volumes: []
|
||||
# - name: foo
|
||||
# secret:
|
||||
# secretName: mysecret
|
||||
# optional: false
|
||||
|
||||
# Additional volumeMounts on the output Deployment definition.
|
||||
volumeMounts: []
|
||||
# - name: foo
|
||||
# mountPath: "/etc/foo"
|
||||
# readOnly: true
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
tolerations: []
|
||||
|
||||
affinity: {}
|
||||
|
||||
db:
|
||||
# Use an existing postgres server/cluster
|
||||
useExisting: false
|
||||
|
||||
# How to connect to the existing postgres server/cluster
|
||||
endpoint: localhost
|
||||
database: litellm
|
||||
secret:
|
||||
name: postgres
|
||||
usernameKey: username
|
||||
passwordKey: password
|
||||
|
||||
# Use the Stackgres Helm chart to deploy an instance of a Stackgres cluster.
|
||||
# The Stackgres Operator must already be installed within the target
|
||||
# Kubernetes cluster.
|
||||
# TODO: Stackgres deployment currently unsupported
|
||||
useStackgresOperator: false
|
||||
|
||||
# Use the Postgres Helm chart to create a single node, stand alone postgres
|
||||
# instance. See the "postgresql" top level key for additional configuration.
|
||||
deployStandalone: true
|
||||
|
||||
# Settings for Bitnami postgresql chart (if db.deployStandalone is true, ignored
|
||||
# otherwise)
|
||||
postgresql:
|
||||
architecture: standalone
|
||||
auth:
|
||||
username: litellm
|
||||
database: litellm
|
||||
|
||||
# You should override these on the helm command line with
|
||||
# `--set postgresql.auth.postgres-password=<some good password>,postgresql.auth.password=<some good password>`
|
||||
password: NoTaGrEaTpAsSwOrD
|
||||
postgres-password: NoTaGrEaTpAsSwOrD
|
||||
|
||||
# A secret is created by this chart (litellm-helm) with the credentials that
|
||||
# the new Postgres instance should use.
|
||||
existingSecret: litellm-dbcredentials
|
||||
secretKeys:
|
||||
userPasswordKey: password
|
||||
|
||||
ui:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
autoscaling:
|
||||
enabled: false
|
||||
image:
|
||||
repository: ghcr.io/berriai/litellm-ui
|
||||
pullPolicy: IfNotPresent
|
||||
# Overrides the image tag whose default is the chart appVersion.
|
||||
# tag: "main-latest"
|
||||
# TODO: Switch to BerryAI repo and tags if/when they provide a ui image
|
||||
# https://github.com/BerriAI/litellm/pull/1505
|
||||
tag: ""
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8501
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: "nginx"
|
||||
annotations: {}
|
||||
hosts:
|
||||
- host: ui.example.local
|
||||
paths:
|
||||
- path: /
|
||||
pathType: ImplementationSpecific
|
||||
tls: []
|
||||
|
||||
podAnnotations: {}
|
||||
podLabels: {}
|
||||
|
||||
podSecurityContext:
|
||||
fsGroup: 1000
|
||||
|
||||
securityContext:
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
|
||||
resources: {}
|
||||
|
||||
volumes: []
|
||||
|
||||
volumeMounts: []
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
tolerations: []
|
||||
|
||||
affinity: {}
|
|
@ -1,12 +0,0 @@
|
|||
version: "3.9"
|
||||
services:
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main
|
||||
ports:
|
||||
- "8000:8000" # Map the container port to the host, change the host port if necessary
|
||||
volumes:
|
||||
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
|
||||
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
|
||||
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
|
||||
|
||||
# ...rest of your docker-compose config if any
|
15
docker-compose.yml
Normal file
|
@ -0,0 +1,15 @@
|
|||
version: "3.9"
|
||||
services:
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main-latest
|
||||
volumes:
|
||||
- ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
|
||||
ports:
|
||||
- "4000:4000"
|
||||
environment:
|
||||
- AZURE_API_KEY=sk-123
|
||||
litellm-ui:
|
||||
image: ghcr.io/berriai/litellm-ui:main-latest
|
||||
|
||||
|
||||
|
|
@ -1,11 +1,17 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Caching - In-Memory, Redis, s3
|
||||
# Caching - In-Memory, Redis, s3, Redis Semantic Cache
|
||||
|
||||
[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
|
||||
|
||||
## Initialize Cache - In Memory, Redis, s3 Bucket
|
||||
:::info
|
||||
|
||||
Need to use Caching on LiteLLM Proxy Server? Doc here: [Caching Proxy Server](https://docs.litellm.ai/docs/proxy/caching)
|
||||
|
||||
:::
|
||||
|
||||
## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
|
||||
|
||||
|
||||
<Tabs>
|
||||
|
@ -18,7 +24,7 @@ pip install redis
|
|||
```
|
||||
|
||||
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
@ -55,7 +61,7 @@ Set AWS environment variables
|
|||
AWS_ACCESS_KEY_ID = "AKI*******"
|
||||
AWS_SECRET_ACCESS_KEY = "WOl*****"
|
||||
```
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
|
@ -80,6 +86,66 @@ response2 = completion(
|
|||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="redis-sem" label="redis-semantic cache">
|
||||
|
||||
Install redis
|
||||
```shell
|
||||
pip install redisvl==0.0.7
|
||||
```
|
||||
|
||||
For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm import completion
|
||||
from litellm.caching import Cache
|
||||
|
||||
random_number = random.randint(
|
||||
1, 100000
|
||||
) # add a random number to ensure it's always adding / reading from cache
|
||||
|
||||
print("testing semantic caching")
|
||||
litellm.cache = Cache(
|
||||
type="redis-semantic",
|
||||
host=os.environ["REDIS_HOST"],
|
||||
port=os.environ["REDIS_PORT"],
|
||||
password=os.environ["REDIS_PASSWORD"],
|
||||
similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
|
||||
)
|
||||
response1 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
}
|
||||
],
|
||||
max_tokens=20,
|
||||
)
|
||||
print(f"response1: {response1}")
|
||||
|
||||
random_number = random.randint(1, 100000)
|
||||
|
||||
response2 = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"write a one sentence poem about: {random_number}",
|
||||
}
|
||||
],
|
||||
max_tokens=20,
|
||||
)
|
||||
print(f"response2: {response1}")
|
||||
assert response1.id == response2.id
|
||||
# response1 == response2, response 1 is cached
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
|
||||
|
||||
<TabItem value="in-mem" label="in memory cache">
|
||||
|
||||
### Quick Start
|
||||
|
|
|
@ -150,5 +150,12 @@ litellm.register_model(model_cost=
|
|||
"https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json")
|
||||
```
|
||||
|
||||
**Don't pull hosted model_cost_map**
|
||||
If you have firewalls, and want to just use the local copy of the model cost map, you can do so like this:
|
||||
```bash
|
||||
export LITELLM_LOCAL_MODEL_COST_MAP="True"
|
||||
```
|
||||
|
||||
Note: this means you will need to upgrade to get updated pricing, and newer models.
|
||||
|
||||
|
||||
|
|
|
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l
|
|||
|
||||
- `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`
|
||||
|
||||
- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
||||
```
|
||||
- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less.
|
||||
```python
|
||||
input=["good morning from litellm"]
|
||||
```
|
||||
|
||||
|
@ -22,7 +22,11 @@ input=["good morning from litellm"]
|
|||
|
||||
- `user`: *string (optional)* A unique identifier representing your end-user,
|
||||
|
||||
- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
||||
- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
|
||||
|
||||
- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
|
||||
|
||||
- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
|
||||
|
||||
- `api_base`: *string (optional)* - The api endpoint you want to call the model with
|
||||
|
||||
|
@ -66,11 +70,18 @@ input=["good morning from litellm"]
|
|||
from litellm import embedding
|
||||
import os
|
||||
os.environ['OPENAI_API_KEY'] = ""
|
||||
response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
|
||||
response = embedding(
|
||||
model="text-embedding-3-small",
|
||||
input=["good morning from litellm", "this is another item"],
|
||||
metadata={"anything": "good day"},
|
||||
dimensions=5 # Only supported in text-embedding-3 and later models.
|
||||
)
|
||||
```
|
||||
|
||||
| Model Name | Function Call | Required OS Variables |
|
||||
|----------------------|---------------------------------------------|--------------------------------------|
|
||||
| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
| text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']` |
|
||||
|
||||
## Azure OpenAI Embedding Models
|
||||
|
|
15
docs/my-website/docs/enterprise.md
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Enterprise
|
||||
|
||||
LiteLLM offers dedicated enterprise support.
|
||||
|
||||
This covers:
|
||||
- **Feature Prioritization**
|
||||
- **Custom Integrations**
|
||||
- **Professional Support - Dedicated discord + slack**
|
||||
- **Custom SLAs**
|
||||
|
||||
:::info
|
||||
|
||||
[Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
|
||||
|
||||
:::
|
|
@ -131,3 +131,23 @@ response = image_generation(
|
|||
prompt="cute baby otter"
|
||||
)
|
||||
```
|
||||
|
||||
## Bedrock - Stable Diffusion
|
||||
Use this for stable diffusion on bedrock
|
||||
|
||||
|
||||
### Usage
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
|
@ -5,10 +5,14 @@ import TabItem from '@theme/TabItem';
|
|||
|
||||
https://github.com/BerriAI/litellm
|
||||
|
||||
import QuickStart from '../src/components/QuickStart.js'
|
||||
|
||||
## **Call 100+ LLMs using the same Input/Output Format**
|
||||
|
||||
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
|
||||
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
## Basic usage
|
||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
|
@ -157,9 +161,6 @@ response = completion(
|
|||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -177,9 +178,6 @@ response = completion(
|
|||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -199,9 +197,6 @@ response = completion(
|
|||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -222,9 +217,7 @@ response = completion(
|
|||
stream=True,
|
||||
)
|
||||
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -246,9 +239,6 @@ response = completion(
|
|||
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -265,9 +255,6 @@ response = completion(
|
|||
api_base="http://localhost:11434",
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="or" label="Openrouter">
|
||||
|
@ -284,9 +271,6 @@ response = completion(
|
|||
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
|
@ -327,34 +311,8 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
|
|||
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
```
|
||||
|
||||
## Calculate Costs, Usage, Latency
|
||||
|
||||
Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
|
||||
|
||||
```python
|
||||
from litellm import completion, completion_cost
|
||||
import os
|
||||
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
||||
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
|
||||
cost = completion_cost(completion_response=response)
|
||||
print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
|
||||
```
|
||||
|
||||
**Output**
|
||||
```shell
|
||||
Cost for completion call with gpt-3.5-turbo: $0.0000775000
|
||||
```
|
||||
|
||||
### Track Costs, Usage, Latency for streaming
|
||||
We use a custom callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
||||
- We define a callback function to calculate cost `def track_cost_callback()`
|
||||
- In `def track_cost_callback()` we check if the stream is complete - `if "complete_streaming_response" in kwargs`
|
||||
- Use `litellm.completion_cost()` to calculate cost, once the stream is complete
|
||||
## Track Costs, Usage, Latency for streaming
|
||||
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
@ -366,18 +324,8 @@ def track_cost_callback(
|
|||
start_time, end_time # start/end time
|
||||
):
|
||||
try:
|
||||
# check if it has collected an entire stream response
|
||||
if "complete_streaming_response" in kwargs:
|
||||
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
|
||||
completion_response=kwargs["complete_streaming_response"]
|
||||
input_text = kwargs["messages"]
|
||||
output_text = completion_response["choices"][0]["message"]["content"]
|
||||
response_cost = litellm.completion_cost(
|
||||
model = kwargs["model"],
|
||||
messages = input_text,
|
||||
completion=output_text
|
||||
)
|
||||
print("streaming response_cost", response_cost)
|
||||
response_cost = kwargs.get("response_cost", 0)
|
||||
print("streaming response_cost", response_cost)
|
||||
except:
|
||||
pass
|
||||
# set callback
|
||||
|
@ -400,6 +348,8 @@ response = completion(
|
|||
|
||||
Track spend across multiple projects/people
|
||||
|
||||

|
||||
|
||||
The proxy provides:
|
||||
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
|
||||
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
|
||||
|
@ -436,8 +386,7 @@ response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
|||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## More details
|
||||
* [exception mapping](./exception_mapping.md)
|
||||
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
|
||||
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
|
||||
* [proxy virtual keys & spend management](./tutorials/fallbacks.md)
|
|
@ -27,6 +27,7 @@ Use just 2 lines of code, to instantly log your responses **across all providers
|
|||
Get your Langfuse API Keys from https://cloud.langfuse.com/
|
||||
```python
|
||||
litellm.success_callback = ["langfuse"]
|
||||
litellm.failure_callback = ["langfuse"] # logs errors to langfuse
|
||||
```
|
||||
```python
|
||||
# pip install langfuse
|
||||
|
@ -93,7 +94,7 @@ print(response)
|
|||
|
||||
```
|
||||
|
||||
### Set Custom Trace ID, Trace User ID
|
||||
### Set Custom Trace ID, Trace User ID and Tags
|
||||
|
||||
Pass `trace_id`, `trace_user_id` in `metadata`
|
||||
|
||||
|
@ -122,6 +123,8 @@ response = completion(
|
|||
"generation_id": "gen-id22", # set langfuse Generation ID
|
||||
"trace_id": "trace-id22", # set langfuse Trace ID
|
||||
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
||||
"session_id": "session-1", # set langfuse Session ID
|
||||
"tags": ["tag1", "tag2"] # set langfuse Tags
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
@ -74,6 +74,8 @@ response = litellm.completion(
|
|||
| gpt-4-32k | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-32k-0314 | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-32k-0613 | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-1106-preview | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-4-0125-preview | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-3.5-turbo | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-3.5-turbo-0301 | `completion('azure/<your deployment name>', messages)` |
|
||||
| gpt-3.5-turbo-0613 | `completion('azure/<your deployment name>', messages)` |
|
||||
|
|
|
@ -197,7 +197,7 @@ response = completion(
|
|||
|
||||
### SSO Login (AWS Profile)
|
||||
- Set `AWS_PROFILE` environment variable
|
||||
- Make bedrock completion call
|
||||
- Make bedrock completion call
|
||||
```python
|
||||
import os
|
||||
from litellm import completion
|
||||
|
@ -208,11 +208,24 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
### STS based Auth
|
||||
or pass `aws_profile_name`:
|
||||
|
||||
```python
|
||||
import os
|
||||
from litellm import completion
|
||||
|
||||
response = completion(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
||||
aws_profile_name="dev-profile",
|
||||
)
|
||||
```
|
||||
|
||||
### STS based Auth
|
||||
|
||||
- Set `aws_role_name` and `aws_session_name` in completion() / embedding() function
|
||||
|
||||
Make the bedrock completion call
|
||||
Make the bedrock completion call
|
||||
```python
|
||||
from litellm import completion
|
||||
|
||||
|
@ -315,3 +328,50 @@ print(response)
|
|||
| Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
|
||||
| Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
|
||||
| Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |
|
||||
|
||||
## Image Generation
|
||||
Use this for stable diffusion on bedrock
|
||||
|
||||
|
||||
### Usage
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
**Set optional params**
|
||||
```python
|
||||
import os
|
||||
from litellm import image_generation
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
|
||||
response = image_generation(
|
||||
prompt="A cute baby sea otter",
|
||||
model="bedrock/stability.stable-diffusion-xl-v0",
|
||||
### OPENAI-COMPATIBLE ###
|
||||
size="128x512", # width=128, height=512
|
||||
### PROVIDER-SPECIFIC ### see `AmazonStabilityConfig` in bedrock.py for all params
|
||||
seed=30
|
||||
)
|
||||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
## Supported AWS Bedrock Image Generation Models
|
||||
|
||||
| Model Name | Function Call |
|
||||
|----------------------|---------------------------------------------|
|
||||
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
|
||||
| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |
|
|
@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
|
|||
|
||||
| Model Name | Function Call |
|
||||
|-----------------------|-----------------------------------------------------------------|
|
||||
| gpt-4-0125-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
|
||||
| gpt-4-1106-preview | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
|
||||
| gpt-3.5-turbo-1106 | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
|
||||
| gpt-3.5-turbo | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
|
||||
|
@ -173,6 +174,31 @@ response = completion(
|
|||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
### Set `ssl_verify=False`
|
||||
|
||||
This is done by setting your own `httpx.Client`
|
||||
|
||||
- For `litellm.completion` set `litellm.client_session=httpx.Client(verify=False)`
|
||||
- For `litellm.acompletion` set `litellm.aclient_session=AsyncClient.Client(verify=False)`
|
||||
```python
|
||||
import litellm, httpx
|
||||
|
||||
# for completion
|
||||
litellm.client_session = httpx.Client(verify=False)
|
||||
response = litellm.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
# for acompletion
|
||||
litellm.aclient_session = httpx.AsyncClient(verify=False)
|
||||
response = litellm.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=messages,
|
||||
)
|
||||
```
|
||||
|
||||
### Using Helicone Proxy with LiteLLM
|
||||
```python
|
||||
import os
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# VertexAI - Google [Gemini]
|
||||
# VertexAI - Google [Gemini, Model Garden]
|
||||
|
||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
|
@ -20,6 +20,27 @@ litellm.vertex_location = "us-central1" # proj location
|
|||
response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
|
||||
```
|
||||
|
||||
## OpenAI Proxy Usage
|
||||
|
||||
1. Modify the config.yaml
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
vertex_project: "hardy-device-38811" # Your Project ID
|
||||
vertex_location: "us-central1" # proj location
|
||||
|
||||
model_list:
|
||||
-model_name: team1-gemini-pro
|
||||
litellm_params:
|
||||
model: gemini-pro
|
||||
```
|
||||
|
||||
2. Start the proxy
|
||||
|
||||
```bash
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
## Set Vertex Project & Vertex Location
|
||||
All calls using Vertex AI require the following parameters:
|
||||
* Your Project ID
|
||||
|
@ -46,16 +67,39 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
|
|||
# set directly on module
|
||||
litellm.vertex_location = "us-central1 # Your Location
|
||||
```
|
||||
## Model Garden
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| llama2 | `completion('vertex_ai/<endpoint_id>', messages)` |
|
||||
|
||||
#### Using Model Garden
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
## set ENV variables
|
||||
os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
|
||||
os.environ["VERTEXAI_LOCATION"] = "us-central1"
|
||||
|
||||
response = completion(
|
||||
model="vertex_ai/<your-endpoint-id>",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
```
|
||||
|
||||
## Gemini Pro
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| gemini-pro | `completion('gemini-pro', messages)` |
|
||||
| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
|
||||
|
||||
## Gemini Pro Vision
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
| gemini-pro-vision | `completion('gemini-pro-vision', messages)` |
|
||||
| gemini-pro-vision | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
|
||||
|
||||
|
||||
|
||||
|
||||
#### Using Gemini Pro Vision
|
||||
|
||||
|
@ -93,6 +137,7 @@ response = litellm.completion(
|
|||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## Chat Models
|
||||
| Model Name | Function Call |
|
||||
|------------------|--------------------------------------|
|
||||
|
|
|
@ -1,6 +1,13 @@
|
|||
# Slack Alerting
|
||||
|
||||
Get alerts for failed db read/writes, hanging api calls, failed api calls.
|
||||
Get alerts for:
|
||||
- hanging LLM api calls
|
||||
- failed LLM api calls
|
||||
- slow LLM api calls
|
||||
- budget Tracking per key/user:
|
||||
- When a User/Key crosses their Budget
|
||||
- When a User/Key is 15% away from crossing their Budget
|
||||
- failed db read/writes
|
||||
|
||||
## Quick Start
|
||||
|
||||
|
|
|
@ -7,16 +7,17 @@ Cache LLM Responses
|
|||
LiteLLM supports:
|
||||
- In Memory Cache
|
||||
- Redis Cache
|
||||
- Redis Semantic Cache
|
||||
- s3 Bucket Cache
|
||||
|
||||
## Quick Start - Redis, s3 Cache
|
||||
## Quick Start - Redis, s3 Cache, Semantic Cache
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="redis" label="redis cache">
|
||||
|
||||
Caching can be enabled by adding the `cache` key in the `config.yaml`
|
||||
|
||||
### Step 1: Add `cache` to the config.yaml
|
||||
#### Step 1: Add `cache` to the config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
|
@ -31,7 +32,7 @@ litellm_settings:
|
|||
cache: True # set cache responses to True, litellm defaults to using a redis cache
|
||||
```
|
||||
|
||||
### Step 2: Add Redis Credentials to .env
|
||||
#### Step 2: Add Redis Credentials to .env
|
||||
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
|
||||
|
||||
```shell
|
||||
|
@ -49,7 +50,7 @@ REDIS_<redis-kwarg-name> = ""
|
|||
```
|
||||
|
||||
[**See how it's read from the environment**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/_redis.py#L40)
|
||||
### Step 3: Run proxy with config
|
||||
#### Step 3: Run proxy with config
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
@ -57,7 +58,7 @@ $ litellm --config /path/to/config.yaml
|
|||
|
||||
<TabItem value="s3" label="s3 cache">
|
||||
|
||||
### Step 1: Add `cache` to the config.yaml
|
||||
#### Step 1: Add `cache` to the config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
|
@ -79,7 +80,57 @@ litellm_settings:
|
|||
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
|
||||
```
|
||||
|
||||
### Step 2: Run proxy with config
|
||||
#### Step 2: Run proxy with config
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
|
||||
<TabItem value="redis-sem" label="redis semantic cache">
|
||||
|
||||
Caching can be enabled by adding the `cache` key in the `config.yaml`
|
||||
|
||||
#### Step 1: Add `cache` to the config.yaml
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
- model_name: azure-embedding-model
|
||||
litellm_params:
|
||||
model: azure/azure-embedding-model
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
|
||||
litellm_settings:
|
||||
set_verbose: True
|
||||
cache: True # set cache responses to True, litellm defaults to using a redis cache
|
||||
cache_params:
|
||||
type: "redis-semantic"
|
||||
similarity_threshold: 0.8 # similarity threshold for semantic cache
|
||||
redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
|
||||
```
|
||||
|
||||
#### Step 2: Add Redis Credentials to .env
|
||||
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
|
||||
|
||||
```shell
|
||||
REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database'
|
||||
## OR ##
|
||||
REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
|
||||
REDIS_PORT = "" # REDIS_PORT='18841'
|
||||
REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing'
|
||||
```
|
||||
|
||||
**Additional kwargs**
|
||||
You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this:
|
||||
```shell
|
||||
REDIS_<redis-kwarg-name> = ""
|
||||
```
|
||||
|
||||
#### Step 3: Run proxy with config
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
@ -160,9 +211,10 @@ litellm_settings:
|
|||
|
||||
The proxy support 3 cache-controls:
|
||||
|
||||
- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
|
||||
- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
|
||||
- `no-cache`: Will not return a cached response, but instead call the actual endpoint.
|
||||
- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
|
||||
- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
|
||||
- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint.
|
||||
- `no-store`: *Optional(bool)* Will not cache the response.
|
||||
|
||||
[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
|
||||
|
||||
|
|
|
@ -22,18 +22,22 @@ Set a model alias for your deployments.
|
|||
|
||||
In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment.
|
||||
|
||||
In the config below requests with:
|
||||
In the config below:
|
||||
- `model_name`: the name to pass TO litellm from the external client
|
||||
- `litellm_params.model`: the model string passed to the litellm.completion() function
|
||||
|
||||
E.g.:
|
||||
- `model=vllm-models` will route to `openai/facebook/opt-125m`.
|
||||
- `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo # user-facing model alias
|
||||
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
|
||||
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
|
||||
model: azure/gpt-turbo-small-eu
|
||||
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
|
||||
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
|
||||
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
|
||||
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
|
||||
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
|
||||
- model_name: bedrock-claude-v1
|
||||
litellm_params:
|
||||
model: bedrock/anthropic.claude-instant-v1
|
||||
|
@ -43,6 +47,11 @@ model_list:
|
|||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key: "os.environ/AZURE_API_KEY_CA"
|
||||
rpm: 6
|
||||
- model_name: anthropic-claude
|
||||
litellm_params:
|
||||
model="bedrock/anthropic.claude-instant-v1"
|
||||
### [OPTIONAL] SET AWS REGION ###
|
||||
aws_region_name="us-east-1"
|
||||
- model_name: vllm-models
|
||||
litellm_params:
|
||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||
|
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
|
|||
general_settings:
|
||||
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
|
||||
```
|
||||
:::info
|
||||
|
||||
For more provider-specific info, [go here](../providers/)
|
||||
|
||||
:::
|
||||
|
||||
#### Step 2: Start Proxy with config
|
||||
|
||||
|
@ -188,7 +202,7 @@ print(response)
|
|||
</Tabs>
|
||||
|
||||
|
||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
|
||||
## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
|
||||
You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc.
|
||||
|
||||
[**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
|
||||
|
@ -210,6 +224,12 @@ model_list:
|
|||
api_key: sk-123
|
||||
api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
|
||||
temperature: 0.2
|
||||
- model_name: openai-gpt-3.5
|
||||
litellm_params:
|
||||
model: openai/gpt-3.5-turbo
|
||||
api_key: sk-123
|
||||
organization: org-ikDc4ex8NB
|
||||
temperature: 0.2
|
||||
- model_name: mistral-7b
|
||||
litellm_params:
|
||||
model: ollama/mistral
|
||||
|
@ -226,6 +246,28 @@ model_list:
|
|||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
|
||||
## Set Azure `base_model` for cost tracking
|
||||
|
||||
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
||||
|
||||
**Solution** ✅ : Set `base_model` on your config so litellm uses the correct model for calculating azure cost
|
||||
|
||||
Example config with `base_model`
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: azure-gpt-3.5
|
||||
litellm_params:
|
||||
model: azure/chatgpt-v-2
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
model_info:
|
||||
base_model: azure/gpt-4-1106-preview
|
||||
```
|
||||
|
||||
You can view your cost once you set up [Virtual keys](https://docs.litellm.ai/docs/proxy/virtual_keys) or [custom_callbacks](https://docs.litellm.ai/docs/proxy/logging)
|
||||
|
||||
## Load API Keys
|
||||
|
||||
### Load API Keys from Environment
|
||||
|
@ -318,6 +360,26 @@ See supported Embedding Providers & Models [here](https://docs.litellm.ai/docs/e
|
|||
#### Create Config.yaml
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="bedrock" label="Bedrock Completion/Chat">
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-west-2"
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-east-2"
|
||||
- model_name: bedrock-cohere
|
||||
litellm_params:
|
||||
model: "bedrock/cohere.command-text-v14"
|
||||
aws_region_name: "us-east-1"
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="sagemaker" label="Sagemaker, Bedrock Embeddings">
|
||||
|
||||
|
@ -430,20 +492,26 @@ model_list:
|
|||
</Tabs>
|
||||
|
||||
#### Start Proxy
|
||||
|
||||
```shell
|
||||
litellm --config config.yaml
|
||||
```
|
||||
|
||||
#### Make Request
|
||||
Sends Request to `deployed-codebert-base`
|
||||
Sends Request to `bedrock-cohere`
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/embeddings' \
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "deployed-codebert-base",
|
||||
"input": ["write a litellm poem"]
|
||||
}'
|
||||
"model": "bedrock-cohere",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "gm"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
|
@ -483,3 +551,55 @@ general_settings:
|
|||
max_parallel_requests: 100 # max parallel requests for a user = 100
|
||||
```
|
||||
|
||||
## All settings
|
||||
|
||||
```python
|
||||
{
|
||||
"environment_variables": {},
|
||||
"model_list": [
|
||||
{
|
||||
"model_name": "string",
|
||||
"litellm_params": {},
|
||||
"model_info": {
|
||||
"id": "string",
|
||||
"mode": "embedding",
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"max_tokens": 2048,
|
||||
"base_model": "gpt-4-1106-preview",
|
||||
"additionalProp1": {}
|
||||
}
|
||||
}
|
||||
],
|
||||
"litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
|
||||
"general_settings": {
|
||||
"completion_model": "string",
|
||||
"key_management_system": "google_kms", # either google_kms or azure_kms
|
||||
"master_key": "string",
|
||||
"database_url": "string",
|
||||
"database_type": "dynamo_db",
|
||||
"database_args": {
|
||||
"billing_mode": "PROVISIONED_THROUGHPUT",
|
||||
"read_capacity_units": 0,
|
||||
"write_capacity_units": 0,
|
||||
"ssl_verify": true,
|
||||
"region_name": "string",
|
||||
"user_table_name": "LiteLLM_UserTable",
|
||||
"key_table_name": "LiteLLM_VerificationToken",
|
||||
"config_table_name": "LiteLLM_Config",
|
||||
"spend_table_name": "LiteLLM_SpendLogs"
|
||||
},
|
||||
"otel": true,
|
||||
"custom_auth": "string",
|
||||
"max_parallel_requests": 0,
|
||||
"infer_model_from_keys": true,
|
||||
"background_health_checks": true,
|
||||
"health_check_interval": 300,
|
||||
"alerting": [
|
||||
"string"
|
||||
],
|
||||
"alerting_threshold": 0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -10,6 +10,12 @@ There's 2 ways to track cost:
|
|||
|
||||
By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)
|
||||
|
||||
:::info
|
||||
|
||||
LiteLLM already has pricing for any model in our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
|
||||
|
||||
:::
|
||||
|
||||
## Quick Start
|
||||
|
||||
Register custom pricing for sagemaker completion model.
|
||||
|
@ -54,7 +60,7 @@ model_list:
|
|||
- model_name: sagemaker-embedding-model
|
||||
litellm_params:
|
||||
model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
|
||||
input_cost_per_second: 0.000420
|
||||
input_cost_per_second: 0.000420
|
||||
```
|
||||
|
||||
**Step 2: Start proxy**
|
||||
|
@ -67,25 +73,28 @@ litellm /path/to/config.yaml
|
|||
|
||||
<Image img={require('../../img/spend_logs_table.png')} />
|
||||
|
||||
## Cost Per Token
|
||||
## Cost Per Token (e.g. Azure)
|
||||
|
||||
|
||||
```python
|
||||
# !pip install boto3
|
||||
from litellm import completion, completion_cost
|
||||
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = ""
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||
os.environ["AWS_REGION_NAME"] = ""
|
||||
## set ENV variables
|
||||
os.environ["AZURE_API_KEY"] = ""
|
||||
os.environ["AZURE_API_BASE"] = ""
|
||||
os.environ["AZURE_API_VERSION"] = ""
|
||||
|
||||
|
||||
def test_completion_sagemaker():
|
||||
def test_completion_azure_model():
|
||||
try:
|
||||
print("testing sagemaker")
|
||||
print("testing azure custom pricing")
|
||||
# azure call
|
||||
response = completion(
|
||||
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
input_cost_per_token=0.005,
|
||||
output_cost_per_token=1,
|
||||
model = "azure/<your_deployment_name>",
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
input_cost_per_token=0.005,
|
||||
output_cost_per_token=1,
|
||||
)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
|
@ -94,15 +103,19 @@ def test_completion_sagemaker():
|
|||
except Exception as e:
|
||||
raise Exception(f"Error occurred: {e}")
|
||||
|
||||
test_completion_azure_model()
|
||||
```
|
||||
|
||||
### Usage with OpenAI Proxy Server
|
||||
|
||||
```yaml
|
||||
model_list:
|
||||
- model_name: sagemaker-completion-model
|
||||
- model_name: azure-model
|
||||
litellm_params:
|
||||
model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
|
||||
input_cost_per_token: 0.000420 # 👈 key change
|
||||
output_cost_per_token: 0.000420 # 👈 key change
|
||||
model: azure/<your_deployment_name>
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_base: os.environ/AZURE_API_BASE
|
||||
api_version: os.envrion/AZURE_API_VERSION
|
||||
input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
|
||||
output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
|
||||
```
|
34
docs/my-website/docs/proxy/debugging.md
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Debugging
|
||||
|
||||
2 levels of debugging supported.
|
||||
|
||||
- debug (prints info logs)
|
||||
- detailed debug (prints debug logs)
|
||||
|
||||
## `debug`
|
||||
|
||||
**via cli**
|
||||
|
||||
```bash
|
||||
$ litellm --debug
|
||||
```
|
||||
|
||||
**via env**
|
||||
|
||||
```python
|
||||
os.environ["LITELLM_LOG"] = "INFO"
|
||||
```
|
||||
|
||||
## `detailed debug`
|
||||
|
||||
**via cli**
|
||||
|
||||
```bash
|
||||
$ litellm --detailed_debug
|
||||
```
|
||||
|
||||
**via env**
|
||||
|
||||
```python
|
||||
os.environ["LITELLM_LOG"] = "DEBUG"
|
||||
```
|
|
@ -116,6 +116,20 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
## Setting SSL Certification
|
||||
|
||||
Use this, If you need to set ssl certificates for your on prem litellm proxy
|
||||
|
||||
Pass `ssl_keyfile_path` (Path to the SSL keyfile) and `ssl_certfile_path` (Path to the SSL certfile) when starting litellm proxy
|
||||
|
||||
```shell
|
||||
docker run ghcr.io/berriai/litellm:main-latest \
|
||||
--ssl_keyfile_path ssl_test/keyfile.key \
|
||||
--ssl_certfile_path ssl_test/certfile.crt
|
||||
```
|
||||
|
||||
Provide an ssl certificate when starting litellm proxy server
|
||||
|
||||
## Platform-specific Guide
|
||||
|
||||
|
||||
|
|
|
@ -112,7 +112,8 @@ Example Response:
|
|||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"db": "connected"
|
||||
"db": "connected",
|
||||
"litellm_version":"1.19.2",
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -121,7 +122,8 @@ Example Response:
|
|||
```json
|
||||
{
|
||||
"status": "healthy",
|
||||
"db": "Not connected"
|
||||
"db": "Not connected",
|
||||
"litellm_version":"1.19.2",
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -435,6 +435,7 @@ print(response)
|
|||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
## Logging Proxy Input/Output - s3 Buckets
|
||||
|
||||
We will use the `--config` to set
|
||||
|
@ -490,6 +491,34 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
|
|||
|
||||
Your logs should be available on the specified s3 Bucket
|
||||
|
||||
## Team-based Logging
|
||||
|
||||
Set success callbacks (e.g. langfuse), for a specific team-id.
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_team_settings:
|
||||
- team_id: my-secret-project
|
||||
success_callback: ["langfuse"]
|
||||
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2
|
||||
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_2
|
||||
- team_id: ishaans-secret-project
|
||||
success_callback: ["langfuse"]
|
||||
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_3
|
||||
langfuse_secret: os.environ/LANGFUSE_SECRET_3
|
||||
```
|
||||
|
||||
Now, when you [generate keys](./virtual_keys.md) for this team-id
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:8000/key/generate' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-D '{"team_id": "ishaans-secret-project"}'
|
||||
```
|
||||
|
||||
All requests made with these keys will log data to their team-specific logging.
|
||||
|
||||
## Logging Proxy Input/Output - DynamoDB
|
||||
|
||||
We will use the `--config` to set
|
||||
|
|
30
docs/my-website/docs/proxy/pii_masking.md
Normal file
|
@ -0,0 +1,30 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# PII Masking
|
||||
|
||||
LiteLLM supports [Microsoft Presidio](https://github.com/microsoft/presidio/) for PII masking.
|
||||
|
||||
## Step 1. Add env
|
||||
|
||||
```bash
|
||||
export PRESIDIO_ANALYZER_API_BASE="http://localhost:5002"
|
||||
export PRESIDIO_ANONYMIZER_API_BASE="http://localhost:5001"
|
||||
```
|
||||
|
||||
## Step 2. Set it as a callback in config.yaml
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
callbacks = ["presidio", ...] # e.g. ["presidio", custom_callbacks.proxy_handler_instance]
|
||||
```
|
||||
|
||||
## Start proxy
|
||||
|
||||
```
|
||||
litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
|
||||
This will mask the input going to the llm provider
|
||||
|
||||
<Image img={require('../../img/presidio_screenshot.png')} />
|
|
@ -8,16 +8,8 @@ Quick start CLI, Config, Docker
|
|||
LiteLLM Server manages:
|
||||
|
||||
* **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
|
||||
* **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
|
||||
* **Load Balancing**: between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model) - LiteLLM proxy can handle 1.5k+ requests/second during load tests.
|
||||
* **Cost tracking**: Authentication & Spend Tracking [Virtual Keys](#managing-auth---virtual-keys)
|
||||
|
||||
[**See LiteLLM Proxy code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy)
|
||||
|
||||
|
||||
#### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
|
||||
|
||||
|
||||
View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)
|
||||
|
||||
```shell
|
||||
$ pip install 'litellm[proxy]'
|
||||
|
@ -40,115 +32,6 @@ litellm --test
|
|||
|
||||
This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints.
|
||||
|
||||
### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}
|
||||
'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
||||
model = "gpt-3.5-turbo",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
||||
|
||||
```python
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"SAGEMAKER EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"BEDROCK EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"TITAN EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Supported LLMs
|
||||
All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
|
||||
<Tabs>
|
||||
|
@ -330,9 +213,6 @@ $ litellm --model command-nightly
|
|||
|
||||
</Tabs>
|
||||
|
||||
|
||||
|
||||
|
||||
## Quick Start - LiteLLM Proxy + Config.yaml
|
||||
The config allows you to create a model list and set `api_base`, `max_tokens` (all litellm params). See more details about the config [here](https://docs.litellm.ai/docs/proxy/configs)
|
||||
|
||||
|
@ -363,6 +243,115 @@ model_list:
|
|||
litellm --config your_config.yaml
|
||||
```
|
||||
|
||||
|
||||
## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data ' {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
]
|
||||
}
|
||||
'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="openai" label="OpenAI v1.0.0+">
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything",
|
||||
base_url="http://0.0.0.0:8000"
|
||||
)
|
||||
|
||||
# request sent to model set on litellm proxy, `litellm --model`
|
||||
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
])
|
||||
|
||||
print(response)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="langchain" label="Langchain">
|
||||
|
||||
```python
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
chat = ChatOpenAI(
|
||||
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
|
||||
model = "gpt-3.5-turbo",
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
messages = [
|
||||
SystemMessage(
|
||||
content="You are a helpful assistant that im using to make a test request to."
|
||||
),
|
||||
HumanMessage(
|
||||
content="test from litellm. tell me why it's amazing in 1 sentence"
|
||||
),
|
||||
]
|
||||
response = chat(messages)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="langchain-embedding" label="Langchain Embeddings">
|
||||
|
||||
```python
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"SAGEMAKER EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"BEDROCK EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
|
||||
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
|
||||
|
||||
text = "This is a test document."
|
||||
|
||||
query_result = embeddings.embed_query(text)
|
||||
|
||||
print(f"TITAN EMBEDDINGS")
|
||||
print(query_result[:5])
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
[**More Info**](./configs.md)
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# [BETA] Admin UI
|
||||
# 🔑 [BETA] Proxy UI
|
||||
### **Create + delete keys through a UI**
|
||||
|
||||
- Track Spend Per API Key, User
|
||||
- Allow your users to create their own keys through a UI
|
||||
[Let users create their own keys](#setup-ssoauth-for-ui)
|
||||
|
||||
:::info
|
||||
|
||||
|
@ -11,61 +13,129 @@ This is in beta, so things may change. If you have feedback, [let us know](https
|
|||
|
||||
:::
|
||||
|
||||
<Image img={require('../../img/litellm_ui_create_key.png')} />
|
||||
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
Requirements:
|
||||
- Requires proxy master key to be set
|
||||
- Requires db connected
|
||||
|
||||
- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
|
||||
Follow [setup](./virtual_keys.md#setup)
|
||||
|
||||
[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
|
||||
### 1. Start the proxy
|
||||
```bash
|
||||
litellm --config /path/to/config.yaml
|
||||
|
||||
### Step 1. Save SMTP server credentials
|
||||
|
||||
```env
|
||||
export SMTP_HOST="my-smtp-host"
|
||||
export SMTP_USERNAME="my-smtp-password"
|
||||
export SMTP_PASSWORD="my-smtp-password"
|
||||
export SMTP_SENDER_EMAIL="krrish@berri.ai"
|
||||
#INFO: Proxy running on http://0.0.0.0:8000
|
||||
```
|
||||
|
||||
### Step 2. Enable user auth
|
||||
### 2. Go to UI
|
||||
```bash
|
||||
http://0.0.0.0:8000/ui # <proxy_base_url>/ui
|
||||
```
|
||||
|
||||
In your config.yaml,
|
||||
|
||||
## Get Admin UI Link on Swagger
|
||||
Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhost:4000/`
|
||||
|
||||
<Image img={require('../../img/ui_link.png')} />
|
||||
|
||||
## Change default username + password
|
||||
|
||||
Set the following in your .env on the Proxy
|
||||
|
||||
```shell
|
||||
UI_USERNAME=ishaan-litellm
|
||||
UI_PASSWORD=langchain
|
||||
```
|
||||
|
||||
On accessing the LiteLLM UI, you will be prompted to enter your username, password
|
||||
|
||||
|
||||
## Setup SSO/Auth for UI
|
||||
|
||||
### Step 1: Set upperbounds for keys
|
||||
Control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
# other changes
|
||||
allow_user_auth: true
|
||||
litellm_settings:
|
||||
upperbound_key_generate_params:
|
||||
max_budget: 100 # upperbound of $100, for all /key/generate requests
|
||||
duration: "30d" # upperbound of 30 days for all /key/generate requests
|
||||
```
|
||||
|
||||
This will enable:
|
||||
* Users to create keys via `/key/generate` (by default, only admin can create keys)
|
||||
* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
|
||||
** Expected Behavior **
|
||||
|
||||
### Step 3. Connect to UI
|
||||
- Send a `/key/generate` request with `max_budget=200`
|
||||
- Key will be created with `max_budget=100` since 100 is the upper bound
|
||||
|
||||
You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui).
|
||||
### Step 2: Setup Oauth Client
|
||||
<Tabs>
|
||||
<TabItem value="google" label="Google SSO">
|
||||
|
||||
If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`.
|
||||
- Create a new Oauth 2.0 Client on https://console.cloud.google.com/
|
||||
|
||||
Connect your proxy to your UI, by entering:
|
||||
1. The hosted proxy URL
|
||||
2. Accepted email subdomains
|
||||
3. [OPTIONAL] Allowed admin emails
|
||||
**Required .env variables on your Proxy**
|
||||
```shell
|
||||
# for Google SSO Login
|
||||
GOOGLE_CLIENT_ID=
|
||||
GOOGLE_CLIENT_SECRET=
|
||||
```
|
||||
|
||||
<Image img={require('../../img/admin_dashboard.png')} />
|
||||
- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/
|
||||
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||
```shell
|
||||
https://litellm-production-7002.up.railway.app/sso/callback
|
||||
```
|
||||
|
||||
## What users will see?
|
||||
</TabItem>
|
||||
|
||||
### Auth
|
||||
<TabItem value="msft" label="Microsoft SSO">
|
||||
|
||||
<Image img={require('../../img/user_auth_screen.png')} />
|
||||
- Create a new App Registration on https://portal.azure.com/
|
||||
- Create a client Secret for your App Registration
|
||||
|
||||
### Create Keys
|
||||
**Required .env variables on your Proxy**
|
||||
```shell
|
||||
MICROSOFT_CLIENT_ID="84583a4d-"
|
||||
MICROSOFT_CLIENT_SECRET="nbk8Q~"
|
||||
MICROSOFT_TENANT="5a39737
|
||||
```
|
||||
- Set Redirect URI on your App Registration on https://portal.azure.com/
|
||||
- Set a redirect url = `<your proxy base url>/sso/callback`
|
||||
```shell
|
||||
http://localhost:4000/sso/callback
|
||||
```
|
||||
|
||||
<Image img={require('../../img/user_create_key_screen.png')} />
|
||||
</TabItem>
|
||||
|
||||
### Spend Per Key
|
||||
</Tabs>
|
||||
|
||||
<Image img={require('../../img/spend_per_api_key.png')} />
|
||||
### Step 3. Test flow
|
||||
<Image img={require('../../img/litellm_ui_3.gif')} />
|
||||
|
||||
## Set Admin view w/ SSO
|
||||
|
||||
You just need to set Proxy Admin ID
|
||||
|
||||
### Step 1: Copy your ID from the UI
|
||||
|
||||
<Image img={require('../../img/litellm_ui_copy_id.png')} />
|
||||
|
||||
### Step 2: Set it in your .env as the PROXY_ADMIN_ID
|
||||
|
||||
```env
|
||||
export PROXY_ADMIN_ID="116544810872468347480"
|
||||
```
|
||||
|
||||
### Step 3: See all proxy keys
|
||||
|
||||
<Image img={require('../../img/litellm_ui_admin.png')} />
|
||||
|
||||
:::info
|
||||
|
||||
If you don't see all your keys this could be due to a cached token. So just re-login and it should work.
|
||||
|
||||
:::
|
|
@ -1,7 +1,7 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Use with Langchain, OpenAI SDK, Curl
|
||||
# Use with Langchain, OpenAI SDK, LlamaIndex, Curl
|
||||
|
||||
:::info
|
||||
|
||||
|
@ -51,6 +51,42 @@ response = client.chat.completions.create(
|
|||
print(response)
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="LlamaIndex" label="LlamaIndex">
|
||||
|
||||
```python
|
||||
import os, dotenv
|
||||
|
||||
from llama_index.llms import AzureOpenAI
|
||||
from llama_index.embeddings import AzureOpenAIEmbedding
|
||||
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
|
||||
|
||||
llm = AzureOpenAI(
|
||||
engine="azure-gpt-3.5", # model_name on litellm proxy
|
||||
temperature=0.0,
|
||||
azure_endpoint="http://0.0.0.0:4000", # litellm proxy endpoint
|
||||
api_key="sk-1234", # litellm proxy API Key
|
||||
api_version="2023-07-01-preview",
|
||||
)
|
||||
|
||||
embed_model = AzureOpenAIEmbedding(
|
||||
deployment_name="azure-embedding-model",
|
||||
azure_endpoint="http://0.0.0.0:4000",
|
||||
api_key="sk-1234",
|
||||
api_version="2023-07-01-preview",
|
||||
)
|
||||
|
||||
|
||||
documents = SimpleDirectoryReader("llama_index_data").load_data()
|
||||
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
||||
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
|
||||
|
||||
query_engine = index.as_query_engine()
|
||||
response = query_engine.query("What did the author do growing up?")
|
||||
print(response)
|
||||
|
||||
```
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="Curl" label="Curl Request">
|
||||
|
||||
Pass `metadata` as part of the request body
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# 💰 Budgets, Rate Limits per user
|
||||
# 💰 Budgets, Rate Limits
|
||||
|
||||
Requirements:
|
||||
|
||||
|
@ -10,22 +10,72 @@ Requirements:
|
|||
|
||||
## Set Budgets
|
||||
|
||||
You can set budgets at 3 levels:
|
||||
- For the proxy
|
||||
- For a user
|
||||
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
|
||||
- For a key
|
||||
|
||||
Set `max_budget` in (USD $) param in the `/user/new` or `/key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="per-user" label="Per User">
|
||||
<TabItem value="proxy" label="For Proxy">
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
Apply a budget across all calls on the proxy
|
||||
|
||||
**Step 1. Modify config.yaml**
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
|
||||
litellm_settings:
|
||||
# other litellm settings
|
||||
max_budget: 0 # (float) sets max budget as $0 USD
|
||||
budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
```
|
||||
|
||||
**Step 2. Start proxy**
|
||||
|
||||
```bash
|
||||
litellm /path/to/config.yaml
|
||||
```
|
||||
|
||||
**Step 3. Send test call**
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||
--header 'Autherization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what llm are you"
|
||||
}
|
||||
],
|
||||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="per-user" label="For User">
|
||||
|
||||
Apply a budget across multiple keys.
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for this.
|
||||
|
||||
You can:
|
||||
- Add budgets to users [**Jump**](#add-budgets-to-users)
|
||||
- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
|
||||
|
||||
By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
### **Add budgets to users**
|
||||
```shell
|
||||
curl --location 'http://localhost:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
The request is a normal `/key/generate` request body + a `max_budget` field.
|
||||
|
||||
[**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)
|
||||
|
||||
|
@ -40,9 +90,93 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
|
|||
}
|
||||
```
|
||||
|
||||
### **Add budget duration to users**
|
||||
|
||||
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
|
||||
```
|
||||
curl 'http://0.0.0.0:8000/user/new' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"team_id": "core-infra", # [OPTIONAL]
|
||||
"max_budget": 10,
|
||||
"budget_duration": 10s,
|
||||
}'
|
||||
```
|
||||
|
||||
### Create new keys for existing user
|
||||
|
||||
Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
|
||||
- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
|
||||
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="per-key" label="Per Key">
|
||||
<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
|
||||
|
||||
Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
|
||||
|
||||
**Step 1. Modify config.yaml**
|
||||
Define `litellm.max_user_budget`
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
|
||||
litellm_settings:
|
||||
max_budget: 10 # global budget for proxy
|
||||
max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
|
||||
```
|
||||
|
||||
2. Make a /chat/completions call, pass 'user' - First call Works
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
|
||||
--data ' {
|
||||
"model": "azure-gpt-3.5",
|
||||
"user": "ishaan3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what time is it"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
|
||||
--data ' {
|
||||
"model": "azure-gpt-3.5",
|
||||
"user": "ishaan3",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "what time is it"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
Error
|
||||
```shell
|
||||
{"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="per-key" label="For Key">
|
||||
|
||||
Apply a budget on a key.
|
||||
|
||||
You can:
|
||||
- Add budgets to keys [**Jump**](#add-budgets-to-keys)
|
||||
|
@ -53,6 +187,8 @@ You can:
|
|||
- After the key crosses it's `max_budget`, requests fail
|
||||
- If duration set, spend is reset at the end of the duration
|
||||
|
||||
By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
### **Add budgets to keys**
|
||||
|
||||
```bash
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Virtual Keys
|
||||
# Virtual Keys, Users
|
||||
Track Spend, Set budgets and create virtual keys for the proxy
|
||||
|
||||
Grant other's temporary access to your proxy, with keys that expire after a set duration.
|
||||
|
@ -6,6 +6,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set
|
|||
|
||||
:::info
|
||||
|
||||
- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
|
||||
- [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
|
||||
- Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
|
||||
|
||||
|
@ -16,8 +17,11 @@ Grant other's temporary access to your proxy, with keys that expire after a set
|
|||
|
||||
Requirements:
|
||||
|
||||
- Need to a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
|
||||
- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
|
||||
- Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env
|
||||
- Set a `master key`, this is your Proxy Admin key - you can use this to create other keys
|
||||
- ** Set on config.yaml** set your master key under `general_settings:master_key`, example below
|
||||
- ** Set env variable** set `LITELLM_MASTER_KEY` (**Note: either set this on the config.yaml or in your env** whatever is more convenient for you)
|
||||
|
||||
(the proxy Dockerfile checks if the `DATABASE_URL` is set and then intializes the DB connection)
|
||||
|
||||
|
@ -81,15 +85,17 @@ curl 'http://0.0.0.0:8000/key/generate' \
|
|||
|
||||
Request Params:
|
||||
|
||||
- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server.
|
||||
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
- `key_alias`: *Optional[str]* - User defined key alias
|
||||
- `team_id`: *Optional[str]* - The team id of the user
|
||||
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
|
||||
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
|
||||
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
|
||||
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
|
||||
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
|
||||
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
|
||||
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
|
||||
|
||||
- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
|
||||
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
|
||||
|
||||
- `team_id`: *str or null (optional)* Specify team_id for the associated key
|
||||
|
||||
- `max_budget`: *float or null (optional)* Specify max budget (in Dollars $) for a given key. If no value is set, the key has no budget
|
||||
|
||||
### Response
|
||||
|
||||
|
@ -97,20 +103,11 @@ Request Params:
|
|||
{
|
||||
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
|
||||
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
|
||||
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Keys that don't expire
|
||||
|
||||
Just set duration to None.
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
--header 'Authorization: Bearer <your-master-key>' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
|
||||
```
|
||||
|
||||
### Upgrade/Downgrade Models
|
||||
|
||||
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
|
||||
|
@ -285,7 +282,152 @@ Request Params:
|
|||
}
|
||||
```
|
||||
|
||||
## Set Budgets - Per Key
|
||||
## /user/new
|
||||
|
||||
### Request
|
||||
|
||||
All [key/generate params supported](#keygenerate) for creating a user
|
||||
```shell
|
||||
curl 'http://0.0.0.0:4000/user/new' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{
|
||||
"user_id": "ishaan1",
|
||||
"user_email": "ishaan@litellm.ai",
|
||||
"user_role": "admin",
|
||||
"team_id": "cto-team",
|
||||
"max_budget": 20,
|
||||
"budget_duration": "1h"
|
||||
|
||||
}'
|
||||
```
|
||||
|
||||
Request Params:
|
||||
|
||||
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
|
||||
- user_email: str (optional - defaults to "") - The email address associated with the user.
|
||||
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
|
||||
|
||||
**Possible `user_role` values**
|
||||
```
|
||||
"admin" - Maintaining the proxy and owning the overall budget
|
||||
"app_owner" - employees maintaining the apps, each owner may own more than one app
|
||||
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
|
||||
```
|
||||
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
|
||||
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
|
||||
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
|
||||
|
||||
### Response
|
||||
A key will be generated for the new user created
|
||||
|
||||
```shell
|
||||
{
|
||||
"models": [],
|
||||
"spend": 0.0,
|
||||
"max_budget": null,
|
||||
"user_id": "ishaan1",
|
||||
"team_id": null,
|
||||
"max_parallel_requests": null,
|
||||
"metadata": {},
|
||||
"tpm_limit": null,
|
||||
"rpm_limit": null,
|
||||
"budget_duration": null,
|
||||
"allowed_cache_controls": [],
|
||||
"key_alias": null,
|
||||
"duration": null,
|
||||
"aliases": {},
|
||||
"config": {},
|
||||
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
|
||||
"key_name": null,
|
||||
"expires": null
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Request Params:
|
||||
- keys: List[str] - List of keys to delete
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced
|
||||
### Upperbound /key/generate params
|
||||
Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
|
||||
|
||||
Set `litellm_settings:upperbound_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
upperbound_key_generate_params:
|
||||
max_budget: 100 # upperbound of $100, for all /key/generate requests
|
||||
duration: "30d" # upperbound of 30 days for all /key/generate requests
|
||||
```
|
||||
|
||||
** Expected Behavior **
|
||||
|
||||
- Send a `/key/generate` request with `max_budget=200`
|
||||
- Key will be created with `max_budget=100` since 100 is the upper bound
|
||||
|
||||
### Default /key/generate params
|
||||
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
|
||||
|
||||
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
|
||||
|
||||
Set `litellm_settings:default_key_generate_params`:
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_key_generate_params:
|
||||
max_budget: 1.5000
|
||||
models: ["azure-gpt-3.5"]
|
||||
duration: # blank means `null`
|
||||
metadata: {"setting":"default"}
|
||||
team_id: "core-infra"
|
||||
```
|
||||
|
||||
### Restrict models by `team_id`
|
||||
`litellm-dev` can only access `azure-gpt-3.5`
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
default_team_settings:
|
||||
- team_id: litellm-dev
|
||||
models: ["azure-gpt-3.5"]
|
||||
```
|
||||
|
||||
#### Create key with team_id="litellm-dev"
|
||||
```shell
|
||||
curl --location 'http://localhost:4000/key/generate' \
|
||||
--header 'Authorization: Bearer sk-1234' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data-raw '{"team_id": "litellm-dev"}'
|
||||
```
|
||||
|
||||
#### Use Key to call invalid model - Fails
|
||||
```shell
|
||||
curl --location 'http://0.0.0.0:4000/chat/completions' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
|
||||
--data '{
|
||||
"model": "BEDROCK_GROUP",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "hi"
|
||||
}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
```shell
|
||||
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
|
||||
```
|
||||
|
||||
### Set Budgets - Per Key
|
||||
|
||||
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
|
||||
|
||||
|
@ -331,7 +473,7 @@ Expected Response from `/chat/completions` when key has crossed budget
|
|||
```
|
||||
|
||||
|
||||
## Set Budgets - Per User
|
||||
### Set Budgets - Per User
|
||||
|
||||
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
|
||||
|
||||
|
@ -356,7 +498,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
|
|||
}
|
||||
```
|
||||
|
||||
## Tracking Spend
|
||||
### Tracking Spend
|
||||
|
||||
You can get spend for a key by using the `/key/info` endpoint.
|
||||
|
||||
|
@ -391,13 +533,13 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
|
|||
```
|
||||
|
||||
|
||||
## Custom Auth
|
||||
### Custom Auth
|
||||
|
||||
You can now override the default api key auth.
|
||||
|
||||
Here's how:
|
||||
|
||||
### 1. Create a custom auth file.
|
||||
#### 1. Create a custom auth file.
|
||||
|
||||
Make sure the response type follows the `UserAPIKeyAuth` pydantic object. This is used by for logging usage specific to that user key.
|
||||
|
||||
|
@ -414,7 +556,7 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
|
|||
raise Exception
|
||||
```
|
||||
|
||||
### 2. Pass the filepath (relative to the config.yaml)
|
||||
#### 2. Pass the filepath (relative to the config.yaml)
|
||||
|
||||
Pass the filepath to the config.yaml
|
||||
|
||||
|
@ -435,16 +577,16 @@ general_settings:
|
|||
|
||||
[**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)
|
||||
|
||||
### 3. Start the proxy
|
||||
#### 3. Start the proxy
|
||||
```shell
|
||||
$ litellm --config /path/to/config.yaml
|
||||
```
|
||||
|
||||
## Custom /key/generate
|
||||
### Custom /key/generate
|
||||
|
||||
If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)
|
||||
|
||||
### 1. Write a custom `custom_generate_key_fn`
|
||||
#### 1. Write a custom `custom_generate_key_fn`
|
||||
|
||||
|
||||
The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
|
||||
|
@ -510,7 +652,7 @@ async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
|
|||
```
|
||||
|
||||
|
||||
### 2. Pass the filepath (relative to the config.yaml)
|
||||
#### 2. Pass the filepath (relative to the config.yaml)
|
||||
|
||||
Pass the filepath to the config.yaml
|
||||
|
||||
|
@ -532,18 +674,18 @@ general_settings:
|
|||
|
||||
|
||||
|
||||
## [BETA] Dynamo DB
|
||||
### [BETA] Dynamo DB
|
||||
|
||||
Only live in `v1.16.21.dev1`.
|
||||
|
||||
### Step 1. Save keys to env
|
||||
#### Step 1. Save keys to env
|
||||
|
||||
```shell
|
||||
AWS_ACCESS_KEY_ID = "your-aws-access-key-id"
|
||||
AWS_SECRET_ACCESS_KEY = "your-aws-secret-access-key"
|
||||
```
|
||||
|
||||
### Step 2. Add details to config
|
||||
#### Step 2. Add details to config
|
||||
|
||||
```yaml
|
||||
general_settings:
|
||||
|
@ -560,7 +702,7 @@ general_settings:
|
|||
}
|
||||
```
|
||||
|
||||
### Step 3. Generate Key
|
||||
#### Step 3. Generate Key
|
||||
|
||||
```bash
|
||||
curl --location 'http://0.0.0.0:8000/key/generate' \
|
||||
|
|
|
@ -605,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
|
|||
print(f"response: {response}")
|
||||
```
|
||||
|
||||
## Custom Callbacks - Track API Key, API Endpoint, Model Used
|
||||
|
||||
If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback)
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
import litellm
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
class MyCustomHandler(CustomLogger):
|
||||
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
print(f"On Success")
|
||||
print("kwargs=", kwargs)
|
||||
litellm_params= kwargs.get("litellm_params")
|
||||
api_key = litellm_params.get("api_key")
|
||||
api_base = litellm_params.get("api_base")
|
||||
custom_llm_provider= litellm_params.get("custom_llm_provider")
|
||||
response_cost = kwargs.get("response_cost")
|
||||
|
||||
# print the values
|
||||
print("api_key=", api_key)
|
||||
print("api_base=", api_base)
|
||||
print("custom_llm_provider=", custom_llm_provider)
|
||||
print("response_cost=", response_cost)
|
||||
|
||||
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
print(f"On Failure")
|
||||
print("kwargs=")
|
||||
|
||||
customHandler = MyCustomHandler()
|
||||
|
||||
litellm.callbacks = [customHandler]
|
||||
|
||||
# Init Router
|
||||
router = Router(model_list=model_list, routing_strategy="simple-shuffle")
|
||||
|
||||
# router completion call
|
||||
response = router.completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{ "role": "user", "content": "Hi who are you"}]
|
||||
)
|
||||
```
|
||||
|
||||
## Deploy Router
|
||||
|
||||
|
|
|
@ -99,6 +99,12 @@ const config = {
|
|||
position: 'left',
|
||||
label: 'Docs',
|
||||
},
|
||||
{
|
||||
sidebarId: 'tutorialSidebar',
|
||||
position: 'left',
|
||||
label: 'Enterprise',
|
||||
to: "docs/enterprise"
|
||||
},
|
||||
{
|
||||
href: 'https://github.com/BerriAI/litellm',
|
||||
label: 'GitHub',
|
||||
|
|
BIN
docs/my-website/img/admin_ui_2.png
Normal file
After Width: | Height: | Size: 159 KiB |
BIN
docs/my-website/img/google_oauth2.png
Normal file
After Width: | Height: | Size: 351 KiB |
BIN
docs/my-website/img/google_redirect.png
Normal file
After Width: | Height: | Size: 297 KiB |
BIN
docs/my-website/img/litellm_ui_3.gif
Normal file
After Width: | Height: | Size: 7.5 MiB |
BIN
docs/my-website/img/litellm_ui_admin.png
Normal file
After Width: | Height: | Size: 97 KiB |
BIN
docs/my-website/img/litellm_ui_copy_id.png
Normal file
After Width: | Height: | Size: 13 KiB |
BIN
docs/my-website/img/litellm_ui_create_key.png
Normal file
After Width: | Height: | Size: 243 KiB |
BIN
docs/my-website/img/litellm_ui_login.png
Normal file
After Width: | Height: | Size: 120 KiB |
BIN
docs/my-website/img/presidio_screenshot.png
Normal file
After Width: | Height: | Size: 205 KiB |
BIN
docs/my-website/img/spend_per_user.png
Normal file
After Width: | Height: | Size: 249 KiB |
BIN
docs/my-website/img/ui_3.gif
Normal file
After Width: | Height: | Size: 9.9 MiB |
BIN
docs/my-website/img/ui_link.png
Normal file
After Width: | Height: | Size: 69 KiB |
|
@ -98,7 +98,7 @@ const sidebars = {
|
|||
link: {
|
||||
type: 'generated-index',
|
||||
title: '💥 OpenAI Proxy Server',
|
||||
description: `Proxy Server to call 100+ LLMs in a unified interface, load balance deployments, track costs per user`,
|
||||
description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
|
||||
slug: '/simple_proxy',
|
||||
},
|
||||
items: [
|
||||
|
@ -115,6 +115,8 @@ const sidebars = {
|
|||
"proxy/ui",
|
||||
"proxy/model_management",
|
||||
"proxy/health",
|
||||
"proxy/debugging",
|
||||
"proxy/pii_masking",
|
||||
{
|
||||
"type": "category",
|
||||
"label": "🔥 Load Balancing",
|
||||
|
@ -123,6 +125,7 @@ const sidebars = {
|
|||
"proxy/reliability",
|
||||
]
|
||||
},
|
||||
"proxy/caching",
|
||||
{
|
||||
"type": "category",
|
||||
"label": "Logging, Alerting, Caching",
|
||||
|
@ -130,7 +133,6 @@ const sidebars = {
|
|||
"proxy/logging",
|
||||
"proxy/alerting",
|
||||
"proxy/streaming_logging",
|
||||
"proxy/caching",
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -8,6 +8,11 @@ https://github.com/BerriAI/litellm
|
|||
|
||||
## **Call 100+ LLMs using the same Input/Output Format**
|
||||
|
||||
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
|
||||
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
|
||||
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
|
||||
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
|
||||
|
||||
## Basic usage
|
||||
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
|
||||
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
||||
|
@ -306,30 +311,7 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
|
|||
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
||||
```
|
||||
|
||||
## Calculate Costs, Usage, Latency
|
||||
|
||||
Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
|
||||
|
||||
```python
|
||||
from litellm import completion, completion_cost
|
||||
import os
|
||||
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
||||
|
||||
response = completion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
||||
)
|
||||
|
||||
cost = completion_cost(completion_response=response)
|
||||
print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
|
||||
```
|
||||
|
||||
**Output**
|
||||
```shell
|
||||
Cost for completion call with gpt-3.5-turbo: $0.0000775000
|
||||
```
|
||||
|
||||
### Track Costs, Usage, Latency for streaming
|
||||
## Track Costs, Usage, Latency for streaming
|
||||
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
||||
|
||||
```python
|
||||
|
@ -342,18 +324,8 @@ def track_cost_callback(
|
|||
start_time, end_time # start/end time
|
||||
):
|
||||
try:
|
||||
# check if it has collected an entire stream response
|
||||
if "complete_streaming_response" in kwargs:
|
||||
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
|
||||
completion_response=kwargs["complete_streaming_response"]
|
||||
input_text = kwargs["messages"]
|
||||
output_text = completion_response["choices"][0]["message"]["content"]
|
||||
response_cost = litellm.completion_cost(
|
||||
model = kwargs["model"],
|
||||
messages = input_text,
|
||||
completion=output_text
|
||||
)
|
||||
print("streaming response_cost", response_cost)
|
||||
response_cost = kwargs.get("response_cost", 0)
|
||||
print("streaming response_cost", response_cost)
|
||||
except:
|
||||
pass
|
||||
# set callback
|
||||
|
@ -372,13 +344,12 @@ response = completion(
|
|||
)
|
||||
```
|
||||
|
||||
|
||||
Need a dedicated key? Email us @ krrish@berri.ai
|
||||
|
||||
## OpenAI Proxy
|
||||
|
||||
Track spend across multiple projects/people
|
||||
|
||||

|
||||
|
||||
The proxy provides:
|
||||
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
|
||||
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
|
||||
|
@ -418,4 +389,4 @@ print(response)
|
|||
## More details
|
||||
* [exception mapping](./exception_mapping.md)
|
||||
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
|
||||
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
|
||||
* [proxy virtual keys & spend management](./tutorials/fallbacks.md)
|
|
@ -1,11 +1,13 @@
|
|||
### INIT VARIABLES ###
|
||||
import threading, requests
|
||||
import threading, requests, os
|
||||
from typing import Callable, List, Optional, Dict, Union, Any
|
||||
from litellm.caching import Cache
|
||||
from litellm._logging import set_verbose, _turn_on_debug
|
||||
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
|
||||
from litellm.proxy._types import KeyManagementSystem
|
||||
import httpx
|
||||
import dotenv
|
||||
|
||||
dotenv.load_dotenv()
|
||||
#############################################
|
||||
if set_verbose == True:
|
||||
_turn_on_debug()
|
||||
|
@ -62,6 +64,9 @@ cache: Optional[
|
|||
model_alias_map: Dict[str, str] = {}
|
||||
model_group_alias_map: Dict[str, str] = {}
|
||||
max_budget: float = 0.0 # set the max budget across all providers
|
||||
budget_duration: Optional[
|
||||
str
|
||||
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
_openai_completion_params = [
|
||||
"functions",
|
||||
"function_call",
|
||||
|
@ -140,6 +145,10 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
|
|||
suppress_debug_info = False
|
||||
dynamodb_table_name: Optional[str] = None
|
||||
s3_callback_params: Optional[Dict] = None
|
||||
default_key_generate_params: Optional[Dict] = None
|
||||
upperbound_key_generate_params: Optional[Dict] = None
|
||||
default_team_settings: Optional[List] = None
|
||||
max_user_budget: Optional[float] = None
|
||||
#### RELIABILITY ####
|
||||
request_timeout: Optional[float] = 6000
|
||||
num_retries: Optional[int] = None # per model endpoint
|
||||
|
@ -159,6 +168,19 @@ _key_management_system: Optional[KeyManagementSystem] = None
|
|||
|
||||
|
||||
def get_model_cost_map(url: str):
|
||||
if (
|
||||
os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
|
||||
or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
|
||||
):
|
||||
import importlib.resources
|
||||
import json
|
||||
|
||||
with importlib.resources.open_text(
|
||||
"litellm", "model_prices_and_context_window_backup.json"
|
||||
) as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
try:
|
||||
with requests.get(
|
||||
url, timeout=5
|
||||
|
@ -214,6 +236,7 @@ vertex_chat_models: List = []
|
|||
vertex_code_chat_models: List = []
|
||||
vertex_text_models: List = []
|
||||
vertex_code_text_models: List = []
|
||||
vertex_embedding_models: List = []
|
||||
ai21_models: List = []
|
||||
nlp_cloud_models: List = []
|
||||
aleph_alpha_models: List = []
|
||||
|
@ -243,6 +266,8 @@ for key, value in model_cost.items():
|
|||
vertex_chat_models.append(key)
|
||||
elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
|
||||
vertex_code_chat_models.append(key)
|
||||
elif value.get("litellm_provider") == "vertex_ai-embedding-models":
|
||||
vertex_embedding_models.append(key)
|
||||
elif value.get("litellm_provider") == "ai21":
|
||||
ai21_models.append(key)
|
||||
elif value.get("litellm_provider") == "nlp_cloud":
|
||||
|
@ -262,6 +287,7 @@ openai_compatible_endpoints: List = [
|
|||
"api.endpoints.anyscale.com/v1",
|
||||
"api.deepinfra.com/v1/openai",
|
||||
"api.mistral.ai/v1",
|
||||
"api.together.xyz/v1",
|
||||
]
|
||||
|
||||
# this is maintained for Exception Mapping
|
||||
|
@ -271,6 +297,7 @@ openai_compatible_providers: List = [
|
|||
"deepinfra",
|
||||
"perplexity",
|
||||
"xinference",
|
||||
"together_ai",
|
||||
]
|
||||
|
||||
|
||||
|
@ -479,7 +506,10 @@ bedrock_embedding_models: List = [
|
|||
]
|
||||
|
||||
all_embedding_models = (
|
||||
open_ai_embedding_models + cohere_embedding_models + bedrock_embedding_models
|
||||
open_ai_embedding_models
|
||||
+ cohere_embedding_models
|
||||
+ bedrock_embedding_models
|
||||
+ vertex_embedding_models
|
||||
)
|
||||
|
||||
####### IMAGE GENERATION MODELS ###################
|
||||
|
@ -534,6 +564,7 @@ from .llms.bedrock import (
|
|||
AmazonAnthropicConfig,
|
||||
AmazonCohereConfig,
|
||||
AmazonLlamaConfig,
|
||||
AmazonStabilityConfig,
|
||||
)
|
||||
from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
|
||||
from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
|
||||
|
|
|
@ -7,8 +7,11 @@ handler = logging.StreamHandler()
|
|||
handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Create a formatter and set it for the handler
|
||||
formatter = logging.Formatter(
|
||||
"\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")
|
||||
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
import os
|
||||
import inspect
|
||||
import redis, litellm
|
||||
import redis.asyncio as async_redis
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
|
@ -67,7 +68,10 @@ def get_redis_url_from_environment():
|
|||
)
|
||||
|
||||
|
||||
def get_redis_client(**env_overrides):
|
||||
def _get_redis_client_logic(**env_overrides):
|
||||
"""
|
||||
Common functionality across sync + async redis client implementations
|
||||
"""
|
||||
### check if "os.environ/<key-name>" passed in
|
||||
for k, v in env_overrides.items():
|
||||
if isinstance(v, str) and v.startswith("os.environ/"):
|
||||
|
@ -85,9 +89,33 @@ def get_redis_client(**env_overrides):
|
|||
redis_kwargs.pop("port", None)
|
||||
redis_kwargs.pop("db", None)
|
||||
redis_kwargs.pop("password", None)
|
||||
|
||||
return redis.Redis.from_url(**redis_kwargs)
|
||||
elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
|
||||
raise ValueError("Either 'host' or 'url' must be specified for redis.")
|
||||
litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
|
||||
return redis_kwargs
|
||||
|
||||
|
||||
def get_redis_client(**env_overrides):
|
||||
redis_kwargs = _get_redis_client_logic(**env_overrides)
|
||||
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
||||
return redis.Redis.from_url(**redis_kwargs)
|
||||
return redis.Redis(**redis_kwargs)
|
||||
|
||||
|
||||
def get_redis_async_client(**env_overrides):
|
||||
redis_kwargs = _get_redis_client_logic(**env_overrides)
|
||||
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
||||
return async_redis.Redis.from_url(**redis_kwargs)
|
||||
return async_redis.Redis(
|
||||
socket_timeout=5,
|
||||
**redis_kwargs,
|
||||
)
|
||||
|
||||
|
||||
def get_redis_connection_pool(**env_overrides):
|
||||
redis_kwargs = _get_redis_client_logic(**env_overrides)
|
||||
if "url" in redis_kwargs and redis_kwargs["url"] is not None:
|
||||
return async_redis.BlockingConnectionPool.from_url(
|
||||
timeout=5, url=redis_kwargs["url"]
|
||||
)
|
||||
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
# +-----------------------------------------------+
|
||||
# | |
|
||||
# | NOT PROXY BUDGET MANAGER |
|
||||
# | proxy budget manager is in proxy_server.py |
|
||||
# | |
|
||||
# +-----------------------------------------------+
|
||||
#
|
||||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import os, json, time
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse
|
||||
|
@ -16,7 +25,7 @@ class BudgetManager:
|
|||
self.client_type = client_type
|
||||
self.project_name = project_name
|
||||
self.api_base = api_base or "https://api.litellm.ai"
|
||||
self.headers = headers or {'Content-Type': 'application/json'}
|
||||
self.headers = headers or {"Content-Type": "application/json"}
|
||||
## load the data or init the initial dictionaries
|
||||
self.load_data()
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
# Thank you users! We ❤️ you! - Krrish & Ishaan
|
||||
|
||||
import litellm
|
||||
import time, logging
|
||||
import time, logging, asyncio
|
||||
import json, traceback, ast, hashlib
|
||||
from typing import Optional, Literal, List, Union, Any
|
||||
from openai._models import BaseModel as OpenAIObject
|
||||
|
@ -28,9 +28,18 @@ class BaseCache:
|
|||
def set_cache(self, key, value, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
async def disconnect(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class InMemoryCache(BaseCache):
|
||||
def __init__(self):
|
||||
|
@ -43,6 +52,16 @@ class InMemoryCache(BaseCache):
|
|||
if "ttl" in kwargs:
|
||||
self.ttl_dict[key] = time.time() + kwargs["ttl"]
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
self.set_cache(key=key, value=value, **kwargs)
|
||||
|
||||
async def async_set_cache_pipeline(self, cache_list, ttl=None):
|
||||
for cache_key, cache_value in cache_list:
|
||||
if ttl is not None:
|
||||
self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
|
||||
else:
|
||||
self.set_cache(key=cache_key, value=cache_value)
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
if key in self.cache_dict:
|
||||
if key in self.ttl_dict:
|
||||
|
@ -57,17 +76,26 @@ class InMemoryCache(BaseCache):
|
|||
return cached_response
|
||||
return None
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
return self.get_cache(key=key, **kwargs)
|
||||
|
||||
def flush_cache(self):
|
||||
self.cache_dict.clear()
|
||||
self.ttl_dict.clear()
|
||||
|
||||
async def disconnect(self):
|
||||
pass
|
||||
|
||||
def delete_cache(self, key):
|
||||
self.cache_dict.pop(key, None)
|
||||
self.ttl_dict.pop(key, None)
|
||||
|
||||
|
||||
class RedisCache(BaseCache):
|
||||
def __init__(self, host=None, port=None, password=None, **kwargs):
|
||||
import redis
|
||||
# if users don't provider one, use the default litellm cache
|
||||
|
||||
# if users don't provider one, use the default litellm cache
|
||||
from ._redis import get_redis_client
|
||||
def __init__(self, host=None, port=None, password=None, **kwargs):
|
||||
from ._redis import get_redis_client, get_redis_connection_pool
|
||||
|
||||
redis_kwargs = {}
|
||||
if host is not None:
|
||||
|
@ -78,18 +106,84 @@ class RedisCache(BaseCache):
|
|||
redis_kwargs["password"] = password
|
||||
|
||||
redis_kwargs.update(kwargs)
|
||||
|
||||
self.redis_client = get_redis_client(**redis_kwargs)
|
||||
self.redis_kwargs = redis_kwargs
|
||||
self.async_redis_conn_pool = get_redis_connection_pool()
|
||||
|
||||
def init_async_client(self):
|
||||
from ._redis import get_redis_async_client
|
||||
|
||||
return get_redis_async_client(
|
||||
connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
|
||||
)
|
||||
|
||||
def set_cache(self, key, value, **kwargs):
|
||||
ttl = kwargs.get("ttl", None)
|
||||
print_verbose(f"Set Redis Cache: key: {key}\nValue {value}")
|
||||
print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
|
||||
try:
|
||||
self.redis_client.set(name=key, value=str(value), ex=ttl)
|
||||
except Exception as e:
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
_redis_client = self.init_async_client()
|
||||
async with _redis_client as redis_client:
|
||||
ttl = kwargs.get("ttl", None)
|
||||
print_verbose(
|
||||
f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
|
||||
)
|
||||
try:
|
||||
await redis_client.set(name=key, value=json.dumps(value), ex=ttl)
|
||||
except Exception as e:
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
|
||||
|
||||
async def async_set_cache_pipeline(self, cache_list, ttl=None):
|
||||
"""
|
||||
Use Redis Pipelines for bulk write operations
|
||||
"""
|
||||
_redis_client = self.init_async_client()
|
||||
try:
|
||||
async with _redis_client as redis_client:
|
||||
async with redis_client.pipeline(transaction=True) as pipe:
|
||||
# Iterate through each key-value pair in the cache_list and set them in the pipeline.
|
||||
for cache_key, cache_value in cache_list:
|
||||
print_verbose(
|
||||
f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
|
||||
)
|
||||
# Set the value with a TTL if it's provided.
|
||||
if ttl is not None:
|
||||
pipe.setex(cache_key, ttl, json.dumps(cache_value))
|
||||
else:
|
||||
pipe.set(cache_key, json.dumps(cache_value))
|
||||
# Execute the pipeline and return the results.
|
||||
results = await pipe.execute()
|
||||
|
||||
print_verbose(f"pipeline results: {results}")
|
||||
# Optionally, you could process 'results' to make sure that all set operations were successful.
|
||||
return results
|
||||
except Exception as e:
|
||||
print_verbose(f"Error occurred in pipeline write - {str(e)}")
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
|
||||
|
||||
def _get_cache_logic(self, cached_response: Any):
|
||||
"""
|
||||
Common 'get_cache_logic' across sync + async redis client implementations
|
||||
"""
|
||||
if cached_response is None:
|
||||
return cached_response
|
||||
# cached_response is in `b{} convert it to ModelResponse
|
||||
cached_response = cached_response.decode("utf-8") # Convert bytes to string
|
||||
try:
|
||||
cached_response = json.loads(
|
||||
cached_response
|
||||
) # Convert string to dictionary
|
||||
except:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
return cached_response
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
try:
|
||||
print_verbose(f"Get Redis Cache: key: {key}")
|
||||
|
@ -97,26 +191,361 @@ class RedisCache(BaseCache):
|
|||
print_verbose(
|
||||
f"Got Redis Cache: key: {key}, cached_response {cached_response}"
|
||||
)
|
||||
if cached_response != None:
|
||||
# cached_response is in `b{} convert it to ModelResponse
|
||||
cached_response = cached_response.decode(
|
||||
"utf-8"
|
||||
) # Convert bytes to string
|
||||
try:
|
||||
cached_response = json.loads(
|
||||
cached_response
|
||||
) # Convert string to dictionary
|
||||
except:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
return cached_response
|
||||
return self._get_cache_logic(cached_response=cached_response)
|
||||
except Exception as e:
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
traceback.print_exc()
|
||||
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
_redis_client = self.init_async_client()
|
||||
async with _redis_client as redis_client:
|
||||
try:
|
||||
print_verbose(f"Get Redis Cache: key: {key}")
|
||||
cached_response = await redis_client.get(key)
|
||||
print_verbose(
|
||||
f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
|
||||
)
|
||||
response = self._get_cache_logic(cached_response=cached_response)
|
||||
return response
|
||||
except Exception as e:
|
||||
# NON blocking - notify users Redis is throwing an exception
|
||||
traceback.print_exc()
|
||||
logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
|
||||
|
||||
def flush_cache(self):
|
||||
self.redis_client.flushall()
|
||||
|
||||
async def disconnect(self):
|
||||
pass
|
||||
|
||||
def delete_cache(self, key):
|
||||
self.redis_client.delete(key)
|
||||
|
||||
|
||||
class RedisSemanticCache(BaseCache):
|
||||
def __init__(
|
||||
self,
|
||||
host=None,
|
||||
port=None,
|
||||
password=None,
|
||||
redis_url=None,
|
||||
similarity_threshold=None,
|
||||
use_async=False,
|
||||
embedding_model="text-embedding-ada-002",
|
||||
**kwargs,
|
||||
):
|
||||
from redisvl.index import SearchIndex
|
||||
from redisvl.query import VectorQuery
|
||||
|
||||
print_verbose(
|
||||
"redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
|
||||
)
|
||||
if similarity_threshold is None:
|
||||
raise Exception("similarity_threshold must be provided, passed None")
|
||||
self.similarity_threshold = similarity_threshold
|
||||
self.embedding_model = embedding_model
|
||||
schema = {
|
||||
"index": {
|
||||
"name": "litellm_semantic_cache_index",
|
||||
"prefix": "litellm",
|
||||
"storage_type": "hash",
|
||||
},
|
||||
"fields": {
|
||||
"text": [{"name": "response"}],
|
||||
"text": [{"name": "prompt"}],
|
||||
"vector": [
|
||||
{
|
||||
"name": "litellm_embedding",
|
||||
"dims": 1536,
|
||||
"distance_metric": "cosine",
|
||||
"algorithm": "flat",
|
||||
"datatype": "float32",
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
if redis_url is None:
|
||||
# if no url passed, check if host, port and password are passed, if not raise an Exception
|
||||
if host is None or port is None or password is None:
|
||||
# try checking env for host, port and password
|
||||
import os
|
||||
|
||||
host = os.getenv("REDIS_HOST")
|
||||
port = os.getenv("REDIS_PORT")
|
||||
password = os.getenv("REDIS_PASSWORD")
|
||||
if host is None or port is None or password is None:
|
||||
raise Exception("Redis host, port, and password must be provided")
|
||||
|
||||
redis_url = "redis://:" + password + "@" + host + ":" + port
|
||||
print_verbose(f"redis semantic-cache redis_url: {redis_url}")
|
||||
if use_async == False:
|
||||
self.index = SearchIndex.from_dict(schema)
|
||||
self.index.connect(redis_url=redis_url)
|
||||
try:
|
||||
self.index.create(overwrite=False) # don't overwrite existing index
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
|
||||
elif use_async == True:
|
||||
schema["index"]["name"] = "litellm_semantic_cache_index_async"
|
||||
self.index = SearchIndex.from_dict(schema)
|
||||
self.index.connect(redis_url=redis_url, use_async=True)
|
||||
|
||||
#
|
||||
def _get_cache_logic(self, cached_response: Any):
|
||||
"""
|
||||
Common 'get_cache_logic' across sync + async redis client implementations
|
||||
"""
|
||||
if cached_response is None:
|
||||
return cached_response
|
||||
|
||||
# check if cached_response is bytes
|
||||
if isinstance(cached_response, bytes):
|
||||
cached_response = cached_response.decode("utf-8")
|
||||
|
||||
try:
|
||||
cached_response = json.loads(
|
||||
cached_response
|
||||
) # Convert string to dictionary
|
||||
except:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
return cached_response
|
||||
|
||||
def set_cache(self, key, value, **kwargs):
|
||||
import numpy as np
|
||||
|
||||
print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
|
||||
# get the prompt
|
||||
messages = kwargs["messages"]
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
prompt += message["content"]
|
||||
|
||||
# create an embedding for prompt
|
||||
embedding_response = litellm.embedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
# make the embedding a numpy array, convert to bytes
|
||||
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
|
||||
value = str(value)
|
||||
assert isinstance(value, str)
|
||||
|
||||
new_data = [
|
||||
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
|
||||
]
|
||||
|
||||
# Add more data
|
||||
keys = self.index.load(new_data)
|
||||
|
||||
return
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
from redisvl.query import VectorQuery
|
||||
import numpy as np
|
||||
|
||||
# query
|
||||
|
||||
# get the messages
|
||||
messages = kwargs["messages"]
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
prompt += message["content"]
|
||||
|
||||
# convert to embedding
|
||||
embedding_response = litellm.embedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
query = VectorQuery(
|
||||
vector=embedding,
|
||||
vector_field_name="litellm_embedding",
|
||||
return_fields=["response", "prompt", "vector_distance"],
|
||||
num_results=1,
|
||||
)
|
||||
|
||||
results = self.index.query(query)
|
||||
if results == None:
|
||||
return None
|
||||
if isinstance(results, list):
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
vector_distance = results[0]["vector_distance"]
|
||||
vector_distance = float(vector_distance)
|
||||
similarity = 1 - vector_distance
|
||||
cached_prompt = results[0]["prompt"]
|
||||
|
||||
# check similarity, if more than self.similarity_threshold, return results
|
||||
print_verbose(
|
||||
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
|
||||
)
|
||||
if similarity > self.similarity_threshold:
|
||||
# cache hit !
|
||||
cached_value = results[0]["response"]
|
||||
print_verbose(
|
||||
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
|
||||
)
|
||||
return self._get_cache_logic(cached_response=cached_value)
|
||||
else:
|
||||
# cache miss !
|
||||
return None
|
||||
|
||||
pass
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
import numpy as np
|
||||
from litellm.proxy.proxy_server import llm_router, llm_model_list
|
||||
|
||||
try:
|
||||
await self.index.acreate(overwrite=False) # don't overwrite existing index
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception creating semantic cache index: {str(e)}")
|
||||
print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
|
||||
|
||||
# get the prompt
|
||||
messages = kwargs["messages"]
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
prompt += message["content"]
|
||||
# create an embedding for prompt
|
||||
router_model_names = (
|
||||
[m["model_name"] for m in llm_model_list]
|
||||
if llm_model_list is not None
|
||||
else []
|
||||
)
|
||||
if llm_router is not None and self.embedding_model in router_model_names:
|
||||
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
|
||||
embedding_response = await llm_router.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
metadata={
|
||||
"user_api_key": user_api_key,
|
||||
"semantic-cache-embedding": True,
|
||||
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
|
||||
},
|
||||
)
|
||||
else:
|
||||
# convert to embedding
|
||||
embedding_response = await litellm.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
# make the embedding a numpy array, convert to bytes
|
||||
embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
|
||||
value = str(value)
|
||||
assert isinstance(value, str)
|
||||
|
||||
new_data = [
|
||||
{"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
|
||||
]
|
||||
|
||||
# Add more data
|
||||
keys = await self.index.aload(new_data)
|
||||
return
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
|
||||
from redisvl.query import VectorQuery
|
||||
import numpy as np
|
||||
from litellm.proxy.proxy_server import llm_router, llm_model_list
|
||||
|
||||
# query
|
||||
|
||||
# get the messages
|
||||
messages = kwargs["messages"]
|
||||
prompt = ""
|
||||
for message in messages:
|
||||
prompt += message["content"]
|
||||
|
||||
router_model_names = (
|
||||
[m["model_name"] for m in llm_model_list]
|
||||
if llm_model_list is not None
|
||||
else []
|
||||
)
|
||||
if llm_router is not None and self.embedding_model in router_model_names:
|
||||
user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
|
||||
embedding_response = await llm_router.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
metadata={
|
||||
"user_api_key": user_api_key,
|
||||
"semantic-cache-embedding": True,
|
||||
"trace_id": kwargs.get("metadata", {}).get("trace_id", None),
|
||||
},
|
||||
)
|
||||
else:
|
||||
# convert to embedding
|
||||
embedding_response = await litellm.aembedding(
|
||||
model=self.embedding_model,
|
||||
input=prompt,
|
||||
cache={"no-store": True, "no-cache": True},
|
||||
)
|
||||
|
||||
# get the embedding
|
||||
embedding = embedding_response["data"][0]["embedding"]
|
||||
|
||||
query = VectorQuery(
|
||||
vector=embedding,
|
||||
vector_field_name="litellm_embedding",
|
||||
return_fields=["response", "prompt", "vector_distance"],
|
||||
)
|
||||
results = await self.index.aquery(query)
|
||||
if results == None:
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
return None
|
||||
if isinstance(results, list):
|
||||
if len(results) == 0:
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
|
||||
return None
|
||||
|
||||
vector_distance = results[0]["vector_distance"]
|
||||
vector_distance = float(vector_distance)
|
||||
similarity = 1 - vector_distance
|
||||
cached_prompt = results[0]["prompt"]
|
||||
|
||||
# check similarity, if more than self.similarity_threshold, return results
|
||||
print_verbose(
|
||||
f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
|
||||
)
|
||||
|
||||
# update kwargs["metadata"] with similarity, don't rewrite the original metadata
|
||||
kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
|
||||
|
||||
if similarity > self.similarity_threshold:
|
||||
# cache hit !
|
||||
cached_value = results[0]["response"]
|
||||
print_verbose(
|
||||
f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
|
||||
)
|
||||
return self._get_cache_logic(cached_response=cached_value)
|
||||
else:
|
||||
# cache miss !
|
||||
return None
|
||||
pass
|
||||
|
||||
async def _index_info(self):
|
||||
return await self.index.ainfo()
|
||||
|
||||
|
||||
class S3Cache(BaseCache):
|
||||
def __init__(
|
||||
|
@ -195,6 +624,9 @@ class S3Cache(BaseCache):
|
|||
# NON blocking - notify users S3 is throwing an exception
|
||||
print_verbose(f"S3 Caching: set_cache() - Got exception from S3: {e}")
|
||||
|
||||
async def async_set_cache(self, key, value, **kwargs):
|
||||
self.set_cache(key=key, value=value, **kwargs)
|
||||
|
||||
def get_cache(self, key, **kwargs):
|
||||
import boto3, botocore
|
||||
|
||||
|
@ -237,9 +669,15 @@ class S3Cache(BaseCache):
|
|||
traceback.print_exc()
|
||||
print_verbose(f"S3 Caching: get_cache() - Got exception from S3: {e}")
|
||||
|
||||
async def async_get_cache(self, key, **kwargs):
|
||||
return self.get_cache(key=key, **kwargs)
|
||||
|
||||
def flush_cache(self):
|
||||
pass
|
||||
|
||||
async def disconnect(self):
|
||||
pass
|
||||
|
||||
|
||||
class DualCache(BaseCache):
|
||||
"""
|
||||
|
@ -304,15 +742,22 @@ class DualCache(BaseCache):
|
|||
if self.redis_cache is not None:
|
||||
self.redis_cache.flush_cache()
|
||||
|
||||
def delete_cache(self, key):
|
||||
if self.in_memory_cache is not None:
|
||||
self.in_memory_cache.delete_cache(key)
|
||||
if self.redis_cache is not None:
|
||||
self.redis_cache.delete_cache(key)
|
||||
|
||||
|
||||
#### LiteLLM.Completion / Embedding Cache ####
|
||||
class Cache:
|
||||
def __init__(
|
||||
self,
|
||||
type: Optional[Literal["local", "redis", "s3"]] = "local",
|
||||
type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
|
||||
host: Optional[str] = None,
|
||||
port: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
similarity_threshold: Optional[float] = None,
|
||||
supported_call_types: Optional[
|
||||
List[Literal["completion", "acompletion", "embedding", "aembedding"]]
|
||||
] = ["completion", "acompletion", "embedding", "aembedding"],
|
||||
|
@ -327,16 +772,20 @@ class Cache:
|
|||
s3_aws_secret_access_key: Optional[str] = None,
|
||||
s3_aws_session_token: Optional[str] = None,
|
||||
s3_config: Optional[Any] = None,
|
||||
redis_semantic_cache_use_async=False,
|
||||
redis_semantic_cache_embedding_model="text-embedding-ada-002",
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initializes the cache based on the given type.
|
||||
|
||||
Args:
|
||||
type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
|
||||
type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
|
||||
host (str, optional): The host address for the Redis cache. Required if type is "redis".
|
||||
port (int, optional): The port number for the Redis cache. Required if type is "redis".
|
||||
password (str, optional): The password for the Redis cache. Required if type is "redis".
|
||||
similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
|
||||
|
||||
supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
|
||||
**kwargs: Additional keyword arguments for redis.Redis() cache
|
||||
|
||||
|
@ -348,9 +797,19 @@ class Cache:
|
|||
"""
|
||||
if type == "redis":
|
||||
self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
|
||||
if type == "local":
|
||||
elif type == "redis-semantic":
|
||||
self.cache = RedisSemanticCache(
|
||||
host,
|
||||
port,
|
||||
password,
|
||||
similarity_threshold=similarity_threshold,
|
||||
use_async=redis_semantic_cache_use_async,
|
||||
embedding_model=redis_semantic_cache_embedding_model,
|
||||
**kwargs,
|
||||
)
|
||||
elif type == "local":
|
||||
self.cache = InMemoryCache()
|
||||
if type == "s3":
|
||||
elif type == "s3":
|
||||
self.cache = S3Cache(
|
||||
s3_bucket_name=s3_bucket_name,
|
||||
s3_region_name=s3_region_name,
|
||||
|
@ -476,6 +935,45 @@ class Cache:
|
|||
}
|
||||
time.sleep(0.02)
|
||||
|
||||
def _get_cache_logic(
|
||||
self,
|
||||
cached_result: Optional[Any],
|
||||
max_age: Optional[float],
|
||||
):
|
||||
"""
|
||||
Common get cache logic across sync + async implementations
|
||||
"""
|
||||
# Check if a timestamp was stored with the cached response
|
||||
if (
|
||||
cached_result is not None
|
||||
and isinstance(cached_result, dict)
|
||||
and "timestamp" in cached_result
|
||||
):
|
||||
timestamp = cached_result["timestamp"]
|
||||
current_time = time.time()
|
||||
|
||||
# Calculate age of the cached response
|
||||
response_age = current_time - timestamp
|
||||
|
||||
# Check if the cached response is older than the max-age
|
||||
if max_age is not None and response_age > max_age:
|
||||
return None # Cached response is too old
|
||||
|
||||
# If the response is fresh, or there's no max-age requirement, return the cached response
|
||||
# cached_response is in `b{} convert it to ModelResponse
|
||||
cached_response = cached_result.get("response")
|
||||
try:
|
||||
if isinstance(cached_response, dict):
|
||||
pass
|
||||
else:
|
||||
cached_response = json.loads(
|
||||
cached_response # type: ignore
|
||||
) # Convert string to dictionary
|
||||
except:
|
||||
cached_response = ast.literal_eval(cached_response) # type: ignore
|
||||
return cached_response
|
||||
return cached_result
|
||||
|
||||
def get_cache(self, *args, **kwargs):
|
||||
"""
|
||||
Retrieves the cached result for the given arguments.
|
||||
|
@ -488,6 +986,7 @@ class Cache:
|
|||
The cached result if it exists, otherwise None.
|
||||
"""
|
||||
try: # never block execution
|
||||
messages = kwargs.get("messages", [])
|
||||
if "cache_key" in kwargs:
|
||||
cache_key = kwargs["cache_key"]
|
||||
else:
|
||||
|
@ -497,55 +996,44 @@ class Cache:
|
|||
max_age = cache_control_args.get(
|
||||
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
|
||||
)
|
||||
cached_result = self.cache.get_cache(cache_key)
|
||||
# Check if a timestamp was stored with the cached response
|
||||
if (
|
||||
cached_result is not None
|
||||
and isinstance(cached_result, dict)
|
||||
and "timestamp" in cached_result
|
||||
and max_age is not None
|
||||
):
|
||||
timestamp = cached_result["timestamp"]
|
||||
current_time = time.time()
|
||||
|
||||
# Calculate age of the cached response
|
||||
response_age = current_time - timestamp
|
||||
|
||||
# Check if the cached response is older than the max-age
|
||||
if response_age > max_age:
|
||||
print_verbose(
|
||||
f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
|
||||
)
|
||||
return None # Cached response is too old
|
||||
|
||||
# If the response is fresh, or there's no max-age requirement, return the cached response
|
||||
# cached_response is in `b{} convert it to ModelResponse
|
||||
cached_response = cached_result.get("response")
|
||||
try:
|
||||
if isinstance(cached_response, dict):
|
||||
pass
|
||||
else:
|
||||
cached_response = json.loads(
|
||||
cached_response
|
||||
) # Convert string to dictionary
|
||||
except:
|
||||
cached_response = ast.literal_eval(cached_response)
|
||||
return cached_response
|
||||
return cached_result
|
||||
cached_result = self.cache.get_cache(cache_key, messages=messages)
|
||||
return self._get_cache_logic(
|
||||
cached_result=cached_result, max_age=max_age
|
||||
)
|
||||
except Exception as e:
|
||||
print_verbose(f"An exception occurred: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
def add_cache(self, result, *args, **kwargs):
|
||||
async def async_get_cache(self, *args, **kwargs):
|
||||
"""
|
||||
Adds a result to the cache.
|
||||
Async get cache implementation.
|
||||
|
||||
Args:
|
||||
*args: args to litellm.completion() or embedding()
|
||||
**kwargs: kwargs to litellm.completion() or embedding()
|
||||
Used for embedding calls in async wrapper
|
||||
"""
|
||||
try: # never block execution
|
||||
messages = kwargs.get("messages", [])
|
||||
if "cache_key" in kwargs:
|
||||
cache_key = kwargs["cache_key"]
|
||||
else:
|
||||
cache_key = self.get_cache_key(*args, **kwargs)
|
||||
if cache_key is not None:
|
||||
cache_control_args = kwargs.get("cache", {})
|
||||
max_age = cache_control_args.get(
|
||||
"s-max-age", cache_control_args.get("s-maxage", float("inf"))
|
||||
)
|
||||
cached_result = await self.cache.async_get_cache(
|
||||
cache_key, *args, **kwargs
|
||||
)
|
||||
return self._get_cache_logic(
|
||||
cached_result=cached_result, max_age=max_age
|
||||
)
|
||||
except Exception as e:
|
||||
print_verbose(f"An exception occurred: {traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
Returns:
|
||||
None
|
||||
def _add_cache_logic(self, result, *args, **kwargs):
|
||||
"""
|
||||
Common implementation across sync + async add_cache functions
|
||||
"""
|
||||
try:
|
||||
if "cache_key" in kwargs:
|
||||
|
@ -564,14 +1052,82 @@ class Cache:
|
|||
if k == "ttl":
|
||||
kwargs["ttl"] = v
|
||||
cached_data = {"timestamp": time.time(), "response": result}
|
||||
self.cache.set_cache(cache_key, cached_data, **kwargs)
|
||||
return cache_key, cached_data, kwargs
|
||||
else:
|
||||
raise Exception("cache key is None")
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def add_cache(self, result, *args, **kwargs):
|
||||
"""
|
||||
Adds a result to the cache.
|
||||
|
||||
Args:
|
||||
*args: args to litellm.completion() or embedding()
|
||||
**kwargs: kwargs to litellm.completion() or embedding()
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
try:
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=result, *args, **kwargs
|
||||
)
|
||||
self.cache.set_cache(cache_key, cached_data, **kwargs)
|
||||
except Exception as e:
|
||||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
async def _async_add_cache(self, result, *args, **kwargs):
|
||||
self.add_cache(result, *args, **kwargs)
|
||||
async def async_add_cache(self, result, *args, **kwargs):
|
||||
"""
|
||||
Async implementation of add_cache
|
||||
"""
|
||||
try:
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=result, *args, **kwargs
|
||||
)
|
||||
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
|
||||
except Exception as e:
|
||||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
async def async_add_cache_pipeline(self, result, *args, **kwargs):
|
||||
"""
|
||||
Async implementation of add_cache for Embedding calls
|
||||
|
||||
Does a bulk write, to prevent using too many clients
|
||||
"""
|
||||
try:
|
||||
cache_list = []
|
||||
for idx, i in enumerate(kwargs["input"]):
|
||||
preset_cache_key = litellm.cache.get_cache_key(
|
||||
*args, **{**kwargs, "input": i}
|
||||
)
|
||||
kwargs["cache_key"] = preset_cache_key
|
||||
embedding_response = result.data[idx]
|
||||
cache_key, cached_data, kwargs = self._add_cache_logic(
|
||||
result=embedding_response,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
cache_list.append((cache_key, cached_data))
|
||||
if hasattr(self.cache, "async_set_cache_pipeline"):
|
||||
await self.cache.async_set_cache_pipeline(cache_list=cache_list)
|
||||
else:
|
||||
tasks = []
|
||||
for val in cache_list:
|
||||
tasks.append(
|
||||
self.cache.async_set_cache(cache_key, cached_data, **kwargs)
|
||||
)
|
||||
await asyncio.gather(*tasks)
|
||||
except Exception as e:
|
||||
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
async def disconnect(self):
|
||||
if hasattr(self.cache, "disconnect"):
|
||||
await self.cache.disconnect()
|
||||
|
||||
|
||||
def enable_cache(
|
||||
|
|
|
@ -63,6 +63,22 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
|
|||
):
|
||||
pass
|
||||
|
||||
async def async_post_call_streaming_hook(
|
||||
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
|
||||
):
|
||||
"""
|
||||
Returns streaming chunk before their returned to user
|
||||
"""
|
||||
pass
|
||||
|
||||
async def async_post_call_success_hook(
|
||||
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
|
||||
):
|
||||
"""
|
||||
Returns llm response before it's returned to user
|
||||
"""
|
||||
pass
|
||||
|
||||
#### SINGLE-USE #### - https://docs.litellm.ai/docs/observability/custom_callback#using-your-custom-callback-function
|
||||
|
||||
def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# On success, logs events to Helicone
|
||||
import dotenv, os
|
||||
import requests
|
||||
import litellm
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
@ -56,6 +57,10 @@ class HeliconeLogger:
|
|||
else "gpt-3.5-turbo"
|
||||
)
|
||||
provider_request = {"model": model, "messages": messages}
|
||||
if isinstance(response_obj, litellm.EmbeddingResponse) or isinstance(
|
||||
response_obj, litellm.ModelResponse
|
||||
):
|
||||
response_obj = response_obj.json()
|
||||
|
||||
if "claude" in model:
|
||||
provider_request, response_obj = self.claude_mapping(
|
||||
|
|
|
@ -9,11 +9,12 @@ dotenv.load_dotenv() # Loading env variables using dotenv
|
|||
import traceback
|
||||
from packaging.version import Version
|
||||
from litellm._logging import verbose_logger
|
||||
import litellm
|
||||
|
||||
|
||||
class LangFuseLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
def __init__(self, langfuse_public_key=None, langfuse_secret=None):
|
||||
try:
|
||||
from langfuse import Langfuse
|
||||
except Exception as e:
|
||||
|
@ -21,8 +22,8 @@ class LangFuseLogger:
|
|||
f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\033[0m"
|
||||
)
|
||||
# Instance variables
|
||||
self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
|
||||
self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
|
||||
self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
|
||||
self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
|
||||
self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
|
||||
self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
|
||||
self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
|
||||
|
@ -34,8 +35,41 @@ class LangFuseLogger:
|
|||
debug=self.langfuse_debug,
|
||||
)
|
||||
|
||||
if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
|
||||
self.upstream_langfuse_secret_key = os.getenv(
|
||||
"UPSTREAM_LANGFUSE_SECRET_KEY"
|
||||
)
|
||||
self.upstream_langfuse_public_key = os.getenv(
|
||||
"UPSTREAM_LANGFUSE_PUBLIC_KEY"
|
||||
)
|
||||
self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
|
||||
self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
|
||||
self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
|
||||
self.upstream_langfuse = Langfuse(
|
||||
public_key=self.upstream_langfuse_public_key,
|
||||
secret_key=self.upstream_langfuse_secret_key,
|
||||
host=self.upstream_langfuse_host,
|
||||
release=self.upstream_langfuse_release,
|
||||
debug=self.upstream_langfuse_debug,
|
||||
)
|
||||
else:
|
||||
self.upstream_langfuse = None
|
||||
|
||||
# def log_error(kwargs, response_obj, start_time, end_time):
|
||||
# generation = trace.generation(
|
||||
# level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
|
||||
# status_message='error' # can be any string (e.g. stringified stack trace or error body)
|
||||
# )
|
||||
def log_event(
|
||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
self,
|
||||
kwargs,
|
||||
response_obj,
|
||||
start_time,
|
||||
end_time,
|
||||
user_id,
|
||||
print_verbose,
|
||||
level="DEFAULT",
|
||||
status_message=None,
|
||||
):
|
||||
# Method definition
|
||||
|
||||
|
@ -63,32 +97,49 @@ class LangFuseLogger:
|
|||
pass
|
||||
|
||||
# end of processing langfuse ########################
|
||||
input = prompt
|
||||
output = response_obj["choices"][0]["message"].json()
|
||||
print_verbose(
|
||||
f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
|
||||
)
|
||||
self._log_langfuse_v2(
|
||||
user_id,
|
||||
metadata,
|
||||
output,
|
||||
start_time,
|
||||
end_time,
|
||||
kwargs,
|
||||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
) if self._is_langfuse_v2() else self._log_langfuse_v1(
|
||||
user_id,
|
||||
metadata,
|
||||
output,
|
||||
start_time,
|
||||
end_time,
|
||||
kwargs,
|
||||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
)
|
||||
if (
|
||||
level == "ERROR"
|
||||
and status_message is not None
|
||||
and isinstance(status_message, str)
|
||||
):
|
||||
input = prompt
|
||||
output = status_message
|
||||
elif response_obj is not None and (
|
||||
kwargs.get("call_type", None) == "embedding"
|
||||
or isinstance(response_obj, litellm.EmbeddingResponse)
|
||||
):
|
||||
input = prompt
|
||||
output = response_obj["data"]
|
||||
elif response_obj is not None:
|
||||
input = prompt
|
||||
output = response_obj["choices"][0]["message"].json()
|
||||
print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
|
||||
if self._is_langfuse_v2():
|
||||
self._log_langfuse_v2(
|
||||
user_id,
|
||||
metadata,
|
||||
output,
|
||||
start_time,
|
||||
end_time,
|
||||
kwargs,
|
||||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
level,
|
||||
print_verbose,
|
||||
)
|
||||
elif response_obj is not None:
|
||||
self._log_langfuse_v1(
|
||||
user_id,
|
||||
metadata,
|
||||
output,
|
||||
start_time,
|
||||
end_time,
|
||||
kwargs,
|
||||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
)
|
||||
|
||||
self.Langfuse.flush()
|
||||
print_verbose(
|
||||
|
@ -97,15 +148,15 @@ class LangFuseLogger:
|
|||
verbose_logger.info(f"Langfuse Layer Logging - logging success")
|
||||
except:
|
||||
traceback.print_exc()
|
||||
print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
print(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
pass
|
||||
|
||||
async def _async_log_event(
|
||||
self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
):
|
||||
self.log_event(
|
||||
kwargs, response_obj, start_time, end_time, user_id, print_verbose
|
||||
)
|
||||
"""
|
||||
TODO: support async calls when langfuse is truly async
|
||||
"""
|
||||
|
||||
def _is_langfuse_v2(self):
|
||||
import langfuse
|
||||
|
@ -167,40 +218,84 @@ class LangFuseLogger:
|
|||
optional_params,
|
||||
input,
|
||||
response_obj,
|
||||
level,
|
||||
print_verbose,
|
||||
):
|
||||
import langfuse
|
||||
|
||||
tags = []
|
||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
||||
try:
|
||||
tags = []
|
||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
||||
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
|
||||
|
||||
trace_params = {
|
||||
"name": metadata.get("generation_name", "litellm-completion"),
|
||||
"input": input,
|
||||
"output": output,
|
||||
"user_id": metadata.get("trace_user_id", user_id),
|
||||
"id": metadata.get("trace_id", None),
|
||||
}
|
||||
if supports_tags:
|
||||
for key, value in metadata.items():
|
||||
tags.append(f"{key}:{value}")
|
||||
if "cache_hit" in kwargs:
|
||||
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
||||
trace_params.update({"tags": tags})
|
||||
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
|
||||
|
||||
trace = self.Langfuse.trace(**trace_params)
|
||||
if supports_tags:
|
||||
metadata_tags = metadata.get("tags", [])
|
||||
tags = metadata_tags
|
||||
|
||||
trace.generation(
|
||||
name=metadata.get("generation_name", "litellm-completion"),
|
||||
id=metadata.get("generation_id", None),
|
||||
startTime=start_time,
|
||||
endTime=end_time,
|
||||
model=kwargs["model"],
|
||||
modelParameters=optional_params,
|
||||
input=input,
|
||||
output=output,
|
||||
usage={
|
||||
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
||||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||
},
|
||||
metadata=metadata,
|
||||
)
|
||||
generation_name = metadata.get("generation_name", None)
|
||||
if generation_name is None:
|
||||
# just log `litellm-{call_type}` as the generation name
|
||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
trace_params = {
|
||||
"name": generation_name,
|
||||
"input": input,
|
||||
"user_id": metadata.get("trace_user_id", user_id),
|
||||
"id": metadata.get("trace_id", None),
|
||||
"session_id": metadata.get("session_id", None),
|
||||
}
|
||||
|
||||
if level == "ERROR":
|
||||
trace_params["status_message"] = output
|
||||
else:
|
||||
trace_params["output"] = output
|
||||
|
||||
cost = kwargs.get("response_cost", None)
|
||||
print_verbose(f"trace: {cost}")
|
||||
if supports_tags:
|
||||
for key, value in metadata.items():
|
||||
if key in [
|
||||
"user_api_key",
|
||||
"user_api_key_user_id",
|
||||
"semantic-similarity",
|
||||
]:
|
||||
tags.append(f"{key}:{value}")
|
||||
if "cache_hit" in kwargs:
|
||||
if kwargs["cache_hit"] is None:
|
||||
kwargs["cache_hit"] = False
|
||||
tags.append(f"cache_hit:{kwargs['cache_hit']}")
|
||||
trace_params.update({"tags": tags})
|
||||
|
||||
trace = self.Langfuse.trace(**trace_params)
|
||||
|
||||
generation_id = None
|
||||
usage = None
|
||||
if response_obj is not None and response_obj.get("id", None) is not None:
|
||||
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
|
||||
usage = {
|
||||
"prompt_tokens": response_obj["usage"]["prompt_tokens"],
|
||||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"total_cost": cost if supports_costs else None,
|
||||
}
|
||||
generation_params = {
|
||||
"name": generation_name,
|
||||
"id": metadata.get("generation_id", generation_id),
|
||||
"startTime": start_time,
|
||||
"endTime": end_time,
|
||||
"model": kwargs["model"],
|
||||
"modelParameters": optional_params,
|
||||
"input": input,
|
||||
"output": output,
|
||||
"usage": usage,
|
||||
"metadata": metadata,
|
||||
"level": level,
|
||||
}
|
||||
|
||||
if output is not None and isinstance(output, str) and level == "ERROR":
|
||||
generation_params["statusMessage"] = output
|
||||
|
||||
trace.generation(**generation_params)
|
||||
except Exception as e:
|
||||
print(f"Langfuse Layer Error - {traceback.format_exc()}")
|
||||
|
|
|
@ -8,7 +8,7 @@ dotenv.load_dotenv() # Loading env variables using dotenv
|
|||
import traceback
|
||||
import datetime, subprocess, sys
|
||||
import litellm, uuid
|
||||
from litellm._logging import print_verbose
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
|
||||
|
||||
class S3Logger:
|
||||
|
@ -31,7 +31,9 @@ class S3Logger:
|
|||
import boto3
|
||||
|
||||
try:
|
||||
print_verbose("in init s3 logger")
|
||||
verbose_logger.debug(
|
||||
f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
|
||||
)
|
||||
|
||||
if litellm.s3_callback_params is not None:
|
||||
# read in .env variables - example os.environ/AWS_BUCKET_NAME
|
||||
|
@ -42,7 +44,7 @@ class S3Logger:
|
|||
s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
|
||||
s3_region_name = litellm.s3_callback_params.get("s3_region_name")
|
||||
s3_api_version = litellm.s3_callback_params.get("s3_api_version")
|
||||
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
|
||||
s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
|
||||
s3_verify = litellm.s3_callback_params.get("s3_verify")
|
||||
s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
|
||||
s3_aws_access_key_id = litellm.s3_callback_params.get(
|
||||
|
@ -59,6 +61,7 @@ class S3Logger:
|
|||
|
||||
self.bucket_name = s3_bucket_name
|
||||
self.s3_path = s3_path
|
||||
verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
|
||||
# Create an S3 client with custom endpoint URL
|
||||
self.s3_client = boto3.client(
|
||||
"s3",
|
||||
|
@ -84,7 +87,9 @@ class S3Logger:
|
|||
|
||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||
try:
|
||||
print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
|
||||
verbose_logger.debug(
|
||||
f"s3 Logging - Enters logging function for model {kwargs}"
|
||||
)
|
||||
|
||||
# construct payload to send to s3
|
||||
# follows the same params as langfuse.py
|
||||
|
@ -123,12 +128,22 @@ class S3Logger:
|
|||
# non blocking if it can't cast to a str
|
||||
pass
|
||||
|
||||
s3_file_name = litellm.utils.get_logging_id(start_time, payload) or ""
|
||||
s3_object_key = (
|
||||
(self.s3_path.rstrip("/") + "/" if self.s3_path else "")
|
||||
+ payload["id"]
|
||||
+ "-time="
|
||||
+ str(start_time)
|
||||
+ start_time.strftime("%Y-%m-%d")
|
||||
+ "/"
|
||||
+ s3_file_name
|
||||
) # we need the s3 key to include the time, so we log cache hits too
|
||||
s3_object_key += ".json"
|
||||
|
||||
s3_object_download_filename = (
|
||||
"time-"
|
||||
+ start_time.strftime("%Y-%m-%dT%H-%M-%S-%f")
|
||||
+ "_"
|
||||
+ payload["id"]
|
||||
+ ".json"
|
||||
)
|
||||
|
||||
import json
|
||||
|
||||
|
@ -142,7 +157,8 @@ class S3Logger:
|
|||
Body=payload,
|
||||
ContentType="application/json",
|
||||
ContentLanguage="en",
|
||||
ContentDisposition=f'inline; filename="{key}.json"',
|
||||
ContentDisposition=f'inline; filename="{s3_object_download_filename}"',
|
||||
CacheControl="private, immutable, max-age=31536000, s-maxage=0",
|
||||
)
|
||||
|
||||
print_verbose(f"Response from s3:{str(response)}")
|
||||
|
@ -151,5 +167,5 @@ class S3Logger:
|
|||
return response
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
||||
verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
|
||||
pass
|
||||
|
|
|
@ -2,9 +2,9 @@ import json, copy, types
|
|||
import os
|
||||
from enum import Enum
|
||||
import time
|
||||
from typing import Callable, Optional, Any, Union
|
||||
from typing import Callable, Optional, Any, Union, List
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse, get_secret, Usage
|
||||
from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
import httpx
|
||||
|
||||
|
@ -282,6 +282,73 @@ class AmazonLlamaConfig:
|
|||
}
|
||||
|
||||
|
||||
class AmazonStabilityConfig:
|
||||
"""
|
||||
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
|
||||
|
||||
Supported Params for the Amazon / Stable Diffusion models:
|
||||
|
||||
- `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)
|
||||
|
||||
- `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed)
|
||||
|
||||
- `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run.
|
||||
|
||||
- `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64.
|
||||
Engine-specific dimension validation:
|
||||
|
||||
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
|
||||
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
|
||||
- SDXL v1.0: same as SDXL v0.9
|
||||
- SD v1.6: must be between 320x320 and 1536x1536
|
||||
|
||||
- `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64.
|
||||
Engine-specific dimension validation:
|
||||
|
||||
- SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
|
||||
- SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
|
||||
- SDXL v1.0: same as SDXL v0.9
|
||||
- SD v1.6: must be between 320x320 and 1536x1536
|
||||
"""
|
||||
|
||||
cfg_scale: Optional[int] = None
|
||||
seed: Optional[float] = None
|
||||
steps: Optional[List[str]] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cfg_scale: Optional[int] = None,
|
||||
seed: Optional[float] = None,
|
||||
steps: Optional[List[str]] = None,
|
||||
width: Optional[int] = None,
|
||||
height: Optional[int] = None,
|
||||
) -> None:
|
||||
locals_ = locals()
|
||||
for key, value in locals_.items():
|
||||
if key != "self" and value is not None:
|
||||
setattr(self.__class__, key, value)
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
|
||||
def init_bedrock_client(
|
||||
region_name=None,
|
||||
aws_access_key_id: Optional[str] = None,
|
||||
|
@ -289,7 +356,9 @@ def init_bedrock_client(
|
|||
aws_region_name: Optional[str] = None,
|
||||
aws_bedrock_runtime_endpoint: Optional[str] = None,
|
||||
aws_session_name: Optional[str] = None,
|
||||
aws_profile_name: Optional[str] = None,
|
||||
aws_role_name: Optional[str] = None,
|
||||
timeout: Optional[int] = None,
|
||||
):
|
||||
# check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
|
||||
litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
|
||||
|
@ -303,6 +372,7 @@ def init_bedrock_client(
|
|||
aws_region_name,
|
||||
aws_bedrock_runtime_endpoint,
|
||||
aws_session_name,
|
||||
aws_profile_name,
|
||||
aws_role_name,
|
||||
]
|
||||
|
||||
|
@ -317,6 +387,7 @@ def init_bedrock_client(
|
|||
aws_region_name,
|
||||
aws_bedrock_runtime_endpoint,
|
||||
aws_session_name,
|
||||
aws_profile_name,
|
||||
aws_role_name,
|
||||
) = params_to_check
|
||||
|
||||
|
@ -346,6 +417,8 @@ def init_bedrock_client(
|
|||
|
||||
import boto3
|
||||
|
||||
config = boto3.session.Config(connect_timeout=timeout, read_timeout=timeout)
|
||||
|
||||
### CHECK STS ###
|
||||
if aws_role_name is not None and aws_session_name is not None:
|
||||
# use sts if role name passed in
|
||||
|
@ -366,6 +439,7 @@ def init_bedrock_client(
|
|||
aws_session_token=sts_response["Credentials"]["SessionToken"],
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
elif aws_access_key_id is not None:
|
||||
# uses auth params passed to completion
|
||||
|
@ -377,6 +451,16 @@ def init_bedrock_client(
|
|||
aws_secret_access_key=aws_secret_access_key,
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
elif aws_profile_name is not None:
|
||||
# uses auth values from AWS profile usually stored in ~/.aws/credentials
|
||||
|
||||
client = boto3.Session(profile_name=aws_profile_name).client(
|
||||
service_name="bedrock-runtime",
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
else:
|
||||
# aws_access_key_id is None, assume user is trying to auth using env variables
|
||||
|
@ -386,6 +470,7 @@ def init_bedrock_client(
|
|||
service_name="bedrock-runtime",
|
||||
region_name=region_name,
|
||||
endpoint_url=endpoint_url,
|
||||
config=config,
|
||||
)
|
||||
|
||||
return client
|
||||
|
@ -441,6 +526,7 @@ def completion(
|
|||
optional_params=None,
|
||||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
timeout=None,
|
||||
):
|
||||
exception_mapping_worked = False
|
||||
try:
|
||||
|
@ -450,6 +536,7 @@ def completion(
|
|||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
aws_role_name = optional_params.pop("aws_role_name", None)
|
||||
aws_session_name = optional_params.pop("aws_session_name", None)
|
||||
aws_profile_name = optional_params.pop("aws_profile_name", None)
|
||||
aws_bedrock_runtime_endpoint = optional_params.pop(
|
||||
"aws_bedrock_runtime_endpoint", None
|
||||
)
|
||||
|
@ -466,6 +553,8 @@ def completion(
|
|||
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
aws_profile_name=aws_profile_name,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
model = model
|
||||
|
@ -652,6 +741,8 @@ def completion(
|
|||
try:
|
||||
if len(outputText) > 0:
|
||||
model_response["choices"][0]["message"]["content"] = outputText
|
||||
else:
|
||||
raise Exception()
|
||||
except:
|
||||
raise BedrockError(
|
||||
message=json.dumps(outputText),
|
||||
|
@ -659,9 +750,16 @@ def completion(
|
|||
)
|
||||
|
||||
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(
|
||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||
prompt_tokens = response_metadata.get(
|
||||
"x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
|
||||
)
|
||||
completion_tokens = response_metadata.get(
|
||||
"x-amzn-bedrock-output-token-count",
|
||||
len(
|
||||
encoding.encode(
|
||||
model_response["choices"][0]["message"].get("content", "")
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
model_response["created"] = int(time.time())
|
||||
|
@ -672,6 +770,8 @@ def completion(
|
|||
total_tokens=prompt_tokens + completion_tokens,
|
||||
)
|
||||
model_response.usage = usage
|
||||
model_response._hidden_params["region_name"] = client.meta.region_name
|
||||
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
|
||||
return model_response
|
||||
except BedrockError as e:
|
||||
exception_mapping_worked = True
|
||||
|
@ -693,6 +793,11 @@ def _embedding_func_single(
|
|||
encoding=None,
|
||||
logging_obj=None,
|
||||
):
|
||||
if isinstance(input, str) is False:
|
||||
raise BedrockError(
|
||||
message="Bedrock Embedding API input must be type str | List[str]",
|
||||
status_code=400,
|
||||
)
|
||||
# logic for parsing in - calling - parsing out model embedding calls
|
||||
## FORMAT EMBEDDING INPUT ##
|
||||
provider = model.split(".")[0]
|
||||
|
@ -786,7 +891,8 @@ def embedding(
|
|||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
)
|
||||
if type(input) == str:
|
||||
if isinstance(input, str):
|
||||
## Embedding Call
|
||||
embeddings = [
|
||||
_embedding_func_single(
|
||||
model,
|
||||
|
@ -796,8 +902,8 @@ def embedding(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
]
|
||||
else:
|
||||
## Embedding Call
|
||||
elif isinstance(input, list):
|
||||
## Embedding Call - assuming this is a List[str]
|
||||
embeddings = [
|
||||
_embedding_func_single(
|
||||
model,
|
||||
|
@ -808,6 +914,12 @@ def embedding(
|
|||
)
|
||||
for i in input
|
||||
] # [TODO]: make these parallel calls
|
||||
else:
|
||||
# enters this branch if input = int, ex. input=2
|
||||
raise BedrockError(
|
||||
message="Bedrock Embedding API input must be type str | List[str]",
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
|
@ -834,3 +946,112 @@ def embedding(
|
|||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
||||
|
||||
def image_generation(
|
||||
model: str,
|
||||
prompt: str,
|
||||
timeout=None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
aimg_generation=False,
|
||||
):
|
||||
"""
|
||||
Bedrock Image Gen endpoint support
|
||||
"""
|
||||
### BOTO3 INIT ###
|
||||
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
|
||||
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
|
||||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
aws_role_name = optional_params.pop("aws_role_name", None)
|
||||
aws_session_name = optional_params.pop("aws_session_name", None)
|
||||
aws_bedrock_runtime_endpoint = optional_params.pop(
|
||||
"aws_bedrock_runtime_endpoint", None
|
||||
)
|
||||
|
||||
# use passed in BedrockRuntime.Client if provided, otherwise create a new one
|
||||
client = init_bedrock_client(
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
aws_region_name=aws_region_name,
|
||||
aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
|
||||
aws_role_name=aws_role_name,
|
||||
aws_session_name=aws_session_name,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
### FORMAT IMAGE GENERATION INPUT ###
|
||||
modelId = model
|
||||
provider = model.split(".")[0]
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop(
|
||||
"user", None
|
||||
) # make sure user is not passed in for bedrock call
|
||||
data = {}
|
||||
if provider == "stability":
|
||||
prompt = prompt.replace(os.linesep, " ")
|
||||
## LOAD CONFIG
|
||||
config = litellm.AmazonStabilityConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if (
|
||||
k not in inference_params
|
||||
): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
inference_params[k] = v
|
||||
data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params}
|
||||
else:
|
||||
raise BedrockError(
|
||||
status_code=422, message=f"Unsupported model={model}, passed in"
|
||||
)
|
||||
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={body},
|
||||
modelId={modelId},
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
)""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key="", # boto3 is used for init.
|
||||
additional_args={
|
||||
"complete_input_dict": {"model": modelId, "texts": prompt},
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
try:
|
||||
response = client.invoke_model(
|
||||
body=body,
|
||||
modelId=modelId,
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
)
|
||||
response_body = json.loads(response.get("body").read())
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key="",
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=json.dumps(response_body),
|
||||
)
|
||||
except Exception as e:
|
||||
raise BedrockError(
|
||||
message=f"Embedding Error with model {model}: {e}", status_code=500
|
||||
)
|
||||
|
||||
### FORMAT RESPONSE TO OPENAI FORMAT ###
|
||||
if response_body is None:
|
||||
raise Exception("Error in response object format")
|
||||
|
||||
if model_response is None:
|
||||
model_response = ImageResponse()
|
||||
|
||||
image_list: List = []
|
||||
for artifact in response_body["artifacts"]:
|
||||
image_dict = {"url": artifact["base64"]}
|
||||
|
||||
model_response.data = image_dict
|
||||
return model_response
|
||||
|
|
|
@ -145,8 +145,17 @@ def get_ollama_response(
|
|||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
optional_params["stream"] = optional_params.get("stream", False)
|
||||
data = {"model": model, "prompt": prompt, **optional_params}
|
||||
stream = optional_params.pop("stream", False)
|
||||
format = optional_params.pop("format", None)
|
||||
data = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"options": optional_params,
|
||||
"stream": stream,
|
||||
}
|
||||
if format is not None:
|
||||
data["format"] = format
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
|
@ -159,7 +168,7 @@ def get_ollama_response(
|
|||
},
|
||||
)
|
||||
if acompletion is True:
|
||||
if optional_params.get("stream", False) == True:
|
||||
if stream == True:
|
||||
response = ollama_async_streaming(
|
||||
url=url,
|
||||
data=data,
|
||||
|
@ -176,10 +185,12 @@ def get_ollama_response(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
return response
|
||||
elif optional_params.get("stream", False) == True:
|
||||
elif stream == True:
|
||||
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
||||
|
||||
response = requests.post(url=f"{url}", json=data, timeout=litellm.request_timeout)
|
||||
response = requests.post(
|
||||
url=f"{url}", json={**data, "stream": stream}, timeout=litellm.request_timeout
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise OllamaError(status_code=response.status_code, message=response.text)
|
||||
|
||||
|
@ -254,7 +265,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
|||
) as response:
|
||||
if response.status_code != 200:
|
||||
raise OllamaError(
|
||||
status_code=response.status_code, message=response.text
|
||||
status_code=response.status_code, message=await response.aread()
|
||||
)
|
||||
|
||||
streamwrapper = litellm.CustomStreamWrapper(
|
||||
|
@ -267,6 +278,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
|||
yield transformed_chunk
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
raise e
|
||||
|
||||
|
||||
async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
||||
|
|
|
@ -145,8 +145,16 @@ def get_ollama_response(
|
|||
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
optional_params["stream"] = optional_params.get("stream", False)
|
||||
data = {"model": model, "messages": messages, **optional_params}
|
||||
stream = optional_params.pop("stream", False)
|
||||
format = optional_params.pop("format", None)
|
||||
data = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"options": optional_params,
|
||||
"stream": stream,
|
||||
}
|
||||
if format is not None:
|
||||
data["format"] = format
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=None,
|
||||
|
@ -159,7 +167,7 @@ def get_ollama_response(
|
|||
},
|
||||
)
|
||||
if acompletion is True:
|
||||
if optional_params.get("stream", False) == True:
|
||||
if stream == True:
|
||||
response = ollama_async_streaming(
|
||||
url=url,
|
||||
data=data,
|
||||
|
@ -176,7 +184,7 @@ def get_ollama_response(
|
|||
logging_obj=logging_obj,
|
||||
)
|
||||
return response
|
||||
elif optional_params.get("stream", False) == True:
|
||||
elif stream == True:
|
||||
return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)
|
||||
|
||||
response = requests.post(
|
||||
|
@ -220,8 +228,10 @@ def get_ollama_response(
|
|||
model_response["choices"][0]["message"] = response_json["message"]
|
||||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + model
|
||||
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
|
||||
completion_tokens = response_json["eval_count"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count", litellm.token_counter(text=response_json["message"]["content"])
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
@ -318,10 +328,16 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
|||
model_response["choices"][0]["message"] = message
|
||||
else:
|
||||
model_response["choices"][0]["message"] = response_json["message"]
|
||||
|
||||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + data["model"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
|
||||
completion_tokens = response_json["eval_count"]
|
||||
model_response["model"] = "ollama_chat/" + data["model"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count",
|
||||
litellm.token_counter(
|
||||
text=response_json["message"]["content"], count_response_tokens=True
|
||||
),
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
|
|
@ -221,6 +221,8 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
headers: Optional[dict] = None,
|
||||
custom_prompt_dict: dict = {},
|
||||
client=None,
|
||||
organization: Optional[str] = None,
|
||||
custom_llm_provider: Optional[str] = None,
|
||||
):
|
||||
super().completion()
|
||||
exception_mapping_worked = False
|
||||
|
@ -235,6 +237,14 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
status_code=422, message=f"Timeout needs to be a float"
|
||||
)
|
||||
|
||||
if custom_llm_provider == "mistral":
|
||||
# check if message content passed in as list, and not string
|
||||
messages = prompt_factory(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
|
||||
for _ in range(
|
||||
2
|
||||
): # if call fails due to alternating messages, retry with reformatted message
|
||||
|
@ -254,6 +264,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
return self.acompletion(
|
||||
|
@ -266,6 +277,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
elif optional_params.get("stream", False):
|
||||
return self.streaming(
|
||||
|
@ -278,6 +290,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout=timeout,
|
||||
client=client,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
if not isinstance(max_retries, int):
|
||||
|
@ -291,6 +304,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.client_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_client = client
|
||||
|
@ -320,12 +334,13 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model_response_object=model_response,
|
||||
)
|
||||
except Exception as e:
|
||||
if "Conversation roles must alternate user/assistant" in str(
|
||||
e
|
||||
) or "user and assistant roles should be alternating" in str(e):
|
||||
if (
|
||||
"Conversation roles must alternate user/assistant" in str(e)
|
||||
or "user and assistant roles should be alternating" in str(e)
|
||||
) and messages is not None:
|
||||
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
|
||||
new_messages = []
|
||||
for i in range(len(messages) - 1):
|
||||
for i in range(len(messages) - 1): # type: ignore
|
||||
new_messages.append(messages[i])
|
||||
if messages[i]["role"] == messages[i + 1]["role"]:
|
||||
if messages[i]["role"] == "user":
|
||||
|
@ -336,7 +351,9 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
new_messages.append({"role": "user", "content": ""})
|
||||
new_messages.append(messages[-1])
|
||||
messages = new_messages
|
||||
elif "Last message must have role `user`" in str(e):
|
||||
elif (
|
||||
"Last message must have role `user`" in str(e)
|
||||
) and messages is not None:
|
||||
new_messages = messages
|
||||
new_messages.append({"role": "user", "content": ""})
|
||||
messages = new_messages
|
||||
|
@ -358,6 +375,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
timeout: float,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
logging_obj=None,
|
||||
|
@ -372,6 +390,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.aclient_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
|
@ -412,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
headers=None,
|
||||
|
@ -423,6 +443,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.client_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_client = client
|
||||
|
@ -431,8 +452,8 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
input=data["messages"],
|
||||
api_key=api_key,
|
||||
additional_args={
|
||||
"headers": headers,
|
||||
"api_base": api_base,
|
||||
"headers": {"Authorization": f"Bearer {openai_client.api_key}"},
|
||||
"api_base": openai_client._base_url._uri_reference,
|
||||
"acompletion": False,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
|
@ -454,6 +475,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
headers=None,
|
||||
|
@ -467,6 +489,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
http_client=litellm.aclient_session,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
openai_aclient = client
|
||||
|
@ -718,8 +741,22 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation") # type: ignore
|
||||
except OpenAIError as e:
|
||||
exception_mapping_worked = True
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=str(e),
|
||||
)
|
||||
raise e
|
||||
except Exception as e:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data},
|
||||
original_response=str(e),
|
||||
)
|
||||
if hasattr(e, "status_code"):
|
||||
raise OpenAIError(status_code=e.status_code, message=str(e))
|
||||
else:
|
||||
|
@ -734,8 +771,11 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
messages: Optional[list] = None,
|
||||
input: Optional[list] = None,
|
||||
prompt: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
):
|
||||
client = AsyncOpenAI(api_key=api_key, timeout=timeout)
|
||||
client = AsyncOpenAI(
|
||||
api_key=api_key, timeout=timeout, organization=organization
|
||||
)
|
||||
if model is None and mode != "image_generation":
|
||||
raise Exception("model is not set")
|
||||
|
||||
|
|
|
@ -99,12 +99,16 @@ def ollama_pt(
|
|||
|
||||
|
||||
def mistral_instruct_pt(messages):
|
||||
# Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
|
||||
prompt = custom_prompt(
|
||||
initial_prompt_value="<s>",
|
||||
role_dict={
|
||||
"system": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"user": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
|
||||
"system": {
|
||||
"pre_message": "[INST] \n",
|
||||
"post_message": " [/INST]\n",
|
||||
},
|
||||
"user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
|
||||
"assistant": {"pre_message": " ", "post_message": " "},
|
||||
},
|
||||
final_prompt_value="</s>",
|
||||
messages=messages,
|
||||
|
@ -112,6 +116,28 @@ def mistral_instruct_pt(messages):
|
|||
return prompt
|
||||
|
||||
|
||||
def mistral_api_pt(messages):
|
||||
"""
|
||||
- handles scenario where content is list and not string
|
||||
- content list is just text, and no images
|
||||
- if image passed in, then just return as is (user-intended)
|
||||
|
||||
Motivation: mistral api doesn't support content as a list
|
||||
"""
|
||||
new_messages = []
|
||||
for m in messages:
|
||||
texts = ""
|
||||
if isinstance(m["content"], list):
|
||||
for c in m["content"]:
|
||||
if c["type"] == "image_url":
|
||||
return messages
|
||||
elif c["type"] == "text" and isinstance(c["text"], str):
|
||||
texts += c["text"]
|
||||
new_m = {"role": m["role"], "content": texts}
|
||||
new_messages.append(new_m)
|
||||
return new_messages
|
||||
|
||||
|
||||
# Falcon prompt template - from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py#L110
|
||||
def falcon_instruct_pt(messages):
|
||||
prompt = ""
|
||||
|
@ -372,6 +398,7 @@ def anthropic_pt(
|
|||
You can "put words in Claude's mouth" by ending with an assistant message.
|
||||
See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
|
||||
"""
|
||||
|
||||
class AnthropicConstants(Enum):
|
||||
HUMAN_PROMPT = "\n\nHuman: "
|
||||
AI_PROMPT = "\n\nAssistant: "
|
||||
|
@ -394,32 +421,35 @@ def anthropic_pt(
|
|||
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
|
||||
return prompt
|
||||
|
||||
|
||||
|
||||
def _load_image_from_url(image_url):
|
||||
try:
|
||||
from PIL import Image
|
||||
except:
|
||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
||||
raise Exception(
|
||||
"gemini image conversion failed please run `pip install Pillow`"
|
||||
)
|
||||
from io import BytesIO
|
||||
|
||||
try:
|
||||
# Send a GET request to the image URL
|
||||
response = requests.get(image_url)
|
||||
response.raise_for_status() # Raise an exception for HTTP errors
|
||||
|
||||
# Check the response's content type to ensure it is an image
|
||||
content_type = response.headers.get('content-type')
|
||||
if not content_type or 'image' not in content_type:
|
||||
raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
|
||||
content_type = response.headers.get("content-type")
|
||||
if not content_type or "image" not in content_type:
|
||||
raise ValueError(
|
||||
f"URL does not point to a valid image (content-type: {content_type})"
|
||||
)
|
||||
|
||||
# Load the image from the response content
|
||||
return Image.open(BytesIO(response.content))
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Request failed: {e}")
|
||||
except UnidentifiedImageError:
|
||||
print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
raise Exception(f"Request failed: {e}")
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
|
||||
def _gemini_vision_convert_messages(messages: list):
|
||||
|
@ -437,10 +467,11 @@ def _gemini_vision_convert_messages(messages: list):
|
|||
try:
|
||||
from PIL import Image
|
||||
except:
|
||||
raise Exception("gemini image conversion failed please run `pip install Pillow`")
|
||||
raise Exception(
|
||||
"gemini image conversion failed please run `pip install Pillow`"
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
# given messages for gpt-4 vision, convert them for gemini
|
||||
# https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
|
||||
prompt = ""
|
||||
|
@ -589,7 +620,7 @@ def prompt_factory(
|
|||
if custom_llm_provider == "ollama":
|
||||
return ollama_pt(model=model, messages=messages)
|
||||
elif custom_llm_provider == "anthropic":
|
||||
if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
|
||||
if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
|
||||
return claude_2_1_pt(messages=messages)
|
||||
else:
|
||||
return anthropic_pt(messages=messages)
|
||||
|
@ -603,6 +634,8 @@ def prompt_factory(
|
|||
return _gemini_vision_convert_messages(messages=messages)
|
||||
else:
|
||||
return gemini_text_image_pt(messages=messages)
|
||||
elif custom_llm_provider == "mistral":
|
||||
return mistral_api_pt(messages=messages)
|
||||
try:
|
||||
if "meta-llama/llama-2" in model and "chat" in model:
|
||||
return llama_2_chat_pt(messages=messages)
|
||||
|
|
|
@ -34,22 +34,35 @@ class TokenIterator:
|
|||
self.byte_iterator = iter(stream)
|
||||
self.buffer = io.BytesIO()
|
||||
self.read_pos = 0
|
||||
self.end_of_data = False
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
while True:
|
||||
self.buffer.seek(self.read_pos)
|
||||
line = self.buffer.readline()
|
||||
if line and line[-1] == ord("\n"):
|
||||
self.read_pos += len(line) + 1
|
||||
full_line = line[:-1].decode("utf-8")
|
||||
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||
return line_data["token"]["text"]
|
||||
chunk = next(self.byte_iterator)
|
||||
self.buffer.seek(0, io.SEEK_END)
|
||||
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||
try:
|
||||
while True:
|
||||
self.buffer.seek(self.read_pos)
|
||||
line = self.buffer.readline()
|
||||
if line and line[-1] == ord("\n"):
|
||||
response_obj = {"text": "", "is_finished": False}
|
||||
self.read_pos += len(line) + 1
|
||||
full_line = line[:-1].decode("utf-8")
|
||||
line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
|
||||
if line_data.get("generated_text", None) is not None:
|
||||
self.end_of_data = True
|
||||
response_obj["is_finished"] = True
|
||||
response_obj["text"] = line_data["token"]["text"]
|
||||
return response_obj
|
||||
chunk = next(self.byte_iterator)
|
||||
self.buffer.seek(0, io.SEEK_END)
|
||||
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
||||
except StopIteration as e:
|
||||
if self.end_of_data == True:
|
||||
raise e # Re-raise StopIteration
|
||||
else:
|
||||
self.end_of_data = True
|
||||
return "data: [DONE]"
|
||||
|
||||
|
||||
class SagemakerConfig:
|
||||
|
@ -353,7 +366,7 @@ def embedding(
|
|||
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
|
||||
aws_region_name = optional_params.pop("aws_region_name", None)
|
||||
|
||||
if aws_access_key_id != None:
|
||||
if aws_access_key_id is not None:
|
||||
# uses auth params passed to completion
|
||||
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
|
||||
client = boto3.client(
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
"""
|
||||
Deprecated. We now do together ai calls via the openai client.
|
||||
Reference: https://docs.together.ai/docs/openai-api-compatibility
|
||||
"""
|
||||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
|
|
|
@ -3,7 +3,7 @@ import json
|
|||
from enum import Enum
|
||||
import requests
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
from typing import Callable, Optional, Union
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
|
||||
import litellm, uuid
|
||||
import httpx
|
||||
|
@ -75,6 +75,41 @@ class VertexAIConfig:
|
|||
}
|
||||
|
||||
|
||||
import asyncio
|
||||
|
||||
|
||||
class TextStreamer:
|
||||
"""
|
||||
Fake streaming iterator for Vertex AI Model Garden calls
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text.split() # let's assume words as a streaming unit
|
||||
self.index = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.index < len(self.text):
|
||||
result = self.text[self.index]
|
||||
self.index += 1
|
||||
return result
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
if self.index < len(self.text):
|
||||
result = self.text[self.index]
|
||||
self.index += 1
|
||||
return result
|
||||
else:
|
||||
raise StopAsyncIteration # once we run out of data to stream, we raise this error
|
||||
|
||||
|
||||
def _get_image_bytes_from_url(image_url: str) -> bytes:
|
||||
try:
|
||||
response = requests.get(image_url)
|
||||
|
@ -236,9 +271,17 @@ def completion(
|
|||
Part,
|
||||
GenerationConfig,
|
||||
)
|
||||
from google.cloud import aiplatform
|
||||
from google.protobuf import json_format # type: ignore
|
||||
from google.protobuf.struct_pb2 import Value # type: ignore
|
||||
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
|
||||
import google.auth
|
||||
|
||||
vertexai.init(project=vertex_project, location=vertex_location)
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||
vertexai.init(
|
||||
project=vertex_project, location=vertex_location, credentials=creds
|
||||
)
|
||||
|
||||
## Load Config
|
||||
config = litellm.VertexAIConfig.get_config()
|
||||
|
@ -272,6 +315,11 @@ def completion(
|
|||
|
||||
request_str = ""
|
||||
response_obj = None
|
||||
async_client = None
|
||||
instances = None
|
||||
client_options = {
|
||||
"api_endpoint": f"{vertex_location}-aiplatform.googleapis.com"
|
||||
}
|
||||
if (
|
||||
model in litellm.vertex_language_models
|
||||
or model in litellm.vertex_vision_models
|
||||
|
@ -291,39 +339,51 @@ def completion(
|
|||
llm_model = CodeGenerationModel.from_pretrained(model)
|
||||
mode = "text"
|
||||
request_str += f"llm_model = CodeGenerationModel.from_pretrained({model})\n"
|
||||
else: # vertex_code_llm_models
|
||||
elif model in litellm.vertex_code_chat_models: # vertex_code_llm_models
|
||||
llm_model = CodeChatModel.from_pretrained(model)
|
||||
mode = "chat"
|
||||
request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n"
|
||||
else: # assume vertex model garden
|
||||
client = aiplatform.gapic.PredictionServiceClient(
|
||||
client_options=client_options
|
||||
)
|
||||
|
||||
if acompletion == True: # [TODO] expand support to vertex ai chat + text models
|
||||
instances = [optional_params]
|
||||
instances[0]["prompt"] = prompt
|
||||
instances = [
|
||||
json_format.ParseDict(instance_dict, Value())
|
||||
for instance_dict in instances
|
||||
]
|
||||
llm_model = client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
mode = "custom"
|
||||
request_str += f"llm_model = client.endpoint_path(project={vertex_project}, location={vertex_location}, endpoint={model})\n"
|
||||
|
||||
if acompletion == True:
|
||||
data = {
|
||||
"llm_model": llm_model,
|
||||
"mode": mode,
|
||||
"prompt": prompt,
|
||||
"logging_obj": logging_obj,
|
||||
"request_str": request_str,
|
||||
"model": model,
|
||||
"model_response": model_response,
|
||||
"encoding": encoding,
|
||||
"messages": messages,
|
||||
"print_verbose": print_verbose,
|
||||
"client_options": client_options,
|
||||
"instances": instances,
|
||||
"vertex_location": vertex_location,
|
||||
"vertex_project": vertex_project,
|
||||
**optional_params,
|
||||
}
|
||||
if optional_params.get("stream", False) is True:
|
||||
# async streaming
|
||||
return async_streaming(
|
||||
llm_model=llm_model,
|
||||
mode=mode,
|
||||
prompt=prompt,
|
||||
logging_obj=logging_obj,
|
||||
request_str=request_str,
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
**optional_params,
|
||||
)
|
||||
return async_completion(
|
||||
llm_model=llm_model,
|
||||
mode=mode,
|
||||
prompt=prompt,
|
||||
logging_obj=logging_obj,
|
||||
request_str=request_str,
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
encoding=encoding,
|
||||
messages=messages,
|
||||
print_verbose=print_verbose,
|
||||
**optional_params,
|
||||
)
|
||||
return async_streaming(**data)
|
||||
|
||||
return async_completion(**data)
|
||||
|
||||
if mode == "vision":
|
||||
print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
|
||||
|
@ -468,7 +528,36 @@ def completion(
|
|||
},
|
||||
)
|
||||
completion_response = llm_model.predict(prompt, **optional_params).text
|
||||
elif mode == "custom":
|
||||
"""
|
||||
Vertex AI Model Garden
|
||||
"""
|
||||
request_str += (
|
||||
f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response = client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
).predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = TextStreamer(completion_response)
|
||||
return response
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt, api_key=None, original_response=completion_response
|
||||
|
@ -536,6 +625,10 @@ async def async_completion(
|
|||
encoding=None,
|
||||
messages=None,
|
||||
print_verbose=None,
|
||||
client_options=None,
|
||||
instances=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
**optional_params,
|
||||
):
|
||||
"""
|
||||
|
@ -624,7 +717,43 @@ async def async_completion(
|
|||
)
|
||||
response_obj = await llm_model.predict_async(prompt, **optional_params)
|
||||
completion_response = response_obj.text
|
||||
elif mode == "custom":
|
||||
"""
|
||||
Vertex AI Model Garden
|
||||
"""
|
||||
from google.cloud import aiplatform
|
||||
|
||||
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
|
||||
client_options=client_options
|
||||
)
|
||||
llm_model = async_client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
request_str += (
|
||||
f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response_obj = await async_client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
)
|
||||
response = response_obj.predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=prompt, api_key=None, original_response=completion_response
|
||||
|
@ -654,14 +783,12 @@ async def async_completion(
|
|||
# init prompt tokens
|
||||
# this block attempts to get usage from response_obj if it exists, if not it uses the litellm token counter
|
||||
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
||||
if response_obj is not None:
|
||||
if hasattr(response_obj, "usage_metadata") and hasattr(
|
||||
response_obj.usage_metadata, "prompt_token_count"
|
||||
):
|
||||
prompt_tokens = response_obj.usage_metadata.prompt_token_count
|
||||
completion_tokens = (
|
||||
response_obj.usage_metadata.candidates_token_count
|
||||
)
|
||||
if response_obj is not None and (
|
||||
hasattr(response_obj, "usage_metadata")
|
||||
and hasattr(response_obj.usage_metadata, "prompt_token_count")
|
||||
):
|
||||
prompt_tokens = response_obj.usage_metadata.prompt_token_count
|
||||
completion_tokens = response_obj.usage_metadata.candidates_token_count
|
||||
else:
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(
|
||||
|
@ -690,8 +817,13 @@ async def async_streaming(
|
|||
model_response: ModelResponse,
|
||||
logging_obj=None,
|
||||
request_str=None,
|
||||
encoding=None,
|
||||
messages=None,
|
||||
print_verbose=None,
|
||||
client_options=None,
|
||||
instances=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
**optional_params,
|
||||
):
|
||||
"""
|
||||
|
@ -760,17 +892,198 @@ async def async_streaming(
|
|||
},
|
||||
)
|
||||
response = llm_model.predict_streaming_async(prompt, **optional_params)
|
||||
elif mode == "custom":
|
||||
from google.cloud import aiplatform
|
||||
|
||||
async_client = aiplatform.gapic.PredictionServiceAsyncClient(
|
||||
client_options=client_options
|
||||
)
|
||||
llm_model = async_client.endpoint_path(
|
||||
project=vertex_project, location=vertex_location, endpoint=model
|
||||
)
|
||||
|
||||
request_str += f"client.predict(endpoint={llm_model}, instances={instances})\n"
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=prompt,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
response_obj = await async_client.predict(
|
||||
endpoint=llm_model,
|
||||
instances=instances,
|
||||
)
|
||||
response = response_obj.predictions
|
||||
completion_response = response[0]
|
||||
if (
|
||||
isinstance(completion_response, str)
|
||||
and "\nOutput:\n" in completion_response
|
||||
):
|
||||
completion_response = completion_response.split("\nOutput:\n", 1)[1]
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = TextStreamer(completion_response)
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
custom_llm_provider="vertex_ai",
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
async for transformed_chunk in streamwrapper:
|
||||
yield transformed_chunk
|
||||
return streamwrapper
|
||||
|
||||
|
||||
def embedding():
|
||||
def embedding(
|
||||
model: str,
|
||||
input: Union[list, str],
|
||||
api_key: Optional[str] = None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
encoding=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
aembedding=False,
|
||||
):
|
||||
# logic for parsing in - calling - parsing out model embedding calls
|
||||
pass
|
||||
try:
|
||||
import vertexai
|
||||
except:
|
||||
raise VertexAIError(
|
||||
status_code=400,
|
||||
message="vertexai import failed please run `pip install google-cloud-aiplatform`",
|
||||
)
|
||||
|
||||
from vertexai.language_models import TextEmbeddingModel
|
||||
import google.auth
|
||||
|
||||
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
|
||||
try:
|
||||
creds, _ = google.auth.default(quota_project_id=vertex_project)
|
||||
vertexai.init(
|
||||
project=vertex_project, location=vertex_location, credentials=creds
|
||||
)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=401, message=str(e))
|
||||
|
||||
if isinstance(input, str):
|
||||
input = [input]
|
||||
|
||||
try:
|
||||
llm_model = TextEmbeddingModel.from_pretrained(model)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=422, message=str(e))
|
||||
|
||||
if aembedding == True:
|
||||
return async_embedding(
|
||||
model=model,
|
||||
client=llm_model,
|
||||
input=input,
|
||||
logging_obj=logging_obj,
|
||||
model_response=model_response,
|
||||
optional_params=optional_params,
|
||||
encoding=encoding,
|
||||
)
|
||||
|
||||
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
|
||||
## LOGGING PRE-CALL
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
embeddings = llm_model.get_embeddings(input)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=500, message=str(e))
|
||||
|
||||
## LOGGING POST-CALL
|
||||
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
for idx, embedding in enumerate(embeddings):
|
||||
embedding_response.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"index": idx,
|
||||
"embedding": embedding.values,
|
||||
}
|
||||
)
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = embedding_response
|
||||
model_response["model"] = model
|
||||
input_tokens = 0
|
||||
|
||||
input_str = "".join(input)
|
||||
|
||||
input_tokens += len(encoding.encode(input_str))
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||
)
|
||||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
||||
|
||||
async def async_embedding(
|
||||
model: str,
|
||||
input: Union[list, str],
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
optional_params=None,
|
||||
encoding=None,
|
||||
client=None,
|
||||
):
|
||||
"""
|
||||
Async embedding implementation
|
||||
"""
|
||||
request_str = f"""embeddings = llm_model.get_embeddings({input})"""
|
||||
## LOGGING PRE-CALL
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
embeddings = await client.get_embeddings_async(input)
|
||||
except Exception as e:
|
||||
raise VertexAIError(status_code=500, message=str(e))
|
||||
|
||||
## LOGGING POST-CALL
|
||||
logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
|
||||
## Populate OpenAI compliant dictionary
|
||||
embedding_response = []
|
||||
for idx, embedding in enumerate(embeddings):
|
||||
embedding_response.append(
|
||||
{
|
||||
"object": "embedding",
|
||||
"index": idx,
|
||||
"embedding": embedding.values,
|
||||
}
|
||||
)
|
||||
model_response["object"] = "list"
|
||||
model_response["data"] = embedding_response
|
||||
model_response["model"] = model
|
||||
input_tokens = 0
|
||||
|
||||
input_str = "".join(input)
|
||||
|
||||
input_tokens += len(encoding.encode(input_str))
|
||||
|
||||
usage = Usage(
|
||||
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
|
||||
)
|
||||
model_response.usage = usage
|
||||
|
||||
return model_response
|
||||
|
|
160
litellm/main.py
|
@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
|
|||
from copy import deepcopy
|
||||
import httpx
|
||||
import litellm
|
||||
|
||||
from ._logging import verbose_logger
|
||||
from litellm import ( # type: ignore
|
||||
client,
|
||||
exception_type,
|
||||
|
@ -31,6 +31,7 @@ from litellm.utils import (
|
|||
get_llm_provider,
|
||||
get_api_key,
|
||||
mock_completion_streaming_obj,
|
||||
async_mock_completion_streaming_obj,
|
||||
convert_to_model_response_object,
|
||||
token_counter,
|
||||
Usage,
|
||||
|
@ -235,6 +236,9 @@ async def acompletion(
|
|||
"model_list": model_list,
|
||||
"acompletion": True, # assuming this is a required parameter
|
||||
}
|
||||
_, custom_llm_provider, _, _ = get_llm_provider(
|
||||
model=model, api_base=completion_kwargs.get("base_url", None)
|
||||
)
|
||||
try:
|
||||
# Use a partial function to pass your keyword arguments
|
||||
func = partial(completion, **completion_kwargs, **kwargs)
|
||||
|
@ -246,7 +250,6 @@ async def acompletion(
|
|||
_, custom_llm_provider, _, _ = get_llm_provider(
|
||||
model=model, api_base=kwargs.get("api_base", None)
|
||||
)
|
||||
|
||||
if (
|
||||
custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "azure"
|
||||
|
@ -261,6 +264,7 @@ async def acompletion(
|
|||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "ollama_chat"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
or custom_llm_provider in litellm.openai_compatible_providers
|
||||
): # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
|
||||
init_response = await loop.run_in_executor(None, func_with_context)
|
||||
if isinstance(init_response, dict) or isinstance(
|
||||
|
@ -274,14 +278,10 @@ async def acompletion(
|
|||
else:
|
||||
# Call the synchronous function using run_in_executor
|
||||
response = await loop.run_in_executor(None, func_with_context) # type: ignore
|
||||
# if kwargs.get("stream", False): # return an async generator
|
||||
# return _async_streaming(
|
||||
# response=response,
|
||||
# model=model,
|
||||
# custom_llm_provider=custom_llm_provider,
|
||||
# args=args,
|
||||
# )
|
||||
# else:
|
||||
if isinstance(response, CustomStreamWrapper):
|
||||
response.set_logging_event_loop(
|
||||
loop=loop
|
||||
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
|
||||
return response
|
||||
except Exception as e:
|
||||
custom_llm_provider = custom_llm_provider or "openai"
|
||||
|
@ -308,6 +308,7 @@ def mock_completion(
|
|||
messages: List,
|
||||
stream: Optional[bool] = False,
|
||||
mock_response: str = "This is a mock request",
|
||||
logging=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
@ -336,6 +337,15 @@ def mock_completion(
|
|||
model_response = ModelResponse(stream=stream)
|
||||
if stream is True:
|
||||
# don't try to access stream object,
|
||||
if kwargs.get("acompletion", False) == True:
|
||||
return CustomStreamWrapper(
|
||||
completion_stream=async_mock_completion_streaming_obj(
|
||||
model_response, mock_response=mock_response, model=model
|
||||
),
|
||||
model=model,
|
||||
custom_llm_provider="openai",
|
||||
logging_obj=logging,
|
||||
)
|
||||
response = mock_completion_streaming_obj(
|
||||
model_response, mock_response=mock_response, model=model
|
||||
)
|
||||
|
@ -455,6 +465,7 @@ def completion(
|
|||
num_retries = kwargs.get("num_retries", None) ## deprecated
|
||||
max_retries = kwargs.get("max_retries", None)
|
||||
context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
|
||||
organization = kwargs.get("organization", None)
|
||||
### CUSTOM MODEL COST ###
|
||||
input_cost_per_token = kwargs.get("input_cost_per_token", None)
|
||||
output_cost_per_token = kwargs.get("output_cost_per_token", None)
|
||||
|
@ -590,28 +601,43 @@ def completion(
|
|||
)
|
||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||
model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
|
||||
model_response._hidden_params["region_name"] = kwargs.get(
|
||||
"aws_region_name", None
|
||||
) # support region-based pricing for bedrock
|
||||
|
||||
### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
|
||||
if input_cost_per_token is not None and output_cost_per_token is not None:
|
||||
print_verbose(f"Registering model={model} in model cost map")
|
||||
litellm.register_model(
|
||||
{
|
||||
f"{custom_llm_provider}/{model}": {
|
||||
"input_cost_per_token": input_cost_per_token,
|
||||
"output_cost_per_token": output_cost_per_token,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
},
|
||||
model: {
|
||||
"input_cost_per_token": input_cost_per_token,
|
||||
"output_cost_per_token": output_cost_per_token,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
}
|
||||
},
|
||||
}
|
||||
)
|
||||
if (
|
||||
elif (
|
||||
input_cost_per_second is not None
|
||||
): # time based pricing just needs cost in place
|
||||
output_cost_per_second = output_cost_per_second or 0.0
|
||||
litellm.register_model(
|
||||
{
|
||||
f"{custom_llm_provider}/{model}": {
|
||||
"input_cost_per_second": input_cost_per_second,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
},
|
||||
model: {
|
||||
"input_cost_per_second": input_cost_per_second,
|
||||
"output_cost_per_second": output_cost_per_second,
|
||||
"litellm_provider": custom_llm_provider,
|
||||
}
|
||||
},
|
||||
}
|
||||
)
|
||||
### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
|
||||
|
@ -702,7 +728,12 @@ def completion(
|
|||
)
|
||||
if mock_response:
|
||||
return mock_completion(
|
||||
model, messages, stream=stream, mock_response=mock_response
|
||||
model,
|
||||
messages,
|
||||
stream=stream,
|
||||
mock_response=mock_response,
|
||||
logging=logging,
|
||||
acompletion=acompletion,
|
||||
)
|
||||
if custom_llm_provider == "azure":
|
||||
# azure configs
|
||||
|
@ -777,6 +808,7 @@ def completion(
|
|||
or custom_llm_provider == "anyscale"
|
||||
or custom_llm_provider == "mistral"
|
||||
or custom_llm_provider == "openai"
|
||||
or custom_llm_provider == "together_ai"
|
||||
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
|
||||
): # allow user to make an openai call with a custom base
|
||||
# note: if a user sets a custom base - we should ensure this works
|
||||
|
@ -788,7 +820,8 @@ def completion(
|
|||
or "https://api.openai.com/v1"
|
||||
)
|
||||
openai.organization = (
|
||||
litellm.organization
|
||||
organization
|
||||
or litellm.organization
|
||||
or get_secret("OPENAI_ORGANIZATION")
|
||||
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
|
||||
)
|
||||
|
@ -828,6 +861,7 @@ def completion(
|
|||
timeout=timeout,
|
||||
custom_prompt_dict=custom_prompt_dict,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
organization=organization,
|
||||
)
|
||||
except Exception as e:
|
||||
## LOGGING - log the original exception returned
|
||||
|
@ -1314,6 +1348,9 @@ def completion(
|
|||
or ("togethercomputer" in model)
|
||||
or (model in litellm.together_ai_models)
|
||||
):
|
||||
"""
|
||||
Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
|
||||
"""
|
||||
custom_llm_provider = "together_ai"
|
||||
together_ai_key = (
|
||||
api_key
|
||||
|
@ -1421,9 +1458,15 @@ def completion(
|
|||
return response
|
||||
response = model_response
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
|
||||
vertex_ai_location = litellm.vertex_location or get_secret(
|
||||
"VERTEXAI_LOCATION"
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_ai_project", None)
|
||||
or litellm.vertex_project
|
||||
or get_secret("VERTEXAI_PROJECT")
|
||||
)
|
||||
vertex_ai_location = (
|
||||
optional_params.pop("vertex_ai_location", None)
|
||||
or litellm.vertex_location
|
||||
or get_secret("VERTEXAI_LOCATION")
|
||||
)
|
||||
|
||||
model_response = vertex_ai.completion(
|
||||
|
@ -1514,11 +1557,6 @@ def completion(
|
|||
if (
|
||||
"stream" in optional_params and optional_params["stream"] == True
|
||||
): ## [BETA]
|
||||
# sagemaker does not support streaming as of now so we're faking streaming:
|
||||
# https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
|
||||
# "SageMaker is currently not supporting streaming responses."
|
||||
|
||||
# fake streaming for sagemaker
|
||||
print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
|
||||
from .llms.sagemaker import TokenIterator
|
||||
|
||||
|
@ -1529,6 +1567,12 @@ def completion(
|
|||
custom_llm_provider="sagemaker",
|
||||
logging_obj=logging,
|
||||
)
|
||||
## LOGGING
|
||||
logging.post_call(
|
||||
input=messages,
|
||||
api_key=None,
|
||||
original_response=response,
|
||||
)
|
||||
return response
|
||||
|
||||
## RESPONSE OBJECT
|
||||
|
@ -1547,6 +1591,7 @@ def completion(
|
|||
logger_fn=logger_fn,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
|
@ -2191,6 +2236,7 @@ async def aembedding(*args, **kwargs):
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
): # currently implemented aiohttp calls for just azure and openai, soon all.
|
||||
# Await normally
|
||||
init_response = await loop.run_in_executor(None, func_with_context)
|
||||
|
@ -2221,6 +2267,7 @@ def embedding(
|
|||
model,
|
||||
input=[],
|
||||
# Optional params
|
||||
dimensions: Optional[int] = None,
|
||||
timeout=600, # default to 10 minutes
|
||||
# set api_base, api_version, api_key
|
||||
api_base: Optional[str] = None,
|
||||
|
@ -2241,6 +2288,7 @@ def embedding(
|
|||
Parameters:
|
||||
- model: The embedding model to use.
|
||||
- input: The input for which embeddings are to be generated.
|
||||
- dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
|
||||
- timeout: The timeout value for the API call, default 10 mins
|
||||
- litellm_call_id: The call ID for litellm logging.
|
||||
- litellm_logging_obj: The litellm logging object.
|
||||
|
@ -2274,6 +2322,7 @@ def embedding(
|
|||
output_cost_per_second = kwargs.get("output_cost_per_second", None)
|
||||
openai_params = [
|
||||
"user",
|
||||
"dimensions",
|
||||
"request_timeout",
|
||||
"api_base",
|
||||
"api_version",
|
||||
|
@ -2342,7 +2391,9 @@ def embedding(
|
|||
api_key=api_key,
|
||||
)
|
||||
optional_params = get_optional_params_embeddings(
|
||||
model=model,
|
||||
user=user,
|
||||
dimensions=dimensions,
|
||||
encoding_format=encoding_format,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
**non_default_params,
|
||||
|
@ -2461,7 +2512,7 @@ def embedding(
|
|||
client=client,
|
||||
aembedding=aembedding,
|
||||
)
|
||||
elif model in litellm.cohere_embedding_models:
|
||||
elif custom_llm_provider == "cohere":
|
||||
cohere_key = (
|
||||
api_key
|
||||
or litellm.cohere_key
|
||||
|
@ -2503,6 +2554,29 @@ def embedding(
|
|||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
)
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_ai_project", None)
|
||||
or litellm.vertex_project
|
||||
or get_secret("VERTEXAI_PROJECT")
|
||||
)
|
||||
vertex_ai_location = (
|
||||
optional_params.pop("vertex_ai_location", None)
|
||||
or litellm.vertex_location
|
||||
or get_secret("VERTEXAI_LOCATION")
|
||||
)
|
||||
|
||||
response = vertex_ai.embedding(
|
||||
model=model,
|
||||
input=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
vertex_project=vertex_ai_project,
|
||||
vertex_location=vertex_ai_location,
|
||||
aembedding=aembedding,
|
||||
)
|
||||
elif custom_llm_provider == "oobabooga":
|
||||
response = oobabooga.embedding(
|
||||
model=model,
|
||||
|
@ -3064,7 +3138,7 @@ def image_generation(
|
|||
custom_llm_provider=custom_llm_provider,
|
||||
**non_default_params,
|
||||
)
|
||||
logging = litellm_logging_obj
|
||||
logging: Logging = litellm_logging_obj
|
||||
logging.update_environment_variables(
|
||||
model=model,
|
||||
user=user,
|
||||
|
@ -3128,7 +3202,18 @@ def image_generation(
|
|||
model_response=model_response,
|
||||
aimg_generation=aimg_generation,
|
||||
)
|
||||
|
||||
elif custom_llm_provider == "bedrock":
|
||||
if model is None:
|
||||
raise Exception("Model needs to be set for bedrock")
|
||||
model_response = bedrock.image_generation(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
timeout=timeout,
|
||||
logging_obj=litellm_logging_obj,
|
||||
optional_params=optional_params,
|
||||
model_response=model_response,
|
||||
aimg_generation=aimg_generation,
|
||||
)
|
||||
return model_response
|
||||
except Exception as e:
|
||||
## Map to OpenAI Exception
|
||||
|
@ -3164,6 +3249,9 @@ async def ahealth_check(
|
|||
if model is None:
|
||||
raise Exception("model not set")
|
||||
|
||||
if model in litellm.model_cost and mode is None:
|
||||
mode = litellm.model_cost[model]["mode"]
|
||||
|
||||
model, custom_llm_provider, _, _ = get_llm_provider(model=model)
|
||||
mode = mode or "chat" # default to chat completion calls
|
||||
|
||||
|
@ -3210,6 +3298,7 @@ async def ahealth_check(
|
|||
or custom_llm_provider == "text-completion-openai"
|
||||
):
|
||||
api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
|
||||
organization = model_params.get("organization")
|
||||
|
||||
timeout = (
|
||||
model_params.get("timeout")
|
||||
|
@ -3227,8 +3316,12 @@ async def ahealth_check(
|
|||
mode=mode,
|
||||
prompt=prompt,
|
||||
input=input,
|
||||
organization=organization,
|
||||
)
|
||||
else:
|
||||
model_params["cache"] = {
|
||||
"no-cache": True
|
||||
} # don't used cached responses for making health check calls
|
||||
if mode == "embedding":
|
||||
model_params.pop("messages", None)
|
||||
model_params["input"] = input
|
||||
|
@ -3244,6 +3337,10 @@ async def ahealth_check(
|
|||
response = {} # args like remaining ratelimit etc.
|
||||
return response
|
||||
except Exception as e:
|
||||
if model not in litellm.model_cost and mode is None:
|
||||
raise Exception(
|
||||
"Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
|
||||
)
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
|
@ -3251,6 +3348,7 @@ async def ahealth_check(
|
|||
## Set verbose to true -> ```litellm.set_verbose = True```
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
verbose_logger.debug(print_statement)
|
||||
if litellm.set_verbose:
|
||||
print(print_statement) # noqa
|
||||
except:
|
||||
|
@ -3342,6 +3440,16 @@ def stream_chunk_builder(
|
|||
chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
|
||||
):
|
||||
model_response = litellm.ModelResponse()
|
||||
### SORT CHUNKS BASED ON CREATED ORDER ##
|
||||
print_verbose("Goes into checking if chunk has hiddden created at param")
|
||||
if chunks[0]._hidden_params.get("created_at", None):
|
||||
print_verbose("Chunks have a created at hidden param")
|
||||
# Sort chunks based on created_at in ascending order
|
||||
chunks = sorted(
|
||||
chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
|
||||
)
|
||||
print_verbose("Chunks sorted")
|
||||
|
||||
# set hidden params from chunk to model_response
|
||||
if model_response is not None and hasattr(model_response, "_hidden_params"):
|
||||
model_response._hidden_params = chunks[0].get("_hidden_params", {})
|
||||
|
|
1
litellm/proxy/_experimental/out/404.html
Normal file
|
@ -0,0 +1 @@
|
|||
self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
|
@ -0,0 +1 @@
|
|||
self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
|