Merge branch 'BerriAI:main' into main

This commit is contained in:
Mikio Stewart 2024-08-20 11:52:26 -07:00 committed by GitHub
commit d71d19be1e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
547 changed files with 47310 additions and 24984 deletions

View file

@ -47,8 +47,8 @@ jobs:
pip install opentelemetry-api==1.25.0 pip install opentelemetry-api==1.25.0
pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-sdk==1.25.0
pip install opentelemetry-exporter-otlp==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0
pip install openai pip install openai==1.40.0
pip install prisma pip install prisma==0.11.0
pip install "detect_secrets==1.5.0" pip install "detect_secrets==1.5.0"
pip install "httpx==0.24.1" pip install "httpx==0.24.1"
pip install fastapi pip install fastapi
@ -125,6 +125,7 @@ jobs:
pip install tiktoken pip install tiktoken
pip install aiohttp pip install aiohttp
pip install click pip install click
pip install "boto3==1.34.34"
pip install jinja2 pip install jinja2
pip install tokenizers pip install tokenizers
pip install openai pip install openai
@ -165,7 +166,6 @@ jobs:
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install aiohttp pip install aiohttp
pip install openai
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt python -m pip install -r .circleci/requirements.txt
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
@ -190,6 +190,7 @@ jobs:
pip install "aiodynamo==23.10.1" pip install "aiodynamo==23.10.1"
pip install "asyncio==3.4.3" pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1" pip install "PyGithub==1.59.1"
pip install "openai==1.40.0"
# Run pytest and generate JUnit XML report # Run pytest and generate JUnit XML report
- run: - run:
name: Build Docker image name: Build Docker image
@ -208,6 +209,9 @@ jobs:
-e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \ -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
-e MISTRAL_API_KEY=$MISTRAL_API_KEY \ -e MISTRAL_API_KEY=$MISTRAL_API_KEY \
-e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
-e GROQ_API_KEY=$GROQ_API_KEY \
-e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
-e COHERE_API_KEY=$COHERE_API_KEY \
-e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
-e AWS_REGION_NAME=$AWS_REGION_NAME \ -e AWS_REGION_NAME=$AWS_REGION_NAME \
-e AUTO_INFER_REGION=True \ -e AUTO_INFER_REGION=True \
@ -280,10 +284,11 @@ jobs:
pip install aiohttp pip install aiohttp
pip install openai pip install openai
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt pip install "pydantic==2.7.1"
pip install "pytest==7.3.1" pip install "pytest==7.3.1"
pip install "pytest-mock==3.12.0" pip install "pytest-mock==3.12.0"
pip install "pytest-asyncio==0.21.1" pip install "pytest-asyncio==0.21.1"
pip install "boto3==1.34.34"
pip install mypy pip install mypy
pip install pyarrow pip install pyarrow
pip install numpydoc pip install numpydoc
@ -312,6 +317,10 @@ jobs:
-e OPENAI_API_KEY=$OPENAI_API_KEY \ -e OPENAI_API_KEY=$OPENAI_API_KEY \
-e LITELLM_LICENSE=$LITELLM_LICENSE \ -e LITELLM_LICENSE=$LITELLM_LICENSE \
-e OTEL_EXPORTER="in_memory" \ -e OTEL_EXPORTER="in_memory" \
-e APORIA_API_BASE_2=$APORIA_API_BASE_2 \
-e APORIA_API_KEY_2=$APORIA_API_KEY_2 \
-e APORIA_API_BASE_1=$APORIA_API_BASE_1 \
-e APORIA_API_KEY_1=$APORIA_API_KEY_1 \
--name my-app \ --name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \ -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \
my-app:latest \ my-app:latest \
@ -404,7 +413,7 @@ jobs:
circleci step halt circleci step halt
fi fi
- run: - run:
name: Trigger Github Action for new Docker Container name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
command: | command: |
echo "Install TOML package." echo "Install TOML package."
python3 -m pip install toml python3 -m pip install toml
@ -415,7 +424,8 @@ jobs:
-H "Authorization: Bearer $GITHUB_TOKEN" \ -H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \ "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
-d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}" -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
workflows: workflows:
version: 2 version: 2
build_and_test: build_and_test:

View file

@ -1,11 +1,11 @@
# used by CI/CD testing # used by CI/CD testing
openai openai==1.34.0
python-dotenv python-dotenv
tiktoken tiktoken
importlib_metadata importlib_metadata
cohere cohere
redis redis
anthropic anthropic
orjson orjson==3.9.15
pydantic==2.7.1 pydantic==2.7.1
google-cloud-aiplatform==1.43.0 google-cloud-aiplatform==1.43.0

View file

@ -21,6 +21,14 @@ env:
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs: jobs:
# print commit hash, tag, and release type
print:
runs-on: ubuntu-latest
steps:
- run: |
echo "Commit hash: ${{ github.event.inputs.commit_hash }}"
echo "Tag: ${{ github.event.inputs.tag }}"
echo "Release type: ${{ github.event.inputs.release_type }}"
docker-hub-deploy: docker-hub-deploy:
if: github.repository == 'BerriAI/litellm' if: github.repository == 'BerriAI/litellm'
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -186,6 +194,8 @@ jobs:
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-helm-chart: build-and-push-helm-chart:
if: github.event.inputs.release_type != 'dev'
needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database]
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -203,9 +213,17 @@ jobs:
- name: lowercase github.repository_owner - name: lowercase github.repository_owner
run: | run: |
echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV} echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
- name: Get LiteLLM Latest Tag - name: Get LiteLLM Latest Tag
id: current_app_tag id: current_app_tag
uses: WyriHaximus/github-action-get-previous-tag@v1.3.0 shell: bash
run: |
LATEST_TAG=$(git describe --tags --exclude "*dev*" --abbrev=0)
if [ -z "${LATEST_TAG}" ]; then
echo "latest_tag=latest" | tee -a $GITHUB_OUTPUT
else
echo "latest_tag=${LATEST_TAG}" | tee -a $GITHUB_OUTPUT
fi
- name: Get last published chart version - name: Get last published chart version
id: current_version id: current_version
@ -233,7 +251,7 @@ jobs:
name: ${{ env.CHART_NAME }} name: ${{ env.CHART_NAME }}
repository: ${{ env.REPO_OWNER }} repository: ${{ env.REPO_OWNER }}
tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }} tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }} app_version: ${{ steps.current_app_tag.outputs.latest_tag }}
path: deploy/charts/${{ env.CHART_NAME }} path: deploy/charts/${{ env.CHART_NAME }}
registry: ${{ env.REGISTRY }} registry: ${{ env.REGISTRY }}
registry_username: ${{ github.actor }} registry_username: ${{ github.actor }}

2
.gitignore vendored
View file

@ -1,5 +1,7 @@
.venv .venv
.env .env
.newenv
newenv/*
litellm/proxy/myenv/* litellm/proxy/myenv/*
litellm_uuid.txt litellm_uuid.txt
__pycache__/ __pycache__/

View file

@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/
RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
# Generate prisma client # Generate prisma client
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
RUN mkdir -p /.cache
RUN chmod -R 777 /.cache
RUN pip install nodejs-bin
RUN pip install prisma
RUN prisma generate RUN prisma generate
RUN chmod +x entrypoint.sh RUN chmod +x entrypoint.sh

41
Dockerfile.custom_ui Normal file
View file

@ -0,0 +1,41 @@
# Use the provided base image
FROM ghcr.io/berriai/litellm:litellm_fwd_server_root_path-dev
# Set the working directory to /app
WORKDIR /app
# Install Node.js and npm (adjust version as needed)
RUN apt-get update && apt-get install -y nodejs npm
# Copy the UI source into the container
COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
# Set an environment variable for UI_BASE_PATH
# This can be overridden at build time
# set UI_BASE_PATH to "<your server root path>/ui"
ENV UI_BASE_PATH="/prod/ui"
# Build the UI with the specified UI_BASE_PATH
WORKDIR /app/ui/litellm-dashboard
RUN npm install
RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
# Create the destination directory
RUN mkdir -p /app/litellm/proxy/_experimental/out
# Move the built files to the appropriate location
# Assuming the build output is in ./out directory
RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
mv ./out/* /app/litellm/proxy/_experimental/out/
# Switch back to the main app directory
WORKDIR /app
# Make sure your entrypoint.sh is executable
RUN chmod +x entrypoint.sh
# Expose the necessary port
EXPOSE 4000/tcp
# Override the CMD instruction with your desired command and arguments
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]

View file

@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir
RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh
# Generate prisma client # Generate prisma client
ENV PRISMA_BINARY_CACHE_DIR=/app/prisma
RUN mkdir -p /.cache
RUN chmod -R 777 /.cache
RUN pip install nodejs-bin
RUN pip install prisma
RUN prisma generate RUN prisma generate
RUN chmod +x entrypoint.sh RUN chmod +x entrypoint.sh

View file

@ -8,10 +8,10 @@
<img src="https://railway.app/button.svg" alt="Deploy on Railway"> <img src="https://railway.app/button.svg" alt="Deploy on Railway">
</a> </a>
</p> </p>
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.] <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
<br> <br>
</p> </p>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4> <h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">LiteLLM Proxy Server (LLM Gateway)</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
<h4 align="center"> <h4 align="center">
<a href="https://pypi.org/project/litellm/" target="_blank"> <a href="https://pypi.org/project/litellm/" target="_blank">
<img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version"> <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -35,9 +35,9 @@ LiteLLM manages:
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']` - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing) - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy) - Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br> [**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs) [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. 🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published.
@ -120,6 +120,7 @@ from litellm import completion
## set env variables for logging tools ## set env variables for logging tools
os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
os.environ["HELICONE_API_KEY"] = "your-helicone-auth-key"
os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["ATHINA_API_KEY"] = "your-athina-api-key" os.environ["ATHINA_API_KEY"] = "your-athina-api-key"
@ -127,13 +128,13 @@ os.environ["ATHINA_API_KEY"] = "your-athina-api-key"
os.environ["OPENAI_API_KEY"] os.environ["OPENAI_API_KEY"]
# set callbacks # set callbacks
litellm.success_callback = ["lunary", "langfuse", "athina"] # log input/output to lunary, langfuse, supabase, athina etc litellm.success_callback = ["lunary", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc
#openai call #openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
``` ```
# OpenAI Proxy - ([Docs](https://docs.litellm.ai/docs/simple_proxy)) # LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy))
Track spend + Load Balance across multiple projects Track spend + Load Balance across multiple projects
@ -165,6 +166,10 @@ $ litellm --model huggingface/bigcode/starcoder
### Step 2: Make ChatCompletions Request to Proxy ### Step 2: Make ChatCompletions Request to Proxy
> [!IMPORTANT]
> 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys)
```python ```python
import openai # openai v1.0.0+ import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
@ -190,8 +195,15 @@ git clone https://github.com/BerriAI/litellm
# Go to folder # Go to folder
cd litellm cd litellm
# Add the master key # Add the master key - you can change this after setup
echo 'LITELLM_MASTER_KEY="sk-1234"' > .env echo 'LITELLM_MASTER_KEY="sk-1234"' > .env
# Add the litellm salt key - you cannot change this after adding a model
# It is used to encrypt / decrypt your LLM API Key credentials
# We recommned - https://1password.com/password-generator/
# password generator to get a random hash for litellm salt key
echo 'LITELLM_SALT_KEY="sk-1234"' > .env
source .env source .env
# Start # Start

View file

@ -0,0 +1,565 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Migrating to LiteLLM Proxy from OpenAI/Azure OpenAI\n",
"\n",
"Covers:\n",
"\n",
"* /chat/completion\n",
"* /embedding\n",
"\n",
"\n",
"These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.\n",
"\n",
"For more examples, [go here](https://docs.litellm.ai/docs/proxy/user_keys)\n",
"\n",
"To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)\n",
"\n",
"To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)\n"
],
"metadata": {
"id": "kccfk0mHZ4Ad"
}
},
{
"cell_type": "markdown",
"source": [
"## /chat/completion\n",
"\n"
],
"metadata": {
"id": "nmSClzCPaGH6"
}
},
{
"cell_type": "markdown",
"source": [
"### OpenAI Python SDK"
],
"metadata": {
"id": "_vqcjwOVaKpO"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "x1e_Ok3KZzeP"
},
"outputs": [],
"source": [
"import openai\n",
"client = openai.OpenAI(\n",
" api_key=\"anything\",\n",
" base_url=\"http://0.0.0.0:4000\"\n",
")\n",
"\n",
"# request sent to model set on litellm proxy, `litellm --model`\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"this is a test request, write a short poem\"\n",
" }\n",
" ],\n",
" extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
" \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
" \"generation_name\": \"ishaan-generation-openai-client\",\n",
" \"generation_id\": \"openai-client-gen-id22\",\n",
" \"trace_id\": \"openai-client-trace-id22\",\n",
" \"trace_user_id\": \"openai-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"source": [
"## Function Calling"
],
"metadata": {
"id": "AqkyKk9Scxgj"
}
},
{
"cell_type": "code",
"source": [
"from openai import OpenAI\n",
"client = OpenAI(\n",
" api_key=\"sk-1234\", # [OPTIONAL] set if you set one on proxy, else set \"\"\n",
" base_url=\"http://0.0.0.0:4000\",\n",
")\n",
"\n",
"tools = [\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"get_current_weather\",\n",
" \"description\": \"Get the current weather in a given location\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"location\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
" },\n",
" \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
" },\n",
" \"required\": [\"location\"],\n",
" },\n",
" }\n",
" }\n",
"]\n",
"messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
"completion = client.chat.completions.create(\n",
" model=\"gpt-4o\", # use 'model_name' from config.yaml\n",
" messages=messages,\n",
" tools=tools,\n",
" tool_choice=\"auto\"\n",
")\n",
"\n",
"print(completion)\n"
],
"metadata": {
"id": "wDg10VqLczE1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Azure OpenAI Python SDK"
],
"metadata": {
"id": "YYoxLloSaNWW"
}
},
{
"cell_type": "code",
"source": [
"import openai\n",
"client = openai.AzureOpenAI(\n",
" api_key=\"anything\",\n",
" base_url=\"http://0.0.0.0:4000\"\n",
")\n",
"\n",
"# request sent to model set on litellm proxy, `litellm --model`\n",
"response = client.chat.completions.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"this is a test request, write a short poem\"\n",
" }\n",
" ],\n",
" extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
" \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
" \"generation_name\": \"ishaan-generation-openai-client\",\n",
" \"generation_id\": \"openai-client-gen-id22\",\n",
" \"trace_id\": \"openai-client-trace-id22\",\n",
" \"trace_user_id\": \"openai-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"print(response)"
],
"metadata": {
"id": "yA1XcgowaSRy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain Python"
],
"metadata": {
"id": "yl9qhDvnaTpL"
}
},
{
"cell_type": "code",
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts.chat import (\n",
" ChatPromptTemplate,\n",
" HumanMessagePromptTemplate,\n",
" SystemMessagePromptTemplate,\n",
")\n",
"from langchain.schema import HumanMessage, SystemMessage\n",
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"anything\"\n",
"\n",
"chat = ChatOpenAI(\n",
" openai_api_base=\"http://0.0.0.0:4000\",\n",
" model = \"gpt-3.5-turbo\",\n",
" temperature=0.1,\n",
" extra_body={\n",
" \"metadata\": {\n",
" \"generation_name\": \"ishaan-generation-langchain-client\",\n",
" \"generation_id\": \"langchain-client-gen-id22\",\n",
" \"trace_id\": \"langchain-client-trace-id22\",\n",
" \"trace_user_id\": \"langchain-client-user-id2\"\n",
" }\n",
" }\n",
")\n",
"\n",
"messages = [\n",
" SystemMessage(\n",
" content=\"You are a helpful assistant that im using to make a test request to.\"\n",
" ),\n",
" HumanMessage(\n",
" content=\"test from litellm. tell me why it's amazing in 1 sentence\"\n",
" ),\n",
"]\n",
"response = chat(messages)\n",
"\n",
"print(response)"
],
"metadata": {
"id": "5MUZgSquaW5t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Curl"
],
"metadata": {
"id": "B9eMgnULbRaz"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"```\n",
"curl -X POST 'http://0.0.0.0:4000/chat/completions' \\\n",
" -H 'Content-Type: application/json' \\\n",
" -d '{\n",
" \"model\": \"gpt-3.5-turbo\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"what llm are you\"\n",
" }\n",
" ],\n",
" \"metadata\": {\n",
" \"generation_name\": \"ishaan-test-generation\",\n",
" \"generation_id\": \"gen-id22\",\n",
" \"trace_id\": \"trace-id22\",\n",
" \"trace_user_id\": \"user-id2\"\n",
" }\n",
"}'\n",
"```\n",
"\n"
],
"metadata": {
"id": "VWCCk5PFcmhS"
}
},
{
"cell_type": "markdown",
"source": [
"### LlamaIndex"
],
"metadata": {
"id": "drBAm2e1b6xe"
}
},
{
"cell_type": "code",
"source": [
"import os, dotenv\n",
"\n",
"from llama_index.llms import AzureOpenAI\n",
"from llama_index.embeddings import AzureOpenAIEmbedding\n",
"from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
"\n",
"llm = AzureOpenAI(\n",
" engine=\"azure-gpt-3.5\", # model_name on litellm proxy\n",
" temperature=0.0,\n",
" azure_endpoint=\"http://0.0.0.0:4000\", # litellm proxy endpoint\n",
" api_key=\"sk-1234\", # litellm proxy API Key\n",
" api_version=\"2023-07-01-preview\",\n",
")\n",
"\n",
"embed_model = AzureOpenAIEmbedding(\n",
" deployment_name=\"azure-embedding-model\",\n",
" azure_endpoint=\"http://0.0.0.0:4000\",\n",
" api_key=\"sk-1234\",\n",
" api_version=\"2023-07-01-preview\",\n",
")\n",
"\n",
"\n",
"documents = SimpleDirectoryReader(\"llama_index_data\").load_data()\n",
"service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)\n",
"index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
"\n",
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\"What did the author do growing up?\")\n",
"print(response)\n"
],
"metadata": {
"id": "d0bZcv8fb9mL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain JS"
],
"metadata": {
"id": "xypvNdHnb-Yy"
}
},
{
"cell_type": "code",
"source": [
"import { ChatOpenAI } from \"@langchain/openai\";\n",
"\n",
"\n",
"const model = new ChatOpenAI({\n",
" modelName: \"gpt-4\",\n",
" openAIApiKey: \"sk-1234\",\n",
" modelKwargs: {\"metadata\": \"hello world\"} // 👈 PASS Additional params here\n",
"}, {\n",
" basePath: \"http://0.0.0.0:4000\",\n",
"});\n",
"\n",
"const message = await model.invoke(\"Hi there!\");\n",
"\n",
"console.log(message);\n"
],
"metadata": {
"id": "R55mK2vCcBN2"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### OpenAI JS"
],
"metadata": {
"id": "nC4bLifCcCiW"
}
},
{
"cell_type": "code",
"source": [
"const { OpenAI } = require('openai');\n",
"\n",
"const openai = new OpenAI({\n",
" apiKey: \"sk-1234\", // This is the default and can be omitted\n",
" baseURL: \"http://0.0.0.0:4000\"\n",
"});\n",
"\n",
"async function main() {\n",
" const chatCompletion = await openai.chat.completions.create({\n",
" messages: [{ role: 'user', content: 'Say this is a test' }],\n",
" model: 'gpt-3.5-turbo',\n",
" }, {\"metadata\": {\n",
" \"generation_name\": \"ishaan-generation-openaijs-client\",\n",
" \"generation_id\": \"openaijs-client-gen-id22\",\n",
" \"trace_id\": \"openaijs-client-trace-id22\",\n",
" \"trace_user_id\": \"openaijs-client-user-id2\"\n",
" }});\n",
"}\n",
"\n",
"main();\n"
],
"metadata": {
"id": "MICH8kIMcFpg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Anthropic SDK"
],
"metadata": {
"id": "D1Q07pEAcGTb"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"from anthropic import Anthropic\n",
"\n",
"client = Anthropic(\n",
" base_url=\"http://localhost:4000\", # proxy endpoint\n",
" api_key=\"sk-s4xN1IiLTCytwtZFJaYQrA\", # litellm proxy virtual key\n",
")\n",
"\n",
"message = client.messages.create(\n",
" max_tokens=1024,\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Hello, Claude\",\n",
" }\n",
" ],\n",
" model=\"claude-3-opus-20240229\",\n",
")\n",
"print(message.content)"
],
"metadata": {
"id": "qBjFcAvgcI3t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## /embeddings"
],
"metadata": {
"id": "dFAR4AJGcONI"
}
},
{
"cell_type": "markdown",
"source": [
"### OpenAI Python SDK"
],
"metadata": {
"id": "lgNoM281cRzR"
}
},
{
"cell_type": "code",
"source": [
"import openai\n",
"from openai import OpenAI\n",
"\n",
"# set base_url to your proxy server\n",
"# set api_key to send to proxy server\n",
"client = OpenAI(api_key=\"<proxy-api-key>\", base_url=\"http://0.0.0.0:4000\")\n",
"\n",
"response = client.embeddings.create(\n",
" input=[\"hello from litellm\"],\n",
" model=\"text-embedding-ada-002\"\n",
")\n",
"\n",
"print(response)\n"
],
"metadata": {
"id": "NY3DJhPfcQhA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Langchain Embeddings"
],
"metadata": {
"id": "hmbg-DW6cUZs"
}
},
{
"cell_type": "code",
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"sagemaker-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"SAGEMAKER EMBEDDINGS\")\n",
"print(query_result[:5])\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"bedrock-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"BEDROCK EMBEDDINGS\")\n",
"print(query_result[:5])\n",
"\n",
"embeddings = OpenAIEmbeddings(model=\"bedrock-titan-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
"\n",
"text = \"This is a test document.\"\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"\n",
"print(f\"TITAN EMBEDDINGS\")\n",
"print(query_result[:5])"
],
"metadata": {
"id": "lX2S8Nl1cWVP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Curl Request"
],
"metadata": {
"id": "oqGbWBCQcYfd"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
"```curl\n",
"curl -X POST 'http://0.0.0.0:4000/embeddings' \\\n",
" -H 'Content-Type: application/json' \\\n",
" -d ' {\n",
" \"model\": \"text-embedding-ada-002\",\n",
" \"input\": [\"write a litellm poem\"]\n",
" }'\n",
"```\n",
"\n"
],
"metadata": {
"id": "7rkIMV9LcdwQ"
}
}
]
}

View file

@ -1,10 +1,10 @@
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -21,13 +21,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -49,7 +49,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -61,7 +61,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -70,7 +70,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -79,7 +79,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -109,7 +109,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -128,7 +128,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -148,7 +148,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -162,7 +162,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -174,7 +174,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -184,7 +184,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -193,19 +193,19 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -214,7 +214,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -234,7 +234,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -244,7 +244,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -253,7 +253,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -267,31 +267,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -305,7 +305,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -330,7 +330,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -339,7 +339,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -360,7 +360,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -369,7 +369,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -378,7 +378,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -388,7 +388,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -409,7 +409,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -422,13 +422,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -438,7 +438,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: Expecting value: line 1 column 1 (char 0) Exception: Expecting value: line 1 column 1 (char 0)
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -462,7 +462,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -482,7 +482,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -492,7 +492,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -516,7 +516,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -529,7 +529,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -546,13 +546,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -580,13 +580,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -624,7 +624,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -638,13 +638,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -660,7 +660,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -681,7 +681,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -691,31 +691,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -771,7 +771,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -780,7 +780,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -800,7 +800,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -820,7 +820,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -830,7 +830,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -840,7 +840,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -850,7 +850,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -862,13 +862,13 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -877,7 +877,7 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -898,7 +898,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -919,7 +919,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -936,19 +936,19 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -961,25 +961,25 @@ Exception: 'Response' object has no attribute 'get'
Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -993,7 +993,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
Call all LLM APIs using the OpenAI format. Call all LLM APIs using the OpenAI format.
Exception: 'Response' object has no attribute 'get' Exception: 'Response' object has no attribute 'get'
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10

View file

@ -20,7 +20,7 @@ Call all LLM APIs using the OpenAI format.
Response ID: 52dbbd49-eedb-4c11-8382-3ca7deb1af35 Url: /queue/response/52dbbd49-eedb-4c11-8382-3ca7deb1af35 Response ID: 52dbbd49-eedb-4c11-8382-3ca7deb1af35 Url: /queue/response/52dbbd49-eedb-4c11-8382-3ca7deb1af35
Time: 3.50 seconds Time: 3.50 seconds
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10
@ -35,7 +35,7 @@ Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. C
Response ID: ae1e2b71-d711-456d-8df0-13ce0709eb04 Url: /queue/response/ae1e2b71-d711-456d-8df0-13ce0709eb04 Response ID: ae1e2b71-d711-456d-8df0-13ce0709eb04 Url: /queue/response/ae1e2b71-d711-456d-8df0-13ce0709eb04
Time: 5.60 seconds Time: 5.60 seconds
Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 10 Calling 10

View file

@ -1,4 +1,4 @@
What endpoints does the litellm proxy have 💥 OpenAI Proxy Server What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
LiteLLM Server manages: LiteLLM Server manages:
Calling 100+ LLMs Huggingface/Bedrock/TogetherAI/etc. in the OpenAI ChatCompletions & Completions format Calling 100+ LLMs Huggingface/Bedrock/TogetherAI/etc. in the OpenAI ChatCompletions & Completions format

View file

@ -18,13 +18,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.1 version: 0.2.3
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using. # follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes. # It is recommended to use it with quotes.
appVersion: v1.41.8 appVersion: v1.43.18
dependencies: dependencies:
- name: "postgresql" - name: "postgresql"

View file

@ -1,5 +1,9 @@
# Helm Chart for LiteLLM # Helm Chart for LiteLLM
> [!IMPORTANT]
> This is community maintained, Please make an issue if you run into a bug
> We recommend using [Docker or Kubernetes for production deployments](https://docs.litellm.ai/docs/proxy/prod)
## Prerequisites ## Prerequisites
- Kubernetes 1.21+ - Kubernetes 1.21+

View file

@ -9,13 +9,11 @@ services:
######################################### #########################################
## Uncomment these lines to start proxy with a config.yaml file ## ## Uncomment these lines to start proxy with a config.yaml file ##
# volumes: # volumes:
# - ./proxy_server_config.yaml:/app/config.yaml
# command: [ "--config", "./config.yaml", "--port", "4000"]
############################################### ###############################################
ports: ports:
- "4000:4000" # Map the container port to the host, change the host port if necessary - "4000:4000" # Map the container port to the host, change the host port if necessary
environment: environment:
DATABASE_URL: "postgresql://postgres:example@db:5432/postgres" DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file: env_file:
- .env # Load local .env file - .env # Load local .env file
@ -25,11 +23,31 @@ services:
image: postgres image: postgres
restart: always restart: always
environment: environment:
POSTGRES_PASSWORD: example POSTGRES_DB: litellm
POSTGRES_USER: llmproxy
POSTGRES_PASSWORD: dbpassword9090
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready"] test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
interval: 1s interval: 1s
timeout: 5s timeout: 5s
retries: 10 retries: 10
prometheus:
image: prom/prometheus
volumes:
- prometheus_data:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
restart: always
volumes:
prometheus_data:
driver: local
# ...rest of your docker-compose config if any # ...rest of your docker-compose config if any

View file

@ -1,23 +1,73 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Batches API # [BETA] Batches API
Covers Batches, Files Covers Batches, Files
## Quick Start ## Quick Start
Call an existing Assistant.
- Create File for Batch Completion - Create File for Batch Completion
- Create Batch Request - Create Batch Request
- List Batches
- Retrieve the Specific Batch and File Content - Retrieve the Specific Batch and File Content
<Tabs> <Tabs>
<TabItem value="proxy" label="LiteLLM PROXY Server">
```bash
$ export OPENAI_API_KEY="sk-..."
$ litellm
# RUNNING on http://0.0.0.0:4000
```
**Create File for Batch Completion**
```shell
curl http://localhost:4000/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
**Create Batch Request**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"input_file_id": "file-abc123",
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}'
```
**Retrieve the Specific Batch**
```bash
curl http://localhost:4000/v1/batches/batch_abc123 \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
```
**List Batches**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
```
</TabItem>
<TabItem value="sdk" label="SDK"> <TabItem value="sdk" label="SDK">
**Create File for Batch Completion** **Create File for Batch Completion**
@ -77,48 +127,15 @@ file_content = await litellm.afile_content(
print("file content = ", file_content) print("file content = ", file_content)
``` ```
</TabItem> **List Batches**
<TabItem value="proxy" label="PROXY">
```bash ```python
$ export OPENAI_API_KEY="sk-..." list_batches_response = litellm.list_batches(custom_llm_provider="openai", limit=2)
print("list_batches_response=", list_batches_response)
$ litellm
# RUNNING on http://0.0.0.0:4000
```
**Create File for Batch Completion**
```shell
curl https://api.openai.com/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
**Create Batch Request**
```bash
curl http://localhost:4000/v1/batches \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
"input_file_id": "file-abc123",
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}'
```
**Retrieve the Specific Batch**
```bash
curl http://localhost:4000/v1/batches/batch_abc123 \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>
## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch) ## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch)

View file

@ -7,14 +7,14 @@ Don't want to get crazy bills because either while you're calling LLM APIs **or*
:::info :::info
If you want a server to manage user keys, budgets, etc. use our [OpenAI Proxy Server](./proxy/virtual_keys.md) If you want a server to manage user keys, budgets, etc. use our [LiteLLM Proxy Server](./proxy/virtual_keys.md)
::: :::
LiteLLM exposes: LiteLLM exposes:
* `litellm.max_budget`: a global variable you can use to set the max budget (in USD) across all your litellm calls. If this budget is exceeded, it will raise a BudgetExceededError * `litellm.max_budget`: a global variable you can use to set the max budget (in USD) across all your litellm calls. If this budget is exceeded, it will raise a BudgetExceededError
* `BudgetManager`: A class to help set budgets per user. BudgetManager creates a dictionary to manage the user budgets, where the key is user and the object is their current cost + model-specific costs. * `BudgetManager`: A class to help set budgets per user. BudgetManager creates a dictionary to manage the user budgets, where the key is user and the object is their current cost + model-specific costs.
* `OpenAI Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc. * `LiteLLM Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc.
## quick start ## quick start

View file

@ -48,19 +48,20 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
|Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ | |Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ |
|Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
|Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | | |Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | |
|AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
|VertexAI| ✅ | ✅ | | ✅ | ✅ | | | | | | | | | ✅ | ✅ | | | |VertexAI| ✅ | ✅ | | ✅ | ✅ | | | | | | | | | ✅ | ✅ | | |
|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ (for anthropic) | | |Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ (model dependent) | |
|Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ | |TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ |
|AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | |
|NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | |NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | |
|Petals| ✅ | ✅ | | ✅ | ✅ | | | | | | |Petals| ✅ | ✅ | | ✅ | ✅ | | | | | |
|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | | | | | ✅ | | | |Ollama| ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | | | | | ✅ | | |✅| | | | | | |
|Databricks| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |Databricks| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|ClarifAI| ✅ | ✅ | |✅ | ✅ | | | | | | | | | | | |ClarifAI| ✅ | ✅ | |✅ | ✅ | | | | | | | | | | |
|Github| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ (model dependent)|✅ (model dependent)| | |
:::note :::note
By default, LiteLLM raises an exception if the openai param being passed in isn't supported. By default, LiteLLM raises an exception if the openai param being passed in isn't supported.

View file

@ -0,0 +1,321 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Structured Outputs (JSON Mode)
## Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["OPENAI_API_KEY"] = ""
response = completion(
model="gpt-4o-mini",
response_format={ "type": "json_object" },
messages=[
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
{"role": "user", "content": "Who won the world series in 2020?"}
]
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "gpt-4o-mini",
"response_format": { "type": "json_object" },
"messages": [
{
"role": "system",
"content": "You are a helpful assistant designed to output JSON."
},
{
"role": "user",
"content": "Who won the world series in 2020?"
}
]
}'
```
</TabItem>
</Tabs>
## Check Model Support
Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`.
```python
from litellm import get_supported_openai_params
params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
assert "response_format" in params
```
## Pass in 'json_schema'
To use Structured Outputs, simply specify
```
response_format: { "type": "json_schema", "json_schema": … , "strict": true }
```
Works for:
- OpenAI models
- Azure OpenAI models
- Google AI Studio - Gemini models
- Vertex AI models (Gemini + Anthropic)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
# add to env var
os.environ["OPENAI_API_KEY"] = ""
messages = [{"role": "user", "content": "List 5 important events in the XIX century"}]
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
class EventsList(BaseModel):
events: list[CalendarEvent]
resp = completion(
model="gpt-4o-2024-08-06",
messages=messages,
response_format=EventsList
)
print("Received={}".format(resp))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add openai model to config.yaml
```yaml
model_list:
- model_name: "gpt-4o"
litellm_params:
model: "gpt-4o-2024-08-06"
```
2. Start proxy with config.yaml
```bash
litellm --config /path/to/config.yaml
```
3. Call with OpenAI SDK / Curl!
Just replace the 'base_url' in the openai sdk, to call the proxy with 'json_schema' for openai models
**OpenAI SDK**
```python
from pydantic import BaseModel
from openai import OpenAI
client = OpenAI(
api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
)
class Step(BaseModel):
explanation: str
output: str
class MathReasoning(BaseModel):
steps: list[Step]
final_answer: str
completion = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
{"role": "user", "content": "how can I solve 8x + 7 = -23"}
],
response_format=MathReasoning,
)
math_reasoning = completion.choices[0].message.parsed
```
**Curl**
```bash
curl -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "system",
"content": "You are a helpful math tutor. Guide the user through the solution step by step."
},
{
"role": "user",
"content": "how can I solve 8x + 7 = -23"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": false
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": false
},
"strict": true
}
}
}'
```
</TabItem>
</Tabs>
## Validate JSON Schema
Not all vertex models support passing the json_schema to them (e.g. `gemini-1.5-flash`). To solve this, LiteLLM supports client-side validation of the json schema.
```
litellm.enable_json_schema_validation=True
```
If `litellm.enable_json_schema_validation=True` is set, LiteLLM will validate the json response using `jsonvalidator`.
[**See Code**](https://github.com/BerriAI/litellm/blob/671d8ac496b6229970c7f2a3bdedd6cb84f0746b/litellm/litellm_core_utils/json_validation_rule.py#L4)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# !gcloud auth application-default login - run this to add vertex credentials to your env
import litellm, os
from litellm import completion
from pydantic import BaseModel
messages=[
{"role": "system", "content": "Extract the event information."},
{"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
]
litellm.enable_json_schema_validation = True
litellm.set_verbose = True # see the raw request made by litellm
class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]
resp = completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format=CalendarEvent,
)
print("Received={}".format(resp))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Create config.yaml
```yaml
model_list:
- model_name: "gemini-1.5-flash"
litellm_params:
model: "gemini/gemini-1.5-flash"
api_key: os.environ/GEMINI_API_KEY
litellm_settings:
enable_json_schema_validation: True
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_API_KEY" \
-d '{
"model": "gemini-1.5-flash",
"messages": [
{"role": "system", "content": "Extract the event information."},
{"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
],
"response_format": {
"type": "json_object",
"response_schema": {
"type": "json_schema",
"json_schema": {
"name": "math_reasoning",
"schema": {
"type": "object",
"properties": {
"steps": {
"type": "array",
"items": {
"type": "object",
"properties": {
"explanation": { "type": "string" },
"output": { "type": "string" }
},
"required": ["explanation", "output"],
"additionalProperties": false
}
},
"final_answer": { "type": "string" }
},
"required": ["steps", "final_answer"],
"additionalProperties": false
},
"strict": true
},
}
},
}'
```
</TabItem>
</Tabs>

View file

@ -0,0 +1,119 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Pre-fix Assistant Messages
Supported by:
- Deepseek
- Mistral
- Anthropic
```python
{
"role": "assistant",
"content": "..",
...
"prefix": true # 👈 KEY CHANGE
}
```
## Quick Start
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["DEEPSEEK_API_KEY"] = ""
response = completion(
model="deepseek/deepseek-chat",
messages=[
{"role": "user", "content": "Who won the world cup in 2022?"},
{"role": "assistant", "content": "Argentina", "prefix": True}
]
)
print(response.choices[0].message.content)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
```bash
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LITELLM_KEY" \
-d '{
"model": "deepseek/deepseek-chat",
"messages": [
{
"role": "user",
"content": "Who won the world cup in 2022?"
},
{
"role": "assistant",
"content": "Argentina", "prefix": true
}
]
}'
```
</TabItem>
</Tabs>
**Expected Response**
```bash
{
"id": "3b66124d79a708e10c603496b363574c",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": " won the FIFA World Cup in 2022.",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1723323084,
"model": "deepseek/deepseek-chat",
"object": "chat.completion",
"system_fingerprint": "fp_7e0991cad4",
"usage": {
"completion_tokens": 12,
"prompt_tokens": 16,
"total_tokens": 28,
},
"service_tier": null
}
```
## Check Model Support
Call `litellm.get_model_info` to check if a model/provider supports `response_format`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import get_model_info
params = get_model_info(model="deepseek/deepseek-chat")
assert params["supports_assistant_prefill"] is True
```
</TabItem>
<TabItem value="proxy" label="PROXY">
Call the `/model/info` endpoint to get a list of models + their supported params.
```bash
curl -X GET 'http://0.0.0.0:4000/v1/model/info' \
-H 'Authorization: Bearer $LITELLM_KEY' \
```
</TabItem>
</Tabs>

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Streaming + Async # Streaming + Async
- [Streaming Responses](#streaming-responses) - [Streaming Responses](#streaming-responses)
@ -74,3 +77,72 @@ async def completion_call():
asyncio.run(completion_call()) asyncio.run(completion_call())
``` ```
## Error Handling - Infinite Loops
Sometimes a model might enter an infinite loop, and keep repeating the same chunks - [e.g. issue](https://github.com/BerriAI/litellm/issues/5158)
Break out of it with:
```python
litellm.REPEATED_STREAMING_CHUNK_LIMIT = 100 # # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
```
LiteLLM provides error handling for this, by checking if a chunk is repeated 'n' times (Default is 100). If it exceeds that limit, it will raise a `litellm.InternalServerError`, to allow retry logic to happen.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
litellm.set_verbose = False
loop_amount = litellm.REPEATED_STREAMING_CHUNK_LIMIT + 1
chunks = [
litellm.ModelResponse(**{
"id": "chatcmpl-123",
"object": "chat.completion.chunk",
"created": 1694268190,
"model": "gpt-3.5-turbo-0125",
"system_fingerprint": "fp_44709d6fcb",
"choices": [
{"index": 0, "delta": {"content": "How are you?"}, "finish_reason": "stop"}
],
}, stream=True)
] * loop_amount
completion_stream = litellm.ModelResponseListIterator(model_responses=chunks)
response = litellm.CustomStreamWrapper(
completion_stream=completion_stream,
model="gpt-3.5-turbo",
custom_llm_provider="cached_response",
logging_obj=litellm.Logging(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey"}],
stream=True,
call_type="completion",
start_time=time.time(),
litellm_call_id="12345",
function_id="1245",
),
)
for chunk in response:
continue # expect to raise InternalServerError
```
</TabItem>
<TabItem value="proxy" label="PROXY">
Define this on your config.yaml on the proxy.
```yaml
litellm_settings:
REPEATED_STREAMING_CHUNK_LIMIT: 100 # this overrides the litellm default
```
The proxy uses the litellm SDK. To validate this works, try the 'SDK' code snippet.
</TabItem>
</Tabs>

View file

@ -14,6 +14,14 @@
For security inquiries, please contact us at support@berri.ai For security inquiries, please contact us at support@berri.ai
## Self-hosted Instances LiteLLM
- ** No data or telemetry is stored on LiteLLM Servers when you self host **
- For installation and configuration, see: [Self-hosting guided](../docs/proxy/deploy.md)
- **Telemetry** We run no telemetry when you self host LiteLLM
For security inquiries, please contact us at support@berri.ai
### Supported data regions for LiteLLM Cloud ### Supported data regions for LiteLLM Cloud
LiteLLM supports the following data regions: LiteLLM supports the following data regions:

View file

@ -270,7 +270,7 @@ response = embedding(
| embed-multilingual-v2.0 | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` | | embed-multilingual-v2.0 | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |
## HuggingFace Embedding Models ## HuggingFace Embedding Models
LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
### Usage ### Usage
```python ```python
@ -282,6 +282,25 @@ response = embedding(
input=["good morning from litellm"] input=["good morning from litellm"]
) )
``` ```
### Usage - Set input_type
LiteLLM infers input type (feature-extraction or sentence-similarity) by making a GET request to the api base.
Override this, by setting the `input_type` yourself.
```python
from litellm import embedding
import os
os.environ['HUGGINGFACE_API_KEY'] = ""
response = embedding(
model='huggingface/microsoft/codebert-base',
input=["good morning from litellm", "you are a good bot"],
api_base = "https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud",
input_type="sentence-similarity"
)
```
### Usage - Custom API Base ### Usage - Custom API Base
```python ```python
from litellm import embedding from litellm import embedding

View file

@ -27,11 +27,17 @@ This covers:
- ✅ IP addressbased access control lists - ✅ IP addressbased access control lists
- ✅ Track Request IP Address - ✅ Track Request IP Address
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints) - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ Set Max Request / File Size on Requests
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests) - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
- **Spend Tracking** - **Customize Logging, Guardrails, Caching per project**
- ✅ [Team Based Logging](./proxy/team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
- ✅ [Disable Logging for a Team](./proxy/team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
- **Spend Tracking & Data Exports**
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags) - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
- ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Advanced Metrics** - **Prometheus Metrics**
- ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](./proxy/prometheus)
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens) - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
- **Guardrails, PII Masking, Content Moderation** - **Guardrails, PII Masking, Content Moderation**
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation) - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)

View file

@ -0,0 +1,313 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [Beta] Fine-tuning API
:::info
This is an Enterprise only endpoint [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## Supported Providers
- Azure OpenAI
- OpenAI
- Vertex AI
Add `finetune_settings` and `files_settings` to your litellm config.yaml to use the fine-tuning endpoints.
## Example config.yaml for `finetune_settings` and `files_settings`
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
# For /fine_tuning/jobs endpoints
finetune_settings:
- custom_llm_provider: azure
api_base: https://exampleopenaiendpoint-production.up.railway.app
api_key: os.environ/AZURE_API_KEY
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
- custom_llm_provider: "vertex_ai"
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
# for /files endpoints
files_settings:
- custom_llm_provider: azure
api_base: https://exampleopenaiendpoint-production.up.railway.app
api_key: fake-key
api_version: "2023-03-15-preview"
- custom_llm_provider: openai
api_key: os.environ/OPENAI_API_KEY
```
## Create File for fine-tuning
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") # base_url is your litellm proxy url
file_name = "openai_batch_completions.jsonl"
response = await client.files.create(
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
file=open(file_name, "rb"),
purpose="fine-tune",
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F custom_llm_provider="azure"\
-F file="@mydata.jsonl"
```
</TabItem>
</Tabs>
## Create fine-tuning job
<Tabs>
<TabItem value="azure" label="Azure OpenAI">
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
ft_job = await client.fine_tuning.jobs.create(
model="gpt-35-turbo-1106", # Azure OpenAI model you want to fine-tune
training_file="file-abc123", # file_id from create file response
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
)
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/fine_tuning/jobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"custom_llm_provider": "azure",
"model": "gpt-35-turbo-1106",
"training_file": "file-abc123"
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="Vertex" label="VertexAI">
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
ft_job = await client.fine_tuning.jobs.create(
model="gemini-1.0-pro-002", # Vertex model you want to fine-tune
training_file="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl", # file_id from create file response
extra_body={"custom_llm_provider": "vertex_ai"}, # tell litellm proxy which provider to use
)
```
</TabItem>
<TabItem value="curl" label="curl (Unified API)">
```shell
curl http://localhost:4000/v1/fine_tuning/jobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"custom_llm_provider": "vertex_ai",
"model": "gemini-1.0-pro-002",
"training_file": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}'
```
</TabItem>
<TabItem value="curl-vtx" label="curl (VertexAI API)">
:::info
Use this to create Fine tuning Jobs in [the Vertex AI API Format](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning#create-tuning)
:::
```shell
curl http://localhost:4000/v1/projects/tuningJobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"baseModel": "gemini-1.0-pro-002",
"supervisedTuningSpec" : {
"training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
### Request Body
<Tabs>
<TabItem value="params" label="Supported Params">
* `model`
**Type:** string
**Required:** Yes
The name of the model to fine-tune
* `custom_llm_provider`
**Type:** `Literal["azure", "openai", "vertex_ai"]`
**Required:** Yes
The name of the model to fine-tune. You can select one of the [**supported providers**](#supported-providers)
* `training_file`
**Type:** string
**Required:** Yes
The ID of an uploaded file that contains training data.
- See **upload file** for how to upload a file.
- Your dataset must be formatted as a JSONL file.
* `hyperparameters`
**Type:** object
**Required:** No
The hyperparameters used for the fine-tuning job.
> #### Supported `hyperparameters`
> #### batch_size
**Type:** string or integer
**Required:** No
Number of examples in each batch. A larger batch size means that model parameters are updated less frequently, but with lower variance.
> #### learning_rate_multiplier
**Type:** string or number
**Required:** No
Scaling factor for the learning rate. A smaller learning rate may be useful to avoid overfitting.
> #### n_epochs
**Type:** string or integer
**Required:** No
The number of epochs to train the model for. An epoch refers to one full cycle through the training dataset.
* `suffix`
**Type:** string or null
**Required:** No
**Default:** null
A string of up to 18 characters that will be added to your fine-tuned model name.
Example: A `suffix` of "custom-model-name" would produce a model name like `ft:gpt-4o-mini:openai:custom-model-name:7p4lURel`.
* `validation_file`
**Type:** string or null
**Required:** No
The ID of an uploaded file that contains validation data.
- If provided, this data is used to generate validation metrics periodically during fine-tuning.
* `integrations`
**Type:** array or null
**Required:** No
A list of integrations to enable for your fine-tuning job.
* `seed`
**Type:** integer or null
**Required:** No
The seed controls the reproducibility of the job. Passing in the same seed and job parameters should produce the same results, but may differ in rare cases. If a seed is not specified, one will be generated for you.
</TabItem>
<TabItem value="example" label="Example Request Body">
```json
{
"model": "gpt-4o-mini",
"training_file": "file-abcde12345",
"hyperparameters": {
"batch_size": 4,
"learning_rate_multiplier": 0.1,
"n_epochs": 3
},
"suffix": "custom-model-v1",
"validation_file": "file-fghij67890",
"seed": 42
}
```
</TabItem>
</Tabs>
## Cancel fine-tuning job
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
# cancel specific fine tuning job
cancel_ft_job = await client.fine_tuning.jobs.cancel(
fine_tuning_job_id="123", # fine tuning job id
extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
)
print("response from cancel ft job={}".format(cancel_ft_job))
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl -X POST http://localhost:4000/v1/fine_tuning/jobs/ftjob-abc123/cancel \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{"custom_llm_provider": "azure"}'
```
</TabItem>
</Tabs>
## List fine-tuning jobs
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
list_ft_jobs = await client.fine_tuning.jobs.list(
extra_query={"custom_llm_provider": "azure"} # tell litellm proxy which provider to use
)
print("list of ft jobs={}".format(list_ft_jobs))
```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl -X GET 'http://localhost:4000/v1/fine_tuning/jobs?custom_llm_provider=azure' \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234"
```
</TabItem>
</Tabs>
## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/fine-tuning)

View file

@ -87,13 +87,14 @@ from litellm import completion
## set env variables for logging tools ## set env variables for logging tools
os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["OPENAI_API_KEY"] os.environ["OPENAI_API_KEY"]
# set callbacks # set callbacks
litellm.success_callback = ["lunary", "langfuse"] # log input/output to langfuse, lunary, supabase litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to langfuse, lunary, supabase, helicone
#openai call #openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])

View file

@ -10,14 +10,41 @@ https://github.com/BerriAI/litellm
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']` - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing) - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy) - Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## How to use LiteLLM ## How to use LiteLLM
You can use litellm through either: You can use litellm through either:
1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects 1. [LiteLLM Proxy Server](#openai-proxy) - Server (LLM Gateway) to call 100+ LLMs, load balance, cost tracking across projects
2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking 2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
## LiteLLM Python SDK ### **When to use LiteLLM Proxy Server (LLM Gateway)**
:::tip
Use LiteLLM Proxy Server if you want a **central service (LLM Gateway) to access multiple LLMs**
Typically used by Gen AI Enablement / ML PLatform Teams
:::
- LiteLLM Proxy gives you a unified interface to access multiple LLMs (100+ LLMs)
- Track LLM Usage and setup guardrails
- Customize Logging, Guardrails, Caching per project
### **When to use LiteLLM Python SDK**
:::tip
Use LiteLLM Python SDK if you want to use LiteLLM in your **python code**
Typically used by developers building llm projects
:::
- LiteLLM SDK gives you a unified interface to access multiple LLMs (100+ LLMs)
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
## **LiteLLM Python SDK**
### Basic usage ### Basic usage
@ -310,6 +337,7 @@ LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone
from litellm import completion from litellm import completion
## set env variables for logging tools ## set env variables for logging tools
os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
@ -317,7 +345,7 @@ os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key"
os.environ["OPENAI_API_KEY"] os.environ["OPENAI_API_KEY"]
# set callbacks # set callbacks
litellm.success_callback = ["lunary", "langfuse"] # log input/output to lunary, langfuse, supabase litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to lunary, langfuse, supabase, helicone
#openai call #openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
@ -356,7 +384,7 @@ response = completion(
) )
``` ```
## OpenAI Proxy ## **LiteLLM Proxy Server (LLM Gateway)**
Track spend across multiple projects/people Track spend across multiple projects/people

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# 🔥 Load Test LiteLLM # Load Test LiteLLM
## How to run a locust load test on LiteLLM Proxy ## How to run a locust load test on LiteLLM Proxy

View file

@ -0,0 +1,20 @@
# Migration Policy
## New Beta Feature Introduction
- If we introduce a new feature that may move to the Enterprise Tier it will be clearly labeled as **Beta**. With the following example disclaimer
**Example Disclaimer**
:::info
Beta Feature - This feature might move to LiteLLM Enterprise
:::
## Policy if a Beta Feature moves to Enterprise
If we decide to move a beta feature to the paid Enterprise version we will:
- Provide **at least 30 days** notice to all users of the beta feature
- Provide **a free 3 month License to prevent any disruptions to production**
- Provide a **dedicated slack, discord, microsoft teams support channel** to help your team during this transition

View file

@ -0,0 +1,72 @@
import Image from '@theme/IdealImage';
# Arize AI
AI Observability and Evaluation Platform
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
## Pre-Requisites
Make an account on [Arize AI](https://app.arize.com/auth/login)
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with arize
```python
litellm.callbacks = ["arize"]
```
```python
import litellm
import os
os.environ["ARIZE_SPACE_KEY"] = ""
os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set arize as a callback, litellm will send the data to arize
litellm.callbacks = ["arize"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
### Using with LiteLLM Proxy
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
```
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai

View file

@ -0,0 +1,147 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Braintrust - Evals + Logging
[Braintrust](https://www.braintrust.dev/) manages evaluations, logging, prompt playground, to data management for AI products.
## Quick Start
```python
# pip install langfuse
import litellm
import os
# set env
os.environ["BRAINTRUST_API_KEY"] = ""
os.environ['OPENAI_API_KEY']=""
# set braintrust as a callback, litellm will send the data to braintrust
litellm.callbacks = ["braintrust"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
## OpenAI Proxy Usage
1. Add keys to env
```env
BRAINTRUST_API_KEY=""
```
2. Add braintrust to callbacks
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
callbacks: ["braintrust"]
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "groq-llama3",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
]
}'
```
## Advanced - pass Project ID
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
metadata={
"project_id": "my-special-project"
}
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Curl**
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "groq-llama3",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
],
"metadata": {
"project_id": "my-special-project"
}
}'
```
**OpenAI SDK**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
"metadata": { # 👈 use for logging additional params (e.g. to langfuse)
"project_id": "my-special-project"
}
}
)
print(response)
```
For more examples, [**Click Here**](../proxy/user_keys.md#chatcompletions)
</TabItem>
</Tabs>
## Full API Spec
Here's everything you can pass in metadata for a braintrust request
`braintrust_*` - any metadata field starting with `braintrust_` will be passed as metadata to the logging request
`project_id` - set the project id for a braintrust call. Default is `litellm`.

View file

@ -0,0 +1,127 @@
import Image from '@theme/IdealImage';
# Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
## Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
### Fields Logged on GCS Buckets
Example payload of a `/chat/completion` request logged on GCS
```json
{
"request_kwargs": {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "This is a test"
}
],
"optional_params": {
"temperature": 0.7,
"max_tokens": 10,
"user": "ishaan-2",
"extra_body": {}
}
},
"response_obj": {
"id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hi!",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1722868456,
"model": "gpt-3.5-turbo",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
},
"start_time": "2024-08-05 07:34:16",
"end_time": "2024-08-05 07:34:16"
}
```
## Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai

View file

@ -1,64 +1,170 @@
# Helicone Tutorial # Helicone - OSS LLM Observability Platform
:::tip :::tip
This is community maintained, Please make an issue if you run into a bug This is community maintained. Please make an issue if you run into a bug:
https://github.com/BerriAI/litellm https://github.com/BerriAI/litellm
::: :::
[Helicone](https://helicone.ai/) is an open source observability platform that proxies your LLM requests and provides key insights into your usage, spend, latency and more.
[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage. ## Using Helicone with LiteLLM
## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM) LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily log data to Helicone based on the status of your responses.
liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses.
In this case, we want to log requests to Helicone when a request succeeds. ### Supported LLM Providers
Helicone can log requests across [various LLM providers](https://docs.helicone.ai/getting-started/quick-start), including:
- OpenAI
- Azure
- Anthropic
- Gemini
- Groq
- Cohere
- Replicate
- And more
### Integration Methods
There are two main approaches to integrate Helicone with LiteLLM:
1. Using callbacks
2. Using Helicone as a proxy
Let's explore each method in detail.
### Approach 1: Use Callbacks ### Approach 1: Use Callbacks
Use just 1 line of code, to instantly log your responses **across all providers** with helicone:
Use just 1 line of code to instantly log your responses **across all providers** with Helicone:
```python ```python
litellm.success_callback=["helicone"] litellm.success_callback = ["helicone"]
``` ```
Complete code
```python
from litellm import completion
## set env variables
os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["OPENAI_API_KEY"], os.environ["COHERE_API_KEY"] = "", ""
# set callbacks
litellm.success_callback=["helicone"]
#openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
#cohere call
response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}])
```
### Approach 2: [OpenAI + Azure only] Use Helicone as a proxy
Helicone provides advanced functionality like caching, etc. Helicone currently supports this for Azure and OpenAI.
If you want to use Helicone to proxy your OpenAI/Azure requests, then you can -
- Set helicone as your base url via: `litellm.api_url`
- Pass in helicone request headers via: `litellm.headers`
Complete Code Complete Code
```python ```python
import litellm import os
from litellm import completion from litellm import completion
litellm.api_base = "https://oai.hconeai.com/v1" ## Set env variables
litellm.headers = {"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"} os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["OPENAI_API_KEY"] = "your-openai-key"
response = litellm.completion( # Set callbacks
model="gpt-3.5-turbo", litellm.success_callback = ["helicone"]
messages=[{"role": "user", "content": "how does a court case get to the Supreme Court?"}]
# OpenAI call
response = completion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}],
) )
print(response) print(response)
``` ```
### Approach 2: Use Helicone as a proxy
Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/getting-started/proxy-vs-async) like caching, rate limiting, LLM security through [PromptArmor](https://promptarmor.com/) and more.
To use Helicone as a proxy for your LLM requests:
1. Set Helicone as your base URL via: litellm.api_base
2. Pass in Helicone request headers via: litellm.metadata
Complete Code:
```python
import os
import litellm
from litellm import completion
litellm.api_base = "https://oai.hconeai.com/v1"
litellm.headers = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
}
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "How does a court case get to the Supreme Court?"}]
)
print(response)
```
### Advanced Usage
You can add custom metadata and properties to your requests using Helicone headers. Here are some examples:
```python
litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-User-Id": "user-abc", # Specify the user making the request
"Helicone-Property-App": "web", # Custom property to add additional information
"Helicone-Property-Custom": "any-value", # Add any custom property
"Helicone-Prompt-Id": "prompt-supreme-court", # Assign an ID to associate this prompt with future versions
"Helicone-Cache-Enabled": "true", # Enable caching of responses
"Cache-Control": "max-age=3600", # Set cache limit to 1 hour
"Helicone-RateLimit-Policy": "10;w=60;s=user", # Set rate limit policy
"Helicone-Retry-Enabled": "true", # Enable retry mechanism
"helicone-retry-num": "3", # Set number of retries
"helicone-retry-factor": "2", # Set exponential backoff factor
"Helicone-Model-Override": "gpt-3.5-turbo-0613", # Override the model used for cost calculation
"Helicone-Session-Id": "session-abc-123", # Set session ID for tracking
"Helicone-Session-Path": "parent-trace/child-trace", # Set session path for hierarchical tracking
"Helicone-Omit-Response": "false", # Include response in logging (default behavior)
"Helicone-Omit-Request": "false", # Include request in logging (default behavior)
"Helicone-LLM-Security-Enabled": "true", # Enable LLM security features
"Helicone-Moderations-Enabled": "true", # Enable content moderation
"Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', # Set fallback models
}
```
### Caching and Rate Limiting
Enable caching and set up rate limiting policies:
```python
litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Cache-Enabled": "true", # Enable caching of responses
"Cache-Control": "max-age=3600", # Set cache limit to 1 hour
"Helicone-RateLimit-Policy": "100;w=3600;s=user", # Set rate limit policy
}
```
### Session Tracking and Tracing
Track multi-step and agentic LLM interactions using session IDs and paths:
```python
litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Session-Id": "session-abc-123", # The session ID you want to track
"Helicone-Session-Path": "parent-trace/child-trace", # The path of the session
}
```
- `Helicone-Session-Id`: Use this to specify the unique identifier for the session you want to track. This allows you to group related requests together.
- `Helicone-Session-Path`: This header defines the path of the session, allowing you to represent parent and child traces. For example, "parent/child" represents a child trace of a parent trace.
By using these two headers, you can effectively group and visualize multi-step LLM interactions, gaining insights into complex AI workflows.
### Retry and Fallback Mechanisms
Set up retry mechanisms and fallback options:
```python
litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Retry-Enabled": "true", # Enable retry mechanism
"helicone-retry-num": "3", # Set number of retries
"helicone-retry-factor": "2", # Set exponential backoff factor
"Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', # Set fallback models
}
```
> **Supported Headers** - For a full list of supported Helicone headers and their descriptions, please refer to the [Helicone documentation](https://docs.helicone.ai/getting-started/quick-start).
> By utilizing these headers and metadata options, you can gain deeper insights into your LLM usage, optimize performance, and better manage your AI workflows with Helicone and LiteLLM.

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# 🔥 Langfuse - Logging LLM Input/Output # 🪢 Langfuse - Logging LLM Input/Output
LangFuse is open Source Observability & Analytics for LLM Apps LangFuse is open Source Observability & Analytics for LLM Apps
Detailed production traces and a granular view on quality, cost and latency Detailed production traces and a granular view on quality, cost and latency
@ -200,6 +200,13 @@ The following parameters can be updated on a continuation of a trace by passing
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation. Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
#### Disable Logging - Specific Calls
To disable logging for specific calls use the `no-log` flag.
`completion(messages = ..., model = ..., **{"no-log": True})`
### Use LangChain ChatLiteLLM + Langfuse ### Use LangChain ChatLiteLLM + Langfuse
Pass `trace_user_id`, `session_id` in model_kwargs Pass `trace_user_id`, `session_id` in model_kwargs
```python ```python

View file

@ -14,7 +14,7 @@ https://github.com/BerriAI/litellm
An all-in-one developer platform for every step of the application lifecycle An all-in-one developer platform for every step of the application lifecycle
https://smith.langchain.com/ https://smith.langchain.com/
<Image img={require('../../img/langsmith.png')} /> <Image img={require('../../img/langsmith_new.png')} />
:::info :::info
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
@ -56,7 +56,7 @@ response = litellm.completion(
``` ```
## Advanced ## Advanced
### Set Custom Project & Run names ### Set Langsmith fields - Custom Projec, Run names, tags
```python ```python
import litellm import litellm
@ -77,6 +77,7 @@ response = litellm.completion(
metadata={ metadata={
"run_name": "litellmRUN", # langsmith run name "run_name": "litellmRUN", # langsmith run name
"project_name": "litellm-completion", # langsmith project name "project_name": "litellm-completion", # langsmith project name
"tags": ["model1", "prod-2"] # tags to log on langsmith
} }
) )
print(response) print(response)

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# 🔥 Logfire - Logging LLM Input/Output # Logfire
Logfire is open Source Observability & Analytics for LLM Apps Logfire is open Source Observability & Analytics for LLM Apps
Detailed production traces and a granular view on quality, cost and latency Detailed production traces and a granular view on quality, cost and latency

View file

@ -1,10 +1,16 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Raw Request/Response Logging # Raw Request/Response Logging
## Logging
See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.). See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).
**on SDK** <Tabs>
<TabItem value="sdk" label="SDK">
```python ```python
# pip install langfuse # pip install langfuse
import litellm import litellm
@ -34,13 +40,85 @@ response = litellm.completion(
) )
``` ```
**on Proxy**
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml ```yaml
litellm_settings: litellm_settings:
log_raw_request_response: True log_raw_request_response: True
``` ```
</TabItem>
</Tabs>
**Expected Log** **Expected Log**
<Image img={require('../../img/raw_request_log.png')}/> <Image img={require('../../img/raw_request_log.png')}/>
## Return Raw Response Headers
Return raw response headers from llm provider.
Currently only supported for openai.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
litellm.return_response_headers = True
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
print(response._hidden_params)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/GROQ_API_KEY
litellm_settings:
return_response_headers: true
```
2. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gpt-3.5-turbo",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
]
}'
```
</TabItem>
</Tabs>
**Expected Response**
<Image img={require('../../img/raw_response_headers.png')}/>

View file

@ -0,0 +1,97 @@
# Scrub Logged Data
Redact messages / mask PII before sending data to logging integrations (langfuse/etc.).
See our [**Presidio PII Masking**](https://github.com/BerriAI/litellm/blob/a176feeacc5fdf504747978d82056eb84679c4be/litellm/proxy/hooks/presidio_pii_masking.py#L286) for reference.
1. Setup a custom callback
```python
from litellm.integrations.custom_logger import CustomLogger
class MyCustomHandler(CustomLogger):
async def async_logging_hook(
self, kwargs: dict, result: Any, call_type: str
) -> Tuple[dict, Any]:
"""
For masking logged request/response. Return a modified version of the request/result.
Called before `async_log_success_event`.
"""
if (
call_type == "completion" or call_type == "acompletion"
): # /chat/completions requests
messages: Optional[List] = kwargs.get("messages", None)
kwargs["messages"] = [{"role": "user", "content": "MASK_THIS_ASYNC_VALUE"}]
return kwargs, responses
def logging_hook(
self, kwargs: dict, result: Any, call_type: str
) -> Tuple[dict, Any]:
"""
For masking logged request/response. Return a modified version of the request/result.
Called before `log_success_event`.
"""
if (
call_type == "completion" or call_type == "acompletion"
): # /chat/completions requests
messages: Optional[List] = kwargs.get("messages", None)
kwargs["messages"] = [{"role": "user", "content": "MASK_THIS_SYNC_VALUE"}]
return kwargs, responses
customHandler = MyCustomHandler()
```
2. Connect custom handler to LiteLLM
```python
import litellm
litellm.callbacks = [customHandler]
```
3. Test it!
```python
# pip install langfuse
import os
import litellm
from litellm import completion
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = ""
# Optional, defaults to https://cloud.langfuse.com
os.environ["LANGFUSE_HOST"] # optional
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
litellm.callbacks = [customHandler]
litellm.success_callback = ["langfuse"]
## sync
response = completion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
stream=True)
for chunk in response:
continue
## async
import asyncio
def async completion():
response = await acompletion(model="gpt-3.5-turbo", messages=[{ "role": "user", "content": "Hi 👋 - i'm openai"}],
stream=True)
async for chunk in response:
continue
asyncio.run(completion())
```

View file

@ -1,3 +1,4 @@
# Sentry - Log LLM Exceptions
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
@ -9,7 +10,6 @@ https://github.com/BerriAI/litellm
::: :::
# Sentry - Log LLM Exceptions
[Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration [Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration
Track exceptions for: Track exceptions for:

View file

@ -0,0 +1,263 @@
# [BETA] OpenID Connect (OIDC)
LiteLLM supports using OpenID Connect (OIDC) for authentication to upstream services . This allows you to avoid storing sensitive credentials in your configuration files.
:::info
This feature is in Beta
:::
## OIDC Identity Provider (IdP)
LiteLLM supports the following OIDC identity providers:
| Provider | Config Name | Custom Audiences |
| -------------------------| ------------ | ---------------- |
| Google Cloud Run | `google` | Yes |
| CircleCI v1 | `circleci` | No |
| CircleCI v2 | `circleci_v2`| No |
| GitHub Actions | `github` | Yes |
| Azure Kubernetes Service | `azure` | No |
| File | `file` | No |
| Environment Variable | `env` | No |
| Environment Path | `env_path` | No |
If you would like to use a different OIDC provider, please open an issue on GitHub.
:::tip
Do not use the `file`, `env`, or `env_path` providers unless you know what you're doing, and you are sure none of the other providers will work for your use-case. Hint: they probably will.
:::
## OIDC Connect Relying Party (RP)
LiteLLM supports the following OIDC relying parties / clients:
- Amazon Bedrock
- Azure OpenAI
- _(Coming soon) Google Cloud Vertex AI_
### Configuring OIDC
Wherever a secret key can be used, OIDC can be used in-place. The general format is:
```
oidc/config_name_here/audience_here
```
For providers that do not use the `audience` parameter, you can (and should) omit it:
```
oidc/config_name_here/
```
#### Unofficial Providers (not recommended)
For the unofficial `file` provider, you can use the following format:
```
oidc/file/home/user/dave/this_is_a_file_with_a_token.txt
```
For the unofficial `env`, use the following format, where `SECRET_TOKEN` is the name of the environment variable that contains the token:
```
oidc/env/SECRET_TOKEN
```
For the unofficial `env_path`, use the following format, where `SECRET_TOKEN` is the name of the environment variable that contains the path to the file with the token:
```
oidc/env_path/SECRET_TOKEN
```
:::tip
If you are tempted to use oidc/env_path/AZURE_FEDERATED_TOKEN_FILE, don't do that. Instead, use `oidc/azure/`, as this will ensure continued support from LiteLLM if Azure changes their OIDC configuration and/or adds new features.
:::
## Examples
### Google Cloud Run -> Amazon Bedrock
```yaml
model_list:
- model_name: claude-3-haiku-20240307
litellm_params:
model: bedrock/anthropic.claude-3-haiku-20240307-v1:0
aws_region_name: us-west-2
aws_session_name: "litellm"
aws_role_name: "arn:aws:iam::YOUR_THING_HERE:role/litellm-google-demo"
aws_web_identity_token: "oidc/google/https://example.com"
```
### CircleCI v2 -> Amazon Bedrock
```yaml
model_list:
- model_name: command-r
litellm_params:
model: bedrock/cohere.command-r-v1:0
aws_region_name: us-west-2
aws_session_name: "my-test-session"
aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"
aws_web_identity_token: "oidc/circleci_v2/"
```
#### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock
The configuration below is only an example. You should adjust the permissions and trust relationship to match your specific use case.
Permissions:
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor0",
"Effect": "Allow",
"Action": [
"bedrock:InvokeModel",
"bedrock:InvokeModelWithResponseStream"
],
"Resource": [
"arn:aws:bedrock:*::foundation-model/anthropic.claude-3-haiku-20240307-v1:0",
"arn:aws:bedrock:*::foundation-model/cohere.command-r-v1:0"
]
}
]
}
```
See https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html for more examples.
Trust Relationship:
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Federated": "arn:aws:iam::335785316107:oidc-provider/oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd"
},
"Action": "sts:AssumeRoleWithWebIdentity",
"Condition": {
"StringEquals": {
"oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:aud": "c5a99188-154f-4f69-8da2-b442b1bf78dd"
},
"ForAnyValue:StringLike": {
"oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:sub": [
"org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/main",
"org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/litellm_*"
]
}
}
}
]
}
```
This trust relationship restricts CircleCI to only assume the role on the main branch and branches that start with `litellm_`.
For CircleCI (v1 and v2), you also need to add your organization's OIDC provider in your AWS IAM settings. See https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-idp_oidc.html for more information.
:::tip
You should _never_ need to create an IAM user. If you did, you're not using OIDC correctly. You should only be creating a role with permissions and a trust relationship to your OIDC provider.
:::
### Google Cloud Run -> Azure OpenAI
```yaml
model_list:
- model_name: gpt-4o-2024-05-13
litellm_params:
model: azure/gpt-4o-2024-05-13
azure_ad_token: "oidc/google/https://example.com"
api_version: "2024-06-01"
api_base: "https://demo-here.openai.azure.com"
model_info:
base_model: azure/gpt-4o-2024-05-13
```
For Azure OpenAI, you need to define `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, and optionally `AZURE_AUTHORITY_HOST` in your environment.
```bash
export AZURE_CLIENT_ID="91a43c21-cf21-4f34-9085-331015ea4f91" # Azure AD Application (Client) ID
export AZURE_TENANT_ID="f3b1cf79-eba8-40c3-8120-cb26aca169c2" # Will be the same across of all your Azure AD applications
export AZURE_AUTHORITY_HOST="https://login.microsoftonline.com" # 👈 Optional, defaults to "https://login.microsoftonline.com"
```
:::tip
You can find `AZURE_CLIENT_ID` by visiting `https://login.microsoftonline.com/YOUR_DOMAIN_HERE/v2.0/.well-known/openid-configuration` and looking for the UUID in the `issuer` field.
:::
:::tip
Don't set `AZURE_AUTHORITY_HOST` in your environment unless you need to override the default value. This way, if the default value changes in the future, you won't need to update your environment.
:::
:::tip
By default, Azure AD applications use the audience `api://AzureADTokenExchange`. We recommend setting the audience to something more specific to your application.
:::
#### Azure AD Application Configuration
Unfortunately, Azure is bit more complicated to set up than other OIDC relying parties like AWS. Basically, you have to:
1. Create an Azure application.
2. Add a federated credential for the OIDC IdP you're using (e.g. Google Cloud Run).
3. Add the Azure application to resource group that contains the Azure OpenAI resource(s).
4. Give the Azure application the necessary role to access the Azure OpenAI resource(s).
The custom role below is the recommended minimum permissions for the Azure application to access Azure OpenAI resources. You should adjust the permissions to match your specific use case.
```json
{
"id": "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/providers/Microsoft.Authorization/roleDefinitions/baf42808-99ff-466d-b9da-f95bb0422c5f",
"properties": {
"roleName": "invoke-only",
"description": "",
"assignableScopes": [
"/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/resourceGroups/your-openai-group-name"
],
"permissions": [
{
"actions": [],
"notActions": [],
"dataActions": [
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/audio/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/search/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/chat/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/extensions/chat/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/embeddings/action",
"Microsoft.CognitiveServices/accounts/OpenAI/images/generations/action"
],
"notDataActions": []
}
]
}
}
```
_Note: Your UUIDs will be different._
Please contact us for paid enterprise support if you need help setting up Azure AD applications.

View file

@ -0,0 +1,236 @@
# Bedrock (Pass-Through)
Pass-through endpoints for Bedrock - call provider-specific endpoint, in native format (no translation).
Just replace `https://bedrock-runtime.{aws_region_name}.amazonaws.com` with `LITELLM_PROXY_BASE_URL/bedrock` 🚀
#### **Example Usage**
```bash
curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
-H 'Authorization: Bearer anything' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{"role": "user",
"content": [{"text": "Hello"}]
}
]
}'
```
Supports **ALL** Bedrock Endpoints (including streaming).
[**See All Bedrock Endpoints**](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html)
## Quick Start
Let's call the Bedrock [`/converse` endpoint](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html)
1. Add AWS Keyss to your environment
```bash
export AWS_ACCESS_KEY_ID="" # Access key
export AWS_SECRET_ACCESS_KEY="" # Secret access key
export AWS_REGION_NAME="" # us-east-1, us-east-2, us-west-1, us-west-2
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Bedrock converse endpoint
```bash
curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
-H 'Authorization: Bearer anything' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{"role": "user",
"content": [{"text": "Hello"}]
}
]
}'
```
## Examples
Anything after `http://0.0.0.0:4000/bedrock` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://bedrock-runtime.{aws_region_name}.amazonaws.com` | `http://0.0.0.0:4000/bedrock` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `AWS4-HMAC-SHA256..` | `Bearer anything` (use `Bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Converse API**
#### LiteLLM Proxy Call
```bash
curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
-H 'Authorization: Bearer sk-anything' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{"role": "user",
"content": [{"text": "Hello"}]
}
]
}'
```
#### Direct Bedrock API Call
```bash
curl -X POST 'https://bedrock-runtime.us-west-2.amazonaws.com/model/cohere.command-r-v1:0/converse' \
-H 'Authorization: AWS4-HMAC-SHA256..' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{"role": "user",
"content": [{"text": "Hello"}]
}
]
}'
```
### **Example 2: Apply Guardrail**
#### LiteLLM Proxy Call
```bash
curl "http://0.0.0.0:4000/bedrock/guardrail/guardrailIdentifier/version/guardrailVersion/apply" \
-H 'Authorization: Bearer sk-anything' \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{"text": {"text": "Hello world"}}],
"source": "INPUT"
}'
```
#### Direct Bedrock API Call
```bash
curl "https://bedrock-runtime.us-west-2.amazonaws.com/guardrail/guardrailIdentifier/version/guardrailVersion/apply" \
-H 'Authorization: AWS4-HMAC-SHA256..' \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{"text": {"text": "Hello world"}}],
"source": "INPUT"
}'
```
### **Example 3: Query Knowledge Base**
```bash
curl -X POST "http://0.0.0.0:4000/bedrock/knowledgebases/{knowledgeBaseId}/retrieve" \
-H 'Authorization: Bearer sk-anything' \
-H 'Content-Type: application/json' \
-d '{
"nextToken": "string",
"retrievalConfiguration": {
"vectorSearchConfiguration": {
"filter": { ... },
"numberOfResults": number,
"overrideSearchType": "string"
}
},
"retrievalQuery": {
"text": "string"
}
}'
```
#### Direct Bedrock API Call
```bash
curl -X POST "https://bedrock-runtime.us-west-2.amazonaws.com/knowledgebases/{knowledgeBaseId}/retrieve" \
-H 'Authorization: AWS4-HMAC-SHA256..' \
-H 'Content-Type: application/json' \
-d '{
"nextToken": "string",
"retrievalConfiguration": {
"vectorSearchConfiguration": {
"filter": { ... },
"numberOfResults": number,
"overrideSearchType": "string"
}
},
"retrievalQuery": {
"text": "string"
}
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw AWS Keys, but still letting them use AWS Bedrock endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export AWS_ACCESS_KEY_ID="" # Access key
export AWS_SECRET_ACCESS_KEY="" # Secret access key
export AWS_REGION_NAME="" # us-east-1, us-east-2, us-west-1, us-west-2
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/bedrock/model/cohere.command-r-v1:0/converse' \
-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \
-H 'Content-Type: application/json' \
-d '{
"messages": [
{"role": "user",
"content": [{"text": "Hello"}]
}
]
}'
```

View file

@ -0,0 +1,253 @@
# Cohere API (Pass-Through)
Pass-through endpoints for Cohere - call provider-specific endpoint, in native format (no translation).
Just replace `https://api.cohere.com` with `LITELLM_PROXY_BASE_URL/cohere` 🚀
#### **Example Usage**
```bash
curl --request POST \
--url http://0.0.0.0:4000/cohere/v1/chat \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"chat_history": [
{"role": "USER", "message": "Who discovered gravity?"},
{"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
],
"message": "What year was he born?",
"connectors": [{"id": "web-search"}]
}'
```
Supports **ALL** Cohere Endpoints (including streaming).
[**See All Cohere Endpoints**](https://docs.cohere.com/reference/chat)
## Quick Start
Let's call the Cohere [`/rerank` endpoint](https://docs.cohere.com/reference/rerank)
1. Add Cohere API Key to your environment
```bash
export COHERE_API_KEY=""
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Cohere /rerank endpoint
```bash
curl --request POST \
--url http://0.0.0.0:4000/cohere/v1/rerank \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"model": "rerank-english-v3.0",
"query": "What is the capital of the United States?",
"top_n": 3,
"documents": ["Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
}'
```
## Examples
Anything after `http://0.0.0.0:4000/cohere` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://api.cohere.com` | `http://0.0.0.0:4000/cohere` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `bearer $CO_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Rerank endpoint**
#### LiteLLM Proxy Call
```bash
curl --request POST \
--url http://0.0.0.0:4000/cohere/v1/rerank \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"model": "rerank-english-v3.0",
"query": "What is the capital of the United States?",
"top_n": 3,
"documents": ["Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
}'
```
#### Direct Cohere API Call
```bash
curl --request POST \
--url https://api.cohere.com/v1/rerank \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer $CO_API_KEY" \
--data '{
"model": "rerank-english-v3.0",
"query": "What is the capital of the United States?",
"top_n": 3,
"documents": ["Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
}'
```
### **Example 2: Chat API**
#### LiteLLM Proxy Call
```bash
curl --request POST \
--url http://0.0.0.0:4000/cohere/v1/chat \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"chat_history": [
{"role": "USER", "message": "Who discovered gravity?"},
{"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
],
"message": "What year was he born?",
"connectors": [{"id": "web-search"}]
}'
```
#### Direct Cohere API Call
```bash
curl --request POST \
--url https://api.cohere.com/v1/chat \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer $CO_API_KEY" \
--data '{
"chat_history": [
{"role": "USER", "message": "Who discovered gravity?"},
{"role": "CHATBOT", "message": "The man who is widely credited with discovering gravity is Sir Isaac Newton"}
],
"message": "What year was he born?",
"connectors": [{"id": "web-search"}]
}'
```
### **Example 3: Embedding**
```bash
curl --request POST \
--url https://api.cohere.com/v1/embed \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-anything" \
--data '{
"model": "embed-english-v3.0",
"texts": ["hello", "goodbye"],
"input_type": "classification"
}'
```
#### Direct Cohere API Call
```bash
curl --request POST \
--url https://api.cohere.com/v1/embed \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer $CO_API_KEY" \
--data '{
"model": "embed-english-v3.0",
"texts": ["hello", "goodbye"],
"input_type": "classification"
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export COHERE_API_KEY=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
curl --request POST \
--url http://0.0.0.0:4000/cohere/v1/rerank \
--header 'accept: application/json' \
--header 'content-type: application/json' \
--header "Authorization: bearer sk-1234ewknldferwedojwojw" \
--data '{
"model": "rerank-english-v3.0",
"query": "What is the capital of the United States?",
"top_n": 3,
"documents": ["Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capitalization or capitalisation in English grammar is the use of a capital letter at the start of a word. English usage varies from capitalization in other languages.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."]
}'
```

View file

@ -0,0 +1,223 @@
# Google AI Studio (Pass-Through)
Pass-through endpoints for Google AI Studio - call provider-specific endpoint, in native format (no translation).
Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` 🚀
#### **Example Usage**
```bash
http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
-H 'Content-Type: application/json' \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}]
}]
}'
```
Supports **ALL** Google AI Studio Endpoints (including streaming).
[**See All Google AI Studio Endpoints**](https://ai.google.dev/api)
## Quick Start
Let's call the Gemini [`/countTokens` endpoint](https://ai.google.dev/api/tokens#method:-models.counttokens)
1. Add Gemini API Key to your environment
```bash
export GEMINI_API_KEY=""
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's call the Google AI Studio token counting endpoint
```bash
http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=anything' \
-H 'Content-Type: application/json' \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}]
}]
}'
```
## Examples
Anything after `http://0.0.0.0:4000/gemini` is treated as a provider-specific route, and handled accordingly.
Key Changes:
| **Original Endpoint** | **Replace With** |
|------------------------------------------------------|-----------------------------------|
| `https://generativelanguage.googleapis.com` | `http://0.0.0.0:4000/gemini` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") |
| `key=$GOOGLE_API_KEY` | `key=anything` (use `key=LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) |
### **Example 1: Counting tokens**
#### LiteLLM Proxy Call
```bash
curl http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=anything \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}],
}],
}'
```
#### Direct Google AI Studio Call
```bash
curl https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:countTokens?key=$GOOGLE_API_KEY \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}],
}],
}'
```
### **Example 2: Generate content**
#### LiteLLM Proxy Call
```bash
curl "http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:generateContent?key=anything" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[{"text": "Write a story about a magic backpack."}]
}]
}' 2> /dev/null
```
#### Direct Google AI Studio Call
```bash
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY" \
-H 'Content-Type: application/json' \
-X POST \
-d '{
"contents": [{
"parts":[{"text": "Write a story about a magic backpack."}]
}]
}' 2> /dev/null
```
### **Example 3: Caching**
```bash
curl -X POST "http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash-001:generateContent?key=anything" \
-H 'Content-Type: application/json' \
-d '{
"contents": [
{
"parts":[{
"text": "Please summarize this transcript"
}],
"role": "user"
},
],
"cachedContent": "'$CACHE_NAME'"
}'
```
#### Direct Google AI Studio Call
```bash
curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-001:generateContent?key=$GOOGLE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"contents": [
{
"parts":[{
"text": "Please summarize this transcript"
}],
"role": "user"
},
],
"cachedContent": "'$CACHE_NAME'"
}'
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export GEMINI_API_KEY=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```bash
http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-1234ewknldferwedojwojw' \
-H 'Content-Type: application/json' \
-d '{
"contents": [{
"parts":[{
"text": "The quick brown fox jumps over the lazy dog."
}]
}]
}'
```

View file

@ -0,0 +1,132 @@
# Langfuse Endpoints (Pass-Through)
Pass-through endpoints for Langfuse - call langfuse endpoints with LiteLLM Virtual Key.
Just replace `https://us.cloud.langfuse.com` with `LITELLM_PROXY_BASE_URL/langfuse` 🚀
#### **Example Usage**
```python
from langfuse import Langfuse
langfuse = Langfuse(
host="http://localhost:4000/langfuse", # your litellm proxy endpoint
public_key="anything", # no key required since this is a pass through
secret_key="LITELLM_VIRTUAL_KEY", # no key required since this is a pass through
)
print("sending langfuse trace request")
trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
print("flushing langfuse request")
langfuse.flush()
print("flushed langfuse request")
```
Supports **ALL** Langfuse Endpoints.
[**See All Langfuse Endpoints**](https://api.reference.langfuse.com/)
## Quick Start
Let's log a trace to Langfuse.
1. Add Langfuse Public/Private keys to environment
```bash
export LANGFUSE_PUBLIC_KEY=""
export LANGFUSE_PRIVATE_KEY=""
```
2. Start LiteLLM Proxy
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
3. Test it!
Let's log a trace to Langfuse!
```python
from langfuse import Langfuse
langfuse = Langfuse(
host="http://localhost:4000/langfuse", # your litellm proxy endpoint
public_key="anything", # no key required since this is a pass through
secret_key="anything", # no key required since this is a pass through
)
print("sending langfuse trace request")
trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
print("flushing langfuse request")
langfuse.flush()
print("flushed langfuse request")
```
## Advanced - Use with Virtual Keys
Pre-requisites
- [Setup proxy with DB](../proxy/virtual_keys.md#setup)
Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.
### Usage
1. Setup environment
```bash
export DATABASE_URL=""
export LITELLM_MASTER_KEY=""
export LANGFUSE_PUBLIC_KEY=""
export LANGFUSE_PRIVATE_KEY=""
```
```bash
litellm
# RUNNING on http://0.0.0.0:4000
```
2. Generate virtual key
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{}'
```
Expected Response
```bash
{
...
"key": "sk-1234ewknldferwedojwojw"
}
```
3. Test it!
```python
from langfuse import Langfuse
langfuse = Langfuse(
host="http://localhost:4000/langfuse", # your litellm proxy endpoint
public_key="anything", # no key required since this is a pass through
secret_key="sk-1234ewknldferwedojwojw", # no key required since this is a pass through
)
print("sending langfuse trace request")
trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
print("flushing langfuse request")
langfuse.flush()
print("flushed langfuse request")
```
## [Advanced - Log to separate langfuse projects (by key/team)](../proxy/team_logging.md)

View file

@ -0,0 +1,101 @@
# [BETA] Vertex AI Endpoints (Pass-Through)
Pass-through endpoints for Vertex AI - call provider-specific endpoint, in native format (no translation).
:::tip
Looking for the Unified API (OpenAI format) for VertexAI ? [Go here - using vertexAI with LiteLLM SDK or LiteLLM Proxy Server](../docs/providers/vertex.md)
:::
## Supported API Endpoints
- Gemini API
- Embeddings API
- Imagen API
- Code Completion API
- Batch prediction API
- Tuning API
- CountTokens API
## Quick Start Usage
#### 1. Set `default_vertex_config` on your `config.yaml`
Add the following credentials to your litellm config.yaml to use the Vertex AI endpoints.
```yaml
default_vertex_config:
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
#### 2. Start litellm proxy
```shell
litellm --config /path/to/config.yaml
```
#### 3. Test it
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:countTokens \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"content": "gm"}]}'
```
## Usage Examples
### Gemini API (Generate Content)
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:generateContent \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
```
### Embeddings API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:predict \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"content": "gm"}]}'
```
### Imagen API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/imagen-3.0-generate-001:predict \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}'
```
### Count Tokens API
```shell
curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:countTokens \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
```
### Tuning API
Create Fine Tuning Job
```shell
curl http://localhost:4000/vertex-ai/tuningJobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"baseModel": "gemini-1.0-pro-002",
"supervisedTuningSpec" : {
"training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
}
}'
```

View file

@ -22,6 +22,7 @@ Anthropic API fails requests when `max_tokens` are not passed. Due to this litel
import os import os
os.environ["ANTHROPIC_API_KEY"] = "your-api-key" os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
# os.environ["ANTHROPIC_API_BASE"] = "" # [OPTIONAL] or 'ANTHROPIC_BASE_URL'
``` ```
## Usage ## Usage
@ -55,7 +56,7 @@ for chunk in response:
print(chunk["choices"][0]["delta"]["content"]) # same as openai format print(chunk["choices"][0]["delta"]["content"]) # same as openai format
``` ```
## OpenAI Proxy Usage ## Usage with LiteLLM Proxy
Here's how to call Anthropic with the LiteLLM Proxy Server Here's how to call Anthropic with the LiteLLM Proxy Server
@ -68,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key"
### 2. Start the proxy ### 2. Start the proxy
<Tabs> <Tabs>
<TabItem value="cli" label="cli">
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml"> <TabItem value="config" label="config.yaml">
```yaml ```yaml
@ -90,6 +83,55 @@ model_list:
litellm --config /path/to/config.yaml litellm --config /path/to/config.yaml
``` ```
</TabItem> </TabItem>
<TabItem value="config-all" label="config - default all Anthropic Model">
Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
#### Required env variables
```
ANTHROPIC_API_KEY=sk-ant****
```
```yaml
model_list:
- model_name: "*"
litellm_params:
model: "*"
```
```bash
litellm --config /path/to/config.yaml
```
Example Request for this config.yaml
**Ensure you use `anthropic/` prefix to route the request to Anthropic API**
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "anthropic/claude-3-haiku-20240307",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="cli" label="cli">
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:4000
```
</TabItem>
</Tabs> </Tabs>
### 3. Test it ### 3. Test it
@ -183,9 +225,336 @@ print(response)
| claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` | | claude-instant-1.2 | `completion('claude-instant-1.2', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
| claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` | | claude-instant-1 | `completion('claude-instant-1', messages)` | `os.environ['ANTHROPIC_API_KEY']` |
## Advanced ## **Prompt Caching**
## Usage - Function Calling Use Anthropic Prompt Caching
[Relevant Anthropic API Docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)
### Caching - Large Context Caching
This example demonstrates basic Prompt Caching usage, caching the full text of the legal agreement as a prefix while keeping the user instruction uncached.
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
### Caching - Tools definitions
In this example, we demonstrate caching tool definitions.
The cache_control parameter is placed on the final tool
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
import litellm
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"}
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
"cache_control": {"type": "ephemeral"}
},
}
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
### Caching - Continuing Multi-Turn Convo
In this example, we demonstrate how to use Prompt Caching in a multi-turn conversation.
The cache_control parameter is placed on the system message to designate it as part of the static prefix.
The conversation history (previous messages) is included in the messages array. The final turn is marked with cache-control, for continuing in followups. The second-to-last user message is marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
import litellm
response = await litellm.acompletion(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
<TabItem value="proxy" label="LiteLLM Proxy">
:::info
LiteLLM Proxy is OpenAI compatible
This is an example using the OpenAI Python SDK sending a request to LiteLLM Proxy
Assuming you have a model=`anthropic/claude-3-5-sonnet-20240620` on the [litellm proxy config.yaml](#usage-with-litellm-proxy)
:::
```python
import openai
client = openai.AsyncOpenAI(
api_key="anything", # litellm proxy api key
base_url="http://0.0.0.0:4000" # litellm proxy base url
)
response = await client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
# System Message
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement"
* 400,
"cache_control": {"type": "ephemeral"},
}
],
},
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
],
extra_headers={
"anthropic-version": "2023-06-01",
"anthropic-beta": "prompt-caching-2024-07-31",
},
)
```
</TabItem>
</Tabs>
## **Function/Tool Calling**
:::info :::info
@ -374,6 +743,20 @@ resp = litellm.completion(
print(f"\nResponse: {resp}") print(f"\nResponse: {resp}")
``` ```
## **Passing Extra Headers to Anthropic API**
Pass `extra_headers: dict` to `litellm.completion`
```python
from litellm import completion
messages = [{"role": "user", "content": "What is Anthropic?"}]
response = completion(
model="claude-3-5-sonnet-20240620",
messages=messages,
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}
)
```
## Usage - "Assistant Pre-fill" ## Usage - "Assistant Pre-fill"
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array. You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.

View file

@ -1,10 +1,18 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'
# AWS Sagemaker # AWS Sagemaker
LiteLLM supports All Sagemaker Huggingface Jumpstart Models LiteLLM supports All Sagemaker Huggingface Jumpstart Models
:::tip
**We support ALL Sagemaker models, just set `model=sagemaker/<any-model-on-sagemaker>` as a prefix when sending litellm requests**
:::
### API KEYS ### API KEYS
```python ```python
!pip install boto3
os.environ["AWS_ACCESS_KEY_ID"] = "" os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = "" os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = "" os.environ["AWS_REGION_NAME"] = ""
@ -27,6 +35,327 @@ response = completion(
) )
``` ```
### Usage - Streaming
Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.2,
max_tokens=80,
stream=True,
)
for chunk in response:
print(chunk)
```
## **LiteLLM Proxy Usage**
Here's how to call Sagemaker with the LiteLLM Proxy Server
### 1. Setup config.yaml
```yaml
model_list:
- model_name: jumpstart-model
litellm_params:
model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
```
All possible auth params:
```
aws_access_key_id: Optional[str],
aws_secret_access_key: Optional[str],
aws_session_token: Optional[str],
aws_region_name: Optional[str],
aws_session_name: Optional[str],
aws_profile_name: Optional[str],
aws_role_name: Optional[str],
aws_web_identity_token: Optional[str],
```
### 2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "jumpstart-model",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="jumpstart-model", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "jumpstart-model",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Set temperature, top p, etc.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.7,
top_p=1
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Set on yaml**
```yaml
model_list:
- model_name: jumpstart-model
litellm_params:
model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
temperature: <your-temp>
top_p: <your-top-p>
```
**Set on request**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="jumpstart-model", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0.7,
top_p=1
)
print(response)
```
</TabItem>
</Tabs>
## **Allow setting temperature=0** for Sagemaker
By default when `temperature=0` is sent in requests to LiteLLM, LiteLLM rounds up to `temperature=0.1` since Sagemaker fails most requests when `temperature=0`
If you want to send `temperature=0` for your model here's how to set it up (Since Sagemaker can host any kind of model, some models allow zero temperature)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0,
aws_sagemaker_allow_zero_temp=True,
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Set `aws_sagemaker_allow_zero_temp` on yaml**
```yaml
model_list:
- model_name: jumpstart-model
litellm_params:
model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
aws_sagemaker_allow_zero_temp: true
```
**Set `temperature=0` on request**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="jumpstart-model", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0,
)
print(response)
```
</TabItem>
</Tabs>
## Pass provider-specific params
If you pass a non-openai param to litellm, we'll assume it's provider-specific and send it as a kwarg in the request body. [See more](../completion/input.md#provider-specific-params)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614",
messages=[{ "content": "Hello, how are you?","role": "user"}],
top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Set on yaml**
```yaml
model_list:
- model_name: jumpstart-model
litellm_params:
model: sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614
top_k: 1 # 👈 PROVIDER-SPECIFIC PARAM
```
**Set on request**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="jumpstart-model", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0.7,
extra_body={
top_k=1 # 👈 PROVIDER-SPECIFIC PARAM
}
)
print(response)
```
</TabItem>
</Tabs>
### Passing Inference Component Name ### Passing Inference Component Name
If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`. If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`.
@ -85,29 +414,16 @@ response = completion(
You can also pass in your own [custom prompt template](../completion/prompt_formatting.md#format-prompt-yourself) You can also pass in your own [custom prompt template](../completion/prompt_formatting.md#format-prompt-yourself)
### Usage - Streaming
Sagemaker currently does not support streaming - LiteLLM fakes streaming by returning chunks of the response string
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.2,
max_tokens=80,
stream=True,
)
for chunk in response:
print(chunk)
```
### Completion Models ### Completion Models
:::tip
**We support ALL Sagemaker models, just set `model=sagemaker/<any-model-on-sagemaker>` as a prefix when sending litellm requests**
:::
Here's an example of using a sagemaker model with LiteLLM Here's an example of using a sagemaker model with LiteLLM
| Model Name | Function Call | | Model Name | Function Call |
@ -120,7 +436,7 @@ Here's an example of using a sagemaker model with LiteLLM
| Meta Llama 2 70B | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Meta Llama 2 70B | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| Meta Llama 2 70B (Chat/Fine-tuned) | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b-b-f', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Meta Llama 2 70B (Chat/Fine-tuned) | `completion(model='sagemaker/jumpstart-dft-meta-textgeneration-llama-2-70b-b-f', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
### Embedding Models ## Embedding Models
LiteLLM supports all Sagemaker Jumpstart Huggingface Embedding models. Here's how to call it: LiteLLM supports all Sagemaker Jumpstart Huggingface Embedding models. Here's how to call it:

View file

@ -66,8 +66,15 @@ response = litellm.completion(
## Azure OpenAI Chat Completion Models ## Azure OpenAI Chat Completion Models
:::tip
**We support ALL Azure models, just set `model=azure/<your deployment name>` as a prefix when sending litellm requests**
:::
| Model Name | Function Call | | Model Name | Function Call |
|------------------|----------------------------------------| |------------------|----------------------------------------|
| gpt-4o-mini | `completion('azure/<your deployment name>', messages)` |
| gpt-4o | `completion('azure/<your deployment name>', messages)` | | gpt-4o | `completion('azure/<your deployment name>', messages)` |
| gpt-4 | `completion('azure/<your deployment name>', messages)` | | gpt-4 | `completion('azure/<your deployment name>', messages)` |
| gpt-4-0314 | `completion('azure/<your deployment name>', messages)` | | gpt-4-0314 | `completion('azure/<your deployment name>', messages)` |

View file

@ -36,40 +36,40 @@ response = completion(
) )
``` ```
## OpenAI Proxy Usage ## LiteLLM Proxy Usage
Here's how to call Anthropic with the LiteLLM Proxy Server Here's how to call Anthropic with the LiteLLM Proxy Server
### 1. Save key in your environment ### 1. Setup config.yaml
```bash
export AWS_ACCESS_KEY_ID=""
export AWS_SECRET_ACCESS_KEY=""
export AWS_REGION_NAME=""
```
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="CLI">
```bash
$ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml ```yaml
model_list: model_list:
- model_name: bedrock-claude-v1 - model_name: bedrock-claude-v1
litellm_params: litellm_params:
model: bedrock/anthropic.claude-instant-v1 model: bedrock/anthropic.claude-instant-v1
aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
``` ```
</TabItem>
</Tabs>
All possible auth params:
```
aws_access_key_id: Optional[str],
aws_secret_access_key: Optional[str],
aws_session_token: Optional[str],
aws_region_name: Optional[str],
aws_session_name: Optional[str],
aws_profile_name: Optional[str],
aws_role_name: Optional[str],
aws_web_identity_token: Optional[str],
```
### 2. Start the proxy
```bash
litellm --config /path/to/config.yaml
```
### 3. Test it ### 3. Test it
@ -360,6 +360,120 @@ resp = litellm.completion(
print(f"\nResponse: {resp}") print(f"\nResponse: {resp}")
``` ```
## Usage - Bedrock Guardrails
Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
<Tabs>
<TabItem value="sdk" label="LiteLLM SDK">
```python
from litellm import completion
# set env
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="anthropic.claude-v2",
messages=[
{
"content": "where do i buy coffee from? ",
"role": "user",
}
],
max_tokens=10,
guardrailConfig={
"guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
"guardrailVersion": "DRAFT", # The version of the guardrail.
"trace": "disabled", # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
},
)
```
</TabItem>
<TabItem value="proxy" label="Proxy on request">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="anthropic.claude-v2", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0.7,
extra_body={
"guardrailConfig": {
"guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
"guardrailVersion": "DRAFT", # The version of the guardrail.
"trace": "disabled", # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
},
}
)
print(response)
```
</TabItem>
<TabItem value="proxy-config" label="Proxy on config.yaml">
1. Update config.yaml
```yaml
model_list:
- model_name: bedrock-claude-v1
litellm_params:
model: bedrock/anthropic.claude-instant-v1
aws_access_key_id: os.environ/CUSTOM_AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/CUSTOM_AWS_SECRET_ACCESS_KEY
aws_region_name: os.environ/CUSTOM_AWS_REGION_NAME
guardrailConfig: {
"guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
"guardrailVersion": "DRAFT", # The version of the guardrail.
"trace": "disabled", # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
}
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="bedrock-claude-v1", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
temperature=0.7
)
print(response)
```
</TabItem>
</Tabs>
## Usage - "Assistant Pre-fill" ## Usage - "Assistant Pre-fill"
If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array. If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
@ -623,7 +737,7 @@ response = litellm.embedding(
## Supported AWS Bedrock Models ## Supported AWS Bedrock Models
Here's an example of using a bedrock model with LiteLLM Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
| Model Name | Command | | Model Name | Command |
|----------------------------|------------------------------------------------------------------| |----------------------------|------------------------------------------------------------------|
@ -641,6 +755,7 @@ Here's an example of using a bedrock model with LiteLLM
| Cohere Command | `completion(model='bedrock/cohere.command-text-v14', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Cohere Command | `completion(model='bedrock/cohere.command-text-v14', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| AI21 J2-Mid | `completion(model='bedrock/ai21.j2-mid-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | AI21 J2-Mid | `completion(model='bedrock/ai21.j2-mid-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| AI21 J2-Ultra | `completion(model='bedrock/ai21.j2-ultra-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | AI21 J2-Ultra | `completion(model='bedrock/ai21.j2-ultra-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| AI21 Jamba-Instruct | `completion(model='bedrock/ai21.jamba-instruct-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| Meta Llama 2 Chat 13b | `completion(model='bedrock/meta.llama2-13b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Meta Llama 2 Chat 13b | `completion(model='bedrock/meta.llama2-13b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| Meta Llama 2 Chat 70b | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Meta Llama 2 Chat 70b | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
| Mistral 7B Instruct | `completion(model='bedrock/mistral.mistral-7b-instruct-v0:2', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Mistral 7B Instruct | `completion(model='bedrock/mistral.mistral-7b-instruct-v0:2', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |

View file

@ -0,0 +1,168 @@
# Custom API Server (Custom Format)
Call your custom torch-serve / internal LLM APIs via LiteLLM
:::info
- For calling an openai-compatible endpoint, [go here](./openai_compatible.md)
- For modifying incoming/outgoing calls on proxy, [go here](../proxy/call_hooks.md)
:::
## Quick Start
```python
import litellm
from litellm import CustomLLM, completion, get_llm_provider
class MyCustomLLM(CustomLLM):
def completion(self, *args, **kwargs) -> litellm.ModelResponse:
return litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello world"}],
mock_response="Hi!",
) # type: ignore
litellm.custom_provider_map = [ # 👈 KEY STEP - REGISTER HANDLER
{"provider": "my-custom-llm", "custom_handler": my_custom_llm}
]
resp = completion(
model="my-custom-llm/my-fake-model",
messages=[{"role": "user", "content": "Hello world!"}],
)
assert resp.choices[0].message.content == "Hi!"
```
## OpenAI Proxy Usage
1. Setup your `custom_handler.py` file
```python
import litellm
from litellm import CustomLLM, completion, get_llm_provider
class MyCustomLLM(CustomLLM):
def completion(self, *args, **kwargs) -> litellm.ModelResponse:
return litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello world"}],
mock_response="Hi!",
) # type: ignore
async def acompletion(self, *args, **kwargs) -> litellm.ModelResponse:
return litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello world"}],
mock_response="Hi!",
) # type: ignore
my_custom_llm = MyCustomLLM()
```
2. Add to `config.yaml`
In the config below, we pass
python_filename: `custom_handler.py`
custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1
custom_handler: `custom_handler.my_custom_llm`
```yaml
model_list:
- model_name: "test-model"
litellm_params:
model: "openai/text-embedding-ada-002"
- model_name: "my-custom-model"
litellm_params:
model: "my-custom-llm/my-model"
litellm_settings:
custom_provider_map:
- {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm}
```
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "my-custom-model",
"messages": [{"role": "user", "content": "Say \"this is a test\" in JSON!"}],
}'
```
Expected Response
```
{
"id": "chatcmpl-06f1b9cd-08bc-43f7-9814-a69173921216",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hi!",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1721955063,
"model": "gpt-3.5-turbo",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
}
```
## Custom Handler Spec
```python
from litellm.types.utils import GenericStreamingChunk, ModelResponse
from typing import Iterator, AsyncIterator
from litellm.llms.base import BaseLLM
class CustomLLMError(Exception): # use this for all your exceptions
def __init__(
self,
status_code,
message,
):
self.status_code = status_code
self.message = message
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class CustomLLM(BaseLLM):
def __init__(self) -> None:
super().__init__()
def completion(self, *args, **kwargs) -> ModelResponse:
raise CustomLLMError(status_code=500, message="Not implemented yet!")
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
raise CustomLLMError(status_code=500, message="Not implemented yet!")
async def acompletion(self, *args, **kwargs) -> ModelResponse:
raise CustomLLMError(status_code=500, message="Not implemented yet!")
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
raise CustomLLMError(status_code=500, message="Not implemented yet!")
```

View file

@ -1,129 +0,0 @@
# Custom API Server (OpenAI Format)
LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
## API KEYS
No api keys required
## Set up your Custom API Server
Your server should have the following Endpoints:
Here's an example OpenAI proxy server with routes: https://replit.com/@BerriAI/openai-proxy#main.py
### Required Endpoints
- POST `/chat/completions` - chat completions endpoint
### Optional Endpoints
- POST `/completions` - completions endpoint
- Get `/models` - available models on server
- POST `/embeddings` - creates an embedding vector representing the input text.
## Example Usage
### Call `/chat/completions`
In order to use your custom OpenAI Chat Completion proxy with LiteLLM, ensure you set
* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
* `custom_llm_provider` to `openai` this ensures litellm uses the `openai.ChatCompletion` to your api_base
```python
import os
from litellm import completion
## set ENV variables
os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
messages = [{ "content": "Hello, how are you?","role": "user"}]
response = completion(
model="command-nightly",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_base="https://openai-proxy.berriai.repl.co",
custom_llm_provider="openai" # litellm will use the openai.ChatCompletion to make the request
)
print(response)
```
#### Response
```json
{
"object":
"chat.completion",
"choices": [{
"finish_reason": "stop",
"index": 0,
"message": {
"content":
"The sky, a canvas of blue,\nA work of art, pure and true,\nA",
"role": "assistant"
}
}],
"id":
"chatcmpl-7fbd6077-de10-4cb4-a8a4-3ef11a98b7c8",
"created":
1699290237.408061,
"model":
"togethercomputer/llama-2-70b-chat",
"usage": {
"completion_tokens": 18,
"prompt_tokens": 14,
"total_tokens": 32
}
}
```
### Call `/completions`
In order to use your custom OpenAI Completion proxy with LiteLLM, ensure you set
* `api_base` to your proxy url, example "https://openai-proxy.berriai.repl.co"
* `custom_llm_provider` to `text-completion-openai` this ensures litellm uses the `openai.Completion` to your api_base
```python
import os
from litellm import completion
## set ENV variables
os.environ["OPENAI_API_KEY"] = "anything" #key is not used for proxy
messages = [{ "content": "Hello, how are you?","role": "user"}]
response = completion(
model="command-nightly",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_base="https://openai-proxy.berriai.repl.co",
custom_llm_provider="text-completion-openai" # litellm will use the openai.Completion to make the request
)
print(response)
```
#### Response
```json
{
"warning":
"This model version is deprecated. Migrate before January 4, 2024 to avoid disruption of service. Learn more https://platform.openai.com/docs/deprecations",
"id":
"cmpl-8HxHqF5dymQdALmLplS0dWKZVFe3r",
"object":
"text_completion",
"created":
1699290166,
"model":
"text-davinci-003",
"choices": [{
"text":
"\n\nThe weather in San Francisco varies depending on what time of year and time",
"index": 0,
"logprobs": None,
"finish_reason": "length"
}],
"usage": {
"prompt_tokens": 7,
"completion_tokens": 16,
"total_tokens": 23
}
}
```

View file

@ -5,6 +5,11 @@ import TabItem from '@theme/TabItem';
LiteLLM supports all models on Databricks LiteLLM supports all models on Databricks
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
## Usage ## Usage
@ -185,8 +190,17 @@ response = litellm.embedding(
## Supported Databricks Chat Completion Models ## Supported Databricks Chat Completion Models
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
| Model Name | Command | | Model Name | Command |
|----------------------------|------------------------------------------------------------------| |----------------------------|------------------------------------------------------------------|
| databricks-meta-llama-3-1-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)` |
| databricks-meta-llama-3-1-405b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)` |
| databricks-dbrx-instruct | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)` | | databricks-dbrx-instruct | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)` |
| databricks-meta-llama-3-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)` | | databricks-meta-llama-3-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)` |
| databricks-llama-2-70b-chat | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)` | | databricks-llama-2-70b-chat | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)` |
@ -196,6 +210,13 @@ response = litellm.embedding(
## Supported Databricks Embedding Models ## Supported Databricks Embedding Models
:::tip
**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
:::
| Model Name | Command | | Model Name | Command |
|----------------------------|------------------------------------------------------------------| |----------------------------|------------------------------------------------------------------|
| databricks-bge-large-en | `embedding(model='databricks/databricks-bge-large-en', messages=messages)` | | databricks-bge-large-en | `embedding(model='databricks/databricks-bge-large-en', messages=messages)` |

View file

@ -1,7 +1,12 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Fireworks AI # Fireworks AI
https://fireworks.ai/ https://fireworks.ai/
:::info
**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests** **We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests**
:::
## API Key ## API Key
```python ```python
@ -16,7 +21,7 @@ import os
os.environ['FIREWORKS_AI_API_KEY'] = "" os.environ['FIREWORKS_AI_API_KEY'] = ""
response = completion( response = completion(
model="fireworks_ai/mixtral-8x7b-instruct", model="fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct",
messages=[ messages=[
{"role": "user", "content": "hello from litellm"} {"role": "user", "content": "hello from litellm"}
], ],
@ -31,7 +36,7 @@ import os
os.environ['FIREWORKS_AI_API_KEY'] = "" os.environ['FIREWORKS_AI_API_KEY'] = ""
response = completion( response = completion(
model="fireworks_ai/mixtral-8x7b-instruct", model="fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct",
messages=[ messages=[
{"role": "user", "content": "hello from litellm"} {"role": "user", "content": "hello from litellm"}
], ],
@ -43,8 +48,103 @@ for chunk in response:
``` ```
## Usage with LiteLLM Proxy
### 1. Set Fireworks AI Models on config.yaml
```yaml
model_list:
- model_name: fireworks-llama-v3-70b-instruct
litellm_params:
model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
api_key: "os.environ/FIREWORKS_AI_API_KEY"
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fireworks-llama-v3-70b-instruct",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="fireworks-llama-v3-70b-instruct", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "fireworks-llama-v3-70b-instruct",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models - ALL Fireworks AI Models Supported! ## Supported Models - ALL Fireworks AI Models Supported!
:::info
We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests
:::
| Model Name | Function Call | | Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|

View file

@ -0,0 +1,60 @@
# FriendliAI
https://suite.friendli.ai/
**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
## API Key
```python
# env variable
os.environ['FRIENDLI_TOKEN']
os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['FRIENDLI_TOKEN'] = ""
response = completion(
model="friendliai/mixtral-8x7b-instruct-v0-1",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['FRIENDLI_TOKEN'] = ""
response = completion(
model="friendliai/mixtral-8x7b-instruct-v0-1",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Supported Models
### Serverless Endpoints
We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` |
| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |
### Dedicated Endpoints
```
model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
```

View file

@ -1,3 +1,7 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Gemini - Google AI Studio # Gemini - Google AI Studio
## Pre-requisites ## Pre-requisites
@ -17,6 +21,335 @@ response = completion(
) )
``` ```
## Supported OpenAI Params
- temperature
- top_p
- max_tokens
- stream
- tools
- tool_choice
- response_format
- n
- stop
[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
## Passing Gemini Specific Params
### Response schema
LiteLLM supports sending `response_schema` as a param for Gemini-1.5-Pro on Google AI Studio.
**Response Schema**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
response_schema = {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
}
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={"type": "json_object", "response_schema": response_schema} # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
}}
}
'
```
</TabItem>
</Tabs>
**Validate Schema**
To validate the response_schema, set `enforce_validation: true`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion, JSONSchemaValidationError
try:
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={
"type": "json_object",
"response_schema": response_schema,
"enforce_validation": true # 👈 KEY CHANGE
}
)
except JSONSchemaValidationError as e:
print("Raw Response: {}".format(e.raw_response))
raise e
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
},
"enforce_validation": true
}
}
'
```
</TabItem>
</Tabs>
LiteLLM will validate the response against the schema, and raise a `JSONSchemaValidationError` if the response does not match the schema.
JSONSchemaValidationError inherits from `openai.APIError`
Access the raw response with `e.raw_response`
### GenerationConfig Params
To pass additional GenerationConfig params - e.g. `topK`, just pass it in the request body of the call, and LiteLLM will pass it straight through as a key-value pair in the request body.
[**See Gemini GenerationConfigParams**](https://ai.google.dev/api/generate-content#v1beta.GenerationConfig)
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
topK=1 # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"topK": 1 # 👈 KEY CHANGE
}
'
```
</TabItem>
</Tabs>
**Validate Schema**
To validate the response_schema, set `enforce_validation: true`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion, JSONSchemaValidationError
try:
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={
"type": "json_object",
"response_schema": response_schema,
"enforce_validation": true # 👈 KEY CHANGE
}
)
except JSONSchemaValidationError as e:
print("Raw Response: {}".format(e.raw_response))
raise e
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object", "response_schema": {
"type": "array",
"items": {
"type": "object",
"properties": {
"recipe_name": {
"type": "string",
},
},
"required": ["recipe_name"],
},
},
"enforce_validation": true
}
}
'
```
</TabItem>
</Tabs>
## Specifying Safety Settings ## Specifying Safety Settings
In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example: In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:
@ -91,6 +424,72 @@ assert isinstance(
``` ```
## JSON Mode
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import json
import os
os.environ['GEMINI_API_KEY'] = ""
messages = [
{
"role": "user",
"content": "List 5 popular cookie recipes."
}
]
completion(
model="gemini/gemini-1.5-pro",
messages=messages,
response_format={"type": "json_object"} # 👈 KEY CHANGE
)
print(json.loads(completion.choices[0].message.content))
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add model to config.yaml
```yaml
model_list:
- model_name: gemini-pro
litellm_params:
model: gemini/gemini-1.5-pro
api_key: os.environ/GEMINI_API_KEY
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gemini-pro",
"messages": [
{"role": "user", "content": "List 5 popular cookie recipes."}
],
"response_format": {"type": "json_object"}
}
'
```
</TabItem>
</Tabs>
# Gemini-Pro-Vision # Gemini-Pro-Vision
LiteLLM Supports the following image types passed in `url` LiteLLM Supports the following image types passed in `url`
- Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg - Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
@ -141,8 +540,13 @@ print(content)
``` ```
## Chat Models ## Chat Models
:::tip
**We support ALL Gemini models, just set `model=gemini/<any-model-on-gemini>` as a prefix when sending litellm requests**
:::
| Model Name | Function Call | Required OS Variables | | Model Name | Function Call | Required OS Variables |
|-----------------------|--------------------------------------------------------|--------------------------------| |-----------------------|--------------------------------------------------------|--------------------------------|
| gemini-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` | | gemini-pro | `completion(model='gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` | | gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` | | gemini-pro-vision | `completion(model='gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |

View file

@ -0,0 +1,261 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🆕 Github
https://github.com/marketplace/models
:::tip
**We support ALL Github models, just set `model=github/<any-model-on-github>` as a prefix when sending litellm requests**
:::
## API Key
```python
# env variable
os.environ['GITHUB_API_KEY']
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['GITHUB_API_KEY'] = ""
response = completion(
model="github/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['GITHUB_API_KEY'] = ""
response = completion(
model="github/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Usage with LiteLLM Proxy
### 1. Set Github Models on config.yaml
```yaml
model_list:
- model_name: github-llama3-8b-8192 # Model Alias to use for requests
litellm_params:
model: github/llama3-8b-8192
api_key: "os.environ/GITHUB_API_KEY" # ensure you have `GITHUB_API_KEY` in your .env
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
Make request to litellm proxy
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "github-llama3-8b-8192",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="github-llama3-8b-8192", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "github-llama3-8b-8192",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models - ALL Github Models Supported!
We support ALL Github models, just set `github/` as a prefix when sending completion requests
| Model Name | Usage |
|--------------------|---------------------------------------------------------|
| llama-3.1-8b-instant | `completion(model="github/llama-3.1-8b-instant", messages)` |
| llama-3.1-70b-versatile | `completion(model="github/llama-3.1-70b-versatile", messages)` |
| llama-3.1-405b-reasoning | `completion(model="github/llama-3.1-405b-reasoning", messages)` |
| llama3-8b-8192 | `completion(model="github/llama3-8b-8192", messages)` |
| llama3-70b-8192 | `completion(model="github/llama3-70b-8192", messages)` |
| llama2-70b-4096 | `completion(model="github/llama2-70b-4096", messages)` |
| mixtral-8x7b-32768 | `completion(model="github/mixtral-8x7b-32768", messages)` |
| gemma-7b-it | `completion(model="github/gemma-7b-it", messages)` |
## Github - Tool / Function Calling Example
```python
# Example dummy function hard coded to return the current weather
import json
def get_current_weather(location, unit="fahrenheit"):
"""Get the current weather in a given location"""
if "tokyo" in location.lower():
return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
elif "san francisco" in location.lower():
return json.dumps(
{"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
)
elif "paris" in location.lower():
return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
else:
return json.dumps({"location": location, "temperature": "unknown"})
# Step 1: send the conversation and available functions to the model
messages = [
{
"role": "system",
"content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
},
{
"role": "user",
"content": "What's the weather like in San Francisco?",
},
]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
]
response = litellm.completion(
model="github/llama3-8b-8192",
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
)
print("Response\n", response)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
# Step 2: check if the model wanted to call a function
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
}
messages.append(
response_message
) # extend conversation with assistant's reply
print("Response message\n", response_message)
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model="github/llama3-8b-8192", messages=messages
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Groq # Groq
https://groq.com/ https://groq.com/
@ -20,7 +23,7 @@ import os
os.environ['GROQ_API_KEY'] = "" os.environ['GROQ_API_KEY'] = ""
response = completion( response = completion(
model="groq/llama2-70b-4096", model="groq/llama3-8b-8192",
messages=[ messages=[
{"role": "user", "content": "hello from litellm"} {"role": "user", "content": "hello from litellm"}
], ],
@ -35,7 +38,7 @@ import os
os.environ['GROQ_API_KEY'] = "" os.environ['GROQ_API_KEY'] = ""
response = completion( response = completion(
model="groq/llama2-70b-4096", model="groq/llama3-8b-8192",
messages=[ messages=[
{"role": "user", "content": "hello from litellm"} {"role": "user", "content": "hello from litellm"}
], ],
@ -47,11 +50,109 @@ for chunk in response:
``` ```
## Usage with LiteLLM Proxy
### 1. Set Groq Models on config.yaml
```yaml
model_list:
- model_name: groq-llama3-8b-8192 # Model Alias to use for requests
litellm_params:
model: groq/llama3-8b-8192
api_key: "os.environ/GROQ_API_KEY" # ensure you have `GROQ_API_KEY` in your .env
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
Make request to litellm proxy
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "groq-llama3-8b-8192",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="groq-llama3-8b-8192", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "groq-llama3-8b-8192",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models - ALL Groq Models Supported! ## Supported Models - ALL Groq Models Supported!
We support ALL Groq models, just set `groq/` as a prefix when sending completion requests We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
| Model Name | Function Call | | Model Name | Usage |
|--------------------|---------------------------------------------------------| |--------------------|---------------------------------------------------------|
| llama-3.1-8b-instant | `completion(model="groq/llama-3.1-8b-instant", messages)` |
| llama-3.1-70b-versatile | `completion(model="groq/llama-3.1-70b-versatile", messages)` |
| llama-3.1-405b-reasoning | `completion(model="groq/llama-3.1-405b-reasoning", messages)` |
| llama3-8b-8192 | `completion(model="groq/llama3-8b-8192", messages)` | | llama3-8b-8192 | `completion(model="groq/llama3-8b-8192", messages)` |
| llama3-70b-8192 | `completion(model="groq/llama3-70b-8192", messages)` | | llama3-70b-8192 | `completion(model="groq/llama3-70b-8192", messages)` |
| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` |
@ -114,7 +215,7 @@ tools = [
} }
] ]
response = litellm.completion( response = litellm.completion(
model="groq/llama2-70b-4096", model="groq/llama3-8b-8192",
messages=messages, messages=messages,
tools=tools, tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit tool_choice="auto", # auto is default, but we'll be explicit
@ -154,7 +255,7 @@ if tool_calls:
) # extend conversation with function response ) # extend conversation with function response
print(f"messages: {messages}") print(f"messages: {messages}")
second_response = litellm.completion( second_response = litellm.completion(
model="groq/llama2-70b-4096", messages=messages model="groq/llama3-8b-8192", messages=messages
) # get a new response from the model where it can see the function response ) # get a new response from the model where it can see the function response
print("second response\n", second_response) print("second response\n", second_response)
``` ```

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Mistral AI API # Mistral AI API
https://docs.mistral.ai/api/ https://docs.mistral.ai/api/
@ -41,18 +44,120 @@ for chunk in response:
``` ```
## Usage with LiteLLM Proxy
### 1. Set Mistral Models on config.yaml
```yaml
model_list:
- model_name: mistral-small-latest
litellm_params:
model: mistral/mistral-small-latest
api_key: "os.environ/MISTRAL_API_KEY" # ensure you have `MISTRAL_API_KEY` in your .env
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "mistral-small-latest",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="mistral-small-latest", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "mistral-small-latest",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models ## Supported Models
:::info
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
:::
| Model Name | Function Call | | Model Name | Function Call |
|----------------|--------------------------------------------------------------| |----------------|--------------------------------------------------------------|
| Mistral Small | `completion(model="mistral/mistral-small-latest", messages)` | | Mistral Small | `completion(model="mistral/mistral-small-latest", messages)` |
| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`| | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
| Mistral Large | `completion(model="mistral/mistral-large-latest", messages)` | | Mistral Large 2 | `completion(model="mistral/mistral-large-2407", messages)` |
| Mistral Large Latest | `completion(model="mistral/mistral-large-latest", messages)` |
| Mistral 7B | `completion(model="mistral/open-mistral-7b", messages)` | | Mistral 7B | `completion(model="mistral/open-mistral-7b", messages)` |
| Mixtral 8x7B | `completion(model="mistral/open-mixtral-8x7b", messages)` | | Mixtral 8x7B | `completion(model="mistral/open-mixtral-8x7b", messages)` |
| Mixtral 8x22B | `completion(model="mistral/open-mixtral-8x22b", messages)` | | Mixtral 8x22B | `completion(model="mistral/open-mixtral-8x22b", messages)` |
| Codestral | `completion(model="mistral/codestral-latest", messages)` | | Codestral | `completion(model="mistral/codestral-latest", messages)` |
| Mistral NeMo | `completion(model="mistral/open-mistral-nemo", messages)` |
| Mistral NeMo 2407 | `completion(model="mistral/open-mistral-nemo-2407", messages)` |
| Codestral Mamba | `completion(model="mistral/open-codestral-mamba", messages)` |
| Codestral Mamba | `completion(model="mistral/codestral-mamba-latest"", messages)` |
## Function Calling ## Function Calling

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Ollama # Ollama
LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama) LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama)
@ -84,6 +87,120 @@ response = completion(
) )
``` ```
## Example Usage - Tool Calling
To use ollama tool calling, pass `tools=[{..}]` to `litellm.completion()`
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import litellm
## [OPTIONAL] REGISTER MODEL - not all ollama models support function calling, litellm defaults to json mode tool calls if native tool calling not supported.
# litellm.register_model(model_cost={
# "ollama_chat/llama3.1": {
# "supports_function_calling": true
# },
# })
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
}
}
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
response = completion(
model="ollama_chat/llama3.1",
messages=messages,
tools=tools
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: "llama3.1"
litellm_params:
model: "ollama_chat/llama3.1"
model_info:
supports_function_calling: true
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "llama3.1",
"messages": [
{
"role": "user",
"content": "What'\''s the weather like in Boston today?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["location"]
}
}
}
],
"tool_choice": "auto",
"stream": true
}'
```
</TabItem>
</Tabs>
## Using ollama `api/chat` ## Using ollama `api/chat`
In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat` In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

View file

@ -163,7 +163,10 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL
| Model Name | Function Call | | Model Name | Function Call |
|-----------------------|-----------------------------------------------------------------| |-----------------------|-----------------------------------------------------------------|
| gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` |
| gpt-4o-mini-2024-07-18 | `response = completion(model="gpt-4o-mini-2024-07-18", messages=messages)` |
| gpt-4o | `response = completion(model="gpt-4o", messages=messages)` | | gpt-4o | `response = completion(model="gpt-4o", messages=messages)` |
| gpt-4o-2024-08-06 | `response = completion(model="gpt-4o-2024-08-06", messages=messages)` |
| gpt-4o-2024-05-13 | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` | | gpt-4o-2024-05-13 | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
| gpt-4-turbo | `response = completion(model="gpt-4-turbo", messages=messages)` | | gpt-4-turbo | `response = completion(model="gpt-4-turbo", messages=messages)` |
| gpt-4-turbo-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` | | gpt-4-turbo-preview | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
@ -236,6 +239,104 @@ response = completion(
## Advanced ## Advanced
### Getting OpenAI API Response Headers
Set `litellm.return_response_headers = True` to get raw response headers from OpenAI
You can expect to always get the `_response_headers` field from `litellm.completion()`, `litellm.embedding()` functions
<Tabs>
<TabItem value="litellm.completion" label="litellm.completion">
```python
litellm.return_response_headers = True
# /chat/completion
response = completion(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": "hi",
}
],
)
print(f"response: {response}")
print("_response_headers=", response._response_headers)
```
</TabItem>
<TabItem value="litellm.completion - streaming" label="litellm.completion + stream">
```python
litellm.return_response_headers = True
# /chat/completion
response = completion(
model="gpt-4o-mini",
stream=True,
messages=[
{
"role": "user",
"content": "hi",
}
],
)
print(f"response: {response}")
print("response_headers=", response._response_headers)
for chunk in response:
print(chunk)
```
</TabItem>
<TabItem value="litellm.embedding" label="litellm.embedding">
```python
litellm.return_response_headers = True
# embedding
embedding_response = litellm.embedding(
model="text-embedding-ada-002",
input="hello",
)
embedding_response_headers = embedding_response._response_headers
print("embedding_response_headers=", embedding_response_headers)
```
</TabItem>
</Tabs>
Expected Response Headers from OpenAI
```json
{
"date": "Sat, 20 Jul 2024 22:05:23 GMT",
"content-type": "application/json",
"transfer-encoding": "chunked",
"connection": "keep-alive",
"access-control-allow-origin": "*",
"openai-model": "text-embedding-ada-002",
"openai-organization": "*****",
"openai-processing-ms": "20",
"openai-version": "2020-10-01",
"strict-transport-security": "max-age=15552000; includeSubDomains; preload",
"x-ratelimit-limit-requests": "5000",
"x-ratelimit-limit-tokens": "5000000",
"x-ratelimit-remaining-requests": "4999",
"x-ratelimit-remaining-tokens": "4999999",
"x-ratelimit-reset-requests": "12ms",
"x-ratelimit-reset-tokens": "0s",
"x-request-id": "req_cc37487bfd336358231a17034bcfb4d9",
"cf-cache-status": "DYNAMIC",
"set-cookie": "__cf_bm=E_FJY8fdAIMBzBE2RZI2.OkMIO3lf8Hz.ydBQJ9m3q8-1721513123-1.0.1.1-6OK0zXvtd5s9Jgqfz66cU9gzQYpcuh_RLaUZ9dOgxR9Qeq4oJlu.04C09hOTCFn7Hg.k.2tiKLOX24szUE2shw; path=/; expires=Sat, 20-Jul-24 22:35:23 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None, *cfuvid=SDndIImxiO3U0aBcVtoy1TBQqYeQtVDo1L6*Nlpp7EU-1721513123215-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None",
"x-content-type-options": "nosniff",
"server": "cloudflare",
"cf-ray": "8a66409b4f8acee9-SJC",
"content-encoding": "br",
"alt-svc": "h3=\":443\"; ma=86400"
}
```
### Parallel Function calling ### Parallel Function calling
See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call) See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call)
```python ```python

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Perplexity AI (pplx-api) # Perplexity AI (pplx-api)
https://www.perplexity.ai https://www.perplexity.ai
@ -38,7 +41,7 @@ for chunk in response:
## Supported Models ## Supported Models
All models listed here https://docs.perplexity.ai/docs/model-cards are supported All models listed here https://docs.perplexity.ai/docs/model-cards are supported. Just do `model=perplexity/<model-name>`.
| Model Name | Function Call | | Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@ -60,3 +63,72 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported
## Return citations
Perplexity supports returning citations via `return_citations=True`. [Perplexity Docs](https://docs.perplexity.ai/reference/post_chat_completions). Note: Perplexity has this feature in **closed beta**, so you need them to grant you access to get citations from their API.
If perplexity returns citations, LiteLLM will pass it straight through.
:::info
For passing more provider-specific, [go here](../completion/provider_specific_params.md)
:::
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ['PERPLEXITYAI_API_KEY'] = ""
response = completion(
model="perplexity/mistral-7b-instruct",
messages=messages,
return_citations=True
)
print(response)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Add perplexity to config.yaml
```yaml
model_list:
- model_name: "perplexity-model"
litellm_params:
model: "llama-3.1-sonar-small-128k-online"
api_key: os.environ/PERPLEXITY_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "perplexity-model",
"messages": [
{
"role": "user",
"content": "Who won the world cup in 2022?"
}
],
"return_citations": true
}'
```
[**Call w/ OpenAI SDK, Langchain, Instructor, etc.**](../proxy/user_keys.md#chatcompletions)
</TabItem>
</Tabs>

View file

@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
## 🆕 `vertex_ai_beta/` route ## 🆕 `vertex_ai_beta/` route
New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk). New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk). This implementation uses [VertexAI's REST API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#syntax).
```python ```python
from litellm import completion from litellm import completion
@ -334,6 +334,10 @@ completion(model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=messag
Add Google Search Result grounding to vertex ai calls. Add Google Search Result grounding to vertex ai calls.
[**Relevant VertexAI Docs**](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/grounding#examples)
See the grounding metadata with `response_obj._hidden_params["vertex_ai_grounding_metadata"]`
<Tabs> <Tabs>
<TabItem value="sdk" label="SDK"> <TabItem value="sdk" label="SDK">
@ -357,15 +361,17 @@ print(resp)
<TabItem value="proxy" label="PROXY"> <TabItem value="proxy" label="PROXY">
```bash ```bash
curl http://0.0.0.0:4000/v1/chat/completions \ curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENAI_API_KEY" \ -H "Authorization: Bearer sk-1234" \
-d '{ -d '{
"model": "gpt-4o", "model": "gemini-pro",
"messages": [{"role": "user", "content": "Who won the world cup?"}], "messages": [
"tools": [ {"role": "user", "content": "Hello, Claude!"}
],
"tools": [
{ {
"googleSearchResults": {} "googleSearchRetrieval": {}
} }
] ]
}' }'
@ -375,6 +381,161 @@ curl http://0.0.0.0:4000/v1/chat/completions \
</TabItem> </TabItem>
</Tabs> </Tabs>
#### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)**
If this was your initial VertexAI Grounding code,
```python
import vertexai
vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel("gemini-1.5-flash-001")
# Use Google Search for grounding
tool = Tool.from_google_search_retrieval(grounding.GoogleSearchRetrieval(disable_attributon=False))
prompt = "When is the next total solar eclipse in US?"
response = model.generate_content(
prompt,
tools=[tool],
generation_config=GenerationConfig(
temperature=0.0,
),
)
print(response)
```
then, this is what it looks like now
```python
from litellm import completion
# !gcloud auth application-default login - run this to add vertex credentials to your env
tools = [{"googleSearchRetrieval": {"disable_attributon": False}}] # 👈 ADD GOOGLE SEARCH
resp = litellm.completion(
model="vertex_ai_beta/gemini-1.0-pro-001",
messages=[{"role": "user", "content": "Who won the world cup?"}],
tools=tools,
vertex_project="project-id"
)
print(resp)
```
### **Context Caching**
Use Vertex AI Context Caching
[**Relevant VertexAI Docs**](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview)
<Tabs>
<TabItem value="proxy" label="LiteLLM PROXY">
1. Add model to config.yaml
```yaml
model_list:
# used for /chat/completions, /completions, /embeddings endpoints
- model_name: gemini-1.5-pro-001
litellm_params:
model: vertex_ai_beta/gemini-1.5-pro-001
vertex_project: "project-id"
vertex_location: "us-central1"
vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
# used for the /cachedContent and vertexAI native endpoints
default_vertex_config:
vertex_project: "adroit-crow-413218"
vertex_location: "us-central1"
vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
2. Start Proxy
```
$ litellm --config /path/to/config.yaml
```
3. Make Request!
We make the request in two steps:
- Create a cachedContents object
- Use the cachedContents object in your /chat/completions
**Create a cachedContents object**
First, create a cachedContents object by calling the Vertex `cachedContents` endpoint. The LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API.
```python
import httpx
# Set Litellm proxy variables
LITELLM_BASE_URL = "http://0.0.0.0:4000"
LITELLM_PROXY_API_KEY = "sk-1234"
httpx_client = httpx.Client(timeout=30)
print("Creating cached content")
create_cache = httpx_client.post(
url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents",
headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"},
json={
"model": "gemini-1.5-pro-001",
"contents": [
{
"role": "user",
"parts": [{
"text": "This is sample text to demonstrate explicit caching." * 4000
}]
}
],
}
)
print("Response from create_cache:", create_cache)
create_cache_response = create_cache.json()
print("JSON from create_cache:", create_cache_response)
cached_content_name = create_cache_response["name"]
```
**Use the cachedContents object in your /chat/completions request to VertexAI**
```python
import openai
# Set Litellm proxy variables
LITELLM_BASE_URL = "http://0.0.0.0:4000"
LITELLM_PROXY_API_KEY = "sk-1234"
client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL)
response = client.chat.completions.create(
model="gemini-1.5-pro-001",
max_tokens=8192,
messages=[
{
"role": "user",
"content": "What is the sample text about?",
},
],
temperature=0.7,
extra_body={"cached_content": cached_content_name}, # Use the cached content
)
print("Response from proxy:", response)
```
</TabItem>
</Tabs>
## Pre-requisites ## Pre-requisites
* `pip install google-cloud-aiplatform` (pre-installed on proxy docker image) * `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
* Authentication: * Authentication:
@ -697,6 +858,256 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</TabItem> </TabItem>
</Tabs> </Tabs>
## Llama 3 API
| Model Name | Function Call |
|------------------|--------------------------------------|
| meta/llama3-405b-instruct-maas | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` |
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
model = "meta/llama3-405b-instruct-maas"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: anthropic-llama
litellm_params:
model: vertex_ai/meta/llama3-405b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: anthropic-llama
litellm_params:
model: vertex_ai/meta/llama3-405b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "anthropic-llama", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
## Mistral API
[**Supported OpenAI Params**](https://github.com/BerriAI/litellm/blob/e0f3cd580cb85066f7d36241a03c30aa50a8a31d/litellm/llms/openai.py#L137)
| Model Name | Function Call |
|------------------|--------------------------------------|
| mistral-large@latest | `completion('vertex_ai/mistral-large@latest', messages)` |
| mistral-large@2407 | `completion('vertex_ai/mistral-large@2407', messages)` |
| mistral-nemo@latest | `completion('vertex_ai/mistral-nemo@latest', messages)` |
| codestral@latest | `completion('vertex_ai/codestral@latest', messages)` |
| codestral@@2405 | `completion('vertex_ai/codestral@2405', messages)` |
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
model = "mistral-large@2407"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: vertex-mistral
litellm_params:
model: vertex_ai/mistral-large@2407
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: vertex-mistral
litellm_params:
model: vertex_ai/mistral-large@2407
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "vertex-mistral", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
### Usage - Codestral FIM
Call Codestral on VertexAI via the OpenAI [`/v1/completion`](https://platform.openai.com/docs/api-reference/completions/create) endpoint for FIM tasks.
Note: You can also call Codestral via `/chat/completion`.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
# OR run `!gcloud auth print-access-token` in your terminal
model = "codestral@2405"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = text_completion(
model="vertex_ai/" + model,
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
suffix="return True", # optional
temperature=0, # optional
top_p=1, # optional
max_tokens=10, # optional
min_tokens=10, # optional
seed=10, # optional
stop=["return"], # optional
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: vertex-codestral
litellm_params:
model: vertex_ai/codestral@2405
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: vertex-codestral
litellm_params:
model: vertex_ai/codestral@2405
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl -X POST 'http://0.0.0.0:4000/completions' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"model": "vertex-codestral", # 👈 the 'model_name' in config
"prompt": "def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
"suffix":"return True", # optional
"temperature":0, # optional
"top_p":1, # optional
"max_tokens":10, # optional
"min_tokens":10, # optional
"seed":10, # optional
"stop":["return"], # optional
}'
```
</TabItem>
</Tabs>
## Model Garden ## Model Garden
| Model Name | Function Call | | Model Name | Function Call |
|------------------|--------------------------------------| |------------------|--------------------------------------|

View file

@ -119,13 +119,14 @@ All Possible Alert Types
```python ```python
AlertType = Literal[ AlertType = Literal[
"llm_exceptions", "llm_exceptions", # LLM API Exceptions
"llm_too_slow", "llm_too_slow", # LLM Responses slower than alerting_threshold
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports", "daily_reports",
"spend_reports", "spend_reports",
"fallback_reports",
"cooldown_deployment", "cooldown_deployment",
"new_model_added", "new_model_added",
"outage_alerts", "outage_alerts",
@ -133,6 +134,61 @@ AlertType = Literal[
``` ```
## Advanced - set specific slack channels per alert type
Use this if you want to set specific channels per alert type
**This allows you to do the following**
```
llm_exceptions -> go to slack channel #llm-exceptions
spend_reports -> go to slack channel #llm-spend-reports
```
Set `alert_to_webhook_url` on your config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
alert_to_webhook_url: {
"llm_exceptions": "https://hooks.slack.com/services/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",
"llm_too_slow": "https://hooks.slack.com/services/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",
"llm_requests_hanging": "https://hooks.slack.com/services/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",
"budget_alerts": "https://hooks.slack.com/services/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",
"db_exceptions": "https://hooks.slack.com/services/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",
"daily_reports": "https://hooks.slack.com/services/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",
"spend_reports": "https://hooks.slack.com/services/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",
"cooldown_deployment": "https://hooks.slack.com/services/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",
"new_model_added": "https://hooks.slack.com/services/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",
"outage_alerts": "https://hooks.slack.com/services/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",
}
litellm_settings:
success_callback: ["langfuse"]
```
Test it - send a valid llm request - expect to see a `llm_too_slow` alert in it's own slack channel
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
## Advanced - Using MS Teams Webhooks ## Advanced - Using MS Teams Webhooks

View file

@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# 💵 Billing # Billing
Bill internal teams, external customers for their usage Bill internal teams, external customers for their usage

View file

@ -0,0 +1,191 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Logging GCS, s3 Buckets
LiteLLM Supports Logging to the following Cloud Buckets
- (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
- (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
## Logging Proxy Input/Output to Google Cloud Storage Buckets
Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
### Usage
1. Add `gcs_bucket` to LiteLLM Config.yaml
```yaml
model_list:
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key
model: openai/my-fake-model
model_name: fake-openai-endpoint
litellm_settings:
callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
```
2. Set required env variables
```shell
GCS_BUCKET_NAME="<your-gcs-bucket-name>"
GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
```
3. Start Proxy
```
litellm --config /path/to/config.yaml
```
4. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}
'
```
### Expected Logs on GCS Buckets
<Image img={require('../../img/gcs_bucket.png')} />
### Fields Logged on GCS Buckets
Example payload of a `/chat/completion` request logged on GCS
```json
{
"request_kwargs": {
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "This is a test"
}
],
"optional_params": {
"temperature": 0.7,
"max_tokens": 10,
"user": "ishaan-2",
"extra_body": {}
}
},
"response_obj": {
"id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hi!",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1722868456,
"model": "gpt-3.5-turbo",
"object": "chat.completion",
"system_fingerprint": null,
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
},
"start_time": "2024-08-05 07:34:16",
"end_time": "2024-08-05 07:34:16"
}
```
### Getting `service_account.json` from Google Cloud Console
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Search for IAM & Admin
3. Click on Service Accounts
4. Select a Service Account
5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket

View file

@ -59,6 +59,8 @@ litellm_settings:
cache_params: # set cache params for redis cache_params: # set cache params for redis
type: redis type: redis
ttl: 600 # will be cached on redis for 600s ttl: 600 # will be cached on redis for 600s
# default_in_memory_ttl: Optional[float], default is None. time in seconds.
# default_in_redis_ttl: Optional[float], default is None. time in seconds.
``` ```
@ -258,6 +260,21 @@ curl --location 'http://0.0.0.0:4000/cache/ping' -H "Authorization: Bearer sk-1
``` ```
## Advanced ## Advanced
### Control Call Types Caching is on for - (`/chat/completion`, `/embeddings`, etc.)
By default, caching is on for all call types. You can control which call types caching is on for by setting `supported_call_types` in `cache_params`
**Cache will only be on for the call types specified in `supported_call_types`**
```yaml
litellm_settings:
cache: True
cache_params:
type: redis
supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
```
### Set Cache Params on config.yaml ### Set Cache Params on config.yaml
```yaml ```yaml
model_list: model_list:
@ -278,7 +295,8 @@ litellm_settings:
password: "your_password" # The password for the Redis cache. Required if type is "redis". password: "your_password" # The password for the Redis cache. Required if type is "redis".
# Optional configurations # Optional configurations
supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
# /chat/completions, /completions, /embeddings, /audio/transcriptions
``` ```
### Turn on / off caching per request. ### Turn on / off caching per request.
@ -294,6 +312,11 @@ The proxy support 4 cache-controls:
**Turn off caching** **Turn off caching**
Set `no-cache=True`, this will not return a cached response
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python ```python
import os import os
from openai import OpenAI from openai import OpenAI
@ -319,9 +342,81 @@ chat_completion = client.chat.completions.create(
} }
) )
``` ```
</TabItem>
<TabItem value="curl" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"no-cache": True},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Turn on caching** **Turn on caching**
By default cache is always on
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python
import os
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-3.5-turbo"
)
```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Set `ttl`**
Set `ttl=600`, this will caches response for 10 minutes (600 seconds)
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python ```python
import os import os
from openai import OpenAI from openai import OpenAI
@ -347,6 +442,35 @@ chat_completion = client.chat.completions.create(
} }
) )
``` ```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"ttl": 600},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
**Set `s-maxage`**
Set `s-maxage`, this will only get responses cached within last 10 minutes
<Tabs>
<TabItem value="openai" label="OpenAI Python SDK">
```python ```python
import os import os
@ -373,6 +497,27 @@ chat_completion = client.chat.completions.create(
} }
) )
``` ```
</TabItem>
<TabItem value="curl on" label="curl">
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-3.5-turbo",
"cache": {"s-maxage": 600},
"messages": [
{"role": "user", "content": "Say this is a test"}
]
}'
```
</TabItem>
</Tabs>
### Turn on / off caching per Key. ### Turn on / off caching per Key.
@ -486,21 +631,25 @@ litellm_settings:
```yaml ```yaml
cache_params: cache_params:
# ttl
ttl: Optional[float]
default_in_memory_ttl: Optional[float]
default_in_redis_ttl: Optional[float]
# Type of cache (options: "local", "redis", "s3") # Type of cache (options: "local", "redis", "s3")
type: s3 type: s3
# List of litellm call types to cache for # List of litellm call types to cache for
# Options: "completion", "acompletion", "embedding", "aembedding" # Options: "completion", "acompletion", "embedding", "aembedding"
supported_call_types: supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
- completion # /chat/completions, /completions, /embeddings, /audio/transcriptions
- acompletion
- embedding
- aembedding
# Redis cache parameters # Redis cache parameters
host: localhost # Redis server hostname or IP address host: localhost # Redis server hostname or IP address
port: "6379" # Redis server port (as a string) port: "6379" # Redis server port (as a string)
password: secret_password # Redis server password password: secret_password # Redis server password
namespace: Optional[str] = None,
# S3 cache parameters # S3 cache parameters
s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket

View file

@ -47,6 +47,7 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
async def async_post_call_success_hook( async def async_post_call_success_hook(
self, self,
data: dict,
user_api_key_dict: UserAPIKeyAuth, user_api_key_dict: UserAPIKeyAuth,
response, response,
): ):

View file

@ -55,11 +55,19 @@ model_list:
- model_name: vllm-models - model_name: vllm-models
litellm_params: litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
api_base: http://0.0.0.0:4000 api_base: http://0.0.0.0:4000/v1
api_key: none
rpm: 1440 rpm: 1440
model_info: model_info:
version: 2 version: 2
# Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
# Default models
# Works for ALL Providers and needs the default provider credentials in .env
- model_name: "*"
litellm_params:
model: "*"
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True drop_params: True
success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
@ -277,52 +285,58 @@ curl --location 'http://0.0.0.0:4000/v1/model/info' \
--data '' --data ''
``` ```
## Wildcard Model Name (Add ALL MODELS from env)
Dynamically call any model from any given provider without the need to predefine it in the config YAML file. As long as the relevant keys are in the environment (see [providers list](../providers/)), LiteLLM will make the call correctly. ## Provider specific wildcard routing
**Proxy all models from a provider**
Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
**Step 1** - define provider specific routing on config.yaml
1. Setup config.yaml ```yaml
```
model_list: model_list:
- model_name: "*" # all requests where model not in your config go to this deployment # provider specific wildcard routing
- model_name: "anthropic/*"
litellm_params: litellm_params:
model: "openai/*" # passes our validation check that a real provider is given model: "anthropic/*"
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: "groq/*"
litellm_params:
model: "groq/*"
api_key: os.environ/GROQ_API_KEY
``` ```
2. Start LiteLLM proxy Step 2 - Run litellm proxy
``` ```shell
litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
3. Try claude 3-5 sonnet from anthropic Step 3 Test it
```bash Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
curl -X POST 'http://0.0.0.0:4000/chat/completions' \ ```shell
-H 'Content-Type: application/json' \ curl http://localhost:4000/v1/chat/completions \
-H 'Authorization: Bearer sk-1234' \ -H "Content-Type: application/json" \
-D '{ -H "Authorization: Bearer sk-1234" \
"model": "claude-3-5-sonnet-20240620", -d '{
"messages": [ "model": "anthropic/claude-3-sonnet-20240229",
{"role": "user", "content": "Hey, how'\''s it going?"}, "messages": [
{ {"role": "user", "content": "Hello, Claude!"}
"role": "assistant",
"content": "I'\''m doing well. Would like to hear the rest of the story?"
},
{"role": "user", "content": "Na"},
{
"role": "assistant",
"content": "No problem, is there anything else i can help you with today?"
},
{
"role": "user",
"content": "I think you'\''re getting cut off sometimes"
}
] ]
} }'
' ```
Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "groq/llama3-8b-8192",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
``` ```
## Load Balancing ## Load Balancing

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# Custom Pricing - Sagemaker, etc. # Custom LLM Pricing - Sagemaker, Azure, etc
Use this to register custom pricing for models. Use this to register custom pricing for models.
@ -16,39 +16,9 @@ LiteLLM already has pricing for any model in our [model cost map](https://github
::: :::
## Quick Start ## Cost Per Second (e.g. Sagemaker)
Register custom pricing for sagemaker completion model. ### Usage with LiteLLM Proxy Server
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
### Usage with OpenAI Proxy Server
**Step 1: Add pricing to config.yaml** **Step 1: Add pricing to config.yaml**
```yaml ```yaml
@ -75,38 +45,7 @@ litellm /path/to/config.yaml
## Cost Per Token (e.g. Azure) ## Cost Per Token (e.g. Azure)
### Usage with LiteLLM Proxy Server
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```
### Usage with OpenAI Proxy Server
```yaml ```yaml
model_list: model_list:

View file

@ -231,7 +231,7 @@ curl -X POST 'http://localhost:4000/customer/new' \
```python ```python
from openai import OpenAI from openai import OpenAI
client = OpenAI( client = OpenAI(
base_url="<your_proxy_base_url", base_url="<your_proxy_base_url>",
api_key="<your_proxy_key>" api_key="<your_proxy_key>"
) )

View file

@ -35,6 +35,22 @@ $ litellm --detailed_debug
os.environ["LITELLM_LOG"] = "DEBUG" os.environ["LITELLM_LOG"] = "DEBUG"
``` ```
### Debug Logs
Run the proxy with `--detailed_debug` to view detailed debug logs
```shell
litellm --config /path/to/config.yaml --detailed_debug
```
When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
```shell
POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/chat/completions \
-H 'content-type: application/json' -H 'Authorization: Bearer sk-qnWGUIW9****************************************' \
-d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "this is a test request, write a short poem"}]}'
```
## JSON LOGS ## JSON LOGS
Set `JSON_LOGS="True"` in your env: Set `JSON_LOGS="True"` in your env:

View file

@ -17,8 +17,15 @@ git clone https://github.com/BerriAI/litellm
# Go to folder # Go to folder
cd litellm cd litellm
# Add the master key # Add the master key - you can change this after setup
echo 'LITELLM_MASTER_KEY="sk-1234"' > .env echo 'LITELLM_MASTER_KEY="sk-1234"' > .env
# Add the litellm salt key - you cannot change this after adding a model
# It is used to encrypt / decrypt your LLM API Key credentials
# We recommned - https://1password.com/password-generator/
# password generator to get a random hash for litellm salt key
echo 'LITELLM_SALT_KEY="sk-1234"' > .env
source .env source .env
# Start # Start
@ -239,7 +246,7 @@ helm install lite-helm ./litellm-helm
kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
``` ```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
</TabItem> </TabItem>
@ -247,6 +254,15 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
**That's it ! That's the quick start to deploy litellm** **That's it ! That's the quick start to deploy litellm**
## Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
:::info
💡 Go here 👉 [to make your first LLM API Request](user_keys)
LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
:::
## Options to deploy LiteLLM ## Options to deploy LiteLLM
| Docs | When to Use | | Docs | When to Use |
@ -285,7 +301,7 @@ docker run \
--config /app/config.yaml --detailed_debug --config /app/config.yaml --detailed_debug
``` ```
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`. Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
</TabItem> </TabItem>
<TabItem value="kubernetes-deploy" label="Kubernetes"> <TabItem value="kubernetes-deploy" label="Kubernetes">
@ -383,7 +399,7 @@ kubectl apply -f /path/to/service.yaml
kubectl port-forward service/litellm-service 4000:4000 kubectl port-forward service/litellm-service 4000:4000
``` ```
Your OpenAI proxy server is now running on `http://0.0.0.0:4000`. Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.
</TabItem> </TabItem>
@ -425,7 +441,7 @@ kubectl \
4000:4000 4000:4000
``` ```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/values.yaml) If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/values.yaml)
@ -470,7 +486,7 @@ helm install lite-helm ./litellm-helm
kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
``` ```
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.
</TabItem> </TabItem>
</Tabs> </Tabs>
@ -542,6 +558,39 @@ docker run --name litellm-proxy \
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
``` ```
## LiteLLM without Internet Connection
By default `prisma generate` downloads [prisma's engine binaries](https://www.prisma.io/docs/orm/reference/environment-variables-reference#custom-engine-file-locations). This might cause errors when running without internet connection.
Use this dockerfile to build an image which pre-generates the prisma binaries.
```Dockerfile
# Use the provided base image
FROM ghcr.io/berriai/litellm:main-latest
# Set the working directory to /app
WORKDIR /app
### [👇 KEY STEP] ###
# Install Prisma CLI and generate Prisma client
RUN pip install prisma
RUN prisma generate
### FIN ####
# Expose the necessary port
EXPOSE 4000
# Override the CMD instruction with your desired command and arguments
# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
# CMD ["--port", "4000", "--config", "config.yaml"]
# Define the command to run your app
ENTRYPOINT ["litellm"]
CMD ["--port", "4000"]
```
## Advanced Deployment Settings ## Advanced Deployment Settings
### 1. Customization of the server root path (custom Proxy base url) ### 1. Customization of the server root path (custom Proxy base url)
@ -556,24 +605,87 @@ In a Kubernetes deployment, it's possible to utilize a shared DNS to host multip
Customize the root path to eliminate the need for employing multiple DNS configurations during deployment. Customize the root path to eliminate the need for employing multiple DNS configurations during deployment.
Step 1.
👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path 👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path
``` ```
export SERVER_ROOT_PATH="/api/v1" export SERVER_ROOT_PATH="/api/v1"
``` ```
**Step 1. Run Proxy with `SERVER_ROOT_PATH` set in your env ** **Step 2** (If you want the Proxy Admin UI to work with your root path you need to use this dockerfile)
- Use the dockerfile below (it uses litellm as a base image)
- 👉 Set `UI_BASE_PATH=$SERVER_ROOT_PATH/ui` in the Dockerfile, example `UI_BASE_PATH=/api/v1/ui`
Dockerfile
```shell ```shell
docker run --name litellm-proxy \ # Use the provided base image
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \ FROM ghcr.io/berriai/litellm:main-latest
-e SERVER_ROOT_PATH="/api/v1" \
-p 4000:4000 \ # Set the working directory to /app
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml WORKDIR /app
# Install Node.js and npm (adjust version as needed)
RUN apt-get update && apt-get install -y nodejs npm
# Copy the UI source into the container
COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
# Set an environment variable for UI_BASE_PATH
# This can be overridden at build time
# set UI_BASE_PATH to "<your server root path>/ui"
# 👇👇 Enter your UI_BASE_PATH here
ENV UI_BASE_PATH="/api/v1/ui"
# Build the UI with the specified UI_BASE_PATH
WORKDIR /app/ui/litellm-dashboard
RUN npm install
RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
# Create the destination directory
RUN mkdir -p /app/litellm/proxy/_experimental/out
# Move the built files to the appropriate location
# Assuming the build output is in ./out directory
RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
mv ./out/* /app/litellm/proxy/_experimental/out/
# Switch back to the main app directory
WORKDIR /app
# Make sure your entrypoint.sh is executable
RUN chmod +x entrypoint.sh
# Expose the necessary port
EXPOSE 4000/tcp
# Override the CMD instruction with your desired command and arguments
# only use --detailed_debug for debugging
CMD ["--port", "4000", "--config", "config.yaml"]
```
**Step 3** build this Dockerfile
```shell
docker build -f Dockerfile -t litellm-prod-build . --progress=plain
```
**Step 4. Run Proxy with `SERVER_ROOT_PATH` set in your env **
```shell
docker run \
-v $(pwd)/proxy_config.yaml:/app/config.yaml \
-p 4000:4000 \
-e LITELLM_LOG="DEBUG"\
-e SERVER_ROOT_PATH="/api/v1"\
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e LITELLM_MASTER_KEY="sk-1234"\
litellm-prod-build \
--config /app/config.yaml
``` ```
After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`) After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`)
**Step 2. Verify Running on correct path** **Step 5. Verify Running on correct path**
<Image img={require('../../img/custom_root_path.png')} /> <Image img={require('../../img/custom_root_path.png')} />
@ -593,6 +705,29 @@ docker run ghcr.io/berriai/litellm:main-latest \
Provide an ssl certificate when starting litellm proxy server Provide an ssl certificate when starting litellm proxy server
### 3. Providing LiteLLM config.yaml file as a s3 Object/url
Use this if you cannot mount a config file on your deployment service (example - AWS Fargate, Railway etc)
LiteLLM Proxy will read your config.yaml from an s3 Bucket
Set the following .env vars
```shell
LITELLM_CONFIG_BUCKET_NAME = "litellm-proxy" # your bucket name on s3
LITELLM_CONFIG_BUCKET_OBJECT_KEY = "litellm_proxy_config.yaml" # object key on s3
```
Start litellm proxy with these env vars - litellm will read your config from s3
```shell
docker run --name litellm-proxy \
-e DATABASE_URL=<database_url> \
-e LITELLM_CONFIG_BUCKET_NAME=<bucket_name> \
-e LITELLM_CONFIG_BUCKET_OBJECT_KEY="<object_key>> \
-p 4000:4000 \
ghcr.io/berriai/litellm-database:main-latest
```
## Platform-specific Guide ## Platform-specific Guide
<Tabs> <Tabs>
@ -778,3 +913,31 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
Your LiteLLM container should be running now on the defined port e.g. `4000`. Your LiteLLM container should be running now on the defined port e.g. `4000`.
### IAM-based Auth for RDS DB
1. Set AWS env var
```bash
export AWS_WEB_IDENTITY_TOKEN='/path/to/token'
export AWS_ROLE_NAME='arn:aws:iam::123456789012:role/MyRole'
export AWS_SESSION_NAME='MySession'
```
[**See all Auth options**](https://github.com/BerriAI/litellm/blob/089a4f279ad61b7b3e213d8039fb9b75204a7abc/litellm/proxy/auth/rds_iam_token.py#L165)
2. Add RDS credentials to env
```bash
export DATABASE_USER="db-user"
export DATABASE_PORT="5432"
export DATABASE_HOST="database-1-instance-1.cs1ksmwz2xt3.us-west-2.rds.amazonaws.com"
export DATABASE_NAME="database-1-instance-1"
```
3. Run proxy with iam+rds
```bash
litellm --config /path/to/config.yaml --iam_token_db_auth
```

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
# ✨ 📧 Email Notifications # Email Notifications
Send an Email to your users when: Send an Email to your users when:
- A Proxy API Key is created for them - A Proxy API Key is created for them

View file

@ -21,15 +21,22 @@ Features:
- ✅ IP addressbased access control lists - ✅ IP addressbased access control lists
- ✅ Track Request IP Address - ✅ Track Request IP Address
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints) - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ [Set Max Request Size / File Size on Requests](#set-max-request--response-size-on-litellm-proxy)
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests) - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- **Spend Tracking** - **Customize Logging, Guardrails, Caching per project**
- ✅ [Team Based Logging](./team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
- ✅ [Disable Logging for a Team](./team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
-- **Spend Tracking & Data Exports**
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags) - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
- **Advanced Metrics** - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Prometheus Metrics**
- ✅ [Prometheus Metrics - Num Requests, failures, LLM Provider Outages](prometheus)
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens) - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
- **Guardrails, PII Masking, Content Moderation** - **Guardrails, PII Masking, Content Moderation**
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation) - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
- ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai) - ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai)
- ✅ [Prompt Injection Detection (with Aporia API)](#prompt-injection-detection---aporia-ai)
- ✅ [Switch LakeraAI on / off per request](guardrails#control-guardrails-onoff-per-request) - ✅ [Switch LakeraAI on / off per request](guardrails#control-guardrails-onoff-per-request)
- ✅ Reject calls from Blocked User list - ✅ Reject calls from Blocked User list
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors) - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
@ -113,7 +120,7 @@ client = openai.OpenAI(
base_url="http://0.0.0.0:4000" base_url="http://0.0.0.0:4000"
) )
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create( response = client.chat.completions.create(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages = [ messages = [
@ -124,7 +131,7 @@ response = client.chat.completions.create(
], ],
extra_body={ extra_body={
"metadata": { "metadata": {
"tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"] "tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"] # 👈 Key Change
} }
} }
) )
@ -133,6 +140,43 @@ print(response)
``` ```
</TabItem> </TabItem>
<TabItem value="openai js" label="OpenAI JS">
```js
const openai = require('openai');
async function runOpenAI() {
const client = new openai.OpenAI({
apiKey: 'sk-1234',
baseURL: 'http://0.0.0.0:4000'
});
try {
const response = await client.chat.completions.create({
model: 'gpt-3.5-turbo',
messages: [
{
role: 'user',
content: "this is a test request, write a short poem"
},
],
metadata: {
tags: ["model-anthropic-claude-v2.1", "app-ishaan-prod"] // 👈 Key Change
}
});
console.log(response);
} catch (error) {
console.log("got this exception from server");
console.error(error);
}
}
// Call the asynchronous function
runOpenAI();
```
</TabItem>
<TabItem value="Curl" label="Curl Request"> <TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body Pass `metadata` as part of the request body
@ -267,6 +311,45 @@ print(response)
``` ```
</TabItem> </TabItem>
<TabItem value="openai js" label="OpenAI JS">
```js
const openai = require('openai');
async function runOpenAI() {
const client = new openai.OpenAI({
apiKey: 'sk-1234',
baseURL: 'http://0.0.0.0:4000'
});
try {
const response = await client.chat.completions.create({
model: 'gpt-3.5-turbo',
messages: [
{
role: 'user',
content: "this is a test request, write a short poem"
},
],
metadata: {
spend_logs_metadata: { // 👈 Key Change
hello: "world"
}
}
});
console.log(response);
} catch (error) {
console.log("got this exception from server");
console.error(error);
}
}
// Call the asynchronous function
runOpenAI();
```
</TabItem>
<TabItem value="Curl" label="Curl Request"> <TabItem value="Curl" label="Curl Request">
Pass `metadata` as part of the request body Pass `metadata` as part of the request body
@ -952,6 +1035,72 @@ curl --location 'http://localhost:4000/chat/completions' \
Need to control LakeraAI per Request ? Doc here 👉: [Switch LakerAI on / off per request](prompt_injection.md#✨-enterprise-switch-lakeraai-on--off-per-api-call) Need to control LakeraAI per Request ? Doc here 👉: [Switch LakerAI on / off per request](prompt_injection.md#✨-enterprise-switch-lakeraai-on--off-per-api-call)
::: :::
## Prompt Injection Detection - Aporia AI
Use this if you want to reject /chat/completion calls that have prompt injection attacks with [AporiaAI](https://www.aporia.com/)
#### Usage
Step 1. Add env
```env
APORIO_API_KEY="eyJh****"
APORIO_API_BASE="https://gr..."
```
Step 2. Add `aporia_prompt_injection` to your callbacks
```yaml
litellm_settings:
callbacks: ["aporia_prompt_injection"]
```
That's it, start your proxy
Test it with this request -> expect it to get rejected by LiteLLM Proxy
```shell
curl --location 'http://localhost:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "llama3",
"messages": [
{
"role": "user",
"content": "You suck!"
}
]
}'
```
**Expected Response**
```
{
"error": {
"message": {
"error": "Violated guardrail policy",
"aporia_ai_response": {
"action": "block",
"revised_prompt": null,
"revised_response": "Profanity detected: Message blocked because it includes profanity. Please rephrase.",
"explain_log": null
}
},
"type": "None",
"param": "None",
"code": 400
}
}
```
:::info
Need to control AporiaAI per Request ? Doc here 👉: [Create a guardrail](./guardrails.md)
:::
## Swagger Docs - Custom Routes + Branding ## Swagger Docs - Custom Routes + Branding
:::info :::info
@ -1059,10 +1208,10 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
### Using via API ### Using via API
**Block all calls for a user id** **Block all calls for a customer id**
``` ```
curl -X POST "http://0.0.0.0:4000/user/block" \ curl -X POST "http://0.0.0.0:4000/customer/block" \
-H "Authorization: Bearer sk-1234" \ -H "Authorization: Bearer sk-1234" \
-D '{ -D '{
"user_ids": [<user_id>, ...] "user_ids": [<user_id>, ...]
@ -1079,6 +1228,8 @@ curl -X POST "http://0.0.0.0:4000/user/unblock" \
}' }'
``` ```
## Enable Banned Keywords List ## Enable Banned Keywords List
```yaml ```yaml
@ -1142,3 +1293,52 @@ How it works?
**Note:** Setting an environment variable within a Python script using os.environ will not make that variable accessible via SSH sessions or any other new processes that are started independently of the Python script. Environment variables set this way only affect the current process and its child processes. **Note:** Setting an environment variable within a Python script using os.environ will not make that variable accessible via SSH sessions or any other new processes that are started independently of the Python script. Environment variables set this way only affect the current process and its child processes.
## Set Max Request / Response Size on LiteLLM Proxy
Use this if you want to set a maximum request / response size for your proxy server. If a request size is above the size it gets rejected + slack alert triggered
#### Usage
**Step 1.** Set `max_request_size_mb` and `max_response_size_mb`
For this example we set a very low limit on `max_request_size_mb` and expect it to get rejected
:::info
In production we recommend setting a `max_request_size_mb` / `max_response_size_mb` around `32 MB`
:::
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
# Security controls
max_request_size_mb: 0.000000001 # 👈 Key Change - Max Request Size in MB. Set this very low for testing
max_response_size_mb: 100 # 👈 Key Change - Max Response Size in MB
```
**Step 2.** Test it with `/chat/completions` request
```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello, Claude!"}
]
}'
```
**Expected Response from request**
We expect this to fail since the request size is over `max_request_size_mb`
```shell
{"error":{"message":"Request size is too large. Request size is 0.0001125335693359375 MB. Max size is 1e-09 MB","type":"bad_request_error","param":"content-length","code":400}}
```

View file

@ -1,18 +1,10 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# 🛡️ Guardrails # 🛡️ [Beta] Guardrails
Setup Prompt Injection Detection, Secret Detection on LiteLLM Proxy Setup Prompt Injection Detection, Secret Detection on LiteLLM Proxy
:::info
✨ Enterprise Only Feature
Schedule a meeting with us to get an Enterprise License 👉 Talk to founders [here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
## Quick Start ## Quick Start
### 1. Setup guardrails on litellm proxy config.yaml ### 1. Setup guardrails on litellm proxy config.yaml
@ -217,12 +209,12 @@ If you need to switch `pii_masking` off for an API Key set `"permissions": {"pii
<TabItem value="/key/generate" label="/key/generate"> <TabItem value="/key/generate" label="/key/generate">
```shell ```shell
curl --location 'http://0.0.0.0:4000/key/generate' \ curl -X POST 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \ -H 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
--data '{ -D '{
"permissions": {"pii_masking": true} "permissions": {"pii_masking": true}
}' }'
``` ```
```shell ```shell
@ -266,6 +258,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}' }'
``` ```
## Disable team from turning on/off guardrails
### 1. Disable team from modifying guardrails
```bash
curl -X POST 'http://0.0.0.0:4000/team/update' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{
"team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50",
"metadata": {"guardrails": {"modify_guardrails": false}}
}'
```
### 2. Try to disable guardrails for a call
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "Think of 10 random colors."
}
],
"metadata": {"guardrails": {"hide_secrets": false}}
}'
```
### 3. Get 403 Error
```
{
"error": {
"message": {
"error": "Your team does not have permission to modify guardrails."
},
"type": "auth_error",
"param": "None",
"code": 403
}
}
```
Expect to NOT see `+1 412-612-9992` in your server logs on your callback. Expect to NOT see `+1 412-612-9992` in your server logs on your callback.
:::info :::info
@ -277,28 +317,39 @@ The `pii_masking` guardrail ran on this request because api key=sk-jNm1Zar7XfNdZ
## Spec for `guardrails` on litellm config ## Spec for `guardrails` on litellm config
```yaml
litellm_settings:
guardrails:
- string: GuardrailItemSpec
```
- `string` - Your custom guardrail name
- `GuardrailItemSpec`:
- `callbacks`: List[str], list of supported guardrail callbacks.
- Full List: presidio, lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation
- `default_on`: bool, will run on all llm requests when true
- `logging_only`: Optional[bool], if true, run guardrail only on logged output, not on the actual LLM API call. Currently only supported for presidio pii masking. Requires `default_on` to be True as well.
- `callback_args`: Optional[Dict[str, Dict]]: If set, pass in init args for that specific guardrail
Example:
```yaml ```yaml
litellm_settings: litellm_settings:
guardrails: guardrails:
- prompt_injection: # your custom name for guardrail - prompt_injection: # your custom name for guardrail
callbacks: [lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation] # litellm callbacks to use callbacks: [lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation] # litellm callbacks to use
default_on: true # will run on all llm requests when true default_on: true # will run on all llm requests when true
callback_args: {"lakera_prompt_injection": {"moderation_check": "pre_call"}}
- hide_secrets: - hide_secrets:
callbacks: [hide_secrets] callbacks: [hide_secrets]
default_on: true default_on: true
- pii_masking:
callback: ["presidio"]
default_on: true
logging_only: true
- your-custom-guardrail - your-custom-guardrail
callbacks: [hide_secrets] callbacks: [hide_secrets]
default_on: false default_on: false
``` ```
### `guardrails`: List of guardrail configurations to be applied to LLM requests.
#### Guardrail: `prompt_injection`: Configuration for detecting and preventing prompt injection attacks.
- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
- `default_on`: Boolean flag determining if this guardrail runs on all LLM requests by default.
#### Guardrail: `your-custom-guardrail`: Configuration for a user-defined custom guardrail.
- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
- `default_on`: Boolean flag determining if this custom guardrail runs by default, set to false.

View file

@ -41,28 +41,6 @@ litellm --health
} }
``` ```
### Background Health Checks
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
Here's how to use it:
1. in the config.yaml add:
```
general_settings:
background_health_checks: True # enable background health checks
health_check_interval: 300 # frequency of background health checks
```
2. Start server
```
$ litellm /path/to/config.yaml
```
3. Query health endpoint:
```
curl --location 'http://0.0.0.0:4000/health'
```
### Embedding Models ### Embedding Models
We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check We need some way to know if the model is an embedding model when running checks, if you have this in your config, specifying mode it makes an embedding health check
@ -112,6 +90,66 @@ model_list:
mode: completion # 👈 ADD THIS mode: completion # 👈 ADD THIS
``` ```
### Speech to Text Models
```yaml
model_list:
- model_name: whisper
litellm_params:
model: whisper-1
api_key: os.environ/OPENAI_API_KEY
model_info:
mode: audio_transcription
```
### Text to Speech Models
```yaml
# OpenAI Text to Speech Models
- model_name: tts
litellm_params:
model: openai/tts-1
api_key: "os.environ/OPENAI_API_KEY"
model_info:
mode: audio_speech
```
## Background Health Checks
You can enable model health checks being run in the background, to prevent each model from being queried too frequently via `/health`.
Here's how to use it:
1. in the config.yaml add:
```
general_settings:
background_health_checks: True # enable background health checks
health_check_interval: 300 # frequency of background health checks
```
2. Start server
```
$ litellm /path/to/config.yaml
```
3. Query health endpoint:
```
curl --location 'http://0.0.0.0:4000/health'
```
### Hide details
The health check response contains details like endpoint URLs, error messages,
and other LiteLLM params. While this is useful for debugging, it can be
problematic when exposing the proxy server to a broad audience.
You can hide these details by setting the `health_check_details` setting to `False`.
```yaml
general_settings:
health_check_details: False
```
## `/health/readiness` ## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests Unprotected endpoint for checking if proxy is ready to accept requests

View file

@ -1,28 +1,68 @@
# Logging
Log Proxy input, output, and exceptions using:
- Langfuse
- OpenTelemetry
- Custom Callbacks
- Langsmith
- DataDog
- DynamoDB
- etc.
import Image from '@theme/IdealImage'; import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
## Getting the LiteLLM Call ID
# 🪢 Logging - Langfuse, OpenTelemetry, Custom Callbacks, DataDog, s3 Bucket, Sentry, Athina, Azure Content-Safety LiteLLM generates a unique `call_id` for each request. This `call_id` can be
used to track the request across the system. This can be very useful for finding
the info for a particular request in a logging system like one of the systems
mentioned in this page.
Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket ```shell
curl -i -sSL --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "what llm are you"}]
}' | grep 'x-litellm'
```
## Table of Contents The output of this is:
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse) ```output
- [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format) x-litellm-call-id: b980db26-9512-45cc-b1da-c511a363b83f
- [Async Custom Callbacks](#custom-callback-class-async) x-litellm-model-id: cb41bc03f4c33d310019bae8c5afdb1af0a8f97b36a234405a9807614988457c
- [Async Custom Callback APIs](#custom-callback-apis-async) x-litellm-model-api-base: https://x-example-1234.openai.azure.com
- [Logging to Galileo](#logging-llm-io-to-galileo) x-litellm-version: 1.40.21
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse) x-litellm-response-cost: 2.85e-05
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets) x-litellm-key-tpm-limit: None
- [Logging to DataDog](#logging-proxy-inputoutput---datadog) x-litellm-key-rpm-limit: None
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb) ```
- [Logging to Sentry](#logging-proxy-inputoutput---sentry)
- [Logging to Athina](#logging-proxy-inputoutput-athina) A number of these headers could be useful for troubleshooting, but the
- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety) `x-litellm-call-id` is the one that is most useful for tracking a request across
components in your system, including in logging tools.
## Redacting UserAPIKeyInfo
Redact information about the user api key (hashed token, user_id, team id, etc.), from logs.
Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
```yaml
litellm_settings:
callbacks: ["langfuse"]
redact_user_api_key_info: true
```
Removes any field with `user_api_key_*` from metadata.
## Logging Proxy Input/Output - Langfuse ## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse **Step 1** Install langfuse
@ -32,6 +72,7 @@ pip install langfuse>=2.0.0
``` ```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -42,6 +83,7 @@ litellm_settings:
``` ```
**Step 3**: Set required env variables for logging to langfuse **Step 3**: Set required env variables for logging to langfuse
```shell ```shell
export LANGFUSE_PUBLIC_KEY="pk_kk" export LANGFUSE_PUBLIC_KEY="pk_kk"
export LANGFUSE_SECRET_KEY="sk_ss" export LANGFUSE_SECRET_KEY="sk_ss"
@ -52,11 +94,13 @@ export LANGFUSE_HOST="https://xxx.langfuse.com"
**Step 4**: Start the proxy, make a test request **Step 4**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
Test Request Test Request
``` ```
litellm --test litellm --test
``` ```
@ -67,7 +111,6 @@ Expected output on Langfuse
### Logging Metadata to Langfuse ### Logging Metadata to Langfuse
<Tabs> <Tabs>
<TabItem value="Curl" label="Curl Request"> <TabItem value="Curl" label="Curl Request">
@ -93,6 +136,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
} }
}' }'
``` ```
</TabItem> </TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+"> <TabItem value="openai" label="OpenAI v1.0.0+">
@ -126,6 +170,7 @@ response = client.chat.completions.create(
print(response) print(response)
``` ```
</TabItem> </TabItem>
<TabItem value="langchain" label="Langchain"> <TabItem value="langchain" label="Langchain">
@ -168,9 +213,11 @@ print(response)
</TabItem> </TabItem>
</Tabs> </Tabs>
### Team based Logging to Langfuse ### Team based Logging to Langfuse
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
<!--
**Example:** **Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id This config would send langfuse logs to 2 different langfuse projects, based on the team id
@ -197,7 +244,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
-d '{"team_id": "ishaans-secret-project"}' -d '{"team_id": "ishaans-secret-project"}'
``` ```
All requests made with these keys will log data to their team-specific logging. All requests made with these keys will log data to their team-specific logging. -->
### Redacting Messages, Response Content from Langfuse Logging ### Redacting Messages, Response Content from Langfuse Logging
@ -231,6 +278,42 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}' }'
``` ```
### LiteLLM-specific Tags on Langfuse - `cache_hit`, `cache_key`
Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields
| LiteLLM specific field | Description | Example Value |
|------------------------|-------------------------------------------------------|------------------------------------------------|
| `cache_hit` | Indicates whether a cache hit occured (True) or not (False) | `true`, `false` |
| `cache_key` | The Cache key used for this request | `d2b758c****`|
| `proxy_base_url` | The base URL for the proxy server, the value of env var `PROXY_BASE_URL` on your server | `https://proxy.example.com`|
| `user_api_key_alias` | An alias for the LiteLLM Virtual Key.| `prod-app1` |
| `user_api_key_user_id` | The unique ID associated with a user's API key. | `user_123`, `user_456` |
| `user_api_key_user_email` | The email associated with a user's API key. | `user@example.com`, `admin@example.com` |
| `user_api_key_team_alias` | An alias for a team associated with an API key. | `team_alpha`, `dev_team` |
**Usage**
Specify `langfuse_default_tags` to control what litellm fields get logged on Langfuse
Example config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
success_callback: ["langfuse"]
# 👇 Key Change
langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"]
```
### 🔧 Debugging - Viewing RAW CURL sent from LiteLLM to provider ### 🔧 Debugging - Viewing RAW CURL sent from LiteLLM to provider
Use this when you want to view the RAW curl request sent from LiteLLM to the LLM API Use this when you want to view the RAW curl request sent from LiteLLM to the LLM API
@ -257,6 +340,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
} }
}' }'
``` ```
</TabItem> </TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+"> <TabItem value="openai" label="OpenAI v1.0.0+">
@ -287,6 +371,7 @@ response = client.chat.completions.create(
print(response) print(response)
``` ```
</TabItem> </TabItem>
<TabItem value="langchain" label="Langchain"> <TabItem value="langchain" label="Langchain">
@ -332,7 +417,6 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma
<Image img={require('../../img/debug_langfuse.png')} /> <Image img={require('../../img/debug_langfuse.png')} />
## Logging Proxy Input/Output in OpenTelemetry format ## Logging Proxy Input/Output in OpenTelemetry format
:::info :::info
@ -348,10 +432,8 @@ OTEL_SERVICE_NAME=<your-service-name>` # default="litellm"
<Tabs> <Tabs>
<TabItem value="Console Exporter" label="Log to console"> <TabItem value="Console Exporter" label="Log to console">
**Step 1:** Set callbacks and env vars **Step 1:** Set callbacks and env vars
Add the following to your env Add the following to your env
@ -367,7 +449,6 @@ litellm_settings:
callbacks: ["otel"] callbacks: ["otel"]
``` ```
**Step 2**: Start the proxy, make a test request **Step 2**: Start the proxy, make a test request
Start proxy Start proxy
@ -427,7 +508,6 @@ This is the Span from OTEL Logging
</TabItem> </TabItem>
<TabItem value="Honeycomb" label="Log to Honeycomb"> <TabItem value="Honeycomb" label="Log to Honeycomb">
#### Quick Start - Log to Honeycomb #### Quick Start - Log to Honeycomb
@ -449,7 +529,6 @@ litellm_settings:
callbacks: ["otel"] callbacks: ["otel"]
``` ```
**Step 2**: Start the proxy, make a test request **Step 2**: Start the proxy, make a test request
Start proxy Start proxy
@ -474,10 +553,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}' }'
``` ```
</TabItem> </TabItem>
<TabItem value="otel-col" label="Log to OTEL HTTP Collector"> <TabItem value="otel-col" label="Log to OTEL HTTP Collector">
#### Quick Start - Log to OTEL Collector #### Quick Start - Log to OTEL Collector
@ -499,7 +576,6 @@ litellm_settings:
callbacks: ["otel"] callbacks: ["otel"]
``` ```
**Step 2**: Start the proxy, make a test request **Step 2**: Start the proxy, make a test request
Start proxy Start proxy
@ -526,7 +602,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</TabItem> </TabItem>
<TabItem value="otel-col-grpc" label="Log to OTEL GRPC Collector"> <TabItem value="otel-col-grpc" label="Log to OTEL GRPC Collector">
#### Quick Start - Log to OTEL GRPC Collector #### Quick Start - Log to OTEL GRPC Collector
@ -548,7 +623,6 @@ litellm_settings:
callbacks: ["otel"] callbacks: ["otel"]
``` ```
**Step 2**: Start the proxy, make a test request **Step 2**: Start the proxy, make a test request
Start proxy Start proxy
@ -573,7 +647,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}' }'
``` ```
</TabItem> </TabItem>
<TabItem value="traceloop" label="Log to Traceloop Cloud"> <TabItem value="traceloop" label="Log to Traceloop Cloud">
@ -596,7 +669,6 @@ environment_variables:
TRACELOOP_API_KEY: "XXXXX" TRACELOOP_API_KEY: "XXXXX"
``` ```
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
@ -632,11 +704,15 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
❓ Use this when you want to **pass information about the incoming request in a distributed tracing system** ❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**
✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header) ✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header)
```curl ```curl
traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01 traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01
``` ```
Example Usage Example Usage
1. Make Request to LiteLLM Proxy with `traceparent` header 1. Make Request to LiteLLM Proxy with `traceparent` header
```python ```python
import openai import openai
import uuid import uuid
@ -660,7 +736,6 @@ response = client.chat.completions.create(
) )
print(response) print(response)
``` ```
```shell ```shell
@ -674,12 +749,29 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
<Image img={require('../../img/otel_parent.png')} /> <Image img={require('../../img/otel_parent.png')} />
### Forwarding `Traceparent HTTP Header` to LLM APIs
Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM
Set `forward_traceparent_to_llm_provider: True` in your `config.yaml`. This will forward the `traceparent` header to your LLM API
:::warning
Only use this for self hosted LLMs, this can cause Bedrock, VertexAI calls to fail
:::
```yaml
litellm_settings:
forward_traceparent_to_llm_provider: True
```
## Custom Callback Class [Async] ## Custom Callback Class [Async]
Use this when you want to run custom callbacks in `python` Use this when you want to run custom callbacks in `python`
#### Step 1 - Create your custom `litellm` callback class #### Step 1 - Create your custom `litellm` callback class
We use `litellm.integrations.custom_logger` for this, **more details about litellm custom callbacks [here](https://docs.litellm.ai/docs/observability/custom_callback)** We use `litellm.integrations.custom_logger` for this, **more details about litellm custom callbacks [here](https://docs.litellm.ai/docs/observability/custom_callback)**
Define your custom callback class in a python file. Define your custom callback class in a python file.
@ -782,16 +874,17 @@ proxy_handler_instance = MyCustomHandler()
``` ```
#### Step 2 - Pass your custom callback class in `config.yaml` #### Step 2 - Pass your custom callback class in `config.yaml`
We pass the custom callback class defined in **Step1** to the config.yaml. We pass the custom callback class defined in **Step1** to the config.yaml.
Set `callbacks` to `python_filename.logger_instance_name` Set `callbacks` to `python_filename.logger_instance_name`
In the config below, we pass In the config below, we pass
- python_filename: `custom_callbacks.py` - python_filename: `custom_callbacks.py`
- logger_instance_name: `proxy_handler_instance`. This is defined in Step 1 - logger_instance_name: `proxy_handler_instance`. This is defined in Step 1
`callbacks: custom_callbacks.proxy_handler_instance` `callbacks: custom_callbacks.proxy_handler_instance`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -804,6 +897,7 @@ litellm_settings:
``` ```
#### Step 3 - Start proxy + test request #### Step 3 - Start proxy + test request
```shell ```shell
litellm --config proxy_config.yaml litellm --config proxy_config.yaml
``` ```
@ -825,6 +919,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
``` ```
#### Resulting Log on Proxy #### Resulting Log on Proxy
```shell ```shell
On Success On Success
Model: gpt-3.5-turbo, Model: gpt-3.5-turbo,
@ -877,7 +972,6 @@ class MyCustomHandler(CustomLogger):
"max_tokens": 10 "max_tokens": 10
} }
} }
``` ```
#### Logging `model_info` set in config.yaml #### Logging `model_info` set in config.yaml
@ -895,11 +989,13 @@ class MyCustomHandler(CustomLogger):
``` ```
**Expected Output** **Expected Output**
```json ```json
{'mode': 'embedding', 'input_cost_per_token': 0.002} {'mode': 'embedding', 'input_cost_per_token': 0.002}
``` ```
### Logging responses from proxy ### Logging responses from proxy
Both `/chat/completions` and `/embeddings` responses are available as `response_obj` Both `/chat/completions` and `/embeddings` responses are available as `response_obj`
**Note: for `/chat/completions`, both `stream=True` and `non stream` responses are available as `response_obj`** **Note: for `/chat/completions`, both `stream=True` and `non stream` responses are available as `response_obj`**
@ -913,6 +1009,7 @@ class MyCustomHandler(CustomLogger):
``` ```
**Expected Output /chat/completion [for both `stream` and `non-stream` responses]** **Expected Output /chat/completion [for both `stream` and `non-stream` responses]**
```json ```json
ModelResponse( ModelResponse(
id='chatcmpl-8Tfu8GoMElwOZuj2JlHBhNHG01PPo', id='chatcmpl-8Tfu8GoMElwOZuj2JlHBhNHG01PPo',
@ -939,6 +1036,7 @@ ModelResponse(
``` ```
**Expected Output /embeddings** **Expected Output /embeddings**
```json ```json
{ {
'model': 'ada', 'model': 'ada',
@ -958,7 +1056,6 @@ ModelResponse(
} }
``` ```
## Custom Callback APIs [Async] ## Custom Callback APIs [Async]
:::info :::info
@ -968,10 +1065,12 @@ This is an Enterprise only feature [Get Started with Enterprise here](https://gi
::: :::
Use this if you: Use this if you:
- Want to use custom callbacks written in a non Python programming language - Want to use custom callbacks written in a non Python programming language
- Want your callbacks to run on a different microservice - Want your callbacks to run on a different microservice
#### Step 1. Create your generic logging API endpoint #### Step 1. Create your generic logging API endpoint
Set up a generic API endpoint that can receive data in JSON format. The data will be included within a "data" field. Set up a generic API endpoint that can receive data in JSON format. The data will be included within a "data" field.
Your server should support the following Request format: Your server should support the following Request format:
@ -1034,11 +1133,8 @@ async def log_event(request: Request):
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn
uvicorn.run(app, host="127.0.0.1", port=4000) uvicorn.run(app, host="127.0.0.1", port=4000)
``` ```
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to #### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
```shell ```shell
@ -1048,6 +1144,7 @@ os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"] #### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
Example litellm proxy config.yaml Example litellm proxy config.yaml
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -1059,8 +1156,98 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Logging LLM IO to Langsmith
1. Set `success_callback: ["langsmith"]` on litellm config.yaml
If you're using a custom LangSmith instance, you can set the
`LANGSMITH_BASE_URL` environment variable to point to your instance.
```yaml
litellm_settings:
success_callback: ["langsmith"]
environment_variables:
LANGSMITH_API_KEY: "lsv2_pt_xxxxxxxx"
LANGSMITH_PROJECT: "litellm-proxy"
LANGSMITH_BASE_URL: "https://api.smith.langchain.com" # (Optional - only needed if you have a custom Langsmith instance)
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "Hello, Claude gm!"
}
],
}
'
```
Expect to see your log on Langfuse
<Image img={require('../../img/langsmith_new.png')} />
## Logging LLM IO to Arize AI
1. Set `success_callback: ["arize"]` on litellm config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "Hello, Claude gm!"
}
],
}
'
```
Expect to see your log on Langfuse
<Image img={require('../../img/langsmith_new.png')} />
## Logging LLM IO to Galileo ## Logging LLM IO to Galileo
[BETA] [BETA]
Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/) Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
@ -1083,6 +1270,7 @@ export GALILEO_PASSWORD=""
### Quick Start ### Quick Start
1. Add to Config.yaml 1. Add to Config.yaml
```yaml ```yaml
model_list: model_list:
- litellm_params: - litellm_params:
@ -1118,7 +1306,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
' '
``` ```
🎉 That's it - Expect to see your Logs on your Galileo Dashboard 🎉 That's it - Expect to see your Logs on your Galileo Dashboard
## Logging Proxy Cost + Usage - OpenMeter ## Logging Proxy Cost + Usage - OpenMeter
@ -1136,6 +1323,7 @@ export OPENMETER_API_KEY=""
### Quick Start ### Quick Start
1. Add to Config.yaml 1. Add to Config.yaml
```yaml ```yaml
model_list: model_list:
- litellm_params: - litellm_params:
@ -1171,13 +1359,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
' '
``` ```
<Image img={require('../../img/openmeter_img_2.png')} /> <Image img={require('../../img/openmeter_img_2.png')} />
## Logging Proxy Input/Output - DataDog ## Logging Proxy Input/Output - DataDog
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -1197,6 +1386,7 @@ DD_SITE="us5.datadoghq.com" # your datadog base url
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
@ -1224,66 +1414,10 @@ Expected output on Datadog
<Image img={require('../../img/dd_small1.png')} /> <Image img={require('../../img/dd_small1.png')} />
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
**Step 1** Set AWS Credentials in .env
```shell
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["s3"]
s3_callback_params:
s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3
s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
```
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
Your logs should be available on the specified s3 Bucket
## Logging Proxy Input/Output - DynamoDB ## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set We will use the `--config` to set
- `litellm.success_callback = ["dynamodb"]` - `litellm.success_callback = ["dynamodb"]`
- `litellm.dynamodb_table_name = "your-table-name"` - `litellm.dynamodb_table_name = "your-table-name"`
@ -1298,6 +1432,7 @@ AWS_REGION_NAME = ""
``` ```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -1311,11 +1446,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
Test Request Test Request
```shell ```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \ curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
@ -1403,19 +1540,18 @@ Your logs should be available on DynamoDB
} }
``` ```
## Logging Proxy Input/Output - Sentry ## Logging Proxy Input/Output - Sentry
If api calls fail (llm/database) you can log those to Sentry: If api calls fail (llm/database) you can log those to Sentry:
**Step 1** Install Sentry **Step 1** Install Sentry
```shell ```shell
pip install --upgrade sentry-sdk pip install --upgrade sentry-sdk
``` ```
**Step 2**: Save your Sentry_DSN and add `litellm_settings`: `failure_callback` **Step 2**: Save your Sentry_DSN and add `litellm_settings`: `failure_callback`
```shell ```shell
export SENTRY_DSN="your-sentry-dsn" export SENTRY_DSN="your-sentry-dsn"
``` ```
@ -1435,11 +1571,13 @@ general_settings:
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
Test Request Test Request
``` ```
litellm --test litellm --test
``` ```
@ -1457,6 +1595,7 @@ ATHINA_API_KEY = "your-athina-api-key"
``` ```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -1469,11 +1608,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
Test Request Test Request
``` ```
curl --location 'http://0.0.0.0:4000/chat/completions' \ curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
@ -1505,6 +1646,7 @@ AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
``` ```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml ```yaml
model_list: model_list:
- model_name: gpt-3.5-turbo - model_name: gpt-3.5-turbo
@ -1520,11 +1662,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request **Step 3**: Start the proxy, make a test request
Start proxy Start proxy
```shell ```shell
litellm --config config.yaml --debug litellm --config config.yaml --debug
``` ```
Test Request Test Request
``` ```
curl --location 'http://0.0.0.0:4000/chat/completions' \ curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \ --header 'Content-Type: application/json' \
@ -1540,7 +1684,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
``` ```
An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`. An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
The details of the response will describe : The details of the response will describe:
- The `source` : input text or llm generated text - The `source` : input text or llm generated text
- The `category` : the category of the content that triggered the moderation - The `category` : the category of the content that triggered the moderation
- The `severity` : the severity from 0 to 10 - The `severity` : the severity from 0 to 10

View file

@ -17,7 +17,7 @@ model_list:
## Get Model Information - `/model/info` ## Get Model Information - `/model/info`
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes. Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes.
<Tabs <Tabs
defaultValue="curl" defaultValue="curl"
@ -35,22 +35,33 @@ curl -X GET "http://0.0.0.0:4000/model/info" \
## Add a New Model ## Add a New Model
Add a new model to the list in the `config.yaml` by providing the model parameters. This allows you to update the model list without restarting the proxy. Add a new model to the proxy via the `/model/new` API, to add models without restarting the proxy.
<Tabs <Tabs>
defaultValue="curl" <TabItem value="API">
values={[
{ label: 'cURL', value: 'curl', },
]}>
<TabItem value="curl">
```bash ```bash
curl -X POST "http://0.0.0.0:4000/model/new" \ curl -X POST "http://0.0.0.0:4000/model/new" \
-H "accept: application/json" \ -H "accept: application/json" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }' -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
``` ```
</TabItem> </TabItem>
<TabItem value="Yaml">
```yaml
model_list:
- model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)`
litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297
model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
model_info:
my_custom_key: my_custom_value # additional model metadata
```
</TabItem>
</Tabs> </Tabs>
@ -86,3 +97,82 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass
- Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964) - Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964)
Feedback on the beta endpoints is valuable and helps improve the API for all users. Feedback on the beta endpoints is valuable and helps improve the API for all users.
## Add Additional Model Information
If you want the ability to add a display name, description, and labels for models, just use `model_info:`
```yaml
model_list:
- model_name: "gpt-4"
litellm_params:
model: "gpt-4"
api_key: "os.environ/OPENAI_API_KEY"
model_info: # 👈 KEY CHANGE
my_custom_key: "my_custom_value"
```
### Usage
1. Add additional information to model
```yaml
model_list:
- model_name: "gpt-4"
litellm_params:
model: "gpt-4"
api_key: "os.environ/OPENAI_API_KEY"
model_info: # 👈 KEY CHANGE
my_custom_key: "my_custom_value"
```
2. Call with `/model/info`
Use a key with access to the model `gpt-4`.
```bash
curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \
-H 'Authorization: Bearer LITELLM_KEY' \
```
3. **Expected Response**
Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO`
[**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460)
[Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues)
```bash
{
"data": [
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4"
},
"model_info": {
"id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc",
"db_model": false,
"my_custom_key": "my_custom_value", # 👈 CUSTOM INFO
"key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
"max_tokens": 4096,
"max_input_tokens": 8192,
"max_output_tokens": 4096,
"input_cost_per_token": 3e-05,
"input_cost_per_character": null,
"input_cost_per_token_above_128k_tokens": null,
"output_cost_per_token": 6e-05,
"output_cost_per_character": null,
"output_cost_per_token_above_128k_tokens": null,
"output_cost_per_character_above_128k_tokens": null,
"output_vector_size": null,
"litellm_provider": "openai",
"mode": "chat"
}
},
]
}
```

View file

@ -1,4 +1,4 @@
# Attribute Management changes to Users # Attribute Management changes to Users
Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform). Call management endpoints on behalf of a user. (Useful when connecting proxy to your development platform).

View file

@ -0,0 +1,63 @@
# Oauth 2.0 Authentication
Use this if you want to use an Oauth2.0 token to make `/chat`, `/embeddings` requests to the LiteLLM Proxy
:::info
This is an Enterprise Feature - [get in touch with us if you want a free trial to test if this feature meets your needs]((https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat))
:::
## Usage
1. Set env vars:
```bash
export OAUTH_TOKEN_INFO_ENDPOINT="https://your-provider.com/token/info"
export OAUTH_USER_ID_FIELD_NAME="sub"
export OAUTH_USER_ROLE_FIELD_NAME="role"
export OAUTH_USER_TEAM_ID_FIELD_NAME="team_id"
```
- `OAUTH_TOKEN_INFO_ENDPOINT`: URL to validate OAuth tokens
- `OAUTH_USER_ID_FIELD_NAME`: Field in token info response containing user ID
- `OAUTH_USER_ROLE_FIELD_NAME`: Field in token info for user's role
- `OAUTH_USER_TEAM_ID_FIELD_NAME`: Field in token info for user's team ID
2. Enable on litellm config.yaml
Set this on your config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
enable_oauth2_auth: true
```
3. Use token in requests to LiteLLM
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
## Debugging
Start the LiteLLM Proxy with [`--detailed_debug` mode and you should see more verbose logs](cli.md#detailed_debug)

View file

@ -35,6 +35,7 @@ general_settings:
Authorization: "bearer os.environ/COHERE_API_KEY" # (Optional) Auth Header to forward to your Endpoint Authorization: "bearer os.environ/COHERE_API_KEY" # (Optional) Auth Header to forward to your Endpoint
content-type: application/json # (Optional) Extra Headers to pass to this endpoint content-type: application/json # (Optional) Extra Headers to pass to this endpoint
accept: application/json accept: application/json
forward_headers: True # (Optional) Forward all headers from the incoming request to the target endpoint
``` ```
**Step 2** Start Proxy Server in detailed_debug mode **Step 2** Start Proxy Server in detailed_debug mode
@ -156,6 +157,8 @@ POST /api/public/ingestion HTTP/1.1" 207 Multi-Status
Use this if you want the pass through endpoint to honour LiteLLM keys/authentication Use this if you want the pass through endpoint to honour LiteLLM keys/authentication
This also enforces the key's rpm limits on pass-through endpoints.
Usage - set `auth: true` on the config Usage - set `auth: true` on the config
```yaml ```yaml
general_settings: general_settings:
@ -190,6 +193,53 @@ curl --request POST \
}' }'
``` ```
### Use Langfuse client sdk w/ LiteLLM Key
**Usage**
1. Set-up yaml to pass-through langfuse /api/public/ingestion
```yaml
general_settings:
master_key: sk-1234
pass_through_endpoints:
- path: "/api/public/ingestion" # route you want to add to LiteLLM Proxy Server
target: "https://us.cloud.langfuse.com/api/public/ingestion" # URL this route should forward
auth: true # 👈 KEY CHANGE
custom_auth_parser: "langfuse" # 👈 KEY CHANGE
headers:
LANGFUSE_PUBLIC_KEY: "os.environ/LANGFUSE_DEV_PUBLIC_KEY" # your langfuse account public key
LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY" # your langfuse account secret key
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test with langfuse sdk
```python
from langfuse import Langfuse
langfuse = Langfuse(
host="http://localhost:4000", # your litellm proxy endpoint
public_key="sk-1234", # your litellm proxy api key
secret_key="anything", # no key required since this is a pass through
)
print("sending langfuse trace request")
trace = langfuse.trace(name="test-trace-litellm-proxy-passthrough")
print("flushing langfuse request")
langfuse.flush()
print("flushed langfuse request")
```
## `pass_through_endpoints` Spec on config.yaml ## `pass_through_endpoints` Spec on config.yaml
All possible values for `pass_through_endpoints` and what they mean All possible values for `pass_through_endpoints` and what they mean
@ -218,3 +268,149 @@ general_settings:
* `LANGFUSE_PUBLIC_KEY` *string*: Your Langfuse account public key - only set this when forwarding to Langfuse. * `LANGFUSE_PUBLIC_KEY` *string*: Your Langfuse account public key - only set this when forwarding to Langfuse.
* `LANGFUSE_SECRET_KEY` *string*: Your Langfuse account secret key - only set this when forwarding to Langfuse. * `LANGFUSE_SECRET_KEY` *string*: Your Langfuse account secret key - only set this when forwarding to Langfuse.
* `<your-custom-header>` *string*: Pass any custom header key/value pair * `<your-custom-header>` *string*: Pass any custom header key/value pair
* `forward_headers` *Optional(boolean)*: If true, all headers from the incoming request will be forwarded to the target endpoint. Default is `False`.
## Custom Chat Endpoints (Anthropic/Bedrock/Vertex)
Allow developers to call the proxy with Anthropic/boto3/etc. client sdk's.
Test our [Anthropic Adapter](../anthropic_completion.md) for reference [**Code**](https://github.com/BerriAI/litellm/blob/fd743aaefd23ae509d8ca64b0c232d25fe3e39ee/litellm/adapters/anthropic_adapter.py#L50)
### 1. Write an Adapter
Translate the request/response from your custom API schema to the OpenAI schema (used by litellm.completion()) and back.
For provider-specific params 👉 [**Provider-Specific Params**](../completion/provider_specific_params.md)
```python
from litellm import adapter_completion
import litellm
from litellm import ChatCompletionRequest, verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
import os
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import json
import os
import traceback
import uuid
from typing import Literal, Optional
import dotenv
import httpx
from pydantic import BaseModel
###################
# CUSTOM ADAPTER ##
###################
class AnthropicAdapter(CustomLogger):
def __init__(self) -> None:
super().__init__()
def translate_completion_input_params(
self, kwargs
) -> Optional[ChatCompletionRequest]:
"""
- translate params, where needed
- pass rest, as is
"""
request_body = AnthropicMessagesRequest(**kwargs) # type: ignore
translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
anthropic_message_request=request_body
)
return translated_body
def translate_completion_output_params(
self, response: litellm.ModelResponse
) -> Optional[AnthropicResponse]:
return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
response=response
)
def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
return super().translate_completion_output_params_streaming()
anthropic_adapter = AnthropicAdapter()
###########
# TEST IT #
###########
## register CUSTOM ADAPTER
litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-openai-key"
os.environ["COHERE_API_KEY"] = "your-cohere-key"
messages = [{ "content": "Hello, how are you?","role": "user"}]
# openai call
response = adapter_completion(model="gpt-3.5-turbo", messages=messages, adapter_id="anthropic")
# cohere call
response = adapter_completion(model="command-nightly", messages=messages, adapter_id="anthropic")
print(response)
```
### 2. Create new endpoint
We pass the custom callback class defined in Step1 to the config.yaml. Set callbacks to python_filename.logger_instance_name
In the config below, we pass
python_filename: `custom_callbacks.py`
logger_instance_name: `anthropic_adapter`. This is defined in Step 1
`target: custom_callbacks.proxy_handler_instance`
```yaml
model_list:
- model_name: my-fake-claude-endpoint
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
general_settings:
master_key: sk-1234
pass_through_endpoints:
- path: "/v1/messages" # route you want to add to LiteLLM Proxy Server
target: custom_callbacks.anthropic_adapter # Adapter to use for this route
headers:
litellm_user_api_key: "x-api-key" # Field in headers, containing LiteLLM Key
```
### 3. Test it!
**Start proxy**
```bash
litellm --config /path/to/config.yaml
```
**Curl**
```bash
curl --location 'http://0.0.0.0:4000/v1/messages' \
-H 'x-api-key: sk-1234' \
-H 'anthropic-version: 2023-06-01' \ # ignored
-H 'content-type: application/json' \
-D '{
"model": "my-fake-claude-endpoint",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```

View file

@ -180,3 +180,59 @@ chat_completion = client.chat.completions.create(
"_response_ms": 1753.426 "_response_ms": 1753.426
} }
``` ```
## Turn on for logging only
Only apply PII Masking before logging to Langfuse, etc.
Not on the actual llm api request / response.
:::note
This is currently only applied for
- `/chat/completion` requests
- on 'success' logging
:::
1. Setup config.yaml
```yaml
litellm_settings:
presidio_logging_only: true
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "Hi, my name is Jane!"
}
]
}'
```
**Expected Logged Response**
```
Hi, my name is <PERSON>!
```

View file

@ -84,6 +84,20 @@ Set `export LITELLM_MODE="PRODUCTION"`
This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`. This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`.
## 5. Set LiteLLM Salt Key
If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB.
Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials
We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
```bash
export LITELLM_SALT_KEY="sk-1234"
```
[**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)
## Extras ## Extras
### Expected Performance in Production ### Expected Performance in Production

View file

@ -1,7 +1,16 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# 📈 Prometheus metrics [BETA] # 📈 [BETA] Prometheus metrics
:::info
🚨 Prometheus metrics will be out of Beta on September 15, 2024 - as part of this release it will be on LiteLLM Enterprise starting at $250/mo
[Enterprise Pricing](https://www.litellm.ai/#pricing)
[Contact us here to get a free trial](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
@ -47,9 +56,11 @@ http://localhost:4000/metrics
# <proxy_base_url>/metrics # <proxy_base_url>/metrics
``` ```
## Metrics Tracked ## 📈 Metrics Tracked
### Proxy Requests / Spend Metrics
| Metric Name | Description | | Metric Name | Description |
|----------------------|--------------------------------------| |----------------------|--------------------------------------|
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` | | `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` |
@ -57,6 +68,23 @@ http://localhost:4000/metrics
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` |
| `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | | `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` |
### LLM API / Provider Metrics
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. |
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
`litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment |
| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for deployment |
| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure |
| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment |
| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model |
| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model |
### Budget Metrics ### Budget Metrics
| Metric Name | Description | | Metric Name | Description |
|----------------------|--------------------------------------| |----------------------|--------------------------------------|
@ -64,55 +92,6 @@ http://localhost:4000/metrics
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)| | `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group
```yaml
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]
return_response_headers: true # ensures the LLM API calls track the response headers
```
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
Example Metric
<Tabs>
<TabItem value="Remaining Requests" label="Remaining Requests">
```shell
litellm_remaining_requests
{
api_base="https://api.openai.com/v1",
api_provider="openai",
litellm_model_name="gpt-3.5-turbo",
model_group="gpt-3.5-turbo"
}
8998.0
```
</TabItem>
<TabItem value="Requests" label="Remaining Tokens">
```shell
litellm_remaining_tokens
{
api_base="https://api.openai.com/v1",
api_provider="openai",
litellm_model_name="gpt-3.5-turbo",
model_group="gpt-3.5-turbo"
}
999981.0
```
</TabItem>
</Tabs>
## Monitor System Health ## Monitor System Health

View file

@ -13,20 +13,23 @@ LiteLLM Supports the following methods for detecting prompt injection attacks
Use this if you want to reject /chat, /completions, /embeddings calls that have prompt injection attacks Use this if you want to reject /chat, /completions, /embeddings calls that have prompt injection attacks
LiteLLM uses [LakerAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack LiteLLM uses [LakeraAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
#### Usage ### Usage
Step 1 Set a `LAKERA_API_KEY` in your env Step 1 Set a `LAKERA_API_KEY` in your env
``` ```
LAKERA_API_KEY="7a91a1a6059da*******" LAKERA_API_KEY="7a91a1a6059da*******"
``` ```
Step 2. Add `lakera_prompt_injection` to your calbacks Step 2. Add `lakera_prompt_injection` as a guardrail
```yaml ```yaml
litellm_settings: litellm_settings:
callbacks: ["lakera_prompt_injection"] guardrails:
- prompt_injection: # your custom name for guardrail
callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
default_on: true # will run on all llm requests when true
``` ```
That's it, start your proxy That's it, start your proxy
@ -48,6 +51,48 @@ curl --location 'http://localhost:4000/chat/completions' \
}' }'
``` ```
### Advanced - set category-based thresholds.
Lakera has 2 categories for prompt_injection attacks:
- jailbreak
- prompt_injection
```yaml
litellm_settings:
guardrails:
- prompt_injection: # your custom name for guardrail
callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
default_on: true # will run on all llm requests when true
callback_args:
lakera_prompt_injection:
category_thresholds: {
"prompt_injection": 0.1,
"jailbreak": 0.1,
}
```
### Advanced - Run before/in-parallel to request.
Control if the Lakera prompt_injection check runs before a request or in parallel to it (both requests need to be completed before a response is returned to the user).
```yaml
litellm_settings:
guardrails:
- prompt_injection: # your custom name for guardrail
callbacks: ["lakera_prompt_injection"] # litellm callbacks to use
default_on: true # will run on all llm requests when true
callback_args:
lakera_prompt_injection: {"moderation_check": "in_parallel"}, # "pre_call", "in_parallel"
```
### Advanced - set custom API Base.
```bash
export LAKERA_API_BASE=""
```
[**Learn More**](./guardrails.md)
## Similarity Checking ## Similarity Checking
LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack. LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack.

View file

@ -5,7 +5,7 @@ import TabItem from '@theme/TabItem';
# Quick Start # Quick Start
Quick start CLI, Config, Docker Quick start CLI, Config, Docker
LiteLLM Server manages: LiteLLM Server (LLM Gateway) manages:
* **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format * **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
* **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys) * **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
@ -243,7 +243,8 @@ model_list:
- model_name: vllm-model - model_name: vllm-model
litellm_params: litellm_params:
model: openai/<your-model-name> model: openai/<your-model-name>
api_base: <your-api-base> # e.g. http://0.0.0.0:3000 api_base: <your-vllm-api-base> # e.g. http://0.0.0.0:3000/v1
api_key: <your-vllm-api-key|none>
``` ```
### Run proxy with config ### Run proxy with config
@ -255,6 +256,12 @@ litellm --config your_config.yaml
## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain ## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
:::info
LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
[More examples here](user_keys)
:::
<Tabs> <Tabs>
<TabItem value="Curl" label="Curl Request"> <TabItem value="Curl" label="Curl Request">
@ -382,6 +389,34 @@ print(response)
``` ```
</TabItem> </TabItem>
<TabItem value="anthropic-py" label="Anthropic Python SDK">
```python
import os
from anthropic import Anthropic
client = Anthropic(
base_url="http://localhost:4000", # proxy endpoint
api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
)
message = client.messages.create(
max_tokens=1024,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="claude-3-opus-20240229",
)
print(message.content)
```
</TabItem>
</Tabs> </Tabs>
[**More Info**](./configs.md) [**More Info**](./configs.md)
@ -396,165 +431,6 @@ print(response)
- POST `/key/generate` - generate a key to access the proxy - POST `/key/generate` - generate a key to access the proxy
## Using with OpenAI compatible projects
Set `base_url` to the LiteLLM Proxy server
<Tabs>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="librechat" label="LibreChat">
#### Start the LiteLLM proxy
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
```shell
git clone https://github.com/danny-avila/LibreChat.git
```
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
```env
OPENAI_API_KEY=sk-1234
```
#### 4. Run LibreChat:
```shell
docker compose up
```
</TabItem>
<TabItem value="continue-dev" label="ContinueDev">
Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
```python
default=OpenAI(
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:4000" # your proxy server url
),
```
Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial.
</TabItem>
<TabItem value="aider" label="Aider">
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
```python
pip install pyautogen
```
```python
from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
]
response = oai.Completion.create(config_list=config_list, prompt="Hi")
print(response) # works fine
llm_config={
"config_list": config_list,
}
assistant = AssistantAgent("assistant", llm_config=llm_config)
user_proxy = UserProxyAgent("user_proxy")
user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
```
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
</TabItem>
<TabItem value="guidance" label="guidance">
A guidance language for controlling large language models.
https://github.com/guidance-ai/guidance
**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it.
**Fix**: Start your proxy using the `--drop_params` flag
```shell
litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
```
```python
import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
You are a helpful and terse assistant.
{{~/system}}
{{#user~}}
I want a response to the following question:
{{query}}
Name 3 world-class experts (past or present) who would be great at answering this?
Don't answer the question yet.
{{~/user}}
{{#assistant~}}
{{gen 'expert_names' temperature=0 max_tokens=300}}
{{~/assistant}}
''', llm=gpt4)
result = experts(query='How can I be more productive?')
print(result)
```
</TabItem>
</Tabs>
## Debugging Proxy ## Debugging Proxy
Events that occur during normal operation Events that occur during normal operation

View file

@ -31,15 +31,26 @@ model_list:
api_base: https://openai-france-1234.openai.azure.com/ api_base: https://openai-france-1234.openai.azure.com/
api_key: <your-azure-api-key> api_key: <your-azure-api-key>
rpm: 1440 rpm: 1440
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
num_retries: 2
timeout: 30 # 30 seconds
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
redis_password: <your redis password>
redis_port: 1992
``` ```
:::info
Detailed information about [routing strategies can be found here](../routing)
:::
#### Step 2: Start Proxy with config #### Step 2: Start Proxy with config
```shell ```shell
$ litellm --config /path/to/config.yaml $ litellm --config /path/to/config.yaml
``` ```
### Test - Load Balancing ### Test - Simple Call
Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo
@ -127,6 +138,27 @@ print(response)
</Tabs> </Tabs>
### Test - Loadbalancing
In this request, the following will occur:
1. A rate limit exception will be raised
2. LiteLLM proxy will retry the request on the model group (default is 3).
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Hi there!"}
],
"mock_testing_rate_limit_error": true
}'
```
[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
### Test - Client Side Fallbacks ### Test - Client Side Fallbacks
In this request the following will occur: In this request the following will occur:
1. The request to `model="zephyr-beta"` will fail 1. The request to `model="zephyr-beta"` will fail
@ -434,6 +466,33 @@ litellm_settings:
### Default Fallbacks
You can also set default_fallbacks, in case a specific model group is misconfigured / bad.
```yaml
model_list:
- model_name: gpt-3.5-turbo-small
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
- model_name: claude-opus
litellm_params:
model: claude-3-opus-20240229
api_key: os.environ/ANTHROPIC_API_KEY
litellm_settings:
default_fallbacks: ["claude-opus"]
```
This will default to claude-opus in case any model fails.
A model-specific fallbacks (e.g. {"gpt-3.5-turbo-small": ["claude-opus"]}) overrides default fallback.
### Test Fallbacks! ### Test Fallbacks!
Check if your fallbacks are working as expected. Check if your fallbacks are working as expected.

View file

@ -4,7 +4,7 @@ import TabItem from '@theme/TabItem';
# 🤗 UI - Self-Serve # 🤗 UI - Self-Serve
Allow users to create their own keys on [Proxy UI](./ui.md). ## Allow users to create their own keys on [Proxy UI](./ui.md).
1. Add user with permissions to a team on proxy 1. Add user with permissions to a team on proxy
@ -125,6 +125,41 @@ LiteLLM Enterprise: Enable [SSO login](./ui.md#setup-ssoauth-for-ui)
<Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} /> <Image img={require('../../img/ui_self_serve_create_key.png')} style={{ width: '800px', height: 'auto' }} />
## Allow users to View Usage, Caching Analytics
1. Go to Internal Users -> +Invite User
Set their role to `Admin Viewer` - this means they can only view usage, caching analytics
<Image img={require('../../img/ui_invite_user.png')} style={{ width: '800px', height: 'auto' }} />
<br />
2. Share invitation link with user
<Image img={require('../../img/ui_invite_link.png')} style={{ width: '800px', height: 'auto' }} />
<br />
3. User logs in via email + password auth
<Image img={require('../../img/ui_clean_login.png')} style={{ width: '500px', height: 'auto' }} />
<br />
4. User can now view Usage, Caching Analytics
<Image img={require('../../img/ui_usage.png')} style={{ width: '800px', height: 'auto' }} />
## Available Roles
Here's the available UI roles for a LiteLLM Internal User:
**Admin Roles:**
- `proxy_admin`: admin over the platform
- `proxy_admin_viewer`: can login, view all keys, view all spend. **Cannot** create/delete keys, add new users.
**Internal User Roles:**
- `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users.
- `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users.
## Advanced ## Advanced
### Setting custom logout URLs ### Setting custom logout URLs
@ -138,3 +173,24 @@ export PROXY_LOGOUT_URL="https://www.google.com"
<Image img={require('../../img/ui_logout.png')} style={{ width: '400px', height: 'auto' }} /> <Image img={require('../../img/ui_logout.png')} style={{ width: '400px', height: 'auto' }} />
### Set max budget for internal users
Automatically apply budget per internal user when they sign up
```yaml
litellm_settings:
max_internal_user_budget: 10
internal_user_budget_duration: "1mo" # reset every month
```
This sets a max budget of $10 USD for internal users when they sign up.
This budget only applies to personal keys created by that user - seen under `Default Team` on the UI.
<Image img={require('../../img/max_budget_for_internal_users.png')} style={{ width: '500px', height: 'auto' }} />
This budget does not apply to keys created under non-default teams.
### Set max budget for teams
[**Go Here**](./team_budgets.md)

View file

@ -8,6 +8,7 @@ Define your custom callback class in a python file.
```python ```python
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
import litellm import litellm
import logging
# This file includes the custom callbacks for LiteLLM Proxy # This file includes the custom callbacks for LiteLLM Proxy
# Once defined, these can be passed in proxy_config.yaml # Once defined, these can be passed in proxy_config.yaml
@ -25,9 +26,9 @@ class MyCustomHandler(CustomLogger):
datefmt='%Y-%m-%d %H:%M:%S' datefmt='%Y-%m-%d %H:%M:%S'
) )
response_cost = litellm.completion_cost(completion_response=completion_response) response_cost: Optional[float] = kwargs.get("response_cost", None)
print("regular response_cost", response_cost) print("regular response_cost", response_cost)
logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}") logging.info(f"Model {response_obj.model} Cost: ${response_cost:.8f}")
except: except:
pass pass

View file

@ -0,0 +1,133 @@
# Tag Based Routing
Route requests based on tags.
This is useful for implementing free / paid tiers for users
### 1. Define tags on config.yaml
- A request with `tags=["free"]` will get routed to `openai/fake`
- A request with `tags=["paid"]` will get routed to `openai/gpt-4o`
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
tags: ["free"] # 👈 Key Change
- model_name: gpt-4
litellm_params:
model: openai/gpt-4o
api_key: os.environ/OPENAI_API_KEY
tags: ["paid"] # 👈 Key Change
router_settings:
enable_tag_filtering: True # 👈 Key Change
general_settings:
master_key: sk-1234
```
### 2. Make Request with `tags=["free"]`
This request includes "tags": ["free"], which routes it to `openai/fake`
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
],
"tags": ["free"]
}'
```
**Expected Response**
Expect to see the following response header when this works
```shell
x-litellm-model-api-base: https://exampleopenaiendpoint-production.up.railway.app/
```
Response
```shell
{
"id": "chatcmpl-33c534e3d70148218e2d62496b81270b",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "\n\nHello there, how may I assist you today?",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1677652288,
"model": "gpt-3.5-turbo-0125",
"object": "chat.completion",
"system_fingerprint": "fp_44709d6fcb",
"usage": {
"completion_tokens": 12,
"prompt_tokens": 9,
"total_tokens": 21
}
}
```
### 3. Make Request with `tags=["paid"]`
This request includes "tags": ["paid"], which routes it to `openai/gpt-4`
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
],
"tags": ["paid"]
}'
```
**Expected Response**
Expect to see the following response header when this works
```shell
x-litellm-model-api-base: https://api.openai.com
```
Response
```shell
{
"id": "chatcmpl-9maCcqQYTqdJrtvfakIawMOIUbEZx",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Good morning! How can I assist you today?",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"created": 1721365934,
"model": "gpt-4o-2024-05-13",
"object": "chat.completion",
"system_fingerprint": "fp_c4e5b6fa31",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 12,
"total_tokens": 22
}
}
```

View file

@ -1,4 +1,4 @@
# 👥 Team-based Routing + Logging # 👥 Team-based Routing
## Routing ## Routing
Route calls to different model groups based on the team-id Route calls to different model groups based on the team-id
@ -71,35 +71,3 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
}' }'
``` ```
## Logging / Caching
Turn on/off logging and caching for a specific team id.
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging.

View file

@ -0,0 +1,227 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 👥📊 Team/Key Based Logging
Allow each key/team to use their own Langfuse Project / custom callbacks
**This allows you to do the following**
```
Team 1 -> Logs to Langfuse Project 1
Team 2 -> Logs to Langfuse Project 2
Team 3 -> Disabled Logging (for GDPR compliance)
```
## Team Based Logging
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
## Logging / Caching
Turn on/off logging and caching for a specific team id.
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging. -->
## [BETA] Team Logging via API
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
### Set Callbacks Per Team
#### 1. Set callback for team
We make a request to `POST /team/{team_id}/callback` to add a callback for
```shell
curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"callback_name": "langfuse",
"callback_type": "success",
"callback_vars": {
"langfuse_public_key": "pk",
"langfuse_secret_key": "sk_",
"langfuse_host": "https://cloud.langfuse.com"
}
}'
```
##### Supported Values
| Field | Supported Values | Notes |
|-------|------------------|-------|
| `callback_name` | `"langfuse"` | Currently only supports "langfuse" |
| `callback_type` | `"success"`, `"failure"`, `"success_and_failure"` | |
| `callback_vars` | | dict of callback settings |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_public_key` | string | Required |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_secret_key` | string | Required |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_host` | string | Optional (defaults to https://cloud.langfuse.com) |
#### 2. Create key for team
All keys created for team `dbe2f686-a686-4896-864a-4c3924458709` will log to langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
```shell
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"team_id": "dbe2f686-a686-4896-864a-4c3924458709"
}'
```
#### 3. Make `/chat/completion` request for team
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
Expect this to be logged on the langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
### Disable Logging for a Team
To disable logging for a specific team, you can use the following endpoint:
`POST /team/{team_id}/disable_logging`
This endpoint removes all success and failure callbacks for the specified team, effectively disabling logging.
#### Step 1. Disable logging for team
```shell
curl -X POST 'http://localhost:4000/team/YOUR_TEAM_ID/disable_logging' \
-H 'Authorization: Bearer YOUR_API_KEY'
```
Replace YOUR_TEAM_ID with the actual team ID
**Response**
A successful request will return a response similar to this:
```json
{
"status": "success",
"message": "Logging disabled for team YOUR_TEAM_ID",
"data": {
"team_id": "YOUR_TEAM_ID",
"success_callbacks": [],
"failure_callbacks": []
}
}
```
#### Step 2. Test it - `/chat/completions`
Use a key generated for team = `team_id` - you should see no logs on your configured success callback (eg. Langfuse)
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
#### Debugging / Troubleshooting
- Check active callbacks for team using `GET /team/{team_id}/callback`
Use this to check what success/failure callbacks are active for team=`team_id`
```shell
curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Authorization: Bearer sk-1234'
```
### Team Logging Endpoints
- [`POST /team/{team_id}/callback` Add a success/failure callback to a team](https://litellm-api.up.railway.app/#/team%20management/add_team_callbacks_team__team_id__callback_post)
- [`GET /team/{team_id}/callback` - Get the success/failure callbacks and variables for a team](https://litellm-api.up.railway.app/#/team%20management/get_team_callbacks_team__team_id__callback_get)
## [BETA] Key Based Logging
Use the `/key/generate` or `/key/update` endpoints to add logging callbacks to a specific key.
:::info
✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
:::
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{
"metadata": {
"logging": [{
"callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
"callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default
"callback_vars": {
"langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
"langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment
"langfuse_host": "https://cloud.langfuse.com"
}
}]
}
}'
```
---
Help us improve this feature, by filing a [ticket here](https://github.com/BerriAI/litellm/issues)

View file

@ -53,6 +53,12 @@ UI_PASSWORD=langchain # password to sign in on UI
On accessing the LiteLLM UI, you will be prompted to enter your username, password On accessing the LiteLLM UI, you will be prompted to enter your username, password
## Invite-other users
Allow others to create/delete their own keys.
[**Go Here**](./self_serve.md)
## ✨ Enterprise Features ## ✨ Enterprise Features
Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise) Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise)
@ -76,6 +82,13 @@ litellm_settings:
- Key will be created with `max_budget=100` since 100 is the upper bound - Key will be created with `max_budget=100` since 100 is the upper bound
#### Step 2: Setup Oauth Client #### Step 2: Setup Oauth Client
:::tip
Looking for how to use Oauth 2.0 for /chat, /completions API requests to the proxy? [Follow this doc](oauth2)
:::
<Tabs> <Tabs>
<TabItem value="okta" label="Okta SSO"> <TabItem value="okta" label="Okta SSO">
@ -186,6 +199,16 @@ PROXY_BASE_URL=https://litellm-api.up.railway.app/
#### Step 4. Test flow #### Step 4. Test flow
<Image img={require('../../img/litellm_ui_3.gif')} /> <Image img={require('../../img/litellm_ui_3.gif')} />
### Restrict Email Subdomains w/ SSO
If you're using SSO and want to only allow users with a specific subdomain - e.g. (@berri.ai email accounts) to access the UI, do this:
```bash
export ALLOWED_EMAIL_DOMAINS="berri.ai"
```
This will check if the user email we receive from SSO contains this domain, before allowing access.
### Set Admin view w/ SSO ### Set Admin view w/ SSO
You just need to set Proxy Admin ID You just need to set Proxy Admin ID

View file

@ -1,7 +1,43 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl # 💡 Migrating from OpenAI (Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl)
LiteLLM Proxy is **OpenAI-Compatible**, and supports:
* /chat/completions
* /embeddings
* /completions
* /image/generations
* /moderations
* /audio/transcriptions
* /audio/speech
* [Assistants API endpoints](https://docs.litellm.ai/docs/assistants)
* [Batches API endpoints](https://docs.litellm.ai/docs/batches)
* [Fine-Tuning API endpoints](https://docs.litellm.ai/docs/fine_tuning)
LiteLLM Proxy is **Azure OpenAI-compatible**:
* /chat/completions
* /completions
* /embeddings
LiteLLM Proxy is **Anthropic-compatible**:
* /messages
LiteLLM Proxy is **Vertex AI compatible**:
- [Supports ALL Vertex Endpoints](../vertex_ai)
This doc covers:
* /chat/completion
* /embedding
These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.
To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)
To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)
:::info :::info
@ -207,6 +243,81 @@ console.log(message);
``` ```
</TabItem> </TabItem>
<TabItem value="openai JS" label="OpenAI JS">
```js
const { OpenAI } = require('openai');
const openai = new OpenAI({
apiKey: "sk-1234", // This is the default and can be omitted
baseURL: "http://0.0.0.0:4000"
});
async function main() {
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'gpt-3.5-turbo',
}, {"metadata": {
"generation_name": "ishaan-generation-openaijs-client",
"generation_id": "openaijs-client-gen-id22",
"trace_id": "openaijs-client-trace-id22",
"trace_user_id": "openaijs-client-user-id2"
}});
}
main();
```
</TabItem>
<TabItem value="anthropic-py" label="Anthropic Python SDK">
```python
import os
from anthropic import Anthropic
client = Anthropic(
base_url="http://localhost:4000", # proxy endpoint
api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
)
message = client.messages.create(
max_tokens=1024,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="claude-3-opus-20240229",
)
print(message.content)
```
</TabItem>
<TabItem value="mistral-py" label="Mistral Python SDK">
```python
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000")
chat_response = client.chat(
model="mistral-small-latest",
messages=[
{"role": "user", "content": "this is a test request, write a short poem"}
],
)
print(chat_response.choices[0].message.content)
```
</TabItem>
<TabItem value="instructor" label="Instructor"> <TabItem value="instructor" label="Instructor">
```python ```python
@ -214,11 +325,12 @@ from openai import OpenAI
import instructor import instructor
from pydantic import BaseModel from pydantic import BaseModel
my_proxy_api_key = "" # e.g. sk-1234 my_proxy_api_key = "" # e.g. sk-1234 - LITELLM KEY
my_proxy_base_url = "" # e.g. http://0.0.0.0:4000 my_proxy_base_url = "" # e.g. http://0.0.0.0:4000 - LITELLM PROXY BASE URL
# This enables response_model keyword # This enables response_model keyword
# from client.chat.completions.create # from client.chat.completions.create
## WORKS ACROSS OPENAI/ANTHROPIC/VERTEXAI/ETC. - all LITELLM SUPPORTED MODELS!
client = instructor.from_openai(OpenAI(api_key=my_proxy_api_key, base_url=my_proxy_base_url)) client = instructor.from_openai(OpenAI(api_key=my_proxy_api_key, base_url=my_proxy_base_url))
class UserDetail(BaseModel): class UserDetail(BaseModel):
@ -539,6 +651,166 @@ curl --location 'http://0.0.0.0:4000/moderations' \
``` ```
## Using with OpenAI compatible projects
Set `base_url` to the LiteLLM Proxy server
<Tabs>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="librechat" label="LibreChat">
#### Start the LiteLLM proxy
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
```shell
git clone https://github.com/danny-avila/LibreChat.git
```
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
```env
OPENAI_API_KEY=sk-1234
```
#### 4. Run LibreChat:
```shell
docker compose up
```
</TabItem>
<TabItem value="continue-dev" label="ContinueDev">
Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
```python
default=OpenAI(
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:4000" # your proxy server url
),
```
Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial.
</TabItem>
<TabItem value="aider" label="Aider">
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
```python
pip install pyautogen
```
```python
from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
]
response = oai.Completion.create(config_list=config_list, prompt="Hi")
print(response) # works fine
llm_config={
"config_list": config_list,
}
assistant = AssistantAgent("assistant", llm_config=llm_config)
user_proxy = UserProxyAgent("user_proxy")
user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
```
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
</TabItem>
<TabItem value="guidance" label="guidance">
A guidance language for controlling large language models.
https://github.com/guidance-ai/guidance
**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it.
**Fix**: Start your proxy using the `--drop_params` flag
```shell
litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
```
```python
import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
You are a helpful and terse assistant.
{{~/system}}
{{#user~}}
I want a response to the following question:
{{query}}
Name 3 world-class experts (past or present) who would be great at answering this?
Don't answer the question yet.
{{~/user}}
{{#assistant~}}
{{gen 'expert_names' temperature=0 max_tokens=300}}
{{~/assistant}}
''', llm=gpt4)
result = experts(query='How can I be more productive?')
print(result)
```
</TabItem>
</Tabs>
## Advanced ## Advanced
### (BETA) Batch Completions - pass multiple models ### (BETA) Batch Completions - pass multiple models

View file

@ -484,6 +484,8 @@ You can set:
- tpm limits (tokens per minute) - tpm limits (tokens per minute)
- rpm limits (requests per minute) - rpm limits (requests per minute)
- max parallel requests - max parallel requests
- rpm / tpm limits per model for a given key
<Tabs> <Tabs>
<TabItem value="per-user" label="Per Internal User"> <TabItem value="per-user" label="Per Internal User">
@ -532,6 +534,60 @@ curl --location 'http://0.0.0.0:4000/key/generate' \
} }
``` ```
</TabItem>
<TabItem value="per-key-model" label="Per API Key Per model">
**Set rate limits per model per api key**
Set `model_rpm_limit` and `model_tpm_limit` to set rate limits per model per api key
Here `gpt-4` is the `model_name` set on the [litellm config.yaml](configs.md)
```shell
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"model_rpm_limit": {"gpt-4": 2}, "model_tpm_limit": {"gpt-4":}}'
```
**Expected Response**
```json
{
"key": "sk-ulGNRXWtv7M0lFnnsQk0wQ",
"expires": "2024-01-18T20:48:44.297973",
}
```
**Verify Model Rate Limits set correctly for this key**
**Make /chat/completions request check if `x-litellm-key-remaining-requests-gpt-4` returned**
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-ulGNRXWtv7M0lFnnsQk0wQ" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude!ss eho ares"}
]
}'
```
**Expected headers**
```shell
x-litellm-key-remaining-requests-gpt-4: 1
x-litellm-key-remaining-tokens-gpt-4: 179
```
These headers indicate:
- 1 request remaining for the GPT-4 model for key=`sk-ulGNRXWtv7M0lFnnsQk0wQ`
- 179 tokens remaining for the GPT-4 model for key=`sk-ulGNRXWtv7M0lFnnsQk0wQ`
</TabItem> </TabItem>
<TabItem value="per-end-user" label="For customers"> <TabItem value="per-end-user" label="For customers">

View file

@ -34,6 +34,7 @@ You can then generate keys by hitting the `/key/generate` endpoint.
[**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672) [**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672)
## **Quick Start - Generate a Key**
**Step 1: Save postgres db url** **Step 1: Save postgres db url**
```yaml ```yaml
@ -65,7 +66,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}' --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}'
``` ```
## Advanced - Spend Tracking ## Spend Tracking
Get spend per: Get spend per:
- key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get) - key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get)
@ -223,9 +224,70 @@ Expected Response
</TabItem> </TabItem>
</Tabs> </Tabs>
## Advanced - Model Access ## **Model Access**
### Restrict models by `team_id` ### **Restrict models by Virtual Key**
Set allowed models for a key using the `models` param
```shell
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"]}'
```
:::info
This key can only make requests to `models` that are `gpt-3.5-turbo` or `gpt-4`
:::
Verify this is set correctly by
<Tabs>
<TabItem label="Allowed Access" value = "allowed">
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
<TabItem label="Disallowed Access" value = "not-allowed">
:::info
Expect this to fail since gpt-4o is not in the `models` for the key generated
:::
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4o",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
### **Restrict models by `team_id`**
`litellm-dev` can only access `azure-gpt-3.5` `litellm-dev` can only access `azure-gpt-3.5`
**1. Create a team via `/team/new`** **1. Create a team via `/team/new`**
@ -269,6 +331,157 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}% {"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
``` ```
### **Grant Access to new model (Access Groups)**
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.)
**Step 1. Assign model, access group in config.yaml**
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
model_info:
access_groups: ["beta-models"] # 👈 Model Access Group
- model_name: fireworks-llama-v3-70b-instruct
litellm_params:
model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
api_key: "os.environ/FIREWORKS"
model_info:
access_groups: ["beta-models"] # 👈 Model Access Group
```
<Tabs>
<TabItem value="key" label="Key Access Groups">
**Create key with access group**
```bash
curl --location 'http://localhost:4000/key/generate' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
"max_budget": 0,}'
```
Test Key
<Tabs>
<TabItem label="Allowed Access" value = "allowed">
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-<key-from-previous-step>" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
<TabItem label="Disallowed Access" value = "not-allowed">
:::info
Expect this to fail since gpt-4o is not in the `beta-models` access group
:::
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-<key-from-previous-step>" \
-d '{
"model": "gpt-4o",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
</TabItem>
<TabItem value="team" label="Team Access Groups">
Create Team
```shell
curl --location 'http://localhost:4000/team/new' \
-H 'Authorization: Bearer sk-<key-from-previous-step>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"]}'
```
Create Key for Team
```shell
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-<key-from-previous-step>' \
--header 'Content-Type: application/json' \
--data '{"team_id": "0ac97648-c194-4c90-8cd6-40af7b0d2d2a"}
```
Test Key
<Tabs>
<TabItem label="Allowed Access" value = "allowed">
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-<key-from-previous-step>" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
<TabItem label="Disallowed Access" value = "not-allowed">
:::info
Expect this to fail since gpt-4o is not in the `beta-models` access group
:::
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-<key-from-previous-step>" \
-d '{
"model": "gpt-4o",
"messages": [
{"role": "user", "content": "Hello"}
]
}'
```
</TabItem>
</Tabs>
</TabItem>
</Tabs>
### Model Aliases ### Model Aliases
If a user is expected to use a given model (i.e. gpt3-5), and you want to: If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -319,35 +532,73 @@ curl -X POST "https://0.0.0.0:4000/key/generate" \
- **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py) - **How are routing between diff keys/api bases done?** litellm handles this by shuffling between different models in the model list with the same model_name. [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/router.py)
### Grant Access to new model ## Advanced
Use model access groups to give users access to select models, and add new ones to it over time (e.g. mistral, llama-2, etc.) ### Pass LiteLLM Key in custom header
**Step 1. Assign model, access group in config.yaml** Use this to make LiteLLM proxy look for the virtual key in a custom header instead of the default `"Authorization"` header
**Step 1** Define `litellm_key_header_name` name on litellm config.yaml
```yaml ```yaml
model_list: model_list:
- model_name: text-embedding-ada-002 - model_name: fake-openai-endpoint
litellm_params: litellm_params:
model: azure/azure-embedding-model model: openai/fake
api_base: "os.environ/AZURE_API_BASE" api_key: fake-key
api_key: "os.environ/AZURE_API_KEY" api_base: https://exampleopenaiendpoint-production.up.railway.app/
api_version: "2023-07-01-preview"
model_info: general_settings:
access_groups: ["beta-models"] # 👈 Model Access Group master_key: sk-1234
litellm_key_header_name: "X-Litellm-Key" # 👈 Key Change
``` ```
**Step 2. Create key with access group** **Step 2** Test it
```bash In this request, litellm will use the Virtual key in the `X-Litellm-Key` header
curl --location 'http://localhost:4000/key/generate' \
-H 'Authorization: Bearer <your-master-key>' \ <Tabs>
-H 'Content-Type: application/json' \ <TabItem value="curl" label="curl">
-d '{"models": ["beta-models"], # 👈 Model Access Group
"max_budget": 0,}' ```shell
curl http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "X-Litellm-Key: Bearer sk-1234" \
-H "Authorization: Bearer bad-key" \
-d '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
``` ```
## Advanced - Custom Auth **Expected Response**
Expect to see a successfull response from the litellm proxy since the key passed in `X-Litellm-Key` is valid
```shell
{"id":"chatcmpl-f9b2b79a7c30477ab93cd0e717d1773e","choices":[{"finish_reason":"stop","index":0,"message":{"content":"\n\nHello there, how may I assist you today?","role":"assistant","tool_calls":null,"function_call":null}}],"created":1677652288,"model":"gpt-3.5-turbo-0125","object":"chat.completion","system_fingerprint":"fp_44709d6fcb","usage":{"completion_tokens":12,"prompt_tokens":9,"total_tokens":21}
```
</TabItem>
<TabItem value="python" label="OpenAI Python SDK">
```python
client = openai.OpenAI(
api_key="not-used",
base_url="https://api-gateway-url.com/llmservc/api/litellmp",
default_headers={
"Authorization": f"Bearer {API_GATEWAY_TOKEN}", # (optional) For your API Gateway
"X-Litellm-Key": f"Bearer sk-1234" # For LiteLLM Proxy
}
)
```
</TabItem>
</Tabs>
### Custom Auth
You can now override the default api key auth. You can now override the default api key auth.
@ -486,7 +737,7 @@ general_settings:
``` ```
## Upperbound /key/generate params ### Upperbound /key/generate params
Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key. Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key.
Set `litellm_settings:upperbound_key_generate_params`: Set `litellm_settings:upperbound_key_generate_params`:
@ -502,7 +753,7 @@ litellm_settings:
- Send a `/key/generate` request with `max_budget=200` - Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound - Key will be created with `max_budget=100` since 100 is the upper bound
## Default /key/generate params ### Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key. Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params` When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
@ -518,7 +769,11 @@ litellm_settings:
team_id: "core-infra" team_id: "core-infra"
``` ```
## Endpoints ## **Next Steps - Set Budgets, Rate Limits per Virtual Key**
[Follow this doc to set budgets, rate limiters per virtual key with LiteLLM](users)
## Endpoint Reference (Spec)
### Keys ### Keys

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server # [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local LiteLLM Proxy Server
A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs.

View file

@ -14,7 +14,7 @@ In production, litellm supports using Redis as a way to track cooldown server an
:::info :::info
If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./proxy/load_balancing.md) If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./proxy/load_balancing.md)
::: :::
@ -88,8 +88,8 @@ print(response)
### Available Endpoints ### Available Endpoints
- `router.completion()` - chat completions endpoint to call 100+ LLMs - `router.completion()` - chat completions endpoint to call 100+ LLMs
- `router.acompletion()` - async chat completion calls - `router.acompletion()` - async chat completion calls
- `router.embeddings()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints - `router.embedding()` - embedding endpoint for Azure, OpenAI, Huggingface endpoints
- `router.aembeddings()` - async embeddings calls - `router.aembedding()` - async embeddings calls
- `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format - `router.text_completion()` - completion calls in the old OpenAI `/v1/completions` endpoint format
- `router.atext_completion()` - async text completion calls - `router.atext_completion()` - async text completion calls
- `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
@ -1637,7 +1637,7 @@ response = router.completion(
## Deploy Router ## Deploy Router
If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model) If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
## Init Params for the litellm.Router ## Init Params for the litellm.Router

View file

@ -41,7 +41,7 @@ router = Router(
) )
try: try:
_response = await router.schedule_acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL _response = await router.acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey!"}], messages=[{"role": "user", "content": "Hey!"}],
priority=0, # 👈 LOWER IS BETTER priority=0, # 👈 LOWER IS BETTER
@ -52,13 +52,13 @@ except Exception as e:
## LiteLLM Proxy ## LiteLLM Proxy
To prioritize requests on LiteLLM Proxy call our beta openai-compatible `http://localhost:4000/queue` endpoint. To prioritize requests on LiteLLM Proxy add `priority` to the request.
<Tabs> <Tabs>
<TabItem value="curl" label="curl"> <TabItem value="curl" label="curl">
```curl ```curl
curl -X POST 'http://localhost:4000/queue/chat/completions' \ curl -X POST 'http://localhost:4000/chat/completions' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \ -H 'Authorization: Bearer sk-1234' \
-D '{ -D '{
@ -128,7 +128,7 @@ router = Router(
) )
try: try:
_response = await router.schedule_acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL _response = await router.acompletion( # 👈 ADDS TO QUEUE + POLLS + MAKES CALL
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey!"}], messages=[{"role": "user", "content": "Hey!"}],
priority=0, # 👈 LOWER IS BETTER priority=0, # 👈 LOWER IS BETTER
@ -147,6 +147,9 @@ model_list:
mock_response: "hello world!" mock_response: "hello world!"
api_key: my-good-key api_key: my-good-key
litellm_settings:
request_timeout: 600 # 👈 Will keep retrying until timeout occurs
router_settings: router_settings:
redis_host; os.environ/REDIS_HOST redis_host; os.environ/REDIS_HOST
redis_password: os.environ/REDIS_PASSWORD redis_password: os.environ/REDIS_PASSWORD

View file

@ -0,0 +1,65 @@
# Custom Pricing - SageMaker, Azure, etc
Register custom pricing for sagemaker completion model.
For cost per second pricing, you **just** need to register `input_cost_per_second`.
```python
# !pip install boto3
from litellm import completion, completion_cost
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
def test_completion_sagemaker():
try:
print("testing sagemaker")
response = completion(
model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
input_cost_per_second=0.000420,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
```
## Cost Per Token (e.g. Azure)
```python
# !pip install boto3
from litellm import completion, completion_cost
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
def test_completion_azure_model():
try:
print("testing azure custom pricing")
# azure call
response = completion(
model = "azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
input_cost_per_token=0.005,
output_cost_per_token=1,
)
# Add any assertions here to check the response
print(response)
cost = completion_cost(completion_response=response)
print(cost)
except Exception as e:
raise Exception(f"Error occurred: {e}")
test_completion_azure_model()
```

Some files were not shown because too many files have changed in this diff Show more