Merge branch 'main' into litellm_faster_api_key_checking

This commit is contained in:
Krish Dholakia 2024-03-09 18:45:03 -08:00 committed by GitHub
commit c022568a3a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
69 changed files with 2330 additions and 553 deletions

View file

@ -46,6 +46,7 @@ jobs:
pip install "apscheduler==3.10.4"
pip install "PyGithub==1.59.1"
pip install argon2-cffi
pip install python-multipart
- save_cache:
paths:
- ./venv

View file

@ -146,9 +146,29 @@ jobs:
} catch (error) {
core.setFailed(error.message);
}
- name: Fetch Release Notes
id: release-notes
uses: actions/github-script@v6
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
script: |
try {
const response = await github.rest.repos.getRelease({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: process.env.RELEASE_ID,
});
return response.data.body;
} catch (error) {
core.setFailed(error.message);
}
env:
RELEASE_ID: ${{ env.RELEASE_ID }}
- name: Github Releases To Discord
env:
WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }}
REALEASE_TAG: ${{ env.RELEASE_TAG }}
RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
run: |
curl -H "Content-Type: application/json" -X POST -d '{
"content": "||@everyone||",
@ -156,8 +176,8 @@ jobs:
"avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
"embeds": [
{
"title": "Changelog",
"description": "This is the changelog for the latest release.",
"title": "Changelog for ${RELEASE_TAG}",
"description": "${RELEASE_NOTES}",
"color": 2105893
}
]

1
.gitignore vendored
View file

@ -44,3 +44,4 @@ deploy/charts/litellm/*.tgz
deploy/charts/litellm/charts/*
deploy/charts/*.tgz
litellm/proxy/vertex_key.json
**/.vim/

View file

@ -143,13 +143,13 @@ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
@ -170,7 +170,7 @@ Set budgets and rate limits across multiple projects
### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'

View file

@ -1,6 +1,9 @@
dependencies:
- name: postgresql
repository: oci://registry-1.docker.io/bitnamicharts
version: 13.3.1
digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
generated: "2024-01-19T11:32:56.694808861+11:00"
version: 14.3.1
- name: redis
repository: oci://registry-1.docker.io/bitnamicharts
version: 18.19.1
digest: sha256:8660fe6287f9941d08c0902f3f13731079b8cecd2a5da2fbc54e5b7aae4a6f62
generated: "2024-03-10T02:28:52.275022+05:30"

View file

@ -31,3 +31,7 @@ dependencies:
version: ">=13.3.0"
repository: oci://registry-1.docker.io/bitnamicharts
condition: db.deployStandalone
- name: redis
version: ">=18.0.0"
repository: oci://registry-1.docker.io/bitnamicharts
condition: redis.enabled

View file

@ -28,7 +28,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` |
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
@ -76,7 +76,7 @@ When browsing to the URL published per the settings in `ingress.*`, you will
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
(from the `litellm` pod's perspective) URL published by the `<RELEASE>-litellm`
Kubernetes Service. If the deployment uses the default settings for this
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:8000`.
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:4000`.
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
was not provided to the helm command line, the `masterkey` is a randomly

View file

@ -60,3 +60,25 @@ Create the name of the service account to use
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
{{/*
Get redis service name
*/}}
{{- define "litellm.redis.serviceName" -}}
{{- if and (eq .Values.redis.architecture "standalone") .Values.redis.sentinel.enabled -}}
{{- printf "%s-%s" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
{{- else -}}
{{- printf "%s-%s-master" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
{{- end -}}
{{- end -}}
{{/*
Get redis service port
*/}}
{{- define "litellm.redis.port" -}}
{{- if .Values.redis.sentinel.enabled -}}
{{ .Values.redis.sentinel.service.ports.sentinel }}
{{- else -}}
{{ .Values.redis.master.service.ports.redis }}
{{- end -}}
{{- end -}}

View file

@ -142,6 +142,17 @@ spec:
secretKeyRef:
name: {{ include "litellm.fullname" . }}-masterkey
key: masterkey
{{- if .Values.redis.enabled }}
- name: REDIS_HOST
value: {{ include "litellm.redis.serviceName" . }}
- name: REDIS_PORT
value: {{ include "litellm.redis.port" . | quote }}
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "redis.secretName" .Subcharts.redis }}
key: {{include "redis.secretPasswordKey" .Subcharts.redis }}
{{- end }}
envFrom:
{{- range .Values.environmentSecrets }}
- secretRef:

View file

@ -55,7 +55,7 @@ environmentSecrets: []
service:
type: ClusterIP
port: 8000
port: 4000
ingress:
enabled: false
@ -87,6 +87,8 @@ proxy_config:
api_key: eXaMpLeOnLy
general_settings:
master_key: os.environ/PROXY_MASTER_KEY
# litellm_settings:
# cache: true
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
@ -166,3 +168,10 @@ postgresql:
# existingSecret: ""
# secretKeys:
# userPasswordKey: password
# requires cache: true in config file
# either enable this or pass a secret for REDIS_HOST, REDIS_PORT, REDIS_PASSWORD or REDIS_URL
# with cache: true to use existing redis instance
redis:
enabled: false
architecture: standalone

View file

@ -0,0 +1,85 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Audio Transcription
Use this to loadbalance across Azure + OpenAI.
## Quick Start
```python
from litellm import transcription
import os
# set api keys
os.environ["OPENAI_API_KEY"] = ""
audio_file = open("/path/to/audio.mp3", "rb")
response = transcription(model="whisper", file=audio_file)
print(f"response: {response}")
```
## Proxy Usage
### Add model to config
<Tabs>
<TabItem value="openai" label="OpenAI">
```yaml
model_list:
- model_name: whisper
litellm_params:
model: whisper-1
api_key: os.environ/OPENAI_API_KEY
model_info:
mode: audio_transcription
general_settings:
master_key: sk-1234
```
</TabItem>
<TabItem value="openai+azure" label="OpenAI + Azure">
```yaml
model_list:
- model_name: whisper
litellm_params:
model: whisper-1
api_key: os.environ/OPENAI_API_KEY
model_info:
mode: audio_transcription
- model_name: whisper
litellm_params:
model: azure/azure-whisper
api_version: 2024-02-15-preview
api_base: os.environ/AZURE_EUROPE_API_BASE
api_key: os.environ/AZURE_EUROPE_API_KEY
model_info:
mode: audio_transcription
general_settings:
master_key: sk-1234
```
</TabItem>
</Tabs>
### Start proxy
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:8000
```
### Test
```bash
curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
--header 'Authorization: Bearer sk-1234' \
--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
--form 'model="whisper"'
```

View file

@ -24,6 +24,17 @@ print(response)
```
### Translated OpenAI params
Use this function to get an up-to-date list of supported openai params for any model + provider.
```python
from litellm import get_supported_openai_params
response = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
print(response) # ["max_tokens", "tools", "tool_choice", "stream"]
```
This is a list of openai params we translate across providers.
This list is constantly being updated.

View file

@ -35,7 +35,7 @@ general_settings:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:8000
# RUNNING on http://0.0.0.0:4000
```
### Test
@ -44,7 +44,7 @@ litellm --config /path/to/config.yaml
<TabItem value="curl" label="Curl">
```bash
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"input": ["Academia.edu uses"], "model": "textembedding-gecko", "encoding_format": "base64"}'
@ -57,7 +57,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
from openai import OpenAI
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
client.embeddings.create(
@ -72,7 +72,7 @@ client.embeddings.create(
```python
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:8000", openai_api_key="sk-1234")
embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:4000", openai_api_key="sk-1234")
text = "This is a test document."
@ -200,7 +200,7 @@ Use this for calling `/embedding` endpoints on OpenAI Compatible Servers, exampl
from litellm import embedding
response = embedding(
model = "openai/<your-llm-name>", # add `openai/` prefix to model so litellm knows to route to OpenAI
api_base="http://0.0.0.0:8000/" # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000/" # set API Base of your Custom OpenAI Endpoint
input=["good morning from litellm"]
)
```

View file

@ -13,7 +13,14 @@ https://github.com/BerriAI/litellm
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
## Basic usage
## How to use LiteLLM
You can use litellm through either:
1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
## LiteLLM Python SDK
### Basic usage
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
@ -144,7 +151,7 @@ response = completion(
</Tabs>
## Streaming
### Streaming
Set `stream=True` in the `completion` args.
<Tabs>
<TabItem value="openai" label="OpenAI">
@ -276,7 +283,7 @@ response = completion(
</Tabs>
## Exception handling
### Exception handling
LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM.
@ -292,7 +299,7 @@ except OpenAIError as e:
print(e)
```
## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
### Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
LiteLLM exposes pre defined callbacks to send data to Langfuse, LLMonitor, Helicone, Promptlayer, Traceloop, Slack
```python
from litellm import completion
@ -311,7 +318,7 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
```
## Track Costs, Usage, Latency for streaming
### Track Costs, Usage, Latency for streaming
Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
```python
@ -368,13 +375,13 @@ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{

View file

@ -1,5 +1,84 @@
import Image from '@theme/IdealImage';
# 🔥 Load Test LiteLLM
## Load Test LiteLLM Proxy - 1500+ req/s
## 1500+ concurrent requests/s
LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
```python
import time, asyncio
from openai import AsyncOpenAI, AsyncAzureOpenAI
import uuid
import traceback
# base_url - litellm proxy endpoint
# api_key - litellm proxy api-key, is created proxy with auth
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
async def litellm_completion():
# Your existing code for litellm_completion goes here
try:
response = await litellm_client.chat.completions.create(
model="azure-gpt-3.5",
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
)
print(response)
return response
except Exception as e:
# If there's an exception, log the error message
with open("error_log.txt", "a") as error_log:
error_log.write(f"Error during completion: {str(e)}\n")
pass
async def main():
for i in range(1):
start = time.time()
n = 1500 # Number of concurrent tasks
tasks = [litellm_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)
successful_completions = [c for c in chat_completions if c is not None]
# Write errors to error_log.txt
with open("error_log.txt", "a") as error_log:
for completion in chat_completions:
if isinstance(completion, str):
error_log.write(completion + "\n")
print(n, time.time() - start, len(successful_completions))
time.sleep(10)
if __name__ == "__main__":
# Blank out contents of error_log.txt
open("error_log.txt", "w").close()
asyncio.run(main())
```
### Throughput - 30% Increase
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
<Image img={require('../img/throughput.png')} />
### Latency Added - 0.00325 seconds
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
<Image img={require('../img/latency.png')} />
### Testing LiteLLM Proxy with Locust
- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
<Image img={require('../img/locust.png')} />
## Load Test LiteLLM SDK vs OpenAI
Here is a script to load test LiteLLM vs OpenAI
```python
@ -11,7 +90,7 @@ import time, asyncio, litellm
#### LITELLM PROXY ####
litellm_client = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
#### AZURE OPENAI CLIENT ####
@ -85,3 +164,4 @@ async def loadtest_fn():
asyncio.run(loadtest_fn())
```

View file

@ -63,7 +63,7 @@ export ANTHROPIC_API_KEY="your-api-key"
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:8000
# Server running on http://0.0.0.0:4000
```
### 3. Test it
@ -73,7 +73,7 @@ $ litellm --model claude-3-opus-20240229
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -93,7 +93,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -120,7 +120,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)

View file

@ -54,7 +54,7 @@ export AWS_REGION_NAME=""
```bash
$ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
# Server running on http://0.0.0.0:8000
# Server running on http://0.0.0.0:4000
```
### 3. Test it
@ -64,7 +64,7 @@ $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -84,7 +84,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -111,7 +111,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)

View file

@ -5,6 +5,12 @@ LiteLLM supports all models from [Ollama](https://github.com/jmorganca/ollama)
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
:::info
We recommend using [ollama_chat](#using-ollama-apichat) for better responses.
:::
## Pre-requisites
Ensure you have your ollama server running
@ -177,7 +183,7 @@ On the docker container run the `test.py` file using `python3 test.py`
```python
import openai
api_base = f"http://0.0.0.0:8000" # base url for server
api_base = f"http://0.0.0.0:4000" # base url for server
openai.api_base = api_base
openai.api_key = "temp-key"

View file

@ -15,7 +15,7 @@ import os
response = litellm.completion(
model="openai/mistral, # add `openai/` prefix to model so litellm knows to route to OpenAI
api_key="sk-1234", # api key to your openai compatible endpoint
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
messages=[
{
"role": "user",
@ -35,7 +35,7 @@ import os
response = litellm.embedding(
model="openai/GPT-J", # add `openai/` prefix to model so litellm knows to route to OpenAI
api_key="sk-1234", # api key to your openai compatible endpoint
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
input=["good morning from litellm"]
)
print(response)

View file

@ -145,7 +145,7 @@ $ litellm --config /path/to/config.yaml
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -153,7 +153,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -166,14 +166,14 @@ curl http://0.0.0.0:8000/v1/chat/completions \
Send the same request twice:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
"input": ["write a litellm poem"]
}'
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
@ -227,7 +227,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
@ -255,7 +255,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
@ -281,7 +281,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(

View file

@ -63,7 +63,7 @@ litellm_settings:
$ litellm /path/to/config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
@ -162,7 +162,7 @@ litellm_settings:
$ litellm /path/to/config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [

View file

@ -15,7 +15,7 @@ Cli arguments, --host, --port, --num_workers
```
## --port
- **Default:** `8000`
- **Default:** `4000`
- The port to bind the server to.
- **Usage:**
```shell

View file

@ -13,7 +13,7 @@ Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`m
| `general_settings` | Server settings, example setting `master_key: sk-my_special_key` |
| `environment_variables` | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` |
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:8000/#/config.yaml), for everything you can pass in the config.yaml.
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:4000/#/config.yaml), for everything you can pass in the config.yaml.
## Quick Start
@ -49,13 +49,13 @@ model_list:
rpm: 6
- model_name: anthropic-claude
litellm_params:
model="bedrock/anthropic.claude-instant-v1"
model: bedrock/anthropic.claude-instant-v1
### [OPTIONAL] SET AWS REGION ###
aws_region_name="us-east-1"
aws_region_name: us-east-1
- model_name: vllm-models
litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
api_base: http://0.0.0.0:8000
api_base: http://0.0.0.0:4000
rpm: 1440
model_info:
version: 2
@ -91,7 +91,7 @@ Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing)
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -111,7 +111,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "bedrock-claude-v1",
@ -131,7 +131,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
@ -179,7 +179,7 @@ messages = [
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
@ -189,7 +189,7 @@ print(response)
# Sends request to model where `model_name=bedrock-claude-v1` on config.yaml.
claude_chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
model = "bedrock-claude-v1",
temperature=0.1
)
@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml
Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced).
```yaml
router_settings:
routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
For optimal performance:
- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
- Select your optimal routing strategy in `router_settings:routing_strategy`.
LiteLLM supports
```python
["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
```
When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
```yaml
model_list:
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8001
rpm: 60 # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
tpm: 1000 # Optional[int]: tpm = Tokens Per Minute
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8002
rpm: 600
- model_name: zephyr-beta
litellm_params:
model: huggingface/HuggingFaceH4/zephyr-7b-beta
api_base: http://0.0.0.0:8003
rpm: 60000
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: <my-openai-key>
rpm: 200
- model_name: gpt-3.5-turbo-16k
litellm_params:
model: gpt-3.5-turbo-16k
api_key: <my-openai-key>
rpm: 100
litellm_settings:
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
@ -280,8 +295,16 @@ litellm_settings:
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
```
router_settings: # router_settings are optional
routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
num_retries: 2
timeout: 30 # 30 seconds
redis_host: <your redis host> # set this when using multiple litellm proxy deployments, load balancing state stored in redis
redis_password: <your redis password>
redis_port: 1992
```
## Set Azure `base_model` for cost tracking
@ -537,7 +560,7 @@ litellm --config config.yaml
Sends Request to `bedrock-cohere`
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "bedrock-cohere",

View file

@ -28,7 +28,7 @@ docker run ghcr.io/berriai/litellm:main-latest
<TabItem value="cli" label="With CLI Args">
### Run with LiteLLM CLI args
#### Run with LiteLLM CLI args
See all supported CLI args [here](https://docs.litellm.ai/docs/proxy/cli):
@ -68,8 +68,87 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
</TabItem>
<TabItem value="kubernetes" label="Kubernetes">
Deploying a config file based litellm instance just requires a simple deployment that loads
the config.yaml file via a config map. Also it would be a good practice to use the env var
declaration for api keys, and attach the env vars with the api key values as an opaque secret.
```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: litellm-config-file
data:
config.yaml: |
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/gpt-turbo-small-ca
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: os.environ/CA_AZURE_OPENAI_API_KEY
---
apiVersion: v1
kind: Secret
type: Opaque
metadata:
name: litellm-secrets
data:
CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
labels:
app: litellm
spec:
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm
image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
ports:
- containerPort: 4000
volumeMounts:
- name: config-volume
mountPath: /app/proxy_server_config.yaml
subPath: config.yaml
envFrom:
- secretRef:
name: litellm-secrets
volumes:
- name: config-volume
configMap:
name: litellm-config-file
```
:::info
To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
:::
</TabItem>
</Tabs>
**That's it ! That's the quick start to deploy litellm**
## Options to deploy LiteLLM
| Docs | When to Use |
| --- | --- |
| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
## Deploy with Database
We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database
@ -93,7 +172,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
</TabItem>
<TabItem value="kubernetes-deploy" label="Kubernetes">
### Step 1. Create deployment.yaml
#### Step 1. Create deployment.yaml
```yaml
apiVersion: apps/v1
@ -122,7 +201,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
kubectl apply -f /path/to/deployment.yaml
```
### Step 2. Create service.yaml
#### Step 2. Create service.yaml
```yaml
apiVersion: v1
@ -143,7 +222,7 @@ spec:
kubectl apply -f /path/to/service.yaml
```
### Step 3. Start server
#### Step 3. Start server
```
kubectl port-forward service/litellm-service 4000:4000
@ -154,13 +233,13 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
</TabItem>
<TabItem value="helm-deploy" label="Helm">
### Step 1. Clone the repository
#### Step 1. Clone the repository
```bash
git clone https://github.com/BerriAI/litellm.git
```
### Step 2. Deploy with Helm
#### Step 2. Deploy with Helm
```bash
helm install \
@ -169,20 +248,87 @@ helm install \
deploy/charts/litellm
```
### Step 3. Expose the service to localhost
#### Step 3. Expose the service to localhost
```bash
kubectl \
port-forward \
service/mydeploy-litellm \
8000:8000
4000:4000
```
Your OpenAI proxy server is now running on `http://127.0.0.1:8000`.
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
</TabItem>
</Tabs>
## LiteLLM container + Redis
Use Redis when you need litellm to load balance across multiple litellm containers
The only change required is setting Redis on your `config.yaml`
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/<your-deployment-name>
api_base: <your-azure-endpoint>
api_key: <your-azure-api-key>
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/gpt-turbo-small-ca
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: <your-azure-api-key>
rpm: 6
router_settings:
redis_host: <your redis host>
redis_password: <your redis password>
redis_port: 1992
```
Start docker container with config
```shell
docker run ghcr.io/berriai/litellm:main-latest --config your_config.yaml
```
## LiteLLM Database container + PostgresDB + Redis
The only change required is setting Redis on your `config.yaml`
LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/<your-deployment-name>
api_base: <your-azure-endpoint>
api_key: <your-azure-api-key>
rpm: 6 # Rate limit for this deployment: in requests per minute (rpm)
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/gpt-turbo-small-ca
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
api_key: <your-azure-api-key>
rpm: 6
router_settings:
redis_host: <your redis host>
redis_password: <your redis password>
redis_port: 1992
```
Start `litellm-database`docker container with config
```shell
docker run --name litellm-proxy \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-p 4000:4000 \
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
```
## Best Practices for Deploying to Production
### 1. Switch of debug logs in production
don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
@ -218,8 +364,49 @@ Provide an ssl certificate when starting litellm proxy server
## Platform-specific Guide
<Tabs>
<TabItem value="aws-stack" label="AWS Cloud Formation Stack">
### AWS Cloud Formation Stack
LiteLLM AWS Cloudformation Stack - **Get the best LiteLLM AutoScaling Policy and Provision the DB for LiteLLM Proxy**
This will provision:
- LiteLLMServer - EC2 Instance
- LiteLLMServerAutoScalingGroup
- LiteLLMServerScalingPolicy (autoscaling policy)
- LiteLLMDB - RDS::DBInstance
#### Using AWS Cloud Formation Stack
**LiteLLM Cloudformation stack is located [here - litellm.yaml](https://github.com/BerriAI/litellm/blob/main/enterprise/cloudformation_stack/litellm.yaml)**
#### 1. Create the CloudFormation Stack:
In the AWS Management Console, navigate to the CloudFormation service, and click on "Create Stack."
On the "Create Stack" page, select "Upload a template file" and choose the litellm.yaml file
Now monitor the stack was created successfully.
#### 2. Get the Database URL:
Once the stack is created, get the DatabaseURL of the Database resource, copy this value
#### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
From the EC2 console, connect to the instance created by the stack (e.g., using SSH).
Run the following command, replacing <database_url> with the value you copied in step 2
```shell
docker run --name litellm-proxy \
-e DATABASE_URL=<database_url> \
-p 4000:4000 \
ghcr.io/berriai/litellm-database:main-latest
```
#### 4. Access the Application:
Once the container is running, you can access the application by going to `http://<ec2-public-ip>:4000` in your browser.
</TabItem>
<TabItem value="google-cloud-run" label="Google Cloud Run">
### Deploy on Google Cloud Run
@ -286,11 +473,11 @@ services:
target: runtime
image: ghcr.io/berriai/litellm:main-latest
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
- "4000:4000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any
```
@ -308,18 +495,4 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
Your LiteLLM container should be running now on the defined port e.g. `8000`.
## LiteLLM Proxy Performance
LiteLLM proxy has been load tested to handle 1500 req/s.
### Throughput - 30% Increase
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
<Image img={require('../../img/throughput.png')} />
### Latency Added - 0.00325 seconds
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
<Image img={require('../../img/latency.png')} />
Your LiteLLM container should be running now on the defined port e.g. `4000`.

View file

@ -38,7 +38,7 @@ $ litellm --config /path/to/config.yaml
3. Test the embedding call
```shell
curl --location 'http://0.0.0.0:8000/v1/embeddings' \
curl --location 'http://0.0.0.0:4000/v1/embeddings' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{

View file

@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
:::
Features:
- [ ] Content Moderation with LlamaGuard
- [ ] Content Moderation with Google Text Moderations
- [ ] Content Moderation with LLM Guard
- [ ] Reject calls from Blocked User list
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- [ ] Tracking Spend for Custom Tags
- ✅ Content Moderation with LlamaGuard
- ✅ Content Moderation with Google Text Moderations
- ✅ Content Moderation with LLM Guard
- ✅ Reject calls from Blocked User list
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- ✅ Don't log/store specific requests (eg confidential LLM requests)
- ✅ Tracking Spend for Custom Tags
## Content Moderation with LlamaGuard
## Content Moderation
### Content Moderation with LlamaGuard
Currently works with Sagemaker's LlamaGuard endpoint.
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
```
### Customize LlamaGuard prompt
#### Customize LlamaGuard prompt
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
@ -51,12 +53,12 @@ callbacks: ["llamaguard_moderations"]
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
```
## Content Moderation with LLM Guard
### Content Moderation with LLM Guard
Set the LLM Guard API Base in your environment
```env
LLM_GUARD_API_BASE = "http://0.0.0.0:8000"
LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
```
Add `llmguard_moderations` as a callback
@ -78,7 +80,7 @@ Expected results:
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
```
## Content Moderation with Google Text Moderation
### Content Moderation with Google Text Moderation
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
@ -89,7 +91,7 @@ litellm_settings:
callbacks: ["google_text_moderation"]
```
### Set custom confidence thresholds
#### Set custom confidence thresholds
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
@ -133,6 +135,33 @@ Here are the category specific values:
| "legal" | legal_threshold: 0.1 |
## Incognito Requests - Don't log anything
When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
```python
import openai
client = openai.OpenAI(
api_key="anything", # proxy api-key
base_url="http://0.0.0.0:4000" # litellm proxy
)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={
"no-log": True
}
)
print(response)
```
## Enable Blocked User Lists
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
@ -146,7 +175,7 @@ litellm_settings:
### How to test
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -173,7 +202,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
**Block all calls for a user id**
```
curl -X POST "http://0.0.0.0:8000/user/block" \
curl -X POST "http://0.0.0.0:4000/user/block" \
-H "Authorization: Bearer sk-1234" \
-D '{
"user_ids": [<user_id>, ...]
@ -183,7 +212,7 @@ curl -X POST "http://0.0.0.0:8000/user/block" \
**Unblock calls for a user id**
```
curl -X POST "http://0.0.0.0:8000/user/unblock" \
curl -X POST "http://0.0.0.0:4000/user/unblock" \
-H "Authorization: Bearer sk-1234" \
-D '{
"user_ids": [<user_id>, ...]
@ -201,7 +230,7 @@ litellm_settings:
### Test this
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -234,7 +263,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -262,7 +291,7 @@ print(response)
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -288,7 +317,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={

View file

@ -12,10 +12,10 @@ The proxy exposes:
#### Request
Make a GET Request to `/health` on the proxy
```shell
curl --location 'http://0.0.0.0:8000/health' -H "Authorization: Bearer sk-1234"
curl --location 'http://0.0.0.0:4000/health' -H "Authorization: Bearer sk-1234"
```
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
```
litellm --health
```
@ -60,7 +60,7 @@ $ litellm /path/to/config.yaml
3. Query health endpoint:
```
curl --location 'http://0.0.0.0:8000/health'
curl --location 'http://0.0.0.0:4000/health'
```
### Embedding Models
@ -119,7 +119,7 @@ Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:8000/health/readiness'
curl --location 'http://0.0.0.0:4000/health/readiness'
```
Example Response:
@ -153,7 +153,7 @@ Example Request:
```
curl -X 'GET' \
'http://0.0.0.0:8000/health/liveliness' \
'http://0.0.0.0:4000/health/liveliness' \
-H 'accept: application/json'
```

View file

@ -45,7 +45,7 @@ $ litellm --config /path/to/config.yaml
### Step 3: Use proxy - Call a model group [Load Balancing]
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -65,7 +65,7 @@ If you want to call a specific model defined in the `config.yaml`, you can call
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "azure/gpt-turbo-small-ca",

View file

@ -150,7 +150,7 @@ litellm --config proxy_config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--data ' {
"model": "gpt-3.5-turbo",
@ -174,7 +174,7 @@ On Success
Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21},
Cost: 3.65e-05,
Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}}
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:8000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:4000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
```
#### Logging Proxy Request Object, Header, Url
@ -374,7 +374,7 @@ async def log_event(request: Request):
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8000)
uvicorn.run(app, host="127.0.0.1", port=4000)
```
@ -383,7 +383,7 @@ if __name__ == "__main__":
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
```shell
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:8000/log-event"
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
```
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
@ -445,7 +445,7 @@ Expected output on Langfuse
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -472,7 +472,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -509,7 +509,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
@ -663,7 +663,7 @@ litellm --config config.yaml --debug
Test Request
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
@ -698,7 +698,7 @@ litellm_settings:
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:8000/key/generate' \
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
@ -742,7 +742,7 @@ litellm --config config.yaml --debug
Test Request
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
@ -903,7 +903,7 @@ litellm --config config.yaml --debug
Test Request
```
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -947,7 +947,7 @@ litellm --config config.yaml --debug
Test Request
```
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",

View file

@ -24,7 +24,7 @@ Retrieve detailed information about each model listed in the `/models` endpoint,
<TabItem value="curl">
```bash
curl -X GET "http://0.0.0.0:8000/model/info" \
curl -X GET "http://0.0.0.0:4000/model/info" \
-H "accept: application/json" \
```
</TabItem>
@ -42,7 +42,7 @@ Add a new model to the list in the `config.yaml` by providing the model paramete
<TabItem value="curl">
```bash
curl -X POST "http://0.0.0.0:8000/model/new" \
curl -X POST "http://0.0.0.0:4000/model/new" \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'

View file

@ -96,7 +96,7 @@ Turn off PII masking for a given key.
Do this by setting `permissions: {"pii": false}`, when generating a key.
```shell
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
@ -119,7 +119,7 @@ The proxy support 2 request-level PII controls:
Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer my-master-key' \
--header 'Content-Type: application/json' \
--data '{
@ -136,7 +136,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(

View file

@ -21,7 +21,7 @@ Run the following command to start the litellm proxy
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Test
@ -250,7 +250,7 @@ litellm --config your_config.yaml
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -270,7 +270,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -297,7 +297,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
@ -321,7 +321,7 @@ print(response)
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -331,7 +331,7 @@ query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -340,7 +340,7 @@ query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -407,11 +407,11 @@ services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
- "4000:4000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any
```
@ -429,7 +429,7 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
Your LiteLLM container should be running now on the defined port e.g. `8000`.
Your LiteLLM container should be running now on the defined port e.g. `4000`.
## Using with OpenAI compatible projects
@ -442,7 +442,7 @@ Set `base_url` to the LiteLLM Proxy server
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -463,7 +463,7 @@ print(response)
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
@ -474,9 +474,9 @@ git clone https://github.com/danny-avila/LibreChat.git
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
@ -502,7 +502,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:8000" # your proxy server url
api_base="http://localhost:4000" # your proxy server url
),
```
@ -514,7 +514,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
@ -528,7 +528,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:8000", #litellm compatible endpoint
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
@ -566,7 +566,7 @@ import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}

View file

@ -45,7 +45,7 @@ litellm_settings:
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
@ -101,7 +101,7 @@ LiteLLM Proxy supports setting a `timeout` per request
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "gpt-3.5-turbo",
@ -121,7 +121,7 @@ import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(

View file

@ -30,7 +30,7 @@ $ litellm /path/to/config.yaml
```
```bash
curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{

View file

@ -65,7 +65,7 @@ litellm --config proxy_config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--data ' {
"model": "gpt-3.5-turbo",

View file

@ -28,12 +28,12 @@ Follow [setup](./virtual_keys.md#setup)
```bash
litellm --config /path/to/config.yaml
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### 2. Go to UI
```bash
http://0.0.0.0:8000/ui # <proxy_base_url>/ui
http://0.0.0.0:4000/ui # <proxy_base_url>/ui
```

View file

@ -26,7 +26,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -92,7 +92,7 @@ print(response)
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -123,7 +123,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
@ -195,7 +195,7 @@ from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.embeddings.create(
input=["hello from litellm"],
@ -209,7 +209,7 @@ print(response)
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
@ -223,7 +223,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -233,7 +233,7 @@ query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -242,7 +242,7 @@ query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -296,7 +296,7 @@ from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.moderations.create(
input="hello from litellm",
@ -310,7 +310,7 @@ print(response)
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/moderations' \
curl --location 'http://0.0.0.0:4000/moderations' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
@ -421,7 +421,7 @@ user_config = {
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# send request to `user-azure-instance`
@ -489,7 +489,7 @@ const { OpenAI } = require('openai');
const openai = new OpenAI({
apiKey: "sk-1234",
baseURL: "http://0.0.0.0:8000"
baseURL: "http://0.0.0.0:4000"
});
async function main() {
@ -516,7 +516,7 @@ Here's how to do it:
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -541,7 +541,7 @@ Pass in the litellm_params (E.g. api_key, api_base, etc.) via the `extra_body` p
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -571,7 +571,7 @@ const { OpenAI } = require('openai');
const openai = new OpenAI({
apiKey: "sk-1234",
baseURL: "http://0.0.0.0:8000"
baseURL: "http://0.0.0.0:4000"
});
async function main() {

View file

@ -44,7 +44,7 @@ litellm /path/to/config.yaml
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
@ -72,7 +72,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
#### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
@ -96,7 +96,7 @@ curl --location 'http://localhost:8000/user/new' \
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -113,7 +113,7 @@ Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai)
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
@ -127,7 +127,7 @@ You can:
#### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/team/new' \
curl --location 'http://localhost:4000/team/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -238,7 +238,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
#### **Add budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -250,7 +250,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <generated-key>' \
--data ' {
@ -278,7 +278,7 @@ Expected Response from `/chat/completions` when key has crossed budget
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -310,7 +310,7 @@ By default the `model_max_budget` is set to `{}` and is not checked for keys
#### **Add model specific budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -335,7 +335,7 @@ Use `/user/new`, to persist rate limits across multiple keys.
```shell
curl --location 'http://0.0.0.0:8000/user/new' \
curl --location 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
@ -359,7 +359,7 @@ curl --location 'http://0.0.0.0:8000/user/new' \
Use `/key/generate`, if you want them for just that key.
```shell
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
@ -401,7 +401,7 @@ model_list:
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
@ -414,7 +414,7 @@ curl --location 'http://localhost:8000/user/new' \
Just include user_id in the `/key/generate` request.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'

View file

@ -59,7 +59,7 @@ litellm --config /path/to/config.yaml
**Step 3: Generate temporary keys**
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
@ -70,7 +70,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -105,7 +105,7 @@ Request Params:
```python
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
}
@ -147,7 +147,7 @@ model_list:
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
```bash
curl -X POST "https://0.0.0.0:8000/key/generate" \
curl -X POST "https://0.0.0.0:4000/key/generate" \
-H "Authorization: Bearer <your-master-key>" \
-H "Content-Type: application/json" \
-d '{
@ -182,7 +182,7 @@ model_list:
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/key/generate' \
curl --location 'http://localhost:4000/key/generate' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
@ -194,7 +194,7 @@ curl --location 'http://localhost:8000/key/generate' \
### Request
```shell
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
@ -228,7 +228,7 @@ Request Params:
### Request
```shell
curl 'http://0.0.0.0:8000/key/update' \
curl 'http://0.0.0.0:4000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -266,7 +266,7 @@ Request Params:
### Request
```shell
curl 'http://0.0.0.0:8000/key/delete' \
curl 'http://0.0.0.0:4000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -500,7 +500,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -517,7 +517,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
@ -545,10 +545,10 @@ Expected Response from `/chat/completions` when key has crossed budget
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
@ -571,7 +571,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
@ -771,7 +771,7 @@ general_settings:
#### Step 3. Generate Key
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'

View file

@ -22,7 +22,7 @@ $ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Test
@ -39,7 +39,7 @@ This will now automatically route any requests for gpt-3.5-turbo to bigcode star
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -59,7 +59,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -246,7 +246,7 @@ Set `base_url` to the LiteLLM Proxy server
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -267,7 +267,7 @@ print(response)
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
@ -278,9 +278,9 @@ git clone https://github.com/danny-avila/LibreChat.git
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
@ -306,7 +306,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:8000" # your proxy server url
api_base="http://localhost:4000" # your proxy server url
),
```
@ -318,7 +318,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
@ -332,7 +332,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:8000", #litellm compatible endpoint
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
@ -370,7 +370,7 @@ import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
@ -479,7 +479,7 @@ $ litellm --config /path/to/config.yaml
#### Step 3: Use proxy
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-alpha",
@ -529,7 +529,7 @@ $ litellm --config /path/to/config.yaml
#### Step 3: Use proxy
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -586,7 +586,7 @@ litellm_settings:
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
@ -615,7 +615,7 @@ model_list:
- model_name: custom_embedding_model
litellm_params:
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
api_base: http://0.0.0.0:8000/
api_base: http://0.0.0.0:4000/
- model_name: custom_embedding_model
litellm_params:
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
@ -665,7 +665,7 @@ litellm --config /path/to/config.yaml
**Step 3: Generate temporary keys**
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--h 'Authorization: Bearer sk-1234' \
--d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}'
```
@ -719,7 +719,7 @@ model_list:
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
```bash
curl -X POST "https://0.0.0.0:8000/key/generate" \
curl -X POST "https://0.0.0.0:4000/key/generate" \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
@ -737,7 +737,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
@ -868,7 +868,7 @@ $ litellm --config /path/to/config.yaml
#### Using Caching
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -876,7 +876,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -889,7 +889,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
Caching can be switched on/off per `/chat/completions` request
- Caching **on** for completion - pass `caching=True`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -900,7 +900,7 @@ Caching can be switched on/off per `/chat/completions` request
```
- Caching **off** for completion - pass `caching=False`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -963,10 +963,10 @@ https://api.openai.com/v1/chat/completions \
Use this to health check all LLMs defined in your config.yaml
#### Request
```shell
curl --location 'http://0.0.0.0:8000/health'
curl --location 'http://0.0.0.0:4000/health'
```
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
```
litellm --health
```
@ -1087,7 +1087,7 @@ litellm -config config.yaml
#### Run a test request to Proxy
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1244' \
--data ' {
"model": "gpt-3.5-turbo",
@ -1213,7 +1213,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
```
#### --port
- **Default:** `8000`
- **Default:** `4000`
- The port to bind the server to.
- **Usage:**
```shell

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

View file

@ -42,10 +42,6 @@ const sidebars = {
"proxy/team_based_routing",
"proxy/ui",
"proxy/budget_alerts",
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/pii_masking",
{
"type": "category",
"label": "🔥 Load Balancing",
@ -54,6 +50,10 @@ const sidebars = {
"proxy/reliability",
]
},
"proxy/model_management",
"proxy/health",
"proxy/debugging",
"proxy/pii_masking",
"proxy/caching",
{
"type": "category",
@ -101,12 +101,13 @@ const sidebars = {
},
{
type: "category",
label: "Embedding(), Moderation(), Image Generation()",
label: "Embedding(), Moderation(), Image Generation(), Audio Transcriptions()",
items: [
"embedding/supported_embedding",
"embedding/async_embedding",
"embedding/moderation",
"image_generation"
"image_generation",
"audio_transcription"
],
},
{

View file

@ -0,0 +1,44 @@
Resources:
LiteLLMServer:
Type: AWS::EC2::Instance
Properties:
AvailabilityZone: us-east-1a
ImageId: ami-0f403e3180720dd7e
InstanceType: t2.micro
LiteLLMServerAutoScalingGroup:
Type: AWS::AutoScaling::AutoScalingGroup
Properties:
AvailabilityZones:
- us-east-1a
LaunchConfigurationName: !Ref LiteLLMServerLaunchConfig
MinSize: 1
MaxSize: 3
DesiredCapacity: 1
HealthCheckGracePeriod: 300
LiteLLMServerLaunchConfig:
Type: AWS::AutoScaling::LaunchConfiguration
Properties:
ImageId: ami-0f403e3180720dd7e # Replace with your desired AMI ID
InstanceType: t2.micro
LiteLLMServerScalingPolicy:
Type: AWS::AutoScaling::ScalingPolicy
Properties:
AutoScalingGroupName: !Ref LiteLLMServerAutoScalingGroup
PolicyType: TargetTrackingScaling
TargetTrackingConfiguration:
PredefinedMetricSpecification:
PredefinedMetricType: ASGAverageCPUUtilization
TargetValue: 60.0
LiteLLMDB:
Type: AWS::RDS::DBInstance
Properties:
AllocatedStorage: 20
Engine: postgres
MasterUsername: litellmAdmin
MasterUserPassword: litellmPassword
DBInstanceClass: db.t3.micro
AvailabilityZone: us-east-1a

View file

@ -570,7 +570,7 @@ from .utils import (
_calculate_retry_after,
_should_retry,
get_secret,
get_mapped_model_params,
get_supported_openai_params,
)
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
@ -588,6 +588,7 @@ from .llms.petals import PetalsConfig
from .llms.vertex_ai import VertexAIConfig
from .llms.sagemaker import SagemakerConfig
from .llms.ollama import OllamaConfig
from .llms.ollama_chat import OllamaChatConfig
from .llms.maritalk import MaritTalkConfig
from .llms.bedrock import (
AmazonTitanConfig,

View file

@ -31,6 +31,18 @@ def _turn_on_debug():
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
def _disable_debugging():
verbose_logger.disabled = True
verbose_router_logger.disabled = True
verbose_proxy_logger.disabled = True
def _enable_debugging():
verbose_logger.disabled = False
verbose_router_logger.disabled = False
verbose_proxy_logger.disabled = False
def print_verbose(print_statement):
try:
if set_verbose:

View file

@ -1,7 +1,7 @@
import os, types
import json
from enum import Enum
import requests
import requests, copy
import time, uuid
from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage, map_finish_reason
@ -117,6 +117,7 @@ def completion(
):
headers = validate_environment(api_key, headers)
_is_function_call = False
messages = copy.deepcopy(messages)
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]

View file

@ -7,13 +7,15 @@ from litellm.utils import (
Message,
CustomStreamWrapper,
convert_to_model_response_object,
TranscriptionResponse,
)
from typing import Callable, Optional
from typing import Callable, Optional, BinaryIO
from litellm import OpenAIConfig
import litellm, json
import httpx
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
from openai import AzureOpenAI, AsyncAzureOpenAI
import uuid
class AzureOpenAIError(Exception):
@ -270,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
azure_client = AzureOpenAI(**azure_client_params)
else:
azure_client = client
if api_version is not None and isinstance(
azure_client._custom_query, dict
):
# set api_version to version passed by user
azure_client._custom_query.setdefault(
"api-version", api_version
)
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump()
## LOGGING
@ -333,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
azure_client_params["azure_ad_token"] = azure_ad_token
# setting Azure client
if client is None:
azure_client = AsyncAzureOpenAI(**azure_client_params)
else:
azure_client = client
if api_version is not None and isinstance(
azure_client._custom_query, dict
):
# set api_version to version passed by user
azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@ -401,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
azure_client = AzureOpenAI(**azure_client_params)
else:
azure_client = client
if api_version is not None and isinstance(azure_client._custom_query, dict):
# set api_version to version passed by user
azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@ -454,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
azure_client = AsyncAzureOpenAI(**azure_client_params)
else:
azure_client = client
if api_version is not None and isinstance(
azure_client._custom_query, dict
):
# set api_version to version passed by user
azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@ -757,6 +782,156 @@ class AzureChatCompletion(BaseLLM):
else:
raise AzureOpenAIError(status_code=500, message=str(e))
def audio_transcriptions(
self,
model: str,
audio_file: BinaryIO,
optional_params: dict,
model_response: TranscriptionResponse,
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
api_version: Optional[str] = None,
client=None,
azure_ad_token: Optional[str] = None,
logging_obj=None,
atranscription: bool = False,
):
data = {"model": model, "file": audio_file, **optional_params}
# init AzureOpenAI Client
azure_client_params = {
"api_version": api_version,
"azure_endpoint": api_base,
"azure_deployment": model,
"timeout": timeout,
}
max_retries = optional_params.pop("max_retries", None)
azure_client_params = select_azure_base_url_or_endpoint(
azure_client_params=azure_client_params
)
if api_key is not None:
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
azure_client_params["azure_ad_token"] = azure_ad_token
if max_retries is not None:
azure_client_params["max_retries"] = max_retries
if atranscription == True:
return self.async_audio_transcriptions(
audio_file=audio_file,
data=data,
model_response=model_response,
timeout=timeout,
api_key=api_key,
api_base=api_base,
client=client,
azure_client_params=azure_client_params,
max_retries=max_retries,
logging_obj=logging_obj,
)
if client is None:
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
else:
azure_client = client
## LOGGING
logging_obj.pre_call(
input=f"audio_file_{uuid.uuid4()}",
api_key=azure_client.api_key,
additional_args={
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
"api_base": azure_client._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
response = azure_client.audio.transcriptions.create(
**data, timeout=timeout # type: ignore
)
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=audio_file.name,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription") # type: ignore
return final_response
async def async_audio_transcriptions(
self,
audio_file: BinaryIO,
data: dict,
model_response: TranscriptionResponse,
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
azure_client_params=None,
max_retries=None,
logging_obj=None,
):
response = None
try:
if client is None:
async_azure_client = AsyncAzureOpenAI(
**azure_client_params,
http_client=litellm.aclient_session,
)
else:
async_azure_client = client
## LOGGING
logging_obj.pre_call(
input=f"audio_file_{uuid.uuid4()}",
api_key=async_azure_client.api_key,
additional_args={
"headers": {
"Authorization": f"Bearer {async_azure_client.api_key}"
},
"api_base": async_azure_client._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
response = await async_azure_client.audio.transcriptions.create(
**data, timeout=timeout
) # type: ignore
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=audio_file.name,
api_key=api_key,
additional_args={
"headers": {
"Authorization": f"Bearer {async_azure_client.api_key}"
},
"api_base": async_azure_client._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
original_response=stringified_response,
)
response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription") # type: ignore
return response
except Exception as e:
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
original_response=str(e),
)
raise e
async def ahealth_check(
self,
model: Optional[str],

View file

@ -126,6 +126,8 @@ class AmazonAnthropicClaude3Config:
optional_params["max_tokens"] = value
if param == "tools":
optional_params["tools"] = value
if param == "stream":
optional_params["stream"] = value
return optional_params

View file

@ -18,7 +18,7 @@ class OllamaError(Exception):
) # Call the base class constructor with the parameters it needs
class OllamaConfig:
class OllamaChatConfig:
"""
Reference: https://github.com/jmorganca/ollama/blob/main/docs/api.md#parameters
@ -108,6 +108,7 @@ class OllamaConfig:
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and k != "function_name" # special param for function calling
and not isinstance(
v,
(
@ -120,6 +121,61 @@ class OllamaConfig:
and v is not None
}
def get_supported_openai_params(
self,
):
return [
"max_tokens",
"stream",
"top_p",
"temperature",
"frequency_penalty",
"stop",
"tools",
"tool_choice",
"functions",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
optional_params["num_predict"] = value
if param == "stream":
optional_params["stream"] = value
if param == "temperature":
optional_params["temperature"] = value
if param == "top_p":
optional_params["top_p"] = value
if param == "frequency_penalty":
optional_params["repeat_penalty"] = param
if param == "stop":
optional_params["stop"] = value
### FUNCTION CALLING LOGIC ###
if param == "tools":
# ollama actually supports json output
optional_params["format"] = "json"
litellm.add_function_to_prompt = (
True # so that main.py adds the function call to the prompt
)
optional_params["functions_unsupported_model"] = value
if len(optional_params["functions_unsupported_model"]) == 1:
optional_params["function_name"] = optional_params[
"functions_unsupported_model"
][0]["function"]["name"]
if param == "functions":
# ollama actually supports json output
optional_params["format"] = "json"
litellm.add_function_to_prompt = (
True # so that main.py adds the function call to the prompt
)
optional_params["functions_unsupported_model"] = non_default_params.pop(
"functions"
)
non_default_params.pop("tool_choice", None) # causes ollama requests to hang
return optional_params
# ollama implementation
def get_ollama_response(
@ -138,7 +194,7 @@ def get_ollama_response(
url = f"{api_base}/api/chat"
## Load Config
config = litellm.OllamaConfig.get_config()
config = litellm.OllamaChatConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
@ -147,6 +203,7 @@ def get_ollama_response(
stream = optional_params.pop("stream", False)
format = optional_params.pop("format", None)
function_name = optional_params.pop("function_name", None)
for m in messages:
if "role" in m and m["role"] == "tool":
@ -187,6 +244,7 @@ def get_ollama_response(
model_response=model_response,
encoding=encoding,
logging_obj=logging_obj,
function_name=function_name,
)
return response
elif stream == True:
@ -290,7 +348,9 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
traceback.print_exc()
async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
async def ollama_acompletion(
url, data, model_response, encoding, logging_obj, function_name
):
data["stream"] = False
try:
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
@ -324,7 +384,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
"id": f"call_{str(uuid.uuid4())}",
"function": {
"arguments": response_json["message"]["content"],
"name": "",
"name": function_name or "",
},
"type": "function",
}

View file

@ -1,4 +1,4 @@
from typing import Optional, Union, Any
from typing import Optional, Union, Any, BinaryIO
import types, time, json, traceback
import httpx
from .base import BaseLLM
@ -9,6 +9,7 @@ from litellm.utils import (
CustomStreamWrapper,
convert_to_model_response_object,
Usage,
TranscriptionResponse,
)
from typing import Callable, Optional
import aiohttp, requests
@ -237,6 +238,8 @@ class OpenAIChatCompletion(BaseLLM):
status_code=422, message=f"Timeout needs to be a float"
)
if custom_llm_provider != "openai":
# process all OpenAI compatible provider logic here
if custom_llm_provider == "mistral":
# check if message content passed in as list, and not string
messages = prompt_factory(
@ -244,7 +247,13 @@ class OpenAIChatCompletion(BaseLLM):
messages=messages,
custom_llm_provider=custom_llm_provider,
)
if custom_llm_provider == "perplexity" and messages is not None:
# check if messages.name is passed + supported, if not supported remove
messages = prompt_factory(
model=model,
messages=messages,
custom_llm_provider=custom_llm_provider,
)
for _ in range(
2
): # if call fails due to alternating messages, retry with reformatted message
@ -766,6 +775,103 @@ class OpenAIChatCompletion(BaseLLM):
else:
raise OpenAIError(status_code=500, message=str(e))
def audio_transcriptions(
self,
model: str,
audio_file: BinaryIO,
optional_params: dict,
model_response: TranscriptionResponse,
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
max_retries=None,
logging_obj=None,
atranscription: bool = False,
):
data = {"model": model, "file": audio_file, **optional_params}
if atranscription == True:
return self.async_audio_transcriptions(
audio_file=audio_file,
data=data,
model_response=model_response,
timeout=timeout,
api_key=api_key,
api_base=api_base,
client=client,
max_retries=max_retries,
logging_obj=logging_obj,
)
if client is None:
openai_client = OpenAI(
api_key=api_key,
base_url=api_base,
http_client=litellm.client_session,
timeout=timeout,
max_retries=max_retries,
)
else:
openai_client = client
response = openai_client.audio.transcriptions.create(
**data, timeout=timeout # type: ignore
)
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=audio_file.name,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription") # type: ignore
return final_response
async def async_audio_transcriptions(
self,
audio_file: BinaryIO,
data: dict,
model_response: TranscriptionResponse,
timeout: float,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
max_retries=None,
logging_obj=None,
):
response = None
try:
if client is None:
openai_aclient = AsyncOpenAI(
api_key=api_key,
base_url=api_base,
http_client=litellm.aclient_session,
timeout=timeout,
max_retries=max_retries,
)
else:
openai_aclient = client
response = await openai_aclient.audio.transcriptions.create(
**data, timeout=timeout
) # type: ignore
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=audio_file.name,
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription") # type: ignore
except Exception as e:
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
original_response=str(e),
)
raise e
async def ahealth_check(
self,
model: Optional[str],

View file

@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
5. System messages are a separate param to the Messages API (used for tool calling)
6. Ensure we only accept role, content. (message.name is not supported)
"""
## Ensure final assistant message has no trailing whitespace
last_assistant_message_idx: Optional[int] = None
@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
new_content.append({"type": "text", "text": m["text"]})
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
else:
new_messages.append(messages[0])
new_messages.append(
{"role": messages[0]["role"], "content": messages[0]["content"]}
)
return new_messages
@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
new_content.append({"type": "text", "content": m["text"]})
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
else:
new_messages.append(messages[i])
new_messages.append(
{"role": messages[i]["role"], "content": messages[i]["content"]}
)
if messages[i]["role"] == messages[i + 1]["role"]:
if messages[i]["role"] == "user":
@ -897,6 +902,10 @@ def prompt_factory(
return anthropic_pt(messages=messages)
elif "mistral." in model:
return mistral_instruct_pt(messages=messages)
elif custom_llm_provider == "perplexity":
for message in messages:
message.pop("name", None)
return messages
try:
if "meta-llama/llama-2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)

View file

@ -8,9 +8,8 @@
# Thank you ! We ❤️ you! - Krrish & Ishaan
import os, openai, sys, json, inspect, uuid, datetime, threading
from typing import Any, Literal, Union
from typing import Any, Literal, Union, BinaryIO
from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy
import httpx
@ -89,6 +88,7 @@ from litellm.utils import (
read_config_args,
Choices,
Message,
TranscriptionResponse,
)
####### ENVIRONMENT VARIABLES ###################
@ -488,6 +488,8 @@ def completion(
### ASYNC CALLS ###
acompletion = kwargs.get("acompletion", False)
client = kwargs.get("client", None)
### Admin Controls ###
no_log = kwargs.get("no-log", False)
######## end of unpacking kwargs ###########
openai_params = [
"functions",
@ -564,6 +566,7 @@ def completion(
"caching_groups",
"ttl",
"cache",
"no-log",
]
default_params = openai_params + litellm_params
non_default_params = {
@ -727,6 +730,7 @@ def completion(
model_info=model_info,
proxy_server_request=proxy_server_request,
preset_cache_key=preset_cache_key,
no_log=no_log,
)
logging.update_environment_variables(
model=model,
@ -2418,6 +2422,7 @@ def embedding(
"caching_groups",
"ttl",
"cache",
"no-log",
]
default_params = openai_params + litellm_params
non_default_params = {
@ -3044,7 +3049,6 @@ def moderation(
return response
##### Moderation #######################
@client
async def amoderation(input: str, model: str, api_key: Optional[str] = None, **kwargs):
# only supports open ai for now
@ -3067,11 +3071,11 @@ async def aimage_generation(*args, **kwargs):
Asynchronously calls the `image_generation` function with the given arguments and keyword arguments.
Parameters:
- `args` (tuple): Positional arguments to be passed to the `embedding` function.
- `kwargs` (dict): Keyword arguments to be passed to the `embedding` function.
- `args` (tuple): Positional arguments to be passed to the `image_generation` function.
- `kwargs` (dict): Keyword arguments to be passed to the `image_generation` function.
Returns:
- `response` (Any): The response returned by the `embedding` function.
- `response` (Any): The response returned by the `image_generation` function.
"""
loop = asyncio.get_event_loop()
model = args[0] if len(args) > 0 else kwargs["model"]
@ -3093,7 +3097,7 @@ async def aimage_generation(*args, **kwargs):
# Await normally
init_response = await loop.run_in_executor(None, func_with_context)
if isinstance(init_response, dict) or isinstance(
init_response, ModelResponse
init_response, ImageResponse
): ## CACHING SCENARIO
response = init_response
elif asyncio.iscoroutine(init_response):
@ -3311,6 +3315,144 @@ def image_generation(
)
##### Transcription #######################
@client
async def atranscription(*args, **kwargs):
"""
Calls openai + azure whisper endpoints.
Allows router to load balance between them
"""
loop = asyncio.get_event_loop()
model = args[0] if len(args) > 0 else kwargs["model"]
### PASS ARGS TO Image Generation ###
kwargs["atranscription"] = True
custom_llm_provider = None
try:
# Use a partial function to pass your keyword arguments
func = partial(transcription, *args, **kwargs)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
_, custom_llm_provider, _, _ = get_llm_provider(
model=model, api_base=kwargs.get("api_base", None)
)
# Await normally
init_response = await loop.run_in_executor(None, func_with_context)
if isinstance(init_response, dict) or isinstance(
init_response, TranscriptionResponse
): ## CACHING SCENARIO
response = init_response
elif asyncio.iscoroutine(init_response):
response = await init_response
else:
# Call the synchronous function using run_in_executor
response = await loop.run_in_executor(None, func_with_context)
return response
except Exception as e:
custom_llm_provider = custom_llm_provider or "openai"
raise exception_type(
model=model,
custom_llm_provider=custom_llm_provider,
original_exception=e,
completion_kwargs=args,
)
@client
def transcription(
model: str,
file: BinaryIO,
## OPTIONAL OPENAI PARAMS ##
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: Optional[
Literal["json", "text", "srt", "verbose_json", "vtt"]
] = None,
temperature: Optional[int] = None, # openai defaults this to 0
## LITELLM PARAMS ##
user: Optional[str] = None,
timeout=600, # default to 10 minutes
api_key: Optional[str] = None,
api_base: Optional[str] = None,
api_version: Optional[str] = None,
litellm_logging_obj=None,
custom_llm_provider=None,
**kwargs,
):
"""
Calls openai + azure whisper endpoints.
Allows router to load balance between them
"""
atranscription = kwargs.get("atranscription", False)
litellm_call_id = kwargs.get("litellm_call_id", None)
logger_fn = kwargs.get("logger_fn", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", {})
model_response = litellm.utils.TranscriptionResponse()
model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore
optional_params = {
"language": language,
"prompt": prompt,
"response_format": response_format,
"temperature": None, # openai defaults this to 0
}
if custom_llm_provider == "azure":
# azure configs
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
api_version = (
api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
)
azure_ad_token = kwargs.pop("azure_ad_token", None) or get_secret(
"AZURE_AD_TOKEN"
)
api_key = (
api_key
or litellm.api_key
or litellm.azure_key
or get_secret("AZURE_API_KEY")
)
response = azure_chat_completions.audio_transcriptions(
model=model,
audio_file=file,
optional_params=optional_params,
model_response=model_response,
atranscription=atranscription,
timeout=timeout,
logging_obj=litellm_logging_obj,
api_base=api_base,
api_key=api_key,
api_version=api_version,
azure_ad_token=azure_ad_token,
)
elif custom_llm_provider == "openai":
response = openai_chat_completions.audio_transcriptions(
model=model,
audio_file=file,
optional_params=optional_params,
model_response=model_response,
atranscription=atranscription,
timeout=timeout,
logging_obj=litellm_logging_obj,
)
return response
##### Health Endpoints #######################

View file

@ -108,7 +108,7 @@
},
"gpt-3.5-turbo": {
"max_tokens": 4097,
"max_input_tokens": 4097,
"max_input_tokens": 16385,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
@ -293,6 +293,18 @@
"output_cost_per_pixel": 0.0,
"litellm_provider": "openai"
},
"whisper-1": {
"mode": "audio_transcription",
"input_cost_per_second": 0,
"output_cost_per_second": 0.0001,
"litellm_provider": "openai"
},
"azure/whisper-1": {
"mode": "audio_transcription",
"input_cost_per_second": 0,
"output_cost_per_second": 0.0001,
"litellm_provider": "azure"
},
"azure/gpt-4-0125-preview": {
"max_tokens": 128000,
"max_input_tokens": 128000,

View file

@ -16,6 +16,13 @@ from importlib import resources
import shutil
telemetry = None
default_num_workers = 1
try:
default_num_workers = os.cpu_count() or 1
if default_num_workers is not None and default_num_workers > 0:
default_num_workers -= 1
except:
pass
def append_query_params(url, params):
@ -54,10 +61,10 @@ def is_port_in_use(port):
@click.option(
"--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST"
)
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
@click.option("--port", default=4000, help="Port to bind the server to.", envvar="PORT")
@click.option(
"--num_workers",
default=1,
default=default_num_workers,
help="Number of gunicorn workers to spin up",
envvar="NUM_WORKERS",
)
@ -266,7 +273,7 @@ def run_server(
],
}
response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
response = requests.post("http://0.0.0.0:4000/queue/request", json=data)
response = response.json()
@ -500,7 +507,7 @@ def run_server(
print(
f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
)
if port == 8000 and is_port_in_use(port):
if port == 4000 and is_port_in_use(port):
port = random.randint(1024, 49152)
from litellm.proxy.proxy_server import app

View file

@ -5,63 +5,9 @@ model_list:
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
mode: chat
max_tokens: 4096
base_model: azure/gpt-4-1106-preview
access_groups: ["public"]
- model_name: openai-gpt-3.5
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
model_info:
access_groups: ["public"]
- model_name: anthropic-claude-v2.1
litellm_params:
model: bedrock/anthropic.claude-v2:1
timeout: 300 # sets a 5 minute timeout
model_info:
access_groups: ["private"]
- model_name: anthropic-claude-v2
litellm_params:
model: bedrock/anthropic.claude-v2
- model_name: bedrock-cohere
litellm_params:
model: bedrock/cohere.command-text-v14
timeout: 0.0001
- model_name: gpt-4
litellm_params:
model: azure/chatgpt-v-2
api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
api_version: "2023-05-15"
api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
model_info:
base_model: azure/gpt-4
- model_name: text-moderation-stable
litellm_params:
model: text-moderation-stable
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
success_callback: ['langfuse']
# setting callback class
callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
general_settings:
master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
# database_type: "dynamo_db"
# database_args: { # 👈 all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
# "billing_mode": "PAY_PER_REQUEST",
# "region_name": "us-west-2",
# "ssl_verify": False
# }
environment_variables:
# otel: True # OpenTelemetry Logger
# master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
set_verbose: True
success_callback: ["langfuse"]
router_settings:
set_verbose: True
debug_level: "DEBUG"

View file

@ -0,0 +1,6 @@
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: http://0.0.0.0:8090

View file

@ -0,0 +1,27 @@
from locust import HttpUser, task, between
class MyUser(HttpUser):
wait_time = between(1, 5)
@task
def chat_completion(self):
headers = {
"Content-Type": "application/json",
# Include any additional headers you may need for authentication, etc.
}
# Customize the payload with "model" and "messages" keys
payload = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a chat bot."},
{"role": "user", "content": "Hello, how are you?"},
],
# Add more data as necessary
}
# Make a POST request to the "chat/completions" endpoint
response = self.client.post("chat/completions", json=payload, headers=headers)
# Print or log the response if needed

View file

@ -0,0 +1,50 @@
# import sys, os
# sys.path.insert(
# 0, os.path.abspath("../")
# ) # Adds the parent directory to the system path
from fastapi import FastAPI, Request, status, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import OAuth2PasswordBearer
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
async def completion(request: Request):
return {
"id": "chatcmpl-123",
"object": "chat.completion",
"created": 1677652288,
"model": "gpt-3.5-turbo-0125",
"system_fingerprint": "fp_44709d6fcb",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "\n\nHello there, how may I assist you today?",
},
"logprobs": None,
"finish_reason": "stop",
}
],
"usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
}
if __name__ == "__main__":
import uvicorn
# run this on 8090, 8091, 8092 and 8093
uvicorn.run(app, host="0.0.0.0", port=8090)

View file

@ -123,6 +123,8 @@ from fastapi import (
Header,
Response,
Form,
UploadFile,
File,
)
from fastapi.routing import APIRouter
from fastapi.security import OAuth2PasswordBearer
@ -1684,9 +1686,9 @@ class ProxyConfig:
# these are litellm callbacks - "langfuse", "sentry", "wandb"
else:
litellm.success_callback.append(callback)
verbose_proxy_logger.debug(
print( # noqa
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
)
) # noqa
elif key == "failure_callback":
litellm.failure_callback = []
@ -2682,6 +2684,11 @@ async def chat_completion(
except:
data = json.loads(body_str)
# Azure OpenAI only: check if user passed api-version
query_params = dict(request.query_params)
if "api-version" in query_params:
data["api_version"] = query_params["api-version"]
# Include original request and headers in the data
data["proxy_server_request"] = {
"url": str(request.url),
@ -3079,13 +3086,13 @@ async def embeddings(
"/v1/images/generations",
dependencies=[Depends(user_api_key_auth)],
response_class=ORJSONResponse,
tags=["image generation"],
tags=["images"],
)
@router.post(
"/images/generations",
dependencies=[Depends(user_api_key_auth)],
response_class=ORJSONResponse,
tags=["image generation"],
tags=["images"],
)
async def image_generation(
request: Request,
@ -3226,6 +3233,168 @@ async def image_generation(
)
@router.post(
"/v1/audio/transcriptions",
dependencies=[Depends(user_api_key_auth)],
tags=["audio"],
)
@router.post(
"/audio/transcriptions",
dependencies=[Depends(user_api_key_auth)],
tags=["audio"],
)
async def audio_transcriptions(
request: Request,
file: UploadFile = File(...),
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Same params as:
https://platform.openai.com/docs/api-reference/audio/createTranscription?lang=curl
"""
global proxy_logging_obj
try:
# Use orjson to parse JSON data, orjson speeds up requests significantly
form_data = await request.form()
data: Dict = {key: value for key, value in form_data.items() if key != "file"}
# Include original request and headers in the data
data["proxy_server_request"] = { # type: ignore
"url": str(request.url),
"method": request.method,
"headers": dict(request.headers),
"body": copy.copy(data), # use copy instead of deepcopy
}
if data.get("user", None) is None and user_api_key_dict.user_id is not None:
data["user"] = user_api_key_dict.user_id
data["model"] = (
general_settings.get("moderation_model", None) # server default
or user_model # model name passed via cli args
or data["model"] # default passed in http request
)
if user_model:
data["model"] = user_model
if "metadata" not in data:
data["metadata"] = {}
data["metadata"]["user_api_key"] = user_api_key_dict.api_key
data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
_headers = dict(request.headers)
_headers.pop(
"authorization", None
) # do not store the original `sk-..` api key in the db
data["metadata"]["headers"] = _headers
data["metadata"]["user_api_key_alias"] = getattr(
user_api_key_dict, "key_alias", None
)
data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
data["metadata"]["user_api_key_team_id"] = getattr(
user_api_key_dict, "team_id", None
)
data["metadata"]["endpoint"] = str(request.url)
### TEAM-SPECIFIC PARAMS ###
if user_api_key_dict.team_id is not None:
team_config = await proxy_config.load_team_config(
team_id=user_api_key_dict.team_id
)
if len(team_config) == 0:
pass
else:
team_id = team_config.pop("team_id", None)
data["metadata"]["team_id"] = team_id
data = {
**team_config,
**data,
} # add the team-specific configs to the completion call
router_model_names = (
[m["model_name"] for m in llm_model_list]
if llm_model_list is not None
else []
)
assert (
file.filename is not None
) # make sure filename passed in (needed for type)
with open(file.filename, "wb+") as f:
f.write(await file.read())
try:
data["file"] = open(file.filename, "rb")
### CALL HOOKS ### - modify incoming data / reject request before calling the model
data = await proxy_logging_obj.pre_call_hook(
user_api_key_dict=user_api_key_dict,
data=data,
call_type="moderation",
)
## ROUTE TO CORRECT ENDPOINT ##
# skip router if user passed their key
if "api_key" in data:
response = await litellm.atranscription(**data)
elif (
llm_router is not None and data["model"] in router_model_names
): # model in router model list
response = await llm_router.atranscription(**data)
elif (
llm_router is not None
and data["model"] in llm_router.deployment_names
): # model in router deployments, calling a specific deployment on the router
response = await llm_router.atranscription(
**data, specific_deployment=True
)
elif (
llm_router is not None
and llm_router.model_group_alias is not None
and data["model"] in llm_router.model_group_alias
): # model set in model_group_alias
response = await llm_router.atranscription(
**data
) # ensure this goes the llm_router, router will do the correct alias mapping
elif user_model is not None: # `litellm --model <your-model-name>`
response = await litellm.atranscription(**data)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={"error": "Invalid model name passed in"},
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
os.remove(file.filename) # Delete the saved file
### ALERTING ###
data["litellm_status"] = "success" # used for alerting
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e
)
traceback.print_exc()
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_traceback = traceback.format_exc()
error_msg = f"{str(e)}\n\n{error_traceback}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.post(
"/v1/moderations",
dependencies=[Depends(user_api_key_auth)],

View file

@ -5,20 +5,7 @@ import traceback
from large_text import text
from dotenv import load_dotenv
load_dotenv()
litellm_client = AsyncOpenAI(
base_url="http://0.0.0.0:4000",
api_key="sk-VEbqnb28-zDsFzQWTmiCsw",
# base_url="http://0.0.0.0:4000",
# api_key="sk-1234",
)
# litellm_client = AsyncAzureOpenAI(
# azure_endpoint="https://openai-gpt-4-test-v-1.openai.azure.com",
# api_key="d6f82361954b450188295b448e2091ca",
# api_version="2023-07-01-preview",
# )
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
async def litellm_completion():
# Your existing code for litellm_completion goes here
@ -44,7 +31,7 @@ async def litellm_completion():
async def main():
for i in range(6):
start = time.time()
n = 100 # Number of concurrent tasks
n = 20 # Number of concurrent tasks
tasks = [litellm_completion() for _ in range(n)]
chat_completions = await asyncio.gather(*tasks)

View file

@ -693,6 +693,9 @@ class PrismaClient:
"""
Generic implementation of get data
"""
verbose_proxy_logger.debug(
f"PrismaClient: get_generic_data: {key}, table_name: {table_name}"
)
try:
if table_name == "users":
response = await self.db.litellm_usertable.find_first(
@ -758,6 +761,10 @@ class PrismaClient:
int
] = None, # pagination, number of rows to getch when find_all==True
):
args_passed_in = locals()
verbose_proxy_logger.debug(
f"PrismaClient: get_data: token={token}, table_name: {table_name}, query_type: {query_type}, user_id: {user_id}, user_id_list: {user_id_list}, team_id: {team_id}, team_id_list: {team_id_list}, key_val: {key_val}"
)
try:
response: Any = None
if (token is not None and table_name is None) or (
@ -788,6 +795,12 @@ class PrismaClient:
response.expires, datetime
):
response.expires = response.expires.isoformat()
else:
# Token does not exist.
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=f"Authentication Error: invalid user key - user key does not exist in db. User Key={token}",
)
elif query_type == "find_all" and user_id is not None:
response = await self.db.litellm_verificationtoken.find_many(
where={"user_id": user_id},
@ -991,9 +1004,11 @@ class PrismaClient:
except Exception as e:
import traceback
error_msg = f"LiteLLM Prisma Client Exception get_data: {str(e)}"
prisma_query_info = f"LiteLLM Prisma Client Exception: Error with `get_data`. Args passed in: {args_passed_in}"
error_msg = prisma_query_info + str(e)
print_verbose(error_msg)
error_traceback = error_msg + "\n" + traceback.format_exc()
verbose_proxy_logger.debug(error_traceback)
asyncio.create_task(
self.proxy_logging_obj.failure_handler(
original_exception=e, traceback_str=error_traceback
@ -1020,6 +1035,7 @@ class PrismaClient:
Add a key to the database. If it already exists, do nothing.
"""
try:
verbose_proxy_logger.debug(f"PrismaClient: insert_data: {data}")
if table_name == "key":
token = data["token"]
hashed_token = self.hash_token(token=token)
@ -1152,6 +1168,9 @@ class PrismaClient:
"""
Update existing data
"""
verbose_proxy_logger.debug(
f"PrismaClient: update_data, table_name: {table_name}"
)
try:
db_data = self.jsonify_object(data=data)
if update_key_values is not None:

View file

@ -9,7 +9,7 @@
import copy, httpx
from datetime import datetime
from typing import Dict, List, Optional, Union, Literal, Any
from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO
import random, threading, time, traceback, uuid
import litellm, openai
from litellm.caching import RedisCache, InMemoryCache, DualCache
@ -240,6 +240,21 @@ class Router:
{"caching_groups": caching_groups}
)
self.deployment_stats: dict = {} # used for debugging load balancing
"""
deployment_stats = {
"122999-2828282-277:
{
"model": "gpt-3",
"api_base": "http://localhost:4000",
"num_requests": 20,
"avg_latency": 0.001,
"num_failures": 0,
"num_successes": 20
}
}
"""
### ROUTING SETUP ###
if routing_strategy == "least-busy":
self.leastbusy_logger = LeastBusyLoggingHandler(
@ -390,6 +405,10 @@ class Router:
messages=messages,
specific_deployment=kwargs.pop("specific_deployment", None),
)
if self.set_verbose == True and self.debug_level == "DEBUG":
# debug how often this deployment picked
self._print_deployment_metrics(deployment=deployment)
kwargs.setdefault("metadata", {}).update(
{
"deployment": deployment["litellm_params"]["model"],
@ -446,6 +465,9 @@ class Router:
verbose_router_logger.info(
f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
)
if self.set_verbose == True and self.debug_level == "DEBUG":
# debug how often this deployment picked
self._print_deployment_metrics(deployment=deployment, response=response)
return response
except Exception as e:
verbose_router_logger.info(
@ -611,6 +633,106 @@ class Router:
self.fail_calls[model_name] += 1
raise e
async def atranscription(self, file: BinaryIO, model: str, **kwargs):
"""
Example Usage:
```
from litellm import Router
client = Router(model_list = [
{
"model_name": "whisper",
"litellm_params": {
"model": "whisper-1",
},
},
])
audio_file = open("speech.mp3", "rb")
transcript = await client.atranscription(
model="whisper",
file=audio_file
)
```
"""
try:
kwargs["model"] = model
kwargs["file"] = file
kwargs["original_function"] = self._atranscription
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
timeout = kwargs.get("request_timeout", self.timeout)
kwargs.setdefault("metadata", {}).update({"model_group": model})
response = await self.async_function_with_fallbacks(**kwargs)
return response
except Exception as e:
raise e
async def _atranscription(self, file: BinaryIO, model: str, **kwargs):
try:
verbose_router_logger.debug(
f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
)
deployment = self.get_available_deployment(
model=model,
messages=[{"role": "user", "content": "prompt"}],
specific_deployment=kwargs.pop("specific_deployment", None),
)
kwargs.setdefault("metadata", {}).update(
{
"deployment": deployment["litellm_params"]["model"],
"model_info": deployment.get("model_info", {}),
}
)
kwargs["model_info"] = deployment.get("model_info", {})
data = deployment["litellm_params"].copy()
model_name = data["model"]
for k, v in self.default_litellm_params.items():
if (
k not in kwargs
): # prioritize model-specific params > default router params
kwargs[k] = v
elif k == "metadata":
kwargs[k].update(v)
potential_model_client = self._get_client(
deployment=deployment, kwargs=kwargs, client_type="async"
)
# check if provided keys == client keys #
dynamic_api_key = kwargs.get("api_key", None)
if (
dynamic_api_key is not None
and potential_model_client is not None
and dynamic_api_key != potential_model_client.api_key
):
model_client = None
else:
model_client = potential_model_client
self.total_calls[model_name] += 1
response = await litellm.atranscription(
**{
**data,
"file": file,
"caching": self.cache_responses,
"client": model_client,
**kwargs,
}
)
self.success_calls[model_name] += 1
verbose_router_logger.info(
f"litellm.atranscription(model={model_name})\033[32m 200 OK\033[0m"
)
return response
except Exception as e:
verbose_router_logger.info(
f"litellm.atranscription(model={model_name})\033[31m Exception {str(e)}\033[0m"
)
if model_name is not None:
self.fail_calls[model_name] += 1
raise e
async def amoderation(self, model: str, input: str, **kwargs):
try:
kwargs["model"] = model
@ -2124,6 +2246,63 @@ class Router:
)
return deployment
def _print_deployment_metrics(self, deployment, response=None):
try:
litellm_params = deployment["litellm_params"]
api_base = litellm_params.get("api_base", "")
model = litellm_params.get("model", "")
model_id = deployment.get("model_info", {}).get("id", None)
if response is None:
# update self.deployment_stats
if model_id is not None:
if model_id in self.deployment_stats:
# only update num_requests
self.deployment_stats[model_id]["num_requests"] += 1
else:
self.deployment_stats[model_id] = {
"api_base": api_base,
"model": model,
"num_requests": 1,
}
else:
# check response_ms and update num_successes
response_ms = response.get("_response_ms", 0)
if model_id is not None:
if model_id in self.deployment_stats:
# check if avg_latency exists
if "avg_latency" in self.deployment_stats[model_id]:
# update avg_latency
self.deployment_stats[model_id]["avg_latency"] = (
self.deployment_stats[model_id]["avg_latency"]
+ response_ms
) / self.deployment_stats[model_id]["num_successes"]
else:
self.deployment_stats[model_id]["avg_latency"] = response_ms
# check if num_successes exists
if "num_successes" in self.deployment_stats[model_id]:
self.deployment_stats[model_id]["num_successes"] += 1
else:
self.deployment_stats[model_id]["num_successes"] = 1
else:
self.deployment_stats[model_id] = {
"api_base": api_base,
"model": model,
"num_successes": 1,
"avg_latency": response_ms,
}
from pprint import pformat
# Assuming self.deployment_stats is your dictionary
formatted_stats = pformat(self.deployment_stats)
# Assuming verbose_router_logger is your logger
verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats)
except Exception as e:
verbose_router_logger.error(f"Error in _print_deployment_metrics: {str(e)}")
def flush_cache(self):
litellm.cache = None
self.cache.flush_cache()

View file

@ -83,12 +83,13 @@ def test_completion_claude():
def test_completion_claude_3_empty_response():
litellm.set_verbose = True
messages = [
{
"role": "system",
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
},
{"role": "user", "content": "Hi gm!"},
{"role": "user", "content": "Hi gm!", "name": "ishaan"},
{"role": "assistant", "content": "Good morning! How are you doing today?"},
{
"role": "user",

View file

@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
def test_completion_deep_infra_stream():
# deep infra currently includes role in the 2nd chunk
# deep infra,currently includes role in the 2nd chunk
# waiting for them to make a fix on this
litellm.set_verbose = True
try:
@ -727,6 +727,31 @@ def test_completion_claude_stream_bad_key():
# pytest.fail(f"Error occurred: {e}")
def test_bedrock_claude_3_streaming():
try:
litellm.set_verbose = True
response: ModelResponse = completion(
model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
messages=messages,
max_tokens=10,
stream=True,
)
complete_response = ""
# Add any assertions here to check the response
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
if finished:
break
complete_response += chunk
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except RateLimitError:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="Replicate changed exceptions")
def test_completion_replicate_stream_bad_key():
try:

View file

@ -10,7 +10,6 @@
import sys, re, binascii, struct
import litellm
import dotenv, json, traceback, threading, base64, ast
import subprocess, os
from os.path import abspath, join, dirname
import litellm, openai
@ -98,7 +97,7 @@ try:
except Exception as e:
verbose_logger.debug(f"Exception import enterprise features {str(e)}")
from typing import cast, List, Dict, Union, Optional, Literal, Any
from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
from .caching import Cache
from concurrent.futures import ThreadPoolExecutor
@ -790,6 +789,38 @@ class ImageResponse(OpenAIObject):
return self.dict()
class TranscriptionResponse(OpenAIObject):
text: Optional[str] = None
_hidden_params: dict = {}
def __init__(self, text=None):
super().__init__(text=text)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
def json(self, **kwargs):
try:
return self.model_dump() # noqa
except:
# if using pydantic v1
return self.dict()
############################################################
def print_verbose(print_statement, logger_only: bool = False):
try:
@ -815,6 +846,8 @@ class CallTypes(Enum):
aimage_generation = "aimage_generation"
moderation = "moderation"
amoderation = "amoderation"
atranscription = "atranscription"
transcription = "transcription"
# Logging function -> log the exact model details + what's being sent | Non-BlockingP
@ -948,6 +981,7 @@ class Logging:
curl_command = self.model_call_details
# only print verbose if verbose logger is not set
if verbose_logger.level == 0:
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
print_verbose(f"\033[92m{curl_command}\033[0m\n")
@ -1279,6 +1313,15 @@ class Logging:
for callback in callbacks:
try:
litellm_params = self.model_call_details.get("litellm_params", {})
if litellm_params.get("no-log", False) == True:
# proxy cost tracking cal backs should run
if not (
isinstance(callback, CustomLogger)
and "_PROXY_" in callback.__class__.__name__
):
print_verbose("no-log request, skipping logging")
continue
if callback == "lite_debugger":
print_verbose("reaches lite_debugger for logging!")
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
@ -1707,7 +1750,20 @@ class Logging:
callbacks = litellm._async_success_callback
verbose_logger.debug(f"Async success callbacks: {callbacks}")
for callback in callbacks:
# check if callback can run for this request
litellm_params = self.model_call_details.get("litellm_params", {})
if litellm_params.get("no-log", False) == True:
# proxy cost tracking cal backs should run
if not (
isinstance(callback, CustomLogger)
and "_PROXY_" in callback.__class__.__name__
):
print_verbose("no-log request, skipping logging")
continue
try:
if kwargs.get("no-log", False) == True:
print_verbose("no-log request, skipping logging")
continue
if callback == "cache" and litellm.cache is not None:
# set_cache once complete streaming response is built
print_verbose("async success_callback: reaches cache for logging!")
@ -2271,6 +2327,12 @@ def client(original_function):
or call_type == CallTypes.text_completion.value
):
messages = args[0] if len(args) > 0 else kwargs["prompt"]
elif (
call_type == CallTypes.atranscription.value
or call_type == CallTypes.transcription.value
):
_file_name: BinaryIO = args[1] if len(args) > 1 else kwargs["file"]
messages = "audio_file"
stream = True if "stream" in kwargs and kwargs["stream"] == True else False
logging_obj = Logging(
model=model,
@ -2568,6 +2630,8 @@ def client(original_function):
return result
elif "aimg_generation" in kwargs and kwargs["aimg_generation"] == True:
return result
elif "atranscription" in kwargs and kwargs["atranscription"] == True:
return result
### POST-CALL RULES ###
post_call_processing(original_response=result, model=model or None)
@ -2985,11 +3049,13 @@ def client(original_function):
print_verbose(
f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
)
# check if user does not want this to be logged
asyncio.create_task(
logging_obj.async_success_handler(result, start_time, end_time)
)
threading.Thread(
target=logging_obj.success_handler, args=(result, start_time, end_time)
target=logging_obj.success_handler,
args=(result, start_time, end_time),
).start()
# RETURN RESULT
@ -3892,6 +3958,7 @@ def get_litellm_params(
proxy_server_request=None,
acompletion=None,
preset_cache_key=None,
no_log=None,
):
litellm_params = {
"acompletion": acompletion,
@ -3908,6 +3975,7 @@ def get_litellm_params(
"model_info": model_info,
"proxy_server_request": proxy_server_request,
"preset_cache_key": preset_cache_key,
"no-log": no_log,
"stream_response": {}, # litellm_call_id: ModelResponse Dict
}
@ -4147,8 +4215,9 @@ def get_optional_params(
and custom_llm_provider != "mistral"
and custom_llm_provider != "anthropic"
and custom_llm_provider != "bedrock"
and custom_llm_provider != "ollama_chat"
):
if custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
if custom_llm_provider == "ollama":
# ollama actually supports json output
optional_params["format"] = "json"
litellm.add_function_to_prompt = (
@ -4174,7 +4243,7 @@ def get_optional_params(
else:
raise UnsupportedParamsError(
status_code=500,
message=f"Function calling is not supported by {custom_llm_provider}. To add it to the prompt, set `litellm.add_function_to_prompt = True`.",
message=f"Function calling is not supported by {custom_llm_provider}.",
)
def _check_valid_arg(supported_params):
@ -4227,15 +4296,9 @@ def get_optional_params(
## raise exception if provider doesn't support passed in param
if custom_llm_provider == "anthropic":
## check if unsupported param passed in
supported_params = [
"stream",
"stop",
"temperature",
"top_p",
"max_tokens",
"tools",
"tool_choice",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# handle anthropic params
if stream:
@ -4259,17 +4322,9 @@ def get_optional_params(
optional_params["tools"] = tools
elif custom_llm_provider == "cohere":
## check if unsupported param passed in
supported_params = [
"stream",
"temperature",
"max_tokens",
"logit_bias",
"top_p",
"frequency_penalty",
"presence_penalty",
"stop",
"n",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@ -4292,14 +4347,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "maritalk":
## check if unsupported param passed in
supported_params = [
"stream",
"temperature",
"max_tokens",
"top_p",
"presence_penalty",
"stop",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@ -4318,14 +4368,9 @@ def get_optional_params(
optional_params["stopping_tokens"] = stop
elif custom_llm_provider == "replicate":
## check if unsupported param passed in
supported_params = [
"stream",
"temperature",
"max_tokens",
"top_p",
"stop",
"seed",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if stream:
@ -4346,7 +4391,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "huggingface":
## check if unsupported param passed in
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
if temperature is not None:
@ -4385,16 +4432,9 @@ def get_optional_params(
) # since we handle translating echo, we should not send it to TGI request
elif custom_llm_provider == "together_ai":
## check if unsupported param passed in
supported_params = [
"stream",
"temperature",
"max_tokens",
"top_p",
"stop",
"frequency_penalty",
"tools",
"tool_choice",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if stream:
@ -4415,16 +4455,9 @@ def get_optional_params(
optional_params["tool_choice"] = tool_choice
elif custom_llm_provider == "ai21":
## check if unsupported param passed in
supported_params = [
"stream",
"n",
"temperature",
"max_tokens",
"top_p",
"stop",
"frequency_penalty",
"presence_penalty",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if stream:
@ -4447,7 +4480,9 @@ def get_optional_params(
custom_llm_provider == "palm" or custom_llm_provider == "gemini"
): # https://developers.generativeai.google/tutorials/curl_quickstart
## check if unsupported param passed in
supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
@ -4476,14 +4511,9 @@ def get_optional_params(
):
print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
## check if unsupported param passed in
supported_params = [
"temperature",
"top_p",
"max_tokens",
"stream",
"tools",
"tool_choice",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
@ -4513,7 +4543,9 @@ def get_optional_params(
)
elif custom_llm_provider == "sagemaker":
## check if unsupported param passed in
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
if temperature is not None:
@ -4540,8 +4572,10 @@ def get_optional_params(
max_tokens = 1
optional_params["max_new_tokens"] = max_tokens
elif custom_llm_provider == "bedrock":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
if "ai21" in model:
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@ -4554,9 +4588,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "anthropic" in model:
supported_params = get_mapped_model_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# anthropic params on bedrock
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
@ -4573,7 +4604,6 @@ def get_optional_params(
optional_params=optional_params,
)
elif "amazon" in model: # amazon titan llms
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
if max_tokens is not None:
@ -4590,7 +4620,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "meta" in model: # amazon / meta llms
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
if max_tokens is not None:
@ -4602,7 +4631,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "cohere" in model: # cohere models on bedrock
supported_params = ["stream", "temperature", "max_tokens"]
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@ -4612,7 +4640,6 @@ def get_optional_params(
if max_tokens is not None:
optional_params["max_tokens"] = max_tokens
elif "mistral" in model:
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# mistral params on bedrock
# \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
@ -4656,7 +4683,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "cloudflare":
# https://developers.cloudflare.com/workers-ai/models/text-generation/#input
supported_params = ["max_tokens", "stream"]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@ -4664,14 +4693,9 @@ def get_optional_params(
if stream is not None:
optional_params["stream"] = stream
elif custom_llm_provider == "ollama":
supported_params = [
"max_tokens",
"stream",
"top_p",
"temperature",
"frequency_penalty",
"stop",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@ -4687,39 +4711,17 @@ def get_optional_params(
if stop is not None:
optional_params["stop"] = stop
elif custom_llm_provider == "ollama_chat":
supported_params = [
"max_tokens",
"stream",
"top_p",
"temperature",
"frequency_penalty",
"stop",
]
supported_params = litellm.OllamaChatConfig().get_supported_openai_params()
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
optional_params["num_predict"] = max_tokens
if stream:
optional_params["stream"] = stream
if temperature is not None:
optional_params["temperature"] = temperature
if top_p is not None:
optional_params["top_p"] = top_p
if frequency_penalty is not None:
optional_params["repeat_penalty"] = frequency_penalty
if stop is not None:
optional_params["stop"] = stop
optional_params = litellm.OllamaChatConfig().map_openai_params(
non_default_params=non_default_params, optional_params=optional_params
)
elif custom_llm_provider == "nlp_cloud":
supported_params = [
"max_tokens",
"stream",
"temperature",
"top_p",
"presence_penalty",
"frequency_penalty",
"n",
"stop",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@ -4739,7 +4741,9 @@ def get_optional_params(
if stop is not None:
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "petals":
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
# max_new_tokens=1,temperature=0.9, top_p=0.6
if max_tokens is not None:
@ -4751,18 +4755,9 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif custom_llm_provider == "deepinfra":
supported_params = [
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
if (
@ -4789,14 +4784,9 @@ def get_optional_params(
if user:
optional_params["user"] = user
elif custom_llm_provider == "perplexity":
supported_params = [
"temperature",
"top_p",
"stream",
"max_tokens",
"presence_penalty",
"frequency_penalty",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
if (
@ -4815,15 +4805,9 @@ def get_optional_params(
if frequency_penalty:
optional_params["frequency_penalty"] = frequency_penalty
elif custom_llm_provider == "anyscale":
supported_params = [
"temperature",
"top_p",
"stream",
"max_tokens",
"stop",
"frequency_penalty",
"presence_penalty",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
if model in [
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mixtral-8x7B-Instruct-v0.1",
@ -4851,14 +4835,9 @@ def get_optional_params(
if max_tokens:
optional_params["max_tokens"] = max_tokens
elif custom_llm_provider == "mistral":
supported_params = [
"temperature",
"top_p",
"stream",
"max_tokens",
"tools",
"tool_choice",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
optional_params["temperature"] = temperature
@ -4885,25 +4864,9 @@ def get_optional_params(
extra_body # openai client supports `extra_body` param
)
elif custom_llm_provider == "openrouter":
supported_params = [
"functions",
"function_call",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
if functions is not None:
@ -4957,28 +4920,9 @@ def get_optional_params(
)
else: # assume passing in params for openai/azure openai
print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
supported_params = [
"functions",
"function_call",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
"logprobs",
"top_logprobs",
"extra_headers",
]
supported_params = get_supported_openai_params(
model=model, custom_llm_provider="openai"
)
_check_valid_arg(supported_params=supported_params)
if functions is not None:
optional_params["functions"] = functions
@ -5036,15 +4980,228 @@ def get_optional_params(
return optional_params
def get_mapped_model_params(model: str, custom_llm_provider: str):
def get_supported_openai_params(model: str, custom_llm_provider: str):
"""
Returns the supported openai params for a given model + provider
Example:
```
get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
```
"""
if custom_llm_provider == "bedrock":
if model.startswith("anthropic.claude-3"):
return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
else:
elif model.startswith("anthropic"):
return litellm.AmazonAnthropicConfig().get_supported_openai_params()
elif model.startswith("ai21"):
return ["max_tokens", "temperature", "top_p", "stream"]
elif model.startswith("amazon"):
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
elif model.startswith("meta"):
return ["max_tokens", "temperature", "top_p", "stream"]
elif model.startswith("cohere"):
return ["stream", "temperature", "max_tokens"]
elif model.startswith("mistral"):
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
elif custom_llm_provider == "ollama_chat":
return litellm.OllamaChatConfig().get_supported_openai_params()
elif custom_llm_provider == "anthropic":
return [
"stream",
"stop",
"temperature",
"top_p",
"max_tokens",
"tools",
"tool_choice",
]
elif custom_llm_provider == "cohere":
return [
"stream",
"temperature",
"max_tokens",
"logit_bias",
"top_p",
"frequency_penalty",
"presence_penalty",
"stop",
"n",
]
elif custom_llm_provider == "maritalk":
return [
"stream",
"temperature",
"max_tokens",
"top_p",
"presence_penalty",
"stop",
]
elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
return [
"functions",
"function_call",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
"logprobs",
"top_logprobs",
"extra_headers",
]
elif custom_llm_provider == "openrouter":
return [
"functions",
"function_call",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"max_retries",
]
elif custom_llm_provider == "mistral":
return [
"temperature",
"top_p",
"stream",
"max_tokens",
"tools",
"tool_choice",
]
elif custom_llm_provider == "replicate":
return [
"stream",
"temperature",
"max_tokens",
"top_p",
"stop",
"seed",
]
elif custom_llm_provider == "huggingface":
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
elif custom_llm_provider == "together_ai":
return [
"stream",
"temperature",
"max_tokens",
"top_p",
"stop",
"frequency_penalty",
"tools",
"tool_choice",
]
elif custom_llm_provider == "ai21":
return [
"stream",
"n",
"temperature",
"max_tokens",
"top_p",
"stop",
"frequency_penalty",
"presence_penalty",
]
elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
elif custom_llm_provider == "vertex_ai":
return [
"temperature",
"top_p",
"max_tokens",
"stream",
"tools",
"tool_choice",
]
elif custom_llm_provider == "sagemaker":
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
elif custom_llm_provider == "aleph_alpha":
return [
"max_tokens",
"stream",
"top_p",
"temperature",
"presence_penalty",
"frequency_penalty",
"n",
"stop",
]
elif custom_llm_provider == "cloudflare":
return ["max_tokens", "stream"]
elif custom_llm_provider == "ollama":
return [
"max_tokens",
"stream",
"top_p",
"temperature",
"frequency_penalty",
"stop",
]
elif custom_llm_provider == "nlp_cloud":
return [
"max_tokens",
"stream",
"temperature",
"top_p",
"presence_penalty",
"frequency_penalty",
"n",
"stop",
]
elif custom_llm_provider == "petals":
return ["max_tokens", "temperature", "top_p", "stream"]
elif custom_llm_provider == "deepinfra":
return [
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
]
elif custom_llm_provider == "perplexity":
return [
"temperature",
"top_p",
"stream",
"max_tokens",
"presence_penalty",
"frequency_penalty",
]
elif custom_llm_provider == "anyscale":
return [
"temperature",
"top_p",
"stream",
"max_tokens",
"stop",
"frequency_penalty",
"presence_penalty",
]
def get_llm_provider(
@ -6149,10 +6306,10 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
def convert_to_model_response_object(
response_object: Optional[dict] = None,
model_response_object: Optional[
Union[ModelResponse, EmbeddingResponse, ImageResponse]
Union[ModelResponse, EmbeddingResponse, ImageResponse, TranscriptionResponse]
] = None,
response_type: Literal[
"completion", "embedding", "image_generation"
"completion", "embedding", "image_generation", "audio_transcription"
] = "completion",
stream=False,
start_time=None,
@ -6263,6 +6420,19 @@ def convert_to_model_response_object(
model_response_object.data = response_object["data"]
return model_response_object
elif response_type == "audio_transcription" and (
model_response_object is None
or isinstance(model_response_object, TranscriptionResponse)
):
if response_object is None:
raise Exception("Error in response object format")
if model_response_object is None:
model_response_object = TranscriptionResponse()
if "text" in response_object:
model_response_object.text = response_object["text"]
return model_response_object
except Exception as e:
raise Exception(f"Invalid response object {traceback.format_exc()}")
@ -7796,7 +7966,9 @@ def exception_type(
message=f"AzureException - {original_exception.message}",
llm_provider="azure",
model=model,
request=original_exception.request,
request=httpx.Request(
method="POST", url="https://openai.com/"
),
)
else:
# if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
@ -7804,7 +7976,11 @@ def exception_type(
__cause__=original_exception.__cause__,
llm_provider="azure",
model=model,
request=original_exception.request,
request=getattr(
original_exception,
"request",
httpx.Request(method="POST", url="https://openai.com/"),
),
)
if (
"BadRequestError.__init__() missing 1 required positional argument: 'param'"
@ -8602,13 +8778,20 @@ class CustomStreamWrapper:
text = chunk_data.get("completions")[0].get("data").get("text")
is_finished = True
finish_reason = "stop"
# anthropic mapping
elif "completion" in chunk_data:
######## bedrock.anthropic mappings ###############
elif "completion" in chunk_data: # not claude-3
text = chunk_data["completion"] # bedrock.anthropic
stop_reason = chunk_data.get("stop_reason", None)
if stop_reason != None:
is_finished = True
finish_reason = stop_reason
elif "delta" in chunk_data:
if chunk_data["delta"].get("text", None) is not None:
text = chunk_data["delta"]["text"]
stop_reason = chunk_data["delta"].get("stop_reason", None)
if stop_reason != None:
is_finished = True
finish_reason = stop_reason
######## bedrock.cohere mappings ###############
# meta mapping
elif "generation" in chunk_data:

View file

@ -108,7 +108,7 @@
},
"gpt-3.5-turbo": {
"max_tokens": 4097,
"max_input_tokens": 4097,
"max_input_tokens": 16385,
"max_output_tokens": 4096,
"input_cost_per_token": 0.0000015,
"output_cost_per_token": 0.000002,
@ -293,6 +293,18 @@
"output_cost_per_pixel": 0.0,
"litellm_provider": "openai"
},
"whisper-1": {
"mode": "audio_transcription",
"input_cost_per_second": 0,
"output_cost_per_second": 0.0001,
"litellm_provider": "openai"
},
"azure/whisper-1": {
"mode": "audio_transcription",
"input_cost_per_second": 0,
"output_cost_per_second": 0.0001,
"litellm_provider": "azure"
},
"azure/gpt-4-0125-preview": {
"max_tokens": 128000,
"max_input_tokens": 128000,

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.30.1"
version = "1.30.6"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -75,7 +75,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.30.1"
version = "1.30.6"
version_files = [
"pyproject.toml:^version"
]

BIN
tests/gettysburg.wav Normal file

Binary file not shown.

116
tests/test_whisper.py Normal file
View file

@ -0,0 +1,116 @@
# What is this?
## Tests `litellm.transcription` endpoint. Outside litellm module b/c of audio file used in testing (it's ~700kb).
import pytest
import asyncio, time
import aiohttp, traceback
from openai import AsyncOpenAI
import sys, os, dotenv
from typing import Optional
from dotenv import load_dotenv
# Get the current directory of the file being run
pwd = os.path.dirname(os.path.realpath(__file__))
print(pwd)
file_path = os.path.join(pwd, "gettysburg.wav")
audio_file = open(file_path, "rb")
load_dotenv()
sys.path.insert(
0, os.path.abspath("../")
) # Adds the parent directory to the system path
import litellm
from litellm import Router
def test_transcription():
transcript = litellm.transcription(
model="whisper-1",
file=audio_file,
)
print(f"transcript: {transcript}")
# test_transcription()
def test_transcription_azure():
litellm.set_verbose = True
transcript = litellm.transcription(
model="azure/azure-whisper",
file=audio_file,
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
api_version="2024-02-15-preview",
)
assert transcript.text is not None
assert isinstance(transcript.text, str)
# test_transcription_azure()
@pytest.mark.asyncio
async def test_transcription_async_azure():
transcript = await litellm.atranscription(
model="azure/azure-whisper",
file=audio_file,
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
api_version="2024-02-15-preview",
)
assert transcript.text is not None
assert isinstance(transcript.text, str)
# asyncio.run(test_transcription_async_azure())
@pytest.mark.asyncio
async def test_transcription_async_openai():
transcript = await litellm.atranscription(
model="whisper-1",
file=audio_file,
)
assert transcript.text is not None
assert isinstance(transcript.text, str)
@pytest.mark.asyncio
async def test_transcription_on_router():
litellm.set_verbose = True
print("\n Testing async transcription on router\n")
try:
model_list = [
{
"model_name": "whisper",
"litellm_params": {
"model": "whisper-1",
},
},
{
"model_name": "whisper",
"litellm_params": {
"model": "azure/azure-whisper",
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
"api_key": os.getenv("AZURE_EUROPE_API_KEY"),
"api_version": "2024-02-15-preview",
},
},
]
router = Router(model_list=model_list)
response = await router.atranscription(
model="whisper",
file=audio_file,
)
print(response)
except Exception as e:
traceback.print_exc()
pytest.fail(f"Error occurred: {e}")