(docs) use port 4000

This commit is contained in:
ishaan-jaff 2024-03-08 21:59:00 -08:00
parent 22e9d1073f
commit ea6f42216c
33 changed files with 179 additions and 179 deletions

View file

@ -143,13 +143,13 @@ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{
@ -170,7 +170,7 @@ Set budgets and rate limits across multiple projects
### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'

View file

@ -28,7 +28,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` |
| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` |
| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` |
| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` |
| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A |
| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A |
@ -76,7 +76,7 @@ When browsing to the URL published per the settings in `ingress.*`, you will
be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal
(from the `litellm` pod's perspective) URL published by the `<RELEASE>-litellm`
Kubernetes Service. If the deployment uses the default settings for this
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:8000`.
service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:4000`.
The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
was not provided to the helm command line, the `masterkey` is a randomly

View file

@ -55,7 +55,7 @@ environmentSecrets: []
service:
type: ClusterIP
port: 8000
port: 4000
ingress:
enabled: false

View file

@ -35,7 +35,7 @@ general_settings:
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:8000
# RUNNING on http://0.0.0.0:4000
```
### Test
@ -44,7 +44,7 @@ litellm --config /path/to/config.yaml
<TabItem value="curl" label="Curl">
```bash
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"input": ["Academia.edu uses"], "model": "textembedding-gecko", "encoding_format": "base64"}'
@ -57,7 +57,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
from openai import OpenAI
client = OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
client.embeddings.create(
@ -72,7 +72,7 @@ client.embeddings.create(
```python
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:8000", openai_api_key="sk-1234")
embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:4000", openai_api_key="sk-1234")
text = "This is a test document."
@ -200,7 +200,7 @@ Use this for calling `/embedding` endpoints on OpenAI Compatible Servers, exampl
from litellm import embedding
response = embedding(
model = "openai/<your-llm-name>", # add `openai/` prefix to model so litellm knows to route to OpenAI
api_base="http://0.0.0.0:8000/" # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000/" # set API Base of your Custom OpenAI Endpoint
input=["good morning from litellm"]
)
```

View file

@ -368,13 +368,13 @@ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### Step 2: Make ChatCompletions Request to Proxy
```python
import openai # openai v1.0.0+
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
{

View file

@ -90,7 +90,7 @@ import time, asyncio, litellm
#### LITELLM PROXY ####
litellm_client = AsyncOpenAI(
api_key="sk-1234", # [CHANGE THIS]
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
#### AZURE OPENAI CLIENT ####

View file

@ -63,7 +63,7 @@ export ANTHROPIC_API_KEY="your-api-key"
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:8000
# Server running on http://0.0.0.0:4000
```
### 3. Test it
@ -73,7 +73,7 @@ $ litellm --model claude-3-opus-20240229
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -93,7 +93,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -120,7 +120,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)

View file

@ -54,7 +54,7 @@ export AWS_REGION_NAME=""
```bash
$ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
# Server running on http://0.0.0.0:8000
# Server running on http://0.0.0.0:4000
```
### 3. Test it
@ -64,7 +64,7 @@ $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -84,7 +84,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -111,7 +111,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)

View file

@ -183,7 +183,7 @@ On the docker container run the `test.py` file using `python3 test.py`
```python
import openai
api_base = f"http://0.0.0.0:8000" # base url for server
api_base = f"http://0.0.0.0:4000" # base url for server
openai.api_base = api_base
openai.api_key = "temp-key"

View file

@ -15,7 +15,7 @@ import os
response = litellm.completion(
model="openai/mistral, # add `openai/` prefix to model so litellm knows to route to OpenAI
api_key="sk-1234", # api key to your openai compatible endpoint
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
messages=[
{
"role": "user",
@ -35,7 +35,7 @@ import os
response = litellm.embedding(
model="openai/GPT-J", # add `openai/` prefix to model so litellm knows to route to OpenAI
api_key="sk-1234", # api key to your openai compatible endpoint
api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint
api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint
input=["good morning from litellm"]
)
print(response)

View file

@ -145,7 +145,7 @@ $ litellm --config /path/to/config.yaml
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -153,7 +153,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -166,14 +166,14 @@ curl http://0.0.0.0:8000/v1/chat/completions \
Send the same request twice:
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
"input": ["write a litellm poem"]
}'
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
@ -227,7 +227,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
@ -255,7 +255,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(
@ -281,7 +281,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(

View file

@ -63,7 +63,7 @@ litellm_settings:
$ litellm /path/to/config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [
@ -162,7 +162,7 @@ litellm_settings:
$ litellm /path/to/config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--data ' {
"model": "gpt-3.5-turbo",
"messages": [

View file

@ -15,7 +15,7 @@ Cli arguments, --host, --port, --num_workers
```
## --port
- **Default:** `8000`
- **Default:** `4000`
- The port to bind the server to.
- **Usage:**
```shell

View file

@ -13,7 +13,7 @@ Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`m
| `general_settings` | Server settings, example setting `master_key: sk-my_special_key` |
| `environment_variables` | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` |
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:8000/#/config.yaml), for everything you can pass in the config.yaml.
**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:4000/#/config.yaml), for everything you can pass in the config.yaml.
## Quick Start
@ -55,7 +55,7 @@ model_list:
- model_name: vllm-models
litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
api_base: http://0.0.0.0:8000
api_base: http://0.0.0.0:4000
rpm: 1440
model_info:
version: 2
@ -91,7 +91,7 @@ Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing)
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -111,7 +111,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "bedrock-claude-v1",
@ -131,7 +131,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
@ -179,7 +179,7 @@ messages = [
# Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
@ -189,7 +189,7 @@ print(response)
# Sends request to model where `model_name=bedrock-claude-v1` on config.yaml.
claude_chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
model = "bedrock-claude-v1",
temperature=0.1
)
@ -560,7 +560,7 @@ litellm --config config.yaml
Sends Request to `bedrock-cohere`
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "bedrock-cohere",

View file

@ -241,10 +241,10 @@ helm install \
kubectl \
port-forward \
service/mydeploy-litellm \
8000:8000
4000:4000
```
Your OpenAI proxy server is now running on `http://127.0.0.1:8000`.
Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
</TabItem>
</Tabs>
@ -393,11 +393,11 @@ services:
target: runtime
image: ghcr.io/berriai/litellm:main-latest
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
- "4000:4000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any
```
@ -415,4 +415,4 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
Your LiteLLM container should be running now on the defined port e.g. `8000`.
Your LiteLLM container should be running now on the defined port e.g. `4000`.

View file

@ -38,7 +38,7 @@ $ litellm --config /path/to/config.yaml
3. Test the embedding call
```shell
curl --location 'http://0.0.0.0:8000/v1/embeddings' \
curl --location 'http://0.0.0.0:4000/v1/embeddings' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{

View file

@ -58,7 +58,7 @@ callbacks: ["llamaguard_moderations"]
Set the LLM Guard API Base in your environment
```env
LLM_GUARD_API_BASE = "http://0.0.0.0:8000"
LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
```
Add `llmguard_moderations` as a callback
@ -143,7 +143,7 @@ When `no-log=True`, the request will **not be logged on any callbacks** and ther
import openai
client = openai.OpenAI(
api_key="anything", # proxy api-key
base_url="http://0.0.0.0:8000" # litellm proxy
base_url="http://0.0.0.0:4000" # litellm proxy
)
response = client.chat.completions.create(
@ -175,7 +175,7 @@ litellm_settings:
### How to test
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -202,7 +202,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
**Block all calls for a user id**
```
curl -X POST "http://0.0.0.0:8000/user/block" \
curl -X POST "http://0.0.0.0:4000/user/block" \
-H "Authorization: Bearer sk-1234" \
-D '{
"user_ids": [<user_id>, ...]
@ -212,7 +212,7 @@ curl -X POST "http://0.0.0.0:8000/user/block" \
**Unblock calls for a user id**
```
curl -X POST "http://0.0.0.0:8000/user/unblock" \
curl -X POST "http://0.0.0.0:4000/user/unblock" \
-H "Authorization: Bearer sk-1234" \
-D '{
"user_ids": [<user_id>, ...]
@ -230,7 +230,7 @@ litellm_settings:
### Test this
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -263,7 +263,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -291,7 +291,7 @@ print(response)
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -317,7 +317,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={

View file

@ -12,10 +12,10 @@ The proxy exposes:
#### Request
Make a GET Request to `/health` on the proxy
```shell
curl --location 'http://0.0.0.0:8000/health' -H "Authorization: Bearer sk-1234"
curl --location 'http://0.0.0.0:4000/health' -H "Authorization: Bearer sk-1234"
```
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
```
litellm --health
```
@ -60,7 +60,7 @@ $ litellm /path/to/config.yaml
3. Query health endpoint:
```
curl --location 'http://0.0.0.0:8000/health'
curl --location 'http://0.0.0.0:4000/health'
```
### Embedding Models
@ -119,7 +119,7 @@ Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:8000/health/readiness'
curl --location 'http://0.0.0.0:4000/health/readiness'
```
Example Response:
@ -153,7 +153,7 @@ Example Request:
```
curl -X 'GET' \
'http://0.0.0.0:8000/health/liveliness' \
'http://0.0.0.0:4000/health/liveliness' \
-H 'accept: application/json'
```

View file

@ -45,7 +45,7 @@ $ litellm --config /path/to/config.yaml
### Step 3: Use proxy - Call a model group [Load Balancing]
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -65,7 +65,7 @@ If you want to call a specific model defined in the `config.yaml`, you can call
In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "azure/gpt-turbo-small-ca",

View file

@ -150,7 +150,7 @@ litellm --config proxy_config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--data ' {
"model": "gpt-3.5-turbo",
@ -174,7 +174,7 @@ On Success
Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21},
Cost: 3.65e-05,
Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}}
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:8000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:4000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
```
#### Logging Proxy Request Object, Header, Url
@ -374,7 +374,7 @@ async def log_event(request: Request):
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=8000)
uvicorn.run(app, host="127.0.0.1", port=4000)
```
@ -383,7 +383,7 @@ if __name__ == "__main__":
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
```shell
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:8000/log-event"
os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
```
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
@ -445,7 +445,7 @@ Expected output on Langfuse
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -472,7 +472,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -509,7 +509,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
@ -663,7 +663,7 @@ litellm --config config.yaml --debug
Test Request
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
@ -698,7 +698,7 @@ litellm_settings:
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:8000/key/generate' \
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{"team_id": "ishaans-secret-project"}'
@ -742,7 +742,7 @@ litellm --config config.yaml --debug
Test Request
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "Azure OpenAI GPT-4 East",
@ -903,7 +903,7 @@ litellm --config config.yaml --debug
Test Request
```
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -947,7 +947,7 @@ litellm --config config.yaml --debug
Test Request
```
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",

View file

@ -24,7 +24,7 @@ Retrieve detailed information about each model listed in the `/models` endpoint,
<TabItem value="curl">
```bash
curl -X GET "http://0.0.0.0:8000/model/info" \
curl -X GET "http://0.0.0.0:4000/model/info" \
-H "accept: application/json" \
```
</TabItem>
@ -42,7 +42,7 @@ Add a new model to the list in the `config.yaml` by providing the model paramete
<TabItem value="curl">
```bash
curl -X POST "http://0.0.0.0:8000/model/new" \
curl -X POST "http://0.0.0.0:4000/model/new" \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'

View file

@ -96,7 +96,7 @@ Turn off PII masking for a given key.
Do this by setting `permissions: {"pii": false}`, when generating a key.
```shell
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
@ -119,7 +119,7 @@ The proxy support 2 request-level PII controls:
Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer my-master-key' \
--header 'Content-Type: application/json' \
--data '{
@ -136,7 +136,7 @@ from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
chat_completion = client.chat.completions.create(

View file

@ -21,7 +21,7 @@ Run the following command to start the litellm proxy
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Test
@ -250,7 +250,7 @@ litellm --config your_config.yaml
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -270,7 +270,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -297,7 +297,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
temperature=0.1
)
@ -321,7 +321,7 @@ print(response)
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -331,7 +331,7 @@ query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -340,7 +340,7 @@ query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -407,11 +407,11 @@ services:
litellm:
image: ghcr.io/berriai/litellm:main
ports:
- "8000:8000" # Map the container port to the host, change the host port if necessary
- "4000:4000" # Map the container port to the host, change the host port if necessary
volumes:
- ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
# You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
# ...rest of your docker-compose config if any
```
@ -429,7 +429,7 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
> Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
Your LiteLLM container should be running now on the defined port e.g. `8000`.
Your LiteLLM container should be running now on the defined port e.g. `4000`.
## Using with OpenAI compatible projects
@ -442,7 +442,7 @@ Set `base_url` to the LiteLLM Proxy server
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -463,7 +463,7 @@ print(response)
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
@ -474,9 +474,9 @@ git clone https://github.com/danny-avila/LibreChat.git
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
@ -502,7 +502,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:8000" # your proxy server url
api_base="http://localhost:4000" # your proxy server url
),
```
@ -514,7 +514,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
@ -528,7 +528,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:8000", #litellm compatible endpoint
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
@ -566,7 +566,7 @@ import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}

View file

@ -45,7 +45,7 @@ litellm_settings:
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
@ -101,7 +101,7 @@ LiteLLM Proxy supports setting a `timeout` per request
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data-raw '{
"model": "gpt-3.5-turbo",
@ -121,7 +121,7 @@ import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(

View file

@ -30,7 +30,7 @@ $ litellm /path/to/config.yaml
```
```bash
curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{

View file

@ -65,7 +65,7 @@ litellm --config proxy_config.yaml
```
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--data ' {
"model": "gpt-3.5-turbo",

View file

@ -28,12 +28,12 @@ Follow [setup](./virtual_keys.md#setup)
```bash
litellm --config /path/to/config.yaml
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### 2. Go to UI
```bash
http://0.0.0.0:8000/ui # <proxy_base_url>/ui
http://0.0.0.0:4000/ui # <proxy_base_url>/ui
```

View file

@ -26,7 +26,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -92,7 +92,7 @@ print(response)
Pass `metadata` as part of the request body
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
@ -123,7 +123,7 @@ from langchain.prompts.chat import (
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:8000",
openai_api_base="http://0.0.0.0:4000",
model = "gpt-3.5-turbo",
temperature=0.1,
extra_body={
@ -195,7 +195,7 @@ from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.embeddings.create(
input=["hello from litellm"],
@ -209,7 +209,7 @@ print(response)
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/embeddings' \
curl --location 'http://0.0.0.0:4000/embeddings' \
--header 'Content-Type: application/json' \
--data ' {
"model": "text-embedding-ada-002",
@ -223,7 +223,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
```python
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -233,7 +233,7 @@ query_result = embeddings.embed_query(text)
print(f"SAGEMAKER EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -242,7 +242,7 @@ query_result = embeddings.embed_query(text)
print(f"BEDROCK EMBEDDINGS")
print(query_result[:5])
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
text = "This is a test document."
@ -296,7 +296,7 @@ from openai import OpenAI
# set base_url to your proxy server
# set api_key to send to proxy server
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
response = client.moderations.create(
input="hello from litellm",
@ -310,7 +310,7 @@ print(response)
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/moderations' \
curl --location 'http://0.0.0.0:4000/moderations' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
@ -421,7 +421,7 @@ user_config = {
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# send request to `user-azure-instance`
@ -489,7 +489,7 @@ const { OpenAI } = require('openai');
const openai = new OpenAI({
apiKey: "sk-1234",
baseURL: "http://0.0.0.0:8000"
baseURL: "http://0.0.0.0:4000"
});
async function main() {
@ -516,7 +516,7 @@ Here's how to do it:
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -541,7 +541,7 @@ Pass in the litellm_params (E.g. api_key, api_base, etc.) via the `extra_body` p
import openai
client = openai.OpenAI(
api_key="sk-1234",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -571,7 +571,7 @@ const { OpenAI } = require('openai');
const openai = new OpenAI({
apiKey: "sk-1234",
baseURL: "http://0.0.0.0:8000"
baseURL: "http://0.0.0.0:4000"
});
async function main() {

View file

@ -44,7 +44,7 @@ litellm /path/to/config.yaml
**Step 3. Send test call**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Autherization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
@ -72,7 +72,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
#### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
@ -96,7 +96,7 @@ curl --location 'http://localhost:8000/user/new' \
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/user/new' \
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -113,7 +113,7 @@ Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai)
- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
@ -127,7 +127,7 @@ You can:
#### **Add budgets to users**
```shell
curl --location 'http://localhost:8000/team/new' \
curl --location 'http://localhost:4000/team/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -238,7 +238,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
#### **Add budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -250,7 +250,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer <generated-key>' \
--data ' {
@ -278,7 +278,7 @@ Expected Response from `/chat/completions` when key has crossed budget
`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
```
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -310,7 +310,7 @@ By default the `model_max_budget` is set to `{}` and is not checked for keys
#### **Add model specific budgets to keys**
```bash
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -335,7 +335,7 @@ Use `/user/new`, to persist rate limits across multiple keys.
```shell
curl --location 'http://0.0.0.0:8000/user/new' \
curl --location 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
@ -359,7 +359,7 @@ curl --location 'http://0.0.0.0:8000/user/new' \
Use `/key/generate`, if you want them for just that key.
```shell
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}'
@ -401,7 +401,7 @@ model_list:
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
@ -414,7 +414,7 @@ curl --location 'http://localhost:8000/user/new' \
Just include user_id in the `/key/generate` request.
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'

View file

@ -59,7 +59,7 @@ litellm --config /path/to/config.yaml
**Step 3: Generate temporary keys**
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
@ -70,7 +70,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
### Request
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -105,7 +105,7 @@ Request Params:
```python
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
"expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
}
@ -147,7 +147,7 @@ model_list:
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
```bash
curl -X POST "https://0.0.0.0:8000/key/generate" \
curl -X POST "https://0.0.0.0:4000/key/generate" \
-H "Authorization: Bearer <your-master-key>" \
-H "Content-Type: application/json" \
-d '{
@ -182,7 +182,7 @@ model_list:
**Step 2. Create key with access group**
```bash
curl --location 'http://localhost:8000/key/generate' \
curl --location 'http://localhost:4000/key/generate' \
-H 'Authorization: Bearer <your-master-key>' \
-H 'Content-Type: application/json' \
-d '{"models": ["beta-models"], # 👈 Model Access Group
@ -194,7 +194,7 @@ curl --location 'http://localhost:8000/key/generate' \
### Request
```shell
curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
@ -228,7 +228,7 @@ Request Params:
### Request
```shell
curl 'http://0.0.0.0:8000/key/update' \
curl 'http://0.0.0.0:4000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -266,7 +266,7 @@ Request Params:
### Request
```shell
curl 'http://0.0.0.0:8000/key/delete' \
curl 'http://0.0.0.0:4000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -500,7 +500,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
@ -517,7 +517,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
@ -545,10 +545,10 @@ Expected Response from `/chat/completions` when key has crossed budget
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request.
```shell
curl --location 'http://localhost:8000/user/new' \
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
@ -571,7 +571,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
@ -771,7 +771,7 @@ general_settings:
#### Step 3. Generate Key
```bash
curl --location 'http://0.0.0.0:8000/key/generate' \
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'

View file

@ -22,7 +22,7 @@ $ pip install 'litellm[proxy]'
```shell
$ litellm --model huggingface/bigcode/starcoder
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
### Test
@ -39,7 +39,7 @@ This will now automatically route any requests for gpt-3.5-turbo to bigcode star
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -59,7 +59,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -246,7 +246,7 @@ Set `base_url` to the LiteLLM Proxy server
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:8000"
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
@ -267,7 +267,7 @@ print(response)
```shell
litellm --model gpt-3.5-turbo
#INFO: Proxy running on http://0.0.0.0:8000
#INFO: Proxy running on http://0.0.0.0:4000
```
#### 1. Clone the repo
@ -278,9 +278,9 @@ git clone https://github.com/danny-avila/LibreChat.git
#### 2. Modify Librechat's `docker-compose.yml`
LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
```yaml
OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
```
#### 3. Save fake OpenAI key in Librechat's `.env`
@ -306,7 +306,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
api_key="IGNORED",
model="fake-model-name",
context_length=2048, # customize if needed for your model
api_base="http://localhost:8000" # your proxy server url
api_base="http://localhost:4000" # your proxy server url
),
```
@ -318,7 +318,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
```shell
$ pip install aider
$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
```
</TabItem>
<TabItem value="autogen" label="AutoGen">
@ -332,7 +332,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
config_list=[
{
"model": "my-fake-model",
"api_base": "http://localhost:8000", #litellm compatible endpoint
"api_base": "http://localhost:4000", #litellm compatible endpoint
"api_type": "open_ai",
"api_key": "NULL", # just a placeholder
}
@ -370,7 +370,7 @@ import guidance
# set api_base to your proxy
# set api_key to anything
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
experts = guidance('''
{{#system~}}
@ -479,7 +479,7 @@ $ litellm --config /path/to/config.yaml
#### Step 3: Use proxy
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-alpha",
@ -529,7 +529,7 @@ $ litellm --config /path/to/config.yaml
#### Step 3: Use proxy
Curl Command
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
@ -586,7 +586,7 @@ litellm_settings:
**Set dynamically**
```bash
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "zephyr-beta",
@ -615,7 +615,7 @@ model_list:
- model_name: custom_embedding_model
litellm_params:
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
api_base: http://0.0.0.0:8000/
api_base: http://0.0.0.0:4000/
- model_name: custom_embedding_model
litellm_params:
model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible
@ -665,7 +665,7 @@ litellm --config /path/to/config.yaml
**Step 3: Generate temporary keys**
```shell
curl 'http://0.0.0.0:8000/key/generate' \
curl 'http://0.0.0.0:4000/key/generate' \
--h 'Authorization: Bearer sk-1234' \
--d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}'
```
@ -719,7 +719,7 @@ model_list:
**Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
```bash
curl -X POST "https://0.0.0.0:8000/key/generate" \
curl -X POST "https://0.0.0.0:4000/key/generate" \
-H "Authorization: Bearer sk-1234" \
-H "Content-Type: application/json" \
-d '{
@ -737,7 +737,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
@ -868,7 +868,7 @@ $ litellm --config /path/to/config.yaml
#### Using Caching
Send the same request twice:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -876,7 +876,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
"temperature": 0.7
}'
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -889,7 +889,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
Caching can be switched on/off per `/chat/completions` request
- Caching **on** for completion - pass `caching=True`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -900,7 +900,7 @@ Caching can be switched on/off per `/chat/completions` request
```
- Caching **off** for completion - pass `caching=False`:
```shell
curl http://0.0.0.0:8000/v1/chat/completions \
curl http://0.0.0.0:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-3.5-turbo",
@ -963,10 +963,10 @@ https://api.openai.com/v1/chat/completions \
Use this to health check all LLMs defined in your config.yaml
#### Request
```shell
curl --location 'http://0.0.0.0:8000/health'
curl --location 'http://0.0.0.0:4000/health'
```
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
```
litellm --health
```
@ -1087,7 +1087,7 @@ litellm -config config.yaml
#### Run a test request to Proxy
```shell
curl --location 'http://0.0.0.0:8000/chat/completions' \
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1244' \
--data ' {
"model": "gpt-3.5-turbo",
@ -1213,7 +1213,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
```
#### --port
- **Default:** `8000`
- **Default:** `4000`
- The port to bind the server to.
- **Usage:**
```shell

View file

@ -61,7 +61,7 @@ def is_port_in_use(port):
@click.option(
"--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST"
)
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
@click.option("--port", default=4000, help="Port to bind the server to.", envvar="PORT")
@click.option(
"--num_workers",
default=default_num_workers,
@ -273,7 +273,7 @@ def run_server(
],
}
response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
response = requests.post("http://0.0.0.0:4000/queue/request", json=data)
response = response.json()
@ -507,7 +507,7 @@ def run_server(
print(
f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
)
if port == 8000 and is_port_in_use(port):
if port == 4000 and is_port_in_use(port):
port = random.randint(1024, 49152)
from litellm.proxy.proxy_server import app

View file

@ -246,7 +246,7 @@ class Router:
"122999-2828282-277:
{
"model": "gpt-3",
"api_base": "http://localhost:8000",
"api_base": "http://localhost:4000",
"num_requests": 20,
"avg_latency": 0.001,
"num_failures": 0,