From ea6f42216c313d9b91794eb27a6b0699a1683164 Mon Sep 17 00:00:00 2001 From: ishaan-jaff Date: Fri, 8 Mar 2024 21:59:00 -0800 Subject: [PATCH] (docs) use port 4000 --- README.md | 6 +-- deploy/charts/litellm/README.md | 4 +- deploy/charts/litellm/values.yaml | 2 +- .../docs/embedding/supported_embedding.md | 10 ++-- docs/my-website/docs/index.md | 4 +- docs/my-website/docs/load_test.md | 2 +- docs/my-website/docs/providers/anthropic.md | 8 +-- docs/my-website/docs/providers/bedrock.md | 8 +-- docs/my-website/docs/providers/ollama.md | 2 +- .../docs/providers/openai_compatible.md | 4 +- docs/my-website/docs/proxy/caching.md | 14 ++--- docs/my-website/docs/proxy/call_hooks.md | 4 +- docs/my-website/docs/proxy/cli.md | 2 +- docs/my-website/docs/proxy/configs.md | 16 +++--- docs/my-website/docs/proxy/deploy.md | 10 ++-- docs/my-website/docs/proxy/embedding.md | 2 +- docs/my-website/docs/proxy/enterprise.md | 18 +++---- docs/my-website/docs/proxy/health.md | 10 ++-- docs/my-website/docs/proxy/load_balancing.md | 4 +- docs/my-website/docs/proxy/logging.md | 24 ++++----- .../my-website/docs/proxy/model_management.md | 4 +- docs/my-website/docs/proxy/pii_masking.md | 6 +-- docs/my-website/docs/proxy/quick_start.md | 36 ++++++------- docs/my-website/docs/proxy/reliability.md | 6 +-- docs/my-website/docs/proxy/rules.md | 2 +- .../docs/proxy/streaming_logging.md | 2 +- docs/my-website/docs/proxy/ui.md | 4 +- docs/my-website/docs/proxy/user_keys.md | 30 +++++------ docs/my-website/docs/proxy/users.md | 26 +++++----- docs/my-website/docs/proxy/virtual_keys.md | 28 +++++----- docs/my-website/docs/simple_proxy_old_doc.md | 52 +++++++++---------- litellm/proxy/proxy_cli.py | 6 +-- litellm/router.py | 2 +- 33 files changed, 179 insertions(+), 179 deletions(-) diff --git a/README.md b/README.md index bc8c1bae2..d32372b6c 100644 --- a/README.md +++ b/README.md @@ -143,13 +143,13 @@ pip install 'litellm[proxy]' ```shell $ litellm --model huggingface/bigcode/starcoder -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` ### Step 2: Make ChatCompletions Request to Proxy ```python import openai # openai v1.0.0+ -client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url +client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url # request sent to model set on litellm proxy, `litellm --model` response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ { @@ -170,7 +170,7 @@ Set budgets and rate limits across multiple projects ### Request ```shell -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}' diff --git a/deploy/charts/litellm/README.md b/deploy/charts/litellm/README.md index daba8aa68..817781ed0 100644 --- a/deploy/charts/litellm/README.md +++ b/deploy/charts/litellm/README.md @@ -28,7 +28,7 @@ If `db.useStackgresOperator` is used (not yet implemented): | `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` | | `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` | | `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` | -| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `8000` | +| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` | | `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A | | `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A | @@ -76,7 +76,7 @@ When browsing to the URL published per the settings in `ingress.*`, you will be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal (from the `litellm` pod's perspective) URL published by the `-litellm` Kubernetes Service. If the deployment uses the default settings for this -service, the **Proxy Endpoint** should be set to `http://-litellm:8000`. +service, the **Proxy Endpoint** should be set to `http://-litellm:4000`. The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey` was not provided to the helm command line, the `masterkey` is a randomly diff --git a/deploy/charts/litellm/values.yaml b/deploy/charts/litellm/values.yaml index 1b83fe801..642a50b70 100644 --- a/deploy/charts/litellm/values.yaml +++ b/deploy/charts/litellm/values.yaml @@ -55,7 +55,7 @@ environmentSecrets: [] service: type: ClusterIP - port: 8000 + port: 4000 ingress: enabled: false diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md index 62a10b44d..7e2374d16 100644 --- a/docs/my-website/docs/embedding/supported_embedding.md +++ b/docs/my-website/docs/embedding/supported_embedding.md @@ -35,7 +35,7 @@ general_settings: ```bash litellm --config /path/to/config.yaml -# RUNNING on http://0.0.0.0:8000 +# RUNNING on http://0.0.0.0:4000 ``` ### Test @@ -44,7 +44,7 @@ litellm --config /path/to/config.yaml ```bash -curl --location 'http://0.0.0.0:8000/embeddings' \ +curl --location 'http://0.0.0.0:4000/embeddings' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{"input": ["Academia.edu uses"], "model": "textembedding-gecko", "encoding_format": "base64"}' @@ -57,7 +57,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \ from openai import OpenAI client = OpenAI( api_key="sk-1234", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) client.embeddings.create( @@ -72,7 +72,7 @@ client.embeddings.create( ```python from langchain_openai import OpenAIEmbeddings -embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:8000", openai_api_key="sk-1234") +embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:4000", openai_api_key="sk-1234") text = "This is a test document." @@ -200,7 +200,7 @@ Use this for calling `/embedding` endpoints on OpenAI Compatible Servers, exampl from litellm import embedding response = embedding( model = "openai/", # add `openai/` prefix to model so litellm knows to route to OpenAI - api_base="http://0.0.0.0:8000/" # set API Base of your Custom OpenAI Endpoint + api_base="http://0.0.0.0:4000/" # set API Base of your Custom OpenAI Endpoint input=["good morning from litellm"] ) ``` diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md index d7ed14019..66ec6573c 100644 --- a/docs/my-website/docs/index.md +++ b/docs/my-website/docs/index.md @@ -368,13 +368,13 @@ pip install 'litellm[proxy]' ```shell $ litellm --model huggingface/bigcode/starcoder -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` #### Step 2: Make ChatCompletions Request to Proxy ```python import openai # openai v1.0.0+ -client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url +client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url # request sent to model set on litellm proxy, `litellm --model` response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ { diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md index 94165fb7b..f85ff9122 100644 --- a/docs/my-website/docs/load_test.md +++ b/docs/my-website/docs/load_test.md @@ -90,7 +90,7 @@ import time, asyncio, litellm #### LITELLM PROXY #### litellm_client = AsyncOpenAI( api_key="sk-1234", # [CHANGE THIS] - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) #### AZURE OPENAI CLIENT #### diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md index 6aa4b1979..1a7a5fa41 100644 --- a/docs/my-website/docs/providers/anthropic.md +++ b/docs/my-website/docs/providers/anthropic.md @@ -63,7 +63,7 @@ export ANTHROPIC_API_KEY="your-api-key" ```bash $ litellm --model claude-3-opus-20240229 -# Server running on http://0.0.0.0:8000 +# Server running on http://0.0.0.0:4000 ``` ### 3. Test it @@ -73,7 +73,7 @@ $ litellm --model claude-3-opus-20240229 ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -93,7 +93,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -120,7 +120,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy + openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy model = "gpt-3.5-turbo", temperature=0.1 ) diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index c5b12d4c4..8c6926885 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -54,7 +54,7 @@ export AWS_REGION_NAME="" ```bash $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0 -# Server running on http://0.0.0.0:8000 +# Server running on http://0.0.0.0:4000 ``` ### 3. Test it @@ -64,7 +64,7 @@ $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0 ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -84,7 +84,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -111,7 +111,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy + openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy model = "gpt-3.5-turbo", temperature=0.1 ) diff --git a/docs/my-website/docs/providers/ollama.md b/docs/my-website/docs/providers/ollama.md index 78c91bb63..ec2a231e1 100644 --- a/docs/my-website/docs/providers/ollama.md +++ b/docs/my-website/docs/providers/ollama.md @@ -183,7 +183,7 @@ On the docker container run the `test.py` file using `python3 test.py` ```python import openai -api_base = f"http://0.0.0.0:8000" # base url for server +api_base = f"http://0.0.0.0:4000" # base url for server openai.api_base = api_base openai.api_key = "temp-key" diff --git a/docs/my-website/docs/providers/openai_compatible.md b/docs/my-website/docs/providers/openai_compatible.md index beaf38cfa..f86544c28 100644 --- a/docs/my-website/docs/providers/openai_compatible.md +++ b/docs/my-website/docs/providers/openai_compatible.md @@ -15,7 +15,7 @@ import os response = litellm.completion( model="openai/mistral, # add `openai/` prefix to model so litellm knows to route to OpenAI api_key="sk-1234", # api key to your openai compatible endpoint - api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint + api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint messages=[ { "role": "user", @@ -35,7 +35,7 @@ import os response = litellm.embedding( model="openai/GPT-J", # add `openai/` prefix to model so litellm knows to route to OpenAI api_key="sk-1234", # api key to your openai compatible endpoint - api_base="http://0.0.0.0:8000", # set API Base of your Custom OpenAI Endpoint + api_base="http://0.0.0.0:4000", # set API Base of your Custom OpenAI Endpoint input=["good morning from litellm"] ) print(response) diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index ee4874caf..4f1ce18f3 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -145,7 +145,7 @@ $ litellm --config /path/to/config.yaml Send the same request twice: ```shell -curl http://0.0.0.0:8000/v1/chat/completions \ +curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -153,7 +153,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \ "temperature": 0.7 }' -curl http://0.0.0.0:8000/v1/chat/completions \ +curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -166,14 +166,14 @@ curl http://0.0.0.0:8000/v1/chat/completions \ Send the same request twice: ```shell -curl --location 'http://0.0.0.0:8000/embeddings' \ +curl --location 'http://0.0.0.0:4000/embeddings' \ --header 'Content-Type: application/json' \ --data ' { "model": "text-embedding-ada-002", "input": ["write a litellm poem"] }' -curl --location 'http://0.0.0.0:8000/embeddings' \ +curl --location 'http://0.0.0.0:4000/embeddings' \ --header 'Content-Type: application/json' \ --data ' { "model": "text-embedding-ada-002", @@ -227,7 +227,7 @@ from openai import OpenAI client = OpenAI( # This is the default and can be omitted api_key=os.environ.get("OPENAI_API_KEY"), - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) chat_completion = client.chat.completions.create( @@ -255,7 +255,7 @@ from openai import OpenAI client = OpenAI( # This is the default and can be omitted api_key=os.environ.get("OPENAI_API_KEY"), - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) chat_completion = client.chat.completions.create( @@ -281,7 +281,7 @@ from openai import OpenAI client = OpenAI( # This is the default and can be omitted api_key=os.environ.get("OPENAI_API_KEY"), - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) chat_completion = client.chat.completions.create( diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md index b00f4e301..9d4d1112e 100644 --- a/docs/my-website/docs/proxy/call_hooks.md +++ b/docs/my-website/docs/proxy/call_hooks.md @@ -63,7 +63,7 @@ litellm_settings: $ litellm /path/to/config.yaml ``` ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --data ' { "model": "gpt-3.5-turbo", "messages": [ @@ -162,7 +162,7 @@ litellm_settings: $ litellm /path/to/config.yaml ``` ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --data ' { "model": "gpt-3.5-turbo", "messages": [ diff --git a/docs/my-website/docs/proxy/cli.md b/docs/my-website/docs/proxy/cli.md index d366f1f6b..28b210b16 100644 --- a/docs/my-website/docs/proxy/cli.md +++ b/docs/my-website/docs/proxy/cli.md @@ -15,7 +15,7 @@ Cli arguments, --host, --port, --num_workers ``` ## --port - - **Default:** `8000` + - **Default:** `4000` - The port to bind the server to. - **Usage:** ```shell diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index a863ec2ca..68b49502d 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -13,7 +13,7 @@ Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`m | `general_settings` | Server settings, example setting `master_key: sk-my_special_key` | | `environment_variables` | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` | -**Complete List:** Check the Swagger UI docs on `/#/config.yaml` (e.g. http://0.0.0.0:8000/#/config.yaml), for everything you can pass in the config.yaml. +**Complete List:** Check the Swagger UI docs on `/#/config.yaml` (e.g. http://0.0.0.0:4000/#/config.yaml), for everything you can pass in the config.yaml. ## Quick Start @@ -55,7 +55,7 @@ model_list: - model_name: vllm-models litellm_params: model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible - api_base: http://0.0.0.0:8000 + api_base: http://0.0.0.0:4000 rpm: 1440 model_info: version: 2 @@ -91,7 +91,7 @@ Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml. If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing) ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -111,7 +111,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "bedrock-claude-v1", @@ -131,7 +131,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml. @@ -179,7 +179,7 @@ messages = [ # Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml. chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy + openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy model = "gpt-3.5-turbo", temperature=0.1 ) @@ -189,7 +189,7 @@ print(response) # Sends request to model where `model_name=bedrock-claude-v1` on config.yaml. claude_chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy + openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy model = "bedrock-claude-v1", temperature=0.1 ) @@ -560,7 +560,7 @@ litellm --config config.yaml Sends Request to `bedrock-cohere` ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "bedrock-cohere", diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 496bde05f..1a0d3abad 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -241,10 +241,10 @@ helm install \ kubectl \ port-forward \ service/mydeploy-litellm \ - 8000:8000 + 4000:4000 ``` -Your OpenAI proxy server is now running on `http://127.0.0.1:8000`. +Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. @@ -393,11 +393,11 @@ services: target: runtime image: ghcr.io/berriai/litellm:main-latest ports: - - "8000:8000" # Map the container port to the host, change the host port if necessary + - "4000:4000" # Map the container port to the host, change the host port if necessary volumes: - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value - command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] + command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ] # ...rest of your docker-compose config if any ``` @@ -415,4 +415,4 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in > Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d` -Your LiteLLM container should be running now on the defined port e.g. `8000`. +Your LiteLLM container should be running now on the defined port e.g. `4000`. diff --git a/docs/my-website/docs/proxy/embedding.md b/docs/my-website/docs/proxy/embedding.md index 0f3a01a90..2adaaa247 100644 --- a/docs/my-website/docs/proxy/embedding.md +++ b/docs/my-website/docs/proxy/embedding.md @@ -38,7 +38,7 @@ $ litellm --config /path/to/config.yaml 3. Test the embedding call ```shell -curl --location 'http://0.0.0.0:8000/v1/embeddings' \ +curl --location 'http://0.0.0.0:4000/v1/embeddings' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{ diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md index e0c5374f0..93786eff4 100644 --- a/docs/my-website/docs/proxy/enterprise.md +++ b/docs/my-website/docs/proxy/enterprise.md @@ -58,7 +58,7 @@ callbacks: ["llamaguard_moderations"] Set the LLM Guard API Base in your environment ```env -LLM_GUARD_API_BASE = "http://0.0.0.0:8000" +LLM_GUARD_API_BASE = "http://0.0.0.0:4000" ``` Add `llmguard_moderations` as a callback @@ -143,7 +143,7 @@ When `no-log=True`, the request will **not be logged on any callbacks** and ther import openai client = openai.OpenAI( api_key="anything", # proxy api-key - base_url="http://0.0.0.0:8000" # litellm proxy + base_url="http://0.0.0.0:4000" # litellm proxy ) response = client.chat.completions.create( @@ -175,7 +175,7 @@ litellm_settings: ### How to test ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -202,7 +202,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ **Block all calls for a user id** ``` -curl -X POST "http://0.0.0.0:8000/user/block" \ +curl -X POST "http://0.0.0.0:4000/user/block" \ -H "Authorization: Bearer sk-1234" \ -D '{ "user_ids": [, ...] @@ -212,7 +212,7 @@ curl -X POST "http://0.0.0.0:8000/user/block" \ **Unblock calls for a user id** ``` -curl -X POST "http://0.0.0.0:8000/user/unblock" \ +curl -X POST "http://0.0.0.0:4000/user/unblock" \ -H "Authorization: Bearer sk-1234" \ -D '{ "user_ids": [, ...] @@ -230,7 +230,7 @@ litellm_settings: ### Test this ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -263,7 +263,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -291,7 +291,7 @@ print(response) Pass `metadata` as part of the request body ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data '{ "model": "gpt-3.5-turbo", @@ -317,7 +317,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", + openai_api_base="http://0.0.0.0:4000", model = "gpt-3.5-turbo", temperature=0.1, extra_body={ diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md index f0b797329..03dd91731 100644 --- a/docs/my-website/docs/proxy/health.md +++ b/docs/my-website/docs/proxy/health.md @@ -12,10 +12,10 @@ The proxy exposes: #### Request Make a GET Request to `/health` on the proxy ```shell -curl --location 'http://0.0.0.0:8000/health' -H "Authorization: Bearer sk-1234" +curl --location 'http://0.0.0.0:4000/health' -H "Authorization: Bearer sk-1234" ``` -You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you +You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you ``` litellm --health ``` @@ -60,7 +60,7 @@ $ litellm /path/to/config.yaml 3. Query health endpoint: ``` -curl --location 'http://0.0.0.0:8000/health' +curl --location 'http://0.0.0.0:4000/health' ``` ### Embedding Models @@ -119,7 +119,7 @@ Unprotected endpoint for checking if proxy is ready to accept requests Example Request: ```bash -curl --location 'http://0.0.0.0:8000/health/readiness' +curl --location 'http://0.0.0.0:4000/health/readiness' ``` Example Response: @@ -153,7 +153,7 @@ Example Request: ``` curl -X 'GET' \ - 'http://0.0.0.0:8000/health/liveliness' \ + 'http://0.0.0.0:4000/health/liveliness' \ -H 'accept: application/json' ``` diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md index ad5e91203..691592cb6 100644 --- a/docs/my-website/docs/proxy/load_balancing.md +++ b/docs/my-website/docs/proxy/load_balancing.md @@ -45,7 +45,7 @@ $ litellm --config /path/to/config.yaml ### Step 3: Use proxy - Call a model group [Load Balancing] Curl Command ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -65,7 +65,7 @@ If you want to call a specific model defined in the `config.yaml`, you can call In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1 ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "azure/gpt-turbo-small-ca", diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index bf4216c0e..589199a07 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -150,7 +150,7 @@ litellm --config proxy_config.yaml ``` ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Authorization: Bearer sk-1234' \ --data ' { "model": "gpt-3.5-turbo", @@ -174,7 +174,7 @@ On Success Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}, Cost: 3.65e-05, Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}} - Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:8000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'} + Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:4000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'} ``` #### Logging Proxy Request Object, Header, Url @@ -374,7 +374,7 @@ async def log_event(request: Request): if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="127.0.0.1", port=8000) + uvicorn.run(app, host="127.0.0.1", port=4000) ``` @@ -383,7 +383,7 @@ if __name__ == "__main__": #### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to ```shell -os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:8000/log-event" +os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event" ``` #### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"] @@ -445,7 +445,7 @@ Expected output on Langfuse Pass `metadata` as part of the request body ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data '{ "model": "gpt-3.5-turbo", @@ -472,7 +472,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -509,7 +509,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", + openai_api_base="http://0.0.0.0:4000", model = "gpt-3.5-turbo", temperature=0.1, extra_body={ @@ -663,7 +663,7 @@ litellm --config config.yaml --debug Test Request ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "Azure OpenAI GPT-4 East", @@ -698,7 +698,7 @@ litellm_settings: Now, when you [generate keys](./virtual_keys.md) for this team-id ```bash -curl -X POST 'http://0.0.0.0:8000/key/generate' \ +curl -X POST 'http://0.0.0.0:4000/key/generate' \ -H 'Authorization: Bearer sk-1234' \ -H 'Content-Type: application/json' \ -D '{"team_id": "ishaans-secret-project"}' @@ -742,7 +742,7 @@ litellm --config config.yaml --debug Test Request ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "Azure OpenAI GPT-4 East", @@ -903,7 +903,7 @@ litellm --config config.yaml --debug Test Request ``` -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -947,7 +947,7 @@ litellm --config config.yaml --debug Test Request ``` -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", diff --git a/docs/my-website/docs/proxy/model_management.md b/docs/my-website/docs/proxy/model_management.md index 8160e2aa7..0a236185f 100644 --- a/docs/my-website/docs/proxy/model_management.md +++ b/docs/my-website/docs/proxy/model_management.md @@ -24,7 +24,7 @@ Retrieve detailed information about each model listed in the `/models` endpoint, ```bash -curl -X GET "http://0.0.0.0:8000/model/info" \ +curl -X GET "http://0.0.0.0:4000/model/info" \ -H "accept: application/json" \ ``` @@ -42,7 +42,7 @@ Add a new model to the list in the `config.yaml` by providing the model paramete ```bash -curl -X POST "http://0.0.0.0:8000/model/new" \ +curl -X POST "http://0.0.0.0:4000/model/new" \ -H "accept: application/json" \ -H "Content-Type: application/json" \ -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }' diff --git a/docs/my-website/docs/proxy/pii_masking.md b/docs/my-website/docs/proxy/pii_masking.md index 0d559d910..a95a6d771 100644 --- a/docs/my-website/docs/proxy/pii_masking.md +++ b/docs/my-website/docs/proxy/pii_masking.md @@ -96,7 +96,7 @@ Turn off PII masking for a given key. Do this by setting `permissions: {"pii": false}`, when generating a key. ```shell -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{ @@ -119,7 +119,7 @@ The proxy support 2 request-level PII controls: Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls. ```bash -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer my-master-key' \ --header 'Content-Type: application/json' \ --data '{ @@ -136,7 +136,7 @@ from openai import OpenAI client = OpenAI( # This is the default and can be omitted api_key=os.environ.get("OPENAI_API_KEY"), - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) chat_completion = client.chat.completions.create( diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md index 4f508ee59..d44970348 100644 --- a/docs/my-website/docs/proxy/quick_start.md +++ b/docs/my-website/docs/proxy/quick_start.md @@ -21,7 +21,7 @@ Run the following command to start the litellm proxy ```shell $ litellm --model huggingface/bigcode/starcoder -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` ### Test @@ -250,7 +250,7 @@ litellm --config your_config.yaml ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -270,7 +270,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -297,7 +297,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy + openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy model = "gpt-3.5-turbo", temperature=0.1 ) @@ -321,7 +321,7 @@ print(response) ```python from langchain.embeddings import OpenAIEmbeddings -embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -331,7 +331,7 @@ query_result = embeddings.embed_query(text) print(f"SAGEMAKER EMBEDDINGS") print(query_result[:5]) -embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -340,7 +340,7 @@ query_result = embeddings.embed_query(text) print(f"BEDROCK EMBEDDINGS") print(query_result[:5]) -embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -407,11 +407,11 @@ services: litellm: image: ghcr.io/berriai/litellm:main ports: - - "8000:8000" # Map the container port to the host, change the host port if necessary + - "4000:4000" # Map the container port to the host, change the host port if necessary volumes: - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value - command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ] + command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ] # ...rest of your docker-compose config if any ``` @@ -429,7 +429,7 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in > Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d` -Your LiteLLM container should be running now on the defined port e.g. `8000`. +Your LiteLLM container should be running now on the defined port e.g. `4000`. ## Using with OpenAI compatible projects @@ -442,7 +442,7 @@ Set `base_url` to the LiteLLM Proxy server import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -463,7 +463,7 @@ print(response) ```shell litellm --model gpt-3.5-turbo -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` #### 1. Clone the repo @@ -474,9 +474,9 @@ git clone https://github.com/danny-avila/LibreChat.git #### 2. Modify Librechat's `docker-compose.yml` -LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below +LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below ```yaml -OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions +OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions ``` #### 3. Save fake OpenAI key in Librechat's `.env` @@ -502,7 +502,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a api_key="IGNORED", model="fake-model-name", context_length=2048, # customize if needed for your model - api_base="http://localhost:8000" # your proxy server url + api_base="http://localhost:4000" # your proxy server url ), ``` @@ -514,7 +514,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment- ```shell $ pip install aider -$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key +$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key ``` @@ -528,7 +528,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai config_list=[ { "model": "my-fake-model", - "api_base": "http://localhost:8000", #litellm compatible endpoint + "api_base": "http://localhost:4000", #litellm compatible endpoint "api_type": "open_ai", "api_key": "NULL", # just a placeholder } @@ -566,7 +566,7 @@ import guidance # set api_base to your proxy # set api_key to anything -gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything") +gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything") experts = guidance(''' {{#system~}} diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index f241e4ec0..7527a3d5b 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -45,7 +45,7 @@ litellm_settings: **Set dynamically** ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "zephyr-beta", @@ -101,7 +101,7 @@ LiteLLM Proxy supports setting a `timeout` per request ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data-raw '{ "model": "gpt-3.5-turbo", @@ -121,7 +121,7 @@ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) response = client.chat.completions.create( diff --git a/docs/my-website/docs/proxy/rules.md b/docs/my-website/docs/proxy/rules.md index 415607b61..60e990d91 100644 --- a/docs/my-website/docs/proxy/rules.md +++ b/docs/my-website/docs/proxy/rules.md @@ -30,7 +30,7 @@ $ litellm /path/to/config.yaml ``` ```bash -curl --location 'http://0.0.0.0:8000/v1/chat/completions' \ +curl --location 'http://0.0.0.0:4000/v1/chat/completions' \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer sk-1234' \ --data '{ diff --git a/docs/my-website/docs/proxy/streaming_logging.md b/docs/my-website/docs/proxy/streaming_logging.md index 6bc5882d1..3fa896467 100644 --- a/docs/my-website/docs/proxy/streaming_logging.md +++ b/docs/my-website/docs/proxy/streaming_logging.md @@ -65,7 +65,7 @@ litellm --config proxy_config.yaml ``` ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Authorization: Bearer sk-1234' \ --data ' { "model": "gpt-3.5-turbo", diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md index 188a2a2eb..cca9d4434 100644 --- a/docs/my-website/docs/proxy/ui.md +++ b/docs/my-website/docs/proxy/ui.md @@ -28,12 +28,12 @@ Follow [setup](./virtual_keys.md#setup) ```bash litellm --config /path/to/config.yaml -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` ### 2. Go to UI ```bash -http://0.0.0.0:8000/ui # /ui +http://0.0.0.0:4000/ui # /ui ``` diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md index fcccffaa0..d86d3ae09 100644 --- a/docs/my-website/docs/proxy/user_keys.md +++ b/docs/my-website/docs/proxy/user_keys.md @@ -26,7 +26,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -92,7 +92,7 @@ print(response) Pass `metadata` as part of the request body ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data '{ "model": "gpt-3.5-turbo", @@ -123,7 +123,7 @@ from langchain.prompts.chat import ( from langchain.schema import HumanMessage, SystemMessage chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:8000", + openai_api_base="http://0.0.0.0:4000", model = "gpt-3.5-turbo", temperature=0.1, extra_body={ @@ -195,7 +195,7 @@ from openai import OpenAI # set base_url to your proxy server # set api_key to send to proxy server -client = OpenAI(api_key="", base_url="http://0.0.0.0:8000") +client = OpenAI(api_key="", base_url="http://0.0.0.0:4000") response = client.embeddings.create( input=["hello from litellm"], @@ -209,7 +209,7 @@ print(response) ```shell -curl --location 'http://0.0.0.0:8000/embeddings' \ +curl --location 'http://0.0.0.0:4000/embeddings' \ --header 'Content-Type: application/json' \ --data ' { "model": "text-embedding-ada-002", @@ -223,7 +223,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \ ```python from langchain.embeddings import OpenAIEmbeddings -embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -233,7 +233,7 @@ query_result = embeddings.embed_query(text) print(f"SAGEMAKER EMBEDDINGS") print(query_result[:5]) -embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -242,7 +242,7 @@ query_result = embeddings.embed_query(text) print(f"BEDROCK EMBEDDINGS") print(query_result[:5]) -embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key") +embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key") text = "This is a test document." @@ -296,7 +296,7 @@ from openai import OpenAI # set base_url to your proxy server # set api_key to send to proxy server -client = OpenAI(api_key="", base_url="http://0.0.0.0:8000") +client = OpenAI(api_key="", base_url="http://0.0.0.0:4000") response = client.moderations.create( input="hello from litellm", @@ -310,7 +310,7 @@ print(response) ```shell -curl --location 'http://0.0.0.0:8000/moderations' \ +curl --location 'http://0.0.0.0:4000/moderations' \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer sk-1234' \ --data '{"input": "Sample text goes here", "model": "text-moderation-stable"}' @@ -421,7 +421,7 @@ user_config = { import openai client = openai.OpenAI( api_key="sk-1234", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # send request to `user-azure-instance` @@ -489,7 +489,7 @@ const { OpenAI } = require('openai'); const openai = new OpenAI({ apiKey: "sk-1234", - baseURL: "http://0.0.0.0:8000" + baseURL: "http://0.0.0.0:4000" }); async function main() { @@ -516,7 +516,7 @@ Here's how to do it: import openai client = openai.OpenAI( api_key="sk-1234", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -541,7 +541,7 @@ Pass in the litellm_params (E.g. api_key, api_base, etc.) via the `extra_body` p import openai client = openai.OpenAI( api_key="sk-1234", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -571,7 +571,7 @@ const { OpenAI } = require('openai'); const openai = new OpenAI({ apiKey: "sk-1234", - baseURL: "http://0.0.0.0:8000" + baseURL: "http://0.0.0.0:4000" }); async function main() { diff --git a/docs/my-website/docs/proxy/users.md b/docs/my-website/docs/proxy/users.md index 9c8927caf..12cbda9d0 100644 --- a/docs/my-website/docs/proxy/users.md +++ b/docs/my-website/docs/proxy/users.md @@ -44,7 +44,7 @@ litellm /path/to/config.yaml **Step 3. Send test call** ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Autherization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{ @@ -72,7 +72,7 @@ By default the `max_budget` is set to `null` and is not checked for keys #### **Add budgets to users** ```shell -curl --location 'http://localhost:8000/user/new' \ +curl --location 'http://localhost:4000/user/new' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' @@ -96,7 +96,7 @@ curl --location 'http://localhost:8000/user/new' \ `budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ``` -curl 'http://0.0.0.0:8000/user/new' \ +curl 'http://0.0.0.0:4000/user/new' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -113,7 +113,7 @@ Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) - **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well ```bash -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}' @@ -127,7 +127,7 @@ You can: #### **Add budgets to users** ```shell -curl --location 'http://localhost:8000/team/new' \ +curl --location 'http://localhost:4000/team/new' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -238,7 +238,7 @@ By default the `max_budget` is set to `null` and is not checked for keys #### **Add budgets to keys** ```bash -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -250,7 +250,7 @@ curl 'http://0.0.0.0:8000/key/generate' \ Example Request to `/chat/completions` when key has crossed budget ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer ' \ --data ' { @@ -278,7 +278,7 @@ Expected Response from `/chat/completions` when key has crossed budget `budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ``` -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -310,7 +310,7 @@ By default the `model_max_budget` is set to `{}` and is not checked for keys #### **Add model specific budgets to keys** ```bash -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -335,7 +335,7 @@ Use `/user/new`, to persist rate limits across multiple keys. ```shell -curl --location 'http://0.0.0.0:8000/user/new' \ +curl --location 'http://0.0.0.0:4000/user/new' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' @@ -359,7 +359,7 @@ curl --location 'http://0.0.0.0:8000/user/new' \ Use `/key/generate`, if you want them for just that key. ```shell -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' @@ -401,7 +401,7 @@ model_list: **Step 2. Create key with access group** ```bash -curl --location 'http://localhost:8000/user/new' \ +curl --location 'http://localhost:4000/user/new' \ -H 'Authorization: Bearer ' \ -H 'Content-Type: application/json' \ -d '{"models": ["beta-models"], # 👈 Model Access Group @@ -414,7 +414,7 @@ curl --location 'http://localhost:8000/user/new' \ Just include user_id in the `/key/generate` request. ```bash -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}' diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md index 70fd6e6a8..e84b3c16f 100644 --- a/docs/my-website/docs/proxy/virtual_keys.md +++ b/docs/my-website/docs/proxy/virtual_keys.md @@ -59,7 +59,7 @@ litellm --config /path/to/config.yaml **Step 3: Generate temporary keys** ```shell -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}' @@ -70,7 +70,7 @@ curl 'http://0.0.0.0:8000/key/generate' \ ### Request ```shell -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -105,7 +105,7 @@ Request Params: ```python { "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token - "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object + "expires": "2023-11-19T01:38:25.834000+00:00" # datetime object "key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md) ... } @@ -147,7 +147,7 @@ model_list: **Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.** ```bash -curl -X POST "https://0.0.0.0:8000/key/generate" \ +curl -X POST "https://0.0.0.0:4000/key/generate" \ -H "Authorization: Bearer " \ -H "Content-Type: application/json" \ -d '{ @@ -182,7 +182,7 @@ model_list: **Step 2. Create key with access group** ```bash -curl --location 'http://localhost:8000/key/generate' \ +curl --location 'http://localhost:4000/key/generate' \ -H 'Authorization: Bearer ' \ -H 'Content-Type: application/json' \ -d '{"models": ["beta-models"], # 👈 Model Access Group @@ -194,7 +194,7 @@ curl --location 'http://localhost:8000/key/generate' \ ### Request ```shell -curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \ +curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \ -H "Authorization: Bearer sk-1234" ``` @@ -228,7 +228,7 @@ Request Params: ### Request ```shell -curl 'http://0.0.0.0:8000/key/update' \ +curl 'http://0.0.0.0:4000/key/update' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -266,7 +266,7 @@ Request Params: ### Request ```shell -curl 'http://0.0.0.0:8000/key/delete' \ +curl 'http://0.0.0.0:4000/key/delete' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -500,7 +500,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys ```shell -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ @@ -517,7 +517,7 @@ curl 'http://0.0.0.0:8000/key/generate' \ Example Request to `/chat/completions` when key has crossed budget ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \ --data ' { @@ -545,10 +545,10 @@ Expected Response from `/chat/completions` when key has crossed budget LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. -This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. +This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request. ```shell -curl --location 'http://localhost:8000/user/new' \ +curl --location 'http://localhost:4000/user/new' \ --header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' @@ -571,7 +571,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field. You can get spend for a key by using the `/key/info` endpoint. ```bash -curl 'http://0.0.0.0:8000/key/info?key=' \ +curl 'http://0.0.0.0:4000/key/info?key=' \ -X GET \ -H 'Authorization: Bearer ' ``` @@ -771,7 +771,7 @@ general_settings: #### Step 3. Generate Key ```bash -curl --location 'http://0.0.0.0:8000/key/generate' \ +curl --location 'http://0.0.0.0:4000/key/generate' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}' diff --git a/docs/my-website/docs/simple_proxy_old_doc.md b/docs/my-website/docs/simple_proxy_old_doc.md index b48e345e1..9dcb27797 100644 --- a/docs/my-website/docs/simple_proxy_old_doc.md +++ b/docs/my-website/docs/simple_proxy_old_doc.md @@ -22,7 +22,7 @@ $ pip install 'litellm[proxy]' ```shell $ litellm --model huggingface/bigcode/starcoder -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` ### Test @@ -39,7 +39,7 @@ This will now automatically route any requests for gpt-3.5-turbo to bigcode star ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -59,7 +59,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \ import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -246,7 +246,7 @@ Set `base_url` to the LiteLLM Proxy server import openai client = openai.OpenAI( api_key="anything", - base_url="http://0.0.0.0:8000" + base_url="http://0.0.0.0:4000" ) # request sent to model set on litellm proxy, `litellm --model` @@ -267,7 +267,7 @@ print(response) ```shell litellm --model gpt-3.5-turbo -#INFO: Proxy running on http://0.0.0.0:8000 +#INFO: Proxy running on http://0.0.0.0:4000 ``` #### 1. Clone the repo @@ -278,9 +278,9 @@ git clone https://github.com/danny-avila/LibreChat.git #### 2. Modify Librechat's `docker-compose.yml` -LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below +LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below ```yaml -OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions +OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions ``` #### 3. Save fake OpenAI key in Librechat's `.env` @@ -306,7 +306,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a api_key="IGNORED", model="fake-model-name", context_length=2048, # customize if needed for your model - api_base="http://localhost:8000" # your proxy server url + api_base="http://localhost:4000" # your proxy server url ), ``` @@ -318,7 +318,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment- ```shell $ pip install aider -$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key +$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key ``` @@ -332,7 +332,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai config_list=[ { "model": "my-fake-model", - "api_base": "http://localhost:8000", #litellm compatible endpoint + "api_base": "http://localhost:4000", #litellm compatible endpoint "api_type": "open_ai", "api_key": "NULL", # just a placeholder } @@ -370,7 +370,7 @@ import guidance # set api_base to your proxy # set api_key to anything -gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything") +gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything") experts = guidance(''' {{#system~}} @@ -479,7 +479,7 @@ $ litellm --config /path/to/config.yaml #### Step 3: Use proxy Curl Command ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "zephyr-alpha", @@ -529,7 +529,7 @@ $ litellm --config /path/to/config.yaml #### Step 3: Use proxy Curl Command ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "gpt-3.5-turbo", @@ -586,7 +586,7 @@ litellm_settings: **Set dynamically** ```bash -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --data ' { "model": "zephyr-beta", @@ -615,7 +615,7 @@ model_list: - model_name: custom_embedding_model litellm_params: model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible - api_base: http://0.0.0.0:8000/ + api_base: http://0.0.0.0:4000/ - model_name: custom_embedding_model litellm_params: model: openai/custom_embedding # the `openai/` prefix tells litellm it's openai compatible @@ -665,7 +665,7 @@ litellm --config /path/to/config.yaml **Step 3: Generate temporary keys** ```shell -curl 'http://0.0.0.0:8000/key/generate' \ +curl 'http://0.0.0.0:4000/key/generate' \ --h 'Authorization: Bearer sk-1234' \ --d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}' ``` @@ -719,7 +719,7 @@ model_list: **Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.** ```bash -curl -X POST "https://0.0.0.0:8000/key/generate" \ +curl -X POST "https://0.0.0.0:4000/key/generate" \ -H "Authorization: Bearer sk-1234" \ -H "Content-Type: application/json" \ -d '{ @@ -737,7 +737,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \ You can get spend for a key by using the `/key/info` endpoint. ```bash -curl 'http://0.0.0.0:8000/key/info?key=' \ +curl 'http://0.0.0.0:4000/key/info?key=' \ -X GET \ -H 'Authorization: Bearer ' ``` @@ -868,7 +868,7 @@ $ litellm --config /path/to/config.yaml #### Using Caching Send the same request twice: ```shell -curl http://0.0.0.0:8000/v1/chat/completions \ +curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -876,7 +876,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \ "temperature": 0.7 }' -curl http://0.0.0.0:8000/v1/chat/completions \ +curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -889,7 +889,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \ Caching can be switched on/off per `/chat/completions` request - Caching **on** for completion - pass `caching=True`: ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ + curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -900,7 +900,7 @@ Caching can be switched on/off per `/chat/completions` request ``` - Caching **off** for completion - pass `caching=False`: ```shell - curl http://0.0.0.0:8000/v1/chat/completions \ + curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-3.5-turbo", @@ -963,10 +963,10 @@ https://api.openai.com/v1/chat/completions \ Use this to health check all LLMs defined in your config.yaml #### Request ```shell -curl --location 'http://0.0.0.0:8000/health' +curl --location 'http://0.0.0.0:4000/health' ``` -You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you +You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you ``` litellm --health ``` @@ -1087,7 +1087,7 @@ litellm -config config.yaml #### Run a test request to Proxy ```shell -curl --location 'http://0.0.0.0:8000/chat/completions' \ +curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Authorization: Bearer sk-1244' \ --data ' { "model": "gpt-3.5-turbo", @@ -1213,7 +1213,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open ``` #### --port - - **Default:** `8000` + - **Default:** `4000` - The port to bind the server to. - **Usage:** ```shell diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py index 367bbbb70..e5bcff646 100644 --- a/litellm/proxy/proxy_cli.py +++ b/litellm/proxy/proxy_cli.py @@ -61,7 +61,7 @@ def is_port_in_use(port): @click.option( "--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST" ) -@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT") +@click.option("--port", default=4000, help="Port to bind the server to.", envvar="PORT") @click.option( "--num_workers", default=default_num_workers, @@ -273,7 +273,7 @@ def run_server( ], } - response = requests.post("http://0.0.0.0:8000/queue/request", json=data) + response = requests.post("http://0.0.0.0:4000/queue/request", json=data) response = response.json() @@ -507,7 +507,7 @@ def run_server( print( f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found." ) - if port == 8000 and is_port_in_use(port): + if port == 4000 and is_port_in_use(port): port = random.randint(1024, 49152) from litellm.proxy.proxy_server import app diff --git a/litellm/router.py b/litellm/router.py index d4c0be862..2827a307c 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -246,7 +246,7 @@ class Router: "122999-2828282-277: { "model": "gpt-3", - "api_base": "http://localhost:8000", + "api_base": "http://localhost:4000", "num_requests": 20, "avg_latency": 0.001, "num_failures": 0,