diff --git a/.circleci/config.yml b/.circleci/config.yml
index b3703df70..c94da063c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -46,6 +46,7 @@ jobs:
             pip install "apscheduler==3.10.4"
             pip install "PyGithub==1.59.1"
             pip install argon2-cffi
+            pip install python-multipart
       - save_cache:
           paths:
             - ./venv
diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml
index a367ae2b8..d7cf4271c 100644
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@@ -146,9 +146,29 @@ jobs:
             } catch (error) {
               core.setFailed(error.message);
             }
+      - name: Fetch Release Notes
+        id: release-notes
+        uses: actions/github-script@v6
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            try {
+              const response = await github.rest.repos.getRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                release_id: process.env.RELEASE_ID,
+              });
+              return response.data.body;
+            } catch (error) {
+              core.setFailed(error.message);
+            }
+        env:
+          RELEASE_ID: ${{ env.RELEASE_ID }}
       - name: Github Releases To Discord
         env:
           WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }}
+          REALEASE_TAG: ${{ env.RELEASE_TAG }}
+          RELEASE_NOTES: ${{ steps.release-notes.outputs.result }}
         run: |
           curl -H "Content-Type: application/json" -X POST -d '{
             "content": "||@everyone||",
@@ -156,8 +176,8 @@ jobs:
             "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png",
             "embeds": [
               {
-                "title": "Changelog",
-                "description": "This is the changelog for the latest release.",
+                "title": "Changelog for ${RELEASE_TAG}",
+                "description": "${RELEASE_NOTES}",
                 "color": 2105893
               }
             ]
diff --git a/.gitignore b/.gitignore
index de1c7598f..b03bc895b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,4 @@ deploy/charts/litellm/*.tgz
 deploy/charts/litellm/charts/*
 deploy/charts/*.tgz
 litellm/proxy/vertex_key.json
+**/.vim/
diff --git a/README.md b/README.md
index bc8c1bae2..d32372b6c 100644
--- a/README.md
+++ b/README.md
@@ -143,13 +143,13 @@ pip install 'litellm[proxy]'
 ```shell
 $ litellm --model huggingface/bigcode/starcoder
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 ### Step 2: Make ChatCompletions Request to Proxy
 ```python
 import openai # openai v1.0.0+
-client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
     {
@@ -170,7 +170,7 @@ Set budgets and rate limits across multiple projects
 
 ### Request
 ```shell
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}'
diff --git a/deploy/charts/litellm/Chart.lock b/deploy/charts/litellm/Chart.lock
index 7b6ed69d9..f13578d8d 100644
--- a/deploy/charts/litellm/Chart.lock
+++ b/deploy/charts/litellm/Chart.lock
@@ -1,6 +1,9 @@
 dependencies:
 - name: postgresql
   repository: oci://registry-1.docker.io/bitnamicharts
-  version: 13.3.1
-digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
-generated: "2024-01-19T11:32:56.694808861+11:00"
+  version: 14.3.1
+- name: redis
+  repository: oci://registry-1.docker.io/bitnamicharts
+  version: 18.19.1
+digest: sha256:8660fe6287f9941d08c0902f3f13731079b8cecd2a5da2fbc54e5b7aae4a6f62
+generated: "2024-03-10T02:28:52.275022+05:30"
diff --git a/deploy/charts/litellm/Chart.yaml b/deploy/charts/litellm/Chart.yaml
index 6ecdebb50..cc08a9921 100644
--- a/deploy/charts/litellm/Chart.yaml
+++ b/deploy/charts/litellm/Chart.yaml
@@ -31,3 +31,7 @@ dependencies:
     version: ">=13.3.0"
     repository: oci://registry-1.docker.io/bitnamicharts
     condition: db.deployStandalone
+  - name: redis
+    version: ">=18.0.0"
+    repository: oci://registry-1.docker.io/bitnamicharts 
+    condition: redis.enabled
diff --git a/deploy/charts/litellm/README.md b/deploy/charts/litellm/README.md
index daba8aa68..817781ed0 100644
--- a/deploy/charts/litellm/README.md
+++ b/deploy/charts/litellm/README.md
@@ -28,7 +28,7 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | `imagePullSecrets`                                         | Registry credentials for the LiteLLM and initContainer images.                                                                                                                        | `[]`  |
 | `serviceAccount.create`                                    | Whether or not to create a Kubernetes Service Account for this deployment.  The default is `false` because LiteLLM has no need to access the Kubernetes API.                          | `false`  |
 | `service.type`                                             | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.)                                                                                                                      | `ClusterIP`  |
-| `service.port`                                             | TCP port that the Kubernetes Service will listen on.  Also the TCP port within the Pod that the proxy will listen on.                                                                 | `8000`  |
+| `service.port`                                             | TCP port that the Kubernetes Service will listen on.  Also the TCP port within the Pod that the proxy will listen on.                                                                 | `4000`  |
 | `ingress.*`                                                | See [values.yaml](./values.yaml) for example settings                                                                                                                                 | N/A  |
 | `proxy_config.*`                                           | See [values.yaml](./values.yaml) for default settings.  See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples.                            | N/A  |
 
@@ -76,7 +76,7 @@ When browsing to the URL published per the settings in `ingress.*`, you will
 be prompted for **Admin Configuration**.  The **Proxy Endpoint** is the internal
 (from the `litellm` pod's perspective) URL published by the `<RELEASE>-litellm`
 Kubernetes Service.  If the deployment uses the default settings for this
-service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:8000`.
+service, the **Proxy Endpoint** should be set to `http://<RELEASE>-litellm:4000`.
 
 The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
 was not provided to the helm command line, the `masterkey` is a randomly
diff --git a/deploy/charts/litellm/templates/_helpers.tpl b/deploy/charts/litellm/templates/_helpers.tpl
index b8893d07c..a1eda28c6 100644
--- a/deploy/charts/litellm/templates/_helpers.tpl
+++ b/deploy/charts/litellm/templates/_helpers.tpl
@@ -60,3 +60,25 @@ Create the name of the service account to use
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 {{- end }}
+
+{{/*
+Get redis service name
+*/}}
+{{- define "litellm.redis.serviceName" -}}
+{{- if and (eq .Values.redis.architecture "standalone") .Values.redis.sentinel.enabled -}}
+{{- printf "%s-%s" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
+{{- else -}}
+{{- printf "%s-%s-master" .Release.Name (default "redis" .Values.redis.nameOverride | trunc 63 | trimSuffix "-") -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Get redis service port
+*/}}
+{{- define "litellm.redis.port" -}}
+{{- if .Values.redis.sentinel.enabled -}}
+{{ .Values.redis.sentinel.service.ports.sentinel }}
+{{- else -}}
+{{ .Values.redis.master.service.ports.redis }}
+{{- end -}}
+{{- end -}}
diff --git a/deploy/charts/litellm/templates/deployment.yaml b/deploy/charts/litellm/templates/deployment.yaml
index 6ed112dac..736f35680 100644
--- a/deploy/charts/litellm/templates/deployment.yaml
+++ b/deploy/charts/litellm/templates/deployment.yaml
@@ -142,6 +142,17 @@ spec:
                 secretKeyRef:
                   name: {{ include "litellm.fullname" . }}-masterkey
                   key: masterkey
+            {{- if .Values.redis.enabled }}
+            - name: REDIS_HOST
+              value: {{ include "litellm.redis.serviceName" . }}
+            - name: REDIS_PORT
+              value: {{ include "litellm.redis.port" . | quote }}
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "redis.secretName" .Subcharts.redis }}
+                  key: {{include "redis.secretPasswordKey" .Subcharts.redis }}
+            {{- end }}
           envFrom:
           {{- range .Values.environmentSecrets }}
             - secretRef:
diff --git a/deploy/charts/litellm/values.yaml b/deploy/charts/litellm/values.yaml
index 1b83fe801..cc53fc59c 100644
--- a/deploy/charts/litellm/values.yaml
+++ b/deploy/charts/litellm/values.yaml
@@ -55,7 +55,7 @@ environmentSecrets: []
 
 service:
   type: ClusterIP
-  port: 8000
+  port: 4000
 
 ingress:
   enabled: false
@@ -87,6 +87,8 @@ proxy_config:
         api_key: eXaMpLeOnLy
   general_settings:
     master_key: os.environ/PROXY_MASTER_KEY
+#  litellm_settings:
+#    cache: true
 
 resources: {}
   # We usually recommend not to specify default resources and to leave this as a conscious
@@ -166,3 +168,10 @@ postgresql:
     # existingSecret: ""
     # secretKeys:
     #   userPasswordKey: password
+
+# requires cache: true in config file
+# either enable this or pass a secret for REDIS_HOST, REDIS_PORT, REDIS_PASSWORD or REDIS_URL
+# with cache: true to use existing redis instance
+redis:
+  enabled: false
+  architecture: standalone
diff --git a/docs/my-website/docs/audio_transcription.md b/docs/my-website/docs/audio_transcription.md
new file mode 100644
index 000000000..09fa1a1b9
--- /dev/null
+++ b/docs/my-website/docs/audio_transcription.md
@@ -0,0 +1,85 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Audio Transcription
+
+Use this to loadbalance across Azure + OpenAI. 
+
+## Quick Start
+
+```python
+from litellm import transcription
+import os 
+
+# set api keys 
+os.environ["OPENAI_API_KEY"] = ""
+audio_file = open("/path/to/audio.mp3", "rb")
+
+response = transcription(model="whisper", file=audio_file)
+
+print(f"response: {response}")
+```
+
+## Proxy Usage
+
+### Add model to config 
+
+
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+```yaml
+model_list:
+- model_name: whisper
+  litellm_params:
+    model: whisper-1
+    api_key: os.environ/OPENAI_API_KEY
+  model_info:
+    mode: audio_transcription
+    
+general_settings:
+  master_key: sk-1234
+```
+</TabItem>
+<TabItem value="openai+azure" label="OpenAI + Azure">
+
+```yaml
+model_list:
+- model_name: whisper
+  litellm_params:
+    model: whisper-1
+    api_key: os.environ/OPENAI_API_KEY
+  model_info:
+    mode: audio_transcription
+- model_name: whisper
+  litellm_params:
+    model: azure/azure-whisper
+    api_version: 2024-02-15-preview
+    api_base: os.environ/AZURE_EUROPE_API_BASE
+    api_key: os.environ/AZURE_EUROPE_API_KEY
+  model_info:
+    mode: audio_transcription
+
+general_settings:
+  master_key: sk-1234
+```
+
+</TabItem>
+</Tabs>
+
+### Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml 
+
+# RUNNING on http://0.0.0.0:8000
+```
+
+### Test 
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \
+--header 'Authorization: Bearer sk-1234' \
+--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \
+--form 'model="whisper"'
+```
diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md
index e3ad9245d..fd5594610 100644
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@@ -24,6 +24,17 @@ print(response)
 ```
 
 ### Translated OpenAI params
+
+Use this function to get an up-to-date list of supported openai params for any model + provider. 
+
+```python
+from litellm import get_supported_openai_params
+
+response = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
+
+print(response) # ["max_tokens", "tools", "tool_choice", "stream"]
+```
+
 This is a list of openai params we translate across providers.
 
 This list is constantly being updated.
diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md
index 62a10b44d..7e2374d16 100644
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@@ -35,7 +35,7 @@ general_settings:
 ```bash
 litellm --config /path/to/config.yaml 
 
-# RUNNING on http://0.0.0.0:8000
+# RUNNING on http://0.0.0.0:4000
 ```
 
 ### Test 
@@ -44,7 +44,7 @@ litellm --config /path/to/config.yaml
 <TabItem value="curl" label="Curl">
 
 ```bash
-curl --location 'http://0.0.0.0:8000/embeddings' \
+curl --location 'http://0.0.0.0:4000/embeddings' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{"input": ["Academia.edu uses"], "model": "textembedding-gecko", "encoding_format": "base64"}'
@@ -57,7 +57,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
 from openai import OpenAI
 client = OpenAI(
   api_key="sk-1234",
-  base_url="http://0.0.0.0:8000"
+  base_url="http://0.0.0.0:4000"
 )
 
 client.embeddings.create(
@@ -72,7 +72,7 @@ client.embeddings.create(
 ```python
 from langchain_openai import OpenAIEmbeddings
 
-embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:8000", openai_api_key="sk-1234")
+embeddings = OpenAIEmbeddings(model="textembedding-gecko", openai_api_base="http://0.0.0.0:4000", openai_api_key="sk-1234")
 
 text = "This is a test document."
 
@@ -200,7 +200,7 @@ Use this for calling `/embedding` endpoints on OpenAI Compatible Servers, exampl
 from litellm import embedding
 response = embedding(
   model = "openai/<your-llm-name>",     # add `openai/` prefix to model so litellm knows to route to OpenAI
-  api_base="http://0.0.0.0:8000/"       # set API Base of your Custom OpenAI Endpoint
+  api_base="http://0.0.0.0:4000/"       # set API Base of your Custom OpenAI Endpoint
   input=["good morning from litellm"]
 )
 ```
diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md
index d7ed14019..18331ba3b 100644
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@@ -13,7 +13,14 @@ https://github.com/BerriAI/litellm
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
 - Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
 
-## Basic usage 
+## How to use LiteLLM
+You can use litellm through either:
+1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
+2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking
+
+## LiteLLM Python SDK
+
+### Basic usage 
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
@@ -144,7 +151,7 @@ response = completion(
 
 </Tabs>
 
-## Streaming
+### Streaming
 Set `stream=True` in the `completion` args. 
 <Tabs>
 <TabItem value="openai" label="OpenAI">
@@ -276,7 +283,7 @@ response = completion(
 
 </Tabs>
 
-## Exception handling 
+### Exception handling 
 
 LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM. 
 
@@ -292,7 +299,7 @@ except OpenAIError as e:
     print(e)
 ```
 
-## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
+### Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
 LiteLLM exposes pre defined callbacks to send data to Langfuse, LLMonitor, Helicone, Promptlayer, Traceloop, Slack
 ```python
 from litellm import completion
@@ -311,7 +318,7 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```
 
-## Track Costs, Usage, Latency for streaming
+### Track Costs, Usage, Latency for streaming
 Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
 
 ```python
@@ -368,13 +375,13 @@ pip install 'litellm[proxy]'
 ```shell
 $ litellm --model huggingface/bigcode/starcoder
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 #### Step 2: Make ChatCompletions Request to Proxy
 ```python
 import openai # openai v1.0.0+
-client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
+client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
     {
diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md
index f568b5696..f85ff9122 100644
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@@ -1,5 +1,84 @@
+import Image from '@theme/IdealImage';
+
 # 🔥 Load Test LiteLLM 
 
+## Load Test LiteLLM Proxy - 1500+ req/s
+
+## 1500+ concurrent requests/s
+
+LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
+
+```python
+import time, asyncio
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import uuid
+import traceback
+
+# base_url - litellm proxy endpoint
+# api_key - litellm proxy api-key, is created proxy with auth
+litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
+
+
+async def litellm_completion():
+    # Your existing code for litellm_completion goes here
+    try:
+        response = await litellm_client.chat.completions.create(
+            model="azure-gpt-3.5",
+            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+        )
+        print(response)
+        return response
+
+    except Exception as e:
+        # If there's an exception, log the error message
+        with open("error_log.txt", "a") as error_log:
+            error_log.write(f"Error during completion: {str(e)}\n")
+        pass
+
+
+async def main():
+    for i in range(1):
+        start = time.time()
+        n = 1500  # Number of concurrent tasks
+        tasks = [litellm_completion() for _ in range(n)]
+
+        chat_completions = await asyncio.gather(*tasks)
+
+        successful_completions = [c for c in chat_completions if c is not None]
+
+        # Write errors to error_log.txt
+        with open("error_log.txt", "a") as error_log:
+            for completion in chat_completions:
+                if isinstance(completion, str):
+                    error_log.write(completion + "\n")
+
+        print(n, time.time() - start, len(successful_completions))
+        time.sleep(10)
+
+
+if __name__ == "__main__":
+    # Blank out contents of error_log.txt
+    open("error_log.txt", "w").close()
+
+    asyncio.run(main())
+
+```
+
+### Throughput - 30% Increase
+LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
+<Image img={require('../img/throughput.png')} />
+
+### Latency Added - 0.00325 seconds
+LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
+<Image img={require('../img/latency.png')} />
+
+
+### Testing LiteLLM Proxy with Locust 
+- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
+
+<Image img={require('../img/locust.png')} />
+
+## Load Test LiteLLM SDK vs OpenAI
 Here is a script to load test LiteLLM vs OpenAI 
 
 ```python
@@ -11,7 +90,7 @@ import time, asyncio, litellm
 #### LITELLM PROXY #### 
 litellm_client = AsyncOpenAI(
     api_key="sk-1234", # [CHANGE THIS]
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 #### AZURE OPENAI CLIENT #### 
@@ -84,4 +163,5 @@ async def loadtest_fn():
 # Run the event loop to execute the async function
 asyncio.run(loadtest_fn())
 
-```
\ No newline at end of file
+```
+
diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md
index 6aa4b1979..1a7a5fa41 100644
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@@ -63,7 +63,7 @@ export ANTHROPIC_API_KEY="your-api-key"
 ```bash
 $ litellm --model claude-3-opus-20240229
 
-# Server running on http://0.0.0.0:8000
+# Server running on http://0.0.0.0:4000
 ```
 
 ### 3. Test it
@@ -73,7 +73,7 @@ $ litellm --model claude-3-opus-20240229
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -93,7 +93,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -120,7 +120,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
     model = "gpt-3.5-turbo",
     temperature=0.1
 )
diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md
index c5b12d4c4..8c6926885 100644
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@@ -54,7 +54,7 @@ export AWS_REGION_NAME=""
 ```bash
 $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
 
-# Server running on http://0.0.0.0:8000
+# Server running on http://0.0.0.0:4000
 ```
 
 ### 3. Test it
@@ -64,7 +64,7 @@ $ litellm --model anthropic.claude-3-sonnet-20240229-v1:0
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -84,7 +84,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -111,7 +111,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
     model = "gpt-3.5-turbo",
     temperature=0.1
 )
diff --git a/docs/my-website/docs/providers/ollama.md b/docs/my-website/docs/providers/ollama.md
index 51d91ccb6..ec2a231e1 100644
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@@ -5,6 +5,12 @@ LiteLLM supports all models from [Ollama](https://github.com/jmorganca/ollama)
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
 
+:::info 
+
+We recommend using [ollama_chat](#using-ollama-apichat) for better responses.
+
+:::
+
 ## Pre-requisites
 Ensure you have your ollama server running
 
@@ -177,7 +183,7 @@ On the docker container run the `test.py` file using `python3 test.py`
 ```python
 import openai
 
-api_base = f"http://0.0.0.0:8000" # base url for server
+api_base = f"http://0.0.0.0:4000" # base url for server
 
 openai.api_base = api_base
 openai.api_key = "temp-key"
diff --git a/docs/my-website/docs/providers/openai_compatible.md b/docs/my-website/docs/providers/openai_compatible.md
index beaf38cfa..f86544c28 100644
--- a/docs/my-website/docs/providers/openai_compatible.md
+++ b/docs/my-website/docs/providers/openai_compatible.md
@@ -15,7 +15,7 @@ import os
 response = litellm.completion(
     model="openai/mistral,               # add `openai/` prefix to model so litellm knows to route to OpenAI
     api_key="sk-1234",                  # api key to your openai compatible endpoint
-    api_base="http://0.0.0.0:8000",     # set API Base of your Custom OpenAI Endpoint
+    api_base="http://0.0.0.0:4000",     # set API Base of your Custom OpenAI Endpoint
     messages=[
                 {
                     "role": "user",
@@ -35,7 +35,7 @@ import os
 response = litellm.embedding(
     model="openai/GPT-J",               # add `openai/` prefix to model so litellm knows to route to OpenAI
     api_key="sk-1234",                  # api key to your openai compatible endpoint
-    api_base="http://0.0.0.0:8000",     # set API Base of your Custom OpenAI Endpoint
+    api_base="http://0.0.0.0:4000",     # set API Base of your Custom OpenAI Endpoint
     input=["good morning from litellm"]
 )
 print(response)
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index ee4874caf..4f1ce18f3 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -145,7 +145,7 @@ $ litellm --config /path/to/config.yaml
 
 Send the same request twice:
 ```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
+curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -153,7 +153,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
      "temperature": 0.7
    }'
 
-curl http://0.0.0.0:8000/v1/chat/completions \
+curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -166,14 +166,14 @@ curl http://0.0.0.0:8000/v1/chat/completions \
 
 Send the same request twice:
 ```shell
-curl --location 'http://0.0.0.0:8000/embeddings' \
+curl --location 'http://0.0.0.0:4000/embeddings' \
   --header 'Content-Type: application/json' \
   --data ' {
   "model": "text-embedding-ada-002",
   "input": ["write a litellm poem"]
   }'
 
-curl --location 'http://0.0.0.0:8000/embeddings' \
+curl --location 'http://0.0.0.0:4000/embeddings' \
   --header 'Content-Type: application/json' \
   --data ' {
   "model": "text-embedding-ada-002",
@@ -227,7 +227,7 @@ from openai import OpenAI
 client = OpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:8000"
+		base_url="http://0.0.0.0:4000"
 )
 
 chat_completion = client.chat.completions.create(
@@ -255,7 +255,7 @@ from openai import OpenAI
 client = OpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:8000"
+		base_url="http://0.0.0.0:4000"
 )
 
 chat_completion = client.chat.completions.create(
@@ -281,7 +281,7 @@ from openai import OpenAI
 client = OpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
-		base_url="http://0.0.0.0:8000"
+		base_url="http://0.0.0.0:4000"
 )
 
 chat_completion = client.chat.completions.create(
diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md
index b00f4e301..9d4d1112e 100644
--- a/docs/my-website/docs/proxy/call_hooks.md
+++ b/docs/my-website/docs/proxy/call_hooks.md
@@ -63,7 +63,7 @@ litellm_settings:
 $ litellm /path/to/config.yaml
 ```
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --data ' {
     "model": "gpt-3.5-turbo",
     "messages": [
@@ -162,7 +162,7 @@ litellm_settings:
 $ litellm /path/to/config.yaml
 ```
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --data ' {
     "model": "gpt-3.5-turbo",
     "messages": [
diff --git a/docs/my-website/docs/proxy/cli.md b/docs/my-website/docs/proxy/cli.md
index d366f1f6b..28b210b16 100644
--- a/docs/my-website/docs/proxy/cli.md
+++ b/docs/my-website/docs/proxy/cli.md
@@ -15,7 +15,7 @@ Cli arguments,  --host, --port, --num_workers
     ```
 
 ## --port
-   - **Default:** `8000`
+   - **Default:** `4000`
    - The port to bind the server to.
    - **Usage:** 
      ```shell
diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 2b3edfadb..68b49502d 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -13,7 +13,7 @@ Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`m
 | `general_settings`   | Server settings, example setting `master_key: sk-my_special_key` |
 | `environment_variables`   | Environment Variables example, `REDIS_HOST`, `REDIS_PORT` |
 
-**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:8000/#/config.yaml), for everything you can pass in the config.yaml.
+**Complete List:** Check the Swagger UI docs on `<your-proxy-url>/#/config.yaml` (e.g. http://0.0.0.0:4000/#/config.yaml), for everything you can pass in the config.yaml.
 
 
 ## Quick Start 
@@ -49,13 +49,13 @@ model_list:
       rpm: 6
   - model_name: anthropic-claude
     litellm_params: 
-      model="bedrock/anthropic.claude-instant-v1"
+      model: bedrock/anthropic.claude-instant-v1
       ### [OPTIONAL] SET AWS REGION ###
-      aws_region_name="us-east-1"
+      aws_region_name: us-east-1
   - model_name: vllm-models
     litellm_params:
       model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
-      api_base: http://0.0.0.0:8000
+      api_base: http://0.0.0.0:4000
       rpm: 1440
     model_info: 
       version: 2
@@ -91,7 +91,7 @@ Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml.
 If multiple with `model_name=gpt-3.5-turbo` does [Load Balancing](https://docs.litellm.ai/docs/proxy/load_balancing)
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -111,7 +111,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 Sends this request to model where `model_name=bedrock-claude-v1` on config.yaml
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "bedrock-claude-v1",
@@ -131,7 +131,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml. 
@@ -179,7 +179,7 @@ messages = [
 
 # Sends request to model where `model_name=gpt-3.5-turbo` on config.yaml. 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000",  # set openai base to the proxy
+    openai_api_base="http://0.0.0.0:4000",  # set openai base to the proxy
     model = "gpt-3.5-turbo",                
     temperature=0.1
 )
@@ -189,7 +189,7 @@ print(response)
 
 # Sends request to model where `model_name=bedrock-claude-v1` on config.yaml. 
 claude_chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000", # set openai base to the proxy
+    openai_api_base="http://0.0.0.0:4000", # set openai base to the proxy
     model = "bedrock-claude-v1",                   
     temperature=0.1
 )
@@ -248,31 +248,46 @@ $ litellm --config /path/to/config.yaml
 
 Use this to call multiple instances of the same model and configure things like [routing strategy](../routing.md#advanced). 
 
-```yaml
-router_settings:
-  routing_strategy: "latency-based-routing" # routes to the fastest deployment in the group
+For optimal performance:
+- Set `tpm/rpm` per model deployment. Weighted picks are then based on the established tpm/rpm.
+- Select your optimal routing strategy in `router_settings:routing_strategy`. 
 
+LiteLLM supports
+```python
+["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"`
+```
+
+When `tpm/rpm` is set + `routing_strategy==simple-shuffle` litellm will use a weighted pick based on set tpm/rpm. **In our load tests setting tpm/rpm for all deployments + `routing_strategy==simple-shuffle` maximized throughput**
+- When using multiple LiteLLM Servers / Kubernetes set redis settings `router_settings:redis_host` etc
+
+```yaml
 model_list:
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8001
+        rpm: 60      # Optional[int]: When rpm/tpm set - litellm uses weighted pick for load balancing. rpm = Rate limit for this deployment: in requests per minute (rpm).
+        tpm: 1000   # Optional[int]: tpm = Tokens Per Minute 
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8002
+        rpm: 600      
   - model_name: zephyr-beta
     litellm_params:
         model: huggingface/HuggingFaceH4/zephyr-7b-beta
         api_base: http://0.0.0.0:8003
+        rpm: 60000      
   - model_name: gpt-3.5-turbo
     litellm_params:
         model: gpt-3.5-turbo
         api_key: <my-openai-key>
+        rpm: 200      
   - model_name: gpt-3.5-turbo-16k
     litellm_params:
         model: gpt-3.5-turbo-16k
         api_key: <my-openai-key>
+        rpm: 100      
 
 litellm_settings:
   num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
@@ -280,8 +295,16 @@ litellm_settings:
   fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries 
   context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
   allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
-```
 
+router_settings: # router_settings are optional
+  routing_strategy: simple-shuffle # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  model_group_alias: {"gpt-4": "gpt-3.5-turbo"} # all requests with `gpt-4` will be routed to models with `gpt-3.5-turbo`
+  num_retries: 2
+  timeout: 30                                  # 30 seconds
+  redis_host: <your redis host>                # set this when using multiple litellm proxy deployments, load balancing state stored in redis
+  redis_password: <your redis password>
+  redis_port: 1992
+```
 
 ## Set Azure `base_model` for cost tracking
 
@@ -537,7 +560,7 @@ litellm --config config.yaml
 Sends Request to `bedrock-cohere`
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
   --header 'Content-Type: application/json' \
   --data ' {
   "model": "bedrock-cohere",
diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md
index 8ffc2adf5..175806d27 100644
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@@ -28,7 +28,7 @@ docker run ghcr.io/berriai/litellm:main-latest
 
 <TabItem value="cli" label="With CLI Args">
 
-### Run with LiteLLM CLI args
+#### Run with LiteLLM CLI args
 
 See all supported CLI args [here](https://docs.litellm.ai/docs/proxy/cli): 
 
@@ -68,8 +68,87 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
 
 </TabItem>
 
+<TabItem value="kubernetes" label="Kubernetes">
+
+Deploying a config file based litellm instance just requires a simple deployment that loads
+the config.yaml file via a config map. Also it would be a good practice to use the env var
+declaration for api keys, and attach the env vars with the api key values as an opaque secret.
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: litellm-config-file
+data:
+  config.yaml: |
+      model_list: 
+        - model_name: gpt-3.5-turbo
+          litellm_params:
+            model: azure/gpt-turbo-small-ca
+            api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+            api_key: os.environ/CA_AZURE_OPENAI_API_KEY
+---
+apiVersion: v1
+kind: Secret
+type: Opaque
+metadata:
+  name: litellm-secrets
+data:
+  CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-deployment
+  labels:
+    app: litellm
+spec:
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+      - name: litellm
+        image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
+        ports:
+        - containerPort: 4000
+        volumeMounts:
+        - name: config-volume
+          mountPath: /app/proxy_server_config.yaml
+          subPath: config.yaml
+        envFrom:
+        - secretRef:
+            name: litellm-secrets
+      volumes:
+        - name: config-volume
+          configMap:
+            name: litellm-config-file
+```
+
+:::info
+To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
+:::
+
+</TabItem>
+
 </Tabs>
 
+**That's it ! That's the quick start to deploy litellm**
+
+## Options to deploy LiteLLM 
+
+| Docs | When to Use |
+| --- | --- |
+| [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
+| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
+| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
+| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
+
+
 ## Deploy with Database
 
 We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database 
@@ -93,7 +172,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 </TabItem>
 <TabItem value="kubernetes-deploy" label="Kubernetes">
 
-### Step 1. Create deployment.yaml
+#### Step 1. Create deployment.yaml
 
 ```yaml
    apiVersion: apps/v1
@@ -122,7 +201,7 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 kubectl apply -f /path/to/deployment.yaml
 ```
 
-### Step 2. Create service.yaml 
+#### Step 2. Create service.yaml 
 
 ```yaml
 apiVersion: v1
@@ -143,7 +222,7 @@ spec:
 kubectl apply -f /path/to/service.yaml
 ```
 
-### Step 3. Start server
+#### Step 3. Start server
 
 ```
 kubectl port-forward service/litellm-service 4000:4000
@@ -154,13 +233,13 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 </TabItem>
 <TabItem value="helm-deploy" label="Helm">
 
-### Step 1. Clone the repository
+#### Step 1. Clone the repository
 
 ```bash
 git clone https://github.com/BerriAI/litellm.git
 ```
 
-### Step 2. Deploy with Helm
+#### Step 2. Deploy with Helm
 
 ```bash
 helm install \
@@ -169,20 +248,87 @@ helm install \
   deploy/charts/litellm
 ```
 
-### Step 3. Expose the service to localhost
+#### Step 3. Expose the service to localhost
 
 ```bash
 kubectl \
   port-forward \
   service/mydeploy-litellm \
-  8000:8000
+  4000:4000
 ```
 
-Your OpenAI proxy server is now running on `http://127.0.0.1:8000`.
+Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 
 </TabItem>
 </Tabs>
 
+## LiteLLM container + Redis
+Use Redis when you need litellm to load balance across multiple litellm containers
+
+The only change required is setting Redis on your `config.yaml`
+LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/<your-deployment-name>
+      api_base: <your-azure-endpoint>
+      api_key: <your-azure-api-key>
+      rpm: 6      # Rate limit for this deployment: in requests per minute (rpm)
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/gpt-turbo-small-ca
+      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+      api_key: <your-azure-api-key>
+      rpm: 6
+router_settings:
+  redis_host: <your redis host>
+  redis_password: <your redis password>
+  redis_port: 1992
+```
+
+Start docker container with config
+
+```shell
+docker run ghcr.io/berriai/litellm:main-latest --config your_config.yaml
+```
+
+## LiteLLM Database container + PostgresDB + Redis
+
+The only change required is setting Redis on your `config.yaml`
+LiteLLM Proxy supports sharing rpm/tpm shared across multiple litellm instances, pass `redis_host`, `redis_password` and `redis_port` to enable this. (LiteLLM will use Redis to track rpm/tpm usage )
+
+
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/<your-deployment-name>
+      api_base: <your-azure-endpoint>
+      api_key: <your-azure-api-key>
+      rpm: 6      # Rate limit for this deployment: in requests per minute (rpm)
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: azure/gpt-turbo-small-ca
+      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+      api_key: <your-azure-api-key>
+      rpm: 6
+router_settings:
+  redis_host: <your redis host>
+  redis_password: <your redis password>
+  redis_port: 1992
+```
+
+Start `litellm-database`docker container with config
+
+```shell
+docker run --name litellm-proxy \
+-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
+-p 4000:4000 \
+ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
+```
+
 ## Best Practices for Deploying to Production
 ### 1. Switch of debug logs in production 
 don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
@@ -218,8 +364,49 @@ Provide an ssl certificate when starting litellm proxy server
 
 ## Platform-specific Guide
 
-
 <Tabs>
+
+<TabItem value="aws-stack" label="AWS Cloud Formation Stack">
+
+### AWS Cloud Formation Stack
+LiteLLM AWS Cloudformation Stack - **Get the best LiteLLM AutoScaling Policy and Provision the DB for LiteLLM Proxy**
+
+This will provision:
+- LiteLLMServer - EC2 Instance
+- LiteLLMServerAutoScalingGroup
+- LiteLLMServerScalingPolicy (autoscaling policy)
+- LiteLLMDB - RDS::DBInstance
+
+#### Using AWS Cloud Formation Stack
+**LiteLLM Cloudformation stack is located [here - litellm.yaml](https://github.com/BerriAI/litellm/blob/main/enterprise/cloudformation_stack/litellm.yaml)**
+
+#### 1. Create the CloudFormation Stack:
+In the AWS Management Console, navigate to the CloudFormation service, and click on "Create Stack."
+
+On the "Create Stack" page, select "Upload a template file" and choose the litellm.yaml file 
+
+Now monitor the stack was created successfully. 
+
+#### 2. Get the Database URL:
+Once the stack is created, get the DatabaseURL of the Database resource, copy this value 
+
+#### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
+From the EC2 console, connect to the instance created by the stack (e.g., using SSH).
+
+Run the following command, replacing <database_url> with the value you copied in step 2
+
+```shell
+docker run --name litellm-proxy \
+   -e DATABASE_URL=<database_url> \
+   -p 4000:4000 \
+   ghcr.io/berriai/litellm-database:main-latest
+```
+
+#### 4. Access the Application:
+
+Once the container is running, you can access the application by going to `http://<ec2-public-ip>:4000` in your browser.
+
+</TabItem>
 <TabItem value="google-cloud-run" label="Google Cloud Run">
 
 ### Deploy on Google Cloud Run
@@ -286,11 +473,11 @@ services:
           target: runtime
     image: ghcr.io/berriai/litellm:main-latest
     ports:
-      - "8000:8000" # Map the container port to the host, change the host port if necessary
+      - "4000:4000" # Map the container port to the host, change the host port if necessary
     volumes:
       - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
     # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
-    command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
+    command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
 
 # ...rest of your docker-compose config if any
 ```
@@ -308,18 +495,4 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
 > Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
 
 
-Your LiteLLM container should be running now on the defined port e.g. `8000`.
-
-
-
-## LiteLLM Proxy Performance
-
-LiteLLM proxy has been load tested to handle 1500 req/s.
-
-### Throughput - 30% Increase
-LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
-<Image img={require('../../img/throughput.png')} />
-
-### Latency Added - 0.00325 seconds
-LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
-<Image img={require('../../img/latency.png')} />
+Your LiteLLM container should be running now on the defined port e.g. `4000`.
diff --git a/docs/my-website/docs/proxy/embedding.md b/docs/my-website/docs/proxy/embedding.md
index 0f3a01a90..2adaaa247 100644
--- a/docs/my-website/docs/proxy/embedding.md
+++ b/docs/my-website/docs/proxy/embedding.md
@@ -38,7 +38,7 @@ $ litellm --config /path/to/config.yaml
 3. Test the embedding call
 
 ```shell
-curl --location 'http://0.0.0.0:8000/v1/embeddings' \
+curl --location 'http://0.0.0.0:4000/v1/embeddings' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{
diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index a4f3ea7b1..93786eff4 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::
 
 Features: 
-- [ ] Content Moderation with LlamaGuard 
-- [ ] Content Moderation with Google Text Moderations 
-- [ ] Content Moderation with LLM Guard
-- [ ] Reject calls from Blocked User list 
-- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
-- [ ] Tracking Spend for Custom Tags
+- ✅ Content Moderation with LlamaGuard 
+- ✅ Content Moderation with Google Text Moderations 
+- ✅ Content Moderation with LLM Guard
+- ✅ Reject calls from Blocked User list 
+- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
+- ✅ Don't log/store specific requests (eg confidential LLM requests)
+- ✅ Tracking Spend for Custom Tags
  
-## Content Moderation with LlamaGuard 
+## Content Moderation
+### Content Moderation with LlamaGuard 
 
 Currently works with Sagemaker's LlamaGuard endpoint. 
 
@@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 ```
 
-### Customize LlamaGuard prompt 
+#### Customize LlamaGuard prompt 
 
 To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
 
@@ -51,12 +53,12 @@ callbacks: ["llamaguard_moderations"]
   llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
 ```
 
-## Content Moderation with LLM Guard
+### Content Moderation with LLM Guard
 
 Set the LLM Guard API Base in your environment 
 
 ```env
-LLM_GUARD_API_BASE = "http://0.0.0.0:8000"
+LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
 ```
 
 Add `llmguard_moderations` as a callback 
@@ -78,7 +80,7 @@ Expected results:
 LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
 ```
 
-## Content Moderation with Google Text Moderation 
+### Content Moderation with Google Text Moderation 
 
 Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
 
@@ -89,7 +91,7 @@ litellm_settings:
    callbacks: ["google_text_moderation"]
 ```
 
-### Set custom confidence thresholds
+#### Set custom confidence thresholds
 
 Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
 
@@ -133,6 +135,33 @@ Here are the category specific values:
 | "legal" | legal_threshold: 0.1 |
 
 
+## Incognito Requests - Don't log anything
+
+When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",            # proxy api-key
+    base_url="http://0.0.0.0:4000" # litellm proxy 
+)
+
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [
+        {
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }
+    ],
+    extra_body={
+        "no-log": True
+    }
+)
+
+print(response)
+```
+
 
 ## Enable Blocked User Lists 
 If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features 
@@ -146,7 +175,7 @@ litellm_settings:
 ### How to test
 
 ```bash 
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -173,7 +202,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 **Block all calls for a user id**
 
 ```
-curl -X POST "http://0.0.0.0:8000/user/block" \
+curl -X POST "http://0.0.0.0:4000/user/block" \
 -H "Authorization: Bearer sk-1234" \ 
 -D '{
 "user_ids": [<user_id>, ...] 
@@ -183,7 +212,7 @@ curl -X POST "http://0.0.0.0:8000/user/block" \
 **Unblock calls for a user id**
 
 ```
-curl -X POST "http://0.0.0.0:8000/user/unblock" \
+curl -X POST "http://0.0.0.0:4000/user/unblock" \
 -H "Authorization: Bearer sk-1234" \ 
 -D '{
 "user_ids": [<user_id>, ...] 
@@ -201,7 +230,7 @@ litellm_settings:
 ### Test this 
 
 ```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -234,7 +263,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -262,7 +291,7 @@ print(response)
 Pass `metadata` as part of the request body
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data '{
     "model": "gpt-3.5-turbo",
@@ -288,7 +317,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000",
+    openai_api_base="http://0.0.0.0:4000",
     model = "gpt-3.5-turbo",
     temperature=0.1,
     extra_body={
diff --git a/docs/my-website/docs/proxy/health.md b/docs/my-website/docs/proxy/health.md
index f0b797329..03dd91731 100644
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@@ -12,10 +12,10 @@ The proxy exposes:
 #### Request
 Make a GET Request to `/health` on the proxy
 ```shell
-curl --location 'http://0.0.0.0:8000/health' -H "Authorization: Bearer sk-1234"
+curl --location 'http://0.0.0.0:4000/health' -H "Authorization: Bearer sk-1234"
 ```
 
-You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
+You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
 ```
 litellm --health
 ```
@@ -60,7 +60,7 @@ $ litellm /path/to/config.yaml
 
 3. Query health endpoint: 
 ```
-curl --location 'http://0.0.0.0:8000/health'
+curl --location 'http://0.0.0.0:4000/health'
 ```
 
 ### Embedding Models 
@@ -119,7 +119,7 @@ Unprotected endpoint for checking if proxy is ready to accept requests
 Example Request: 
 
 ```bash 
-curl --location 'http://0.0.0.0:8000/health/readiness'
+curl --location 'http://0.0.0.0:4000/health/readiness'
 ```
 
 Example Response:  
@@ -153,7 +153,7 @@ Example Request:
 
 ```
 curl -X 'GET' \
-  'http://0.0.0.0:8000/health/liveliness' \
+  'http://0.0.0.0:4000/health/liveliness' \
   -H 'accept: application/json'
 ```
 
diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md
index ad5e91203..691592cb6 100644
--- a/docs/my-website/docs/proxy/load_balancing.md
+++ b/docs/my-website/docs/proxy/load_balancing.md
@@ -45,7 +45,7 @@ $ litellm --config /path/to/config.yaml
 ### Step 3: Use proxy - Call a model group [Load Balancing]
 Curl Command
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -65,7 +65,7 @@ If you want to call a specific model defined in the `config.yaml`, you can call
 In this example it will call `azure/gpt-turbo-small-ca`. Defined in the config on Step 1
 
 ```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "azure/gpt-turbo-small-ca",
diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md
index bf4216c0e..589199a07 100644
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@@ -150,7 +150,7 @@ litellm --config proxy_config.yaml
 ```
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Authorization: Bearer sk-1234' \
     --data ' {
     "model": "gpt-3.5-turbo",
@@ -174,7 +174,7 @@ On Success
     Usage: {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21},
     Cost: 3.65e-05,
     Response: {'id': 'chatcmpl-8S8avKJ1aVBg941y5xzGMSKrYCMvN', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': 'Good morning! How can I assist you today?', 'role': 'assistant'}}], 'created': 1701716913, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': {'completion_tokens': 10, 'prompt_tokens': 11, 'total_tokens': 21}}
-    Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:8000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
+    Proxy Metadata: {'user_api_key': None, 'headers': Headers({'host': '0.0.0.0:4000', 'user-agent': 'curl/7.88.1', 'accept': '*/*', 'authorization': 'Bearer sk-1234', 'content-length': '199', 'content-type': 'application/x-www-form-urlencoded'}), 'model_group': 'gpt-3.5-turbo', 'deployment': 'gpt-3.5-turbo-ModelID-gpt-3.5-turbo'}
 ```
 
 #### Logging Proxy Request Object, Header, Url
@@ -374,7 +374,7 @@ async def log_event(request: Request):
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="127.0.0.1", port=8000)
+    uvicorn.run(app, host="127.0.0.1", port=4000)
 
 
 ```
@@ -383,7 +383,7 @@ if __name__ == "__main__":
 #### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
 
 ```shell
-os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:8000/log-event"
+os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
 ```
 
 #### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
@@ -445,7 +445,7 @@ Expected output on Langfuse
 Pass `metadata` as part of the request body
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data '{
     "model": "gpt-3.5-turbo",
@@ -472,7 +472,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -509,7 +509,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000",
+    openai_api_base="http://0.0.0.0:4000",
     model = "gpt-3.5-turbo",
     temperature=0.1,
     extra_body={
@@ -663,7 +663,7 @@ litellm --config config.yaml --debug
 
 Test Request
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data ' {
     "model": "Azure OpenAI GPT-4 East",
@@ -698,7 +698,7 @@ litellm_settings:
 Now, when you [generate keys](./virtual_keys.md) for this team-id 
 
 ```bash
-curl -X POST 'http://0.0.0.0:8000/key/generate' \
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -H 'Authorization: Bearer sk-1234' \
 -H 'Content-Type: application/json' \
 -D '{"team_id": "ishaans-secret-project"}'
@@ -742,7 +742,7 @@ litellm --config config.yaml --debug
 
 Test Request
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data ' {
     "model": "Azure OpenAI GPT-4 East",
@@ -903,7 +903,7 @@ litellm --config config.yaml --debug
 
 Test Request
 ```
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data ' {
     "model": "gpt-3.5-turbo",
@@ -947,7 +947,7 @@ litellm --config config.yaml --debug
 
 Test Request
 ```
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data ' {
     "model": "gpt-3.5-turbo",
diff --git a/docs/my-website/docs/proxy/model_management.md b/docs/my-website/docs/proxy/model_management.md
index 8160e2aa7..0a236185f 100644
--- a/docs/my-website/docs/proxy/model_management.md
+++ b/docs/my-website/docs/proxy/model_management.md
@@ -24,7 +24,7 @@ Retrieve detailed information about each model listed in the `/models` endpoint,
   <TabItem value="curl">
 
 ```bash
-curl -X GET "http://0.0.0.0:8000/model/info" \
+curl -X GET "http://0.0.0.0:4000/model/info" \
      -H "accept: application/json" \
 ```
   </TabItem>
@@ -42,7 +42,7 @@ Add a new model to the list in the `config.yaml` by providing the model paramete
   <TabItem value="curl">
 
 ```bash
-curl -X POST "http://0.0.0.0:8000/model/new" \
+curl -X POST "http://0.0.0.0:4000/model/new" \
      -H "accept: application/json" \
      -H "Content-Type: application/json" \
      -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }'
diff --git a/docs/my-website/docs/proxy/pii_masking.md b/docs/my-website/docs/proxy/pii_masking.md
index 0d559d910..a95a6d771 100644
--- a/docs/my-website/docs/proxy/pii_masking.md
+++ b/docs/my-website/docs/proxy/pii_masking.md
@@ -96,7 +96,7 @@ Turn off PII masking for a given key.
 Do this by setting `permissions: {"pii": false}`, when generating a key. 
 
 ```shell 
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{
@@ -119,7 +119,7 @@ The proxy support 2 request-level PII controls:
 Set `allow_pii_controls` to true for a given key. This will allow the user to set request-level PII controls.
 
 ```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer my-master-key' \
 --header 'Content-Type: application/json' \
 --data '{
@@ -136,7 +136,7 @@ from openai import OpenAI
 client = OpenAI(
     # This is the default and can be omitted
     api_key=os.environ.get("OPENAI_API_KEY"),
-        base_url="http://0.0.0.0:8000"
+        base_url="http://0.0.0.0:4000"
 )
 
 chat_completion = client.chat.completions.create(
diff --git a/docs/my-website/docs/proxy/quick_start.md b/docs/my-website/docs/proxy/quick_start.md
index 4f508ee59..d44970348 100644
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@@ -21,7 +21,7 @@ Run the following command to start the litellm proxy
 ```shell
 $ litellm --model huggingface/bigcode/starcoder
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 ### Test
@@ -250,7 +250,7 @@ litellm --config your_config.yaml
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -270,7 +270,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -297,7 +297,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
     model = "gpt-3.5-turbo",
     temperature=0.1
 )
@@ -321,7 +321,7 @@ print(response)
 ```python
 from langchain.embeddings import OpenAIEmbeddings
 
-embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 
 text = "This is a test document."
@@ -331,7 +331,7 @@ query_result = embeddings.embed_query(text)
 print(f"SAGEMAKER EMBEDDINGS")
 print(query_result[:5])
 
-embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 text = "This is a test document."
 
@@ -340,7 +340,7 @@ query_result = embeddings.embed_query(text)
 print(f"BEDROCK EMBEDDINGS")
 print(query_result[:5])
 
-embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 text = "This is a test document."
 
@@ -407,11 +407,11 @@ services:
   litellm:
     image: ghcr.io/berriai/litellm:main
     ports:
-      - "8000:8000" # Map the container port to the host, change the host port if necessary
+      - "4000:4000" # Map the container port to the host, change the host port if necessary
     volumes:
       - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
     # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
-    command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
+    command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ]
 
 # ...rest of your docker-compose config if any
 ```
@@ -429,7 +429,7 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
 > Use `-d` flag to run the container in detached mode (background) e.g. `docker compose up -d`
 
 
-Your LiteLLM container should be running now on the defined port e.g. `8000`.
+Your LiteLLM container should be running now on the defined port e.g. `4000`.
 
 
 ## Using with OpenAI compatible projects
@@ -442,7 +442,7 @@ Set `base_url` to the LiteLLM Proxy server
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -463,7 +463,7 @@ print(response)
 ```shell
 litellm --model gpt-3.5-turbo
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 #### 1. Clone the repo
@@ -474,9 +474,9 @@ git clone https://github.com/danny-avila/LibreChat.git
 
 
 #### 2. Modify Librechat's `docker-compose.yml`
-LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
+LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
 ```yaml
-OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
+OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
 ```
 
 #### 3. Save fake OpenAI key in Librechat's `.env` 
@@ -502,7 +502,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
       api_key="IGNORED",
       model="fake-model-name",
       context_length=2048, # customize if needed for your model
-      api_base="http://localhost:8000" # your proxy server url
+      api_base="http://localhost:4000" # your proxy server url
   ),
 ```
 
@@ -514,7 +514,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
 ```shell
 $ pip install aider 
 
-$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
+$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
 ```
 </TabItem>
 <TabItem value="autogen" label="AutoGen">
@@ -528,7 +528,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
 config_list=[
     {
         "model": "my-fake-model",
-        "api_base": "http://localhost:8000",  #litellm compatible endpoint
+        "api_base": "http://localhost:4000",  #litellm compatible endpoint
         "api_type": "open_ai",
         "api_key": "NULL", # just a placeholder
     }
@@ -566,7 +566,7 @@ import guidance
 
 # set api_base to your proxy
 # set api_key to anything
-gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
+gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
 
 experts = guidance('''
 {{#system~}}
diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md
index f241e4ec0..7527a3d5b 100644
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@@ -45,7 +45,7 @@ litellm_settings:
 **Set dynamically**
 
 ```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "zephyr-beta",
@@ -101,7 +101,7 @@ LiteLLM Proxy supports setting a `timeout` per request
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
      --header 'Content-Type: application/json' \
      --data-raw '{
         "model": "gpt-3.5-turbo",
@@ -121,7 +121,7 @@ import openai
 
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 response = client.chat.completions.create(
diff --git a/docs/my-website/docs/proxy/rules.md b/docs/my-website/docs/proxy/rules.md
index 415607b61..60e990d91 100644
--- a/docs/my-website/docs/proxy/rules.md
+++ b/docs/my-website/docs/proxy/rules.md
@@ -30,7 +30,7 @@ $ litellm /path/to/config.yaml
 ```
 
 ```bash
-curl --location 'http://0.0.0.0:8000/v1/chat/completions' \
+curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer sk-1234' \
 --data '{
diff --git a/docs/my-website/docs/proxy/streaming_logging.md b/docs/my-website/docs/proxy/streaming_logging.md
index 6bc5882d1..3fa896467 100644
--- a/docs/my-website/docs/proxy/streaming_logging.md
+++ b/docs/my-website/docs/proxy/streaming_logging.md
@@ -65,7 +65,7 @@ litellm --config proxy_config.yaml
 ```
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Authorization: Bearer sk-1234' \
     --data ' {
     "model": "gpt-3.5-turbo",
diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md
index 188a2a2eb..cca9d4434 100644
--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@@ -28,12 +28,12 @@ Follow [setup](./virtual_keys.md#setup)
 ```bash
 litellm --config /path/to/config.yaml
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 ### 2. Go to UI 
 ```bash
-http://0.0.0.0:8000/ui # <proxy_base_url>/ui
+http://0.0.0.0:4000/ui # <proxy_base_url>/ui
 ```
 
 
diff --git a/docs/my-website/docs/proxy/user_keys.md b/docs/my-website/docs/proxy/user_keys.md
index fcccffaa0..d86d3ae09 100644
--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@@ -26,7 +26,7 @@ Set `extra_body={"metadata": { }}` to `metadata` you want to pass
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -92,7 +92,7 @@ print(response)
 Pass `metadata` as part of the request body
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Content-Type: application/json' \
     --data '{
     "model": "gpt-3.5-turbo",
@@ -123,7 +123,7 @@ from langchain.prompts.chat import (
 from langchain.schema import HumanMessage, SystemMessage
 
 chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000",
+    openai_api_base="http://0.0.0.0:4000",
     model = "gpt-3.5-turbo",
     temperature=0.1,
     extra_body={
@@ -195,7 +195,7 @@ from openai import OpenAI
 
 # set base_url to your proxy server
 # set api_key to send to proxy server
-client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
+client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
 
 response = client.embeddings.create(
     input=["hello from litellm"],
@@ -209,7 +209,7 @@ print(response)
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/embeddings' \
+curl --location 'http://0.0.0.0:4000/embeddings' \
   --header 'Content-Type: application/json' \
   --data ' {
   "model": "text-embedding-ada-002",
@@ -223,7 +223,7 @@ curl --location 'http://0.0.0.0:8000/embeddings' \
 ```python
 from langchain.embeddings import OpenAIEmbeddings
 
-embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 
 text = "This is a test document."
@@ -233,7 +233,7 @@ query_result = embeddings.embed_query(text)
 print(f"SAGEMAKER EMBEDDINGS")
 print(query_result[:5])
 
-embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 text = "This is a test document."
 
@@ -242,7 +242,7 @@ query_result = embeddings.embed_query(text)
 print(f"BEDROCK EMBEDDINGS")
 print(query_result[:5])
 
-embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:4000", openai_api_key="temp-key")
 
 text = "This is a test document."
 
@@ -296,7 +296,7 @@ from openai import OpenAI
 
 # set base_url to your proxy server
 # set api_key to send to proxy server
-client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:8000")
+client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
 
 response = client.moderations.create(
     input="hello from litellm",
@@ -310,7 +310,7 @@ print(response)
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/moderations' \
+curl --location 'http://0.0.0.0:4000/moderations' \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer sk-1234' \
     --data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
@@ -421,7 +421,7 @@ user_config = {
 import openai
 client = openai.OpenAI(
     api_key="sk-1234",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # send request to `user-azure-instance`
@@ -489,7 +489,7 @@ const { OpenAI } = require('openai');
 
 const openai = new OpenAI({
   apiKey: "sk-1234",
-  baseURL: "http://0.0.0.0:8000"
+  baseURL: "http://0.0.0.0:4000"
 });
 
 async function main() {
@@ -516,7 +516,7 @@ Here's how to do it:
 import openai
 client = openai.OpenAI(
     api_key="sk-1234",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -541,7 +541,7 @@ Pass in the litellm_params (E.g. api_key, api_base, etc.) via the `extra_body` p
 import openai
 client = openai.OpenAI(
     api_key="sk-1234",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -571,7 +571,7 @@ const { OpenAI } = require('openai');
 
 const openai = new OpenAI({
   apiKey: "sk-1234",
-  baseURL: "http://0.0.0.0:8000"
+  baseURL: "http://0.0.0.0:4000"
 });
 
 async function main() {
diff --git a/docs/my-website/docs/proxy/users.md b/docs/my-website/docs/proxy/users.md
index 9c8927caf..12cbda9d0 100644
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@@ -44,7 +44,7 @@ litellm /path/to/config.yaml
 **Step 3. Send test call**
 
 ```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Autherization: Bearer sk-1234' \
     --header 'Content-Type: application/json' \
     --data '{
@@ -72,7 +72,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
 
 #### **Add budgets to users**
 ```shell 
-curl --location 'http://localhost:8000/user/new' \
+curl --location 'http://localhost:4000/user/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
@@ -96,7 +96,7 @@ curl --location 'http://localhost:8000/user/new' \
 `budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 
 ```
-curl 'http://0.0.0.0:8000/user/new' \
+curl 'http://0.0.0.0:4000/user/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -113,7 +113,7 @@ Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai)
 - **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
 
 ```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
@@ -127,7 +127,7 @@ You can:
 
 #### **Add budgets to users**
 ```shell 
-curl --location 'http://localhost:8000/team/new' \
+curl --location 'http://localhost:4000/team/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -238,7 +238,7 @@ By default the `max_budget` is set to `null` and is not checked for keys
 #### **Add budgets to keys**
 
 ```bash
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -250,7 +250,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
 Example Request to `/chat/completions` when key has crossed budget
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
   --header 'Content-Type: application/json' \
   --header 'Authorization: Bearer <generated-key>' \
   --data ' {
@@ -278,7 +278,7 @@ Expected Response from `/chat/completions` when key has crossed budget
 `budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 
 ```
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -310,7 +310,7 @@ By default the `model_max_budget` is set to `{}` and is not checked for keys
 #### **Add model specific budgets to keys**
 
 ```bash
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -335,7 +335,7 @@ Use `/user/new`, to persist rate limits across multiple keys.
 
 
 ```shell
-curl --location 'http://0.0.0.0:8000/user/new' \
+curl --location 'http://0.0.0.0:4000/user/new' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{"user_id": "krrish@berri.ai", "max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' 
@@ -359,7 +359,7 @@ curl --location 'http://0.0.0.0:8000/user/new' \
 Use `/key/generate`, if you want them for just that key.
 
 ```shell
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{"max_parallel_requests": 10, "tpm_limit": 20, "rpm_limit": 4}' 
@@ -401,7 +401,7 @@ model_list:
 **Step 2. Create key with access group**
 
 ```bash
-curl --location 'http://localhost:8000/user/new' \
+curl --location 'http://localhost:4000/user/new' \
 -H 'Authorization: Bearer <your-master-key>' \
 -H 'Content-Type: application/json' \
 -d '{"models": ["beta-models"], # 👈 Model Access Group
@@ -414,7 +414,7 @@ curl --location 'http://localhost:8000/user/new' \
 Just include user_id in the `/key/generate` request.
 
 ```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data '{"models": ["azure-models"], "user_id": "krrish@berri.ai"}'
diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
index 70fd6e6a8..e84b3c16f 100644
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@@ -59,7 +59,7 @@ litellm --config /path/to/config.yaml
 **Step 3: Generate temporary keys**
 
 ```shell 
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
@@ -70,7 +70,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
 
 ### Request
 ```shell
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -105,7 +105,7 @@ Request Params:
 ```python
 {
     "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
-    "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
+    "expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
     "key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
     ...
 }
@@ -147,7 +147,7 @@ model_list:
 **Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
 
 ```bash
-curl -X POST "https://0.0.0.0:8000/key/generate" \
+curl -X POST "https://0.0.0.0:4000/key/generate" \
 -H "Authorization: Bearer <your-master-key>" \
 -H "Content-Type: application/json" \
 -d '{
@@ -182,7 +182,7 @@ model_list:
 **Step 2. Create key with access group**
 
 ```bash
-curl --location 'http://localhost:8000/key/generate' \
+curl --location 'http://localhost:4000/key/generate' \
 -H 'Authorization: Bearer <your-master-key>' \
 -H 'Content-Type: application/json' \
 -d '{"models": ["beta-models"], # 👈 Model Access Group
@@ -194,7 +194,7 @@ curl --location 'http://localhost:8000/key/generate' \
 
 ### Request
 ```shell
-curl -X GET "http://0.0.0.0:8000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
+curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
 -H "Authorization: Bearer sk-1234"
 ```
 
@@ -228,7 +228,7 @@ Request Params:
 
 ### Request
 ```shell
-curl 'http://0.0.0.0:8000/key/update' \
+curl 'http://0.0.0.0:4000/key/update' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -266,7 +266,7 @@ Request Params:
 
 ### Request
 ```shell
-curl 'http://0.0.0.0:8000/key/delete' \
+curl 'http://0.0.0.0:4000/key/delete' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -500,7 +500,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
 
 ```shell
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{
@@ -517,7 +517,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
 Example Request to `/chat/completions` when key has crossed budget
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
   --header 'Content-Type: application/json' \
   --header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
   --data ' {
@@ -545,10 +545,10 @@ Expected Response from `/chat/completions` when key has crossed budget
 
 LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 
 
-This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:8000/`). Here's an example request. 
+This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request. 
 
 ```shell 
-curl --location 'http://localhost:8000/user/new' \
+curl --location 'http://localhost:4000/user/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
@@ -571,7 +571,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
 You can get spend for a key by using the `/key/info` endpoint. 
 
 ```bash
-curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
+curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
      -X GET \
      -H 'Authorization: Bearer <your-master-key>'
 ```
@@ -771,7 +771,7 @@ general_settings:
 #### Step 3. Generate Key
 
 ```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
+curl --location 'http://0.0.0.0:4000/key/generate' \
 --header 'Authorization: Bearer sk-1234' \
 --header 'Content-Type: application/json' \
 --data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
diff --git a/docs/my-website/docs/simple_proxy_old_doc.md b/docs/my-website/docs/simple_proxy_old_doc.md
index b48e345e1..9dcb27797 100644
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@@ -22,7 +22,7 @@ $ pip install 'litellm[proxy]'
 ```shell
 $ litellm --model huggingface/bigcode/starcoder
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 ### Test
@@ -39,7 +39,7 @@ This will now automatically route any requests for gpt-3.5-turbo to bigcode star
 <TabItem value="Curl" label="Curl Request">
 
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -59,7 +59,7 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -246,7 +246,7 @@ Set `base_url` to the LiteLLM Proxy server
 import openai
 client = openai.OpenAI(
     api_key="anything",
-    base_url="http://0.0.0.0:8000"
+    base_url="http://0.0.0.0:4000"
 )
 
 # request sent to model set on litellm proxy, `litellm --model`
@@ -267,7 +267,7 @@ print(response)
 ```shell
 litellm --model gpt-3.5-turbo
 
-#INFO: Proxy running on http://0.0.0.0:8000
+#INFO: Proxy running on http://0.0.0.0:4000
 ```
 
 #### 1. Clone the repo
@@ -278,9 +278,9 @@ git clone https://github.com/danny-avila/LibreChat.git
 
 
 #### 2. Modify Librechat's `docker-compose.yml`
-LiteLLM Proxy is running on port `8000`, set `8000` as the proxy below
+LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
 ```yaml
-OPENAI_REVERSE_PROXY=http://host.docker.internal:8000/v1/chat/completions
+OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
 ```
 
 #### 3. Save fake OpenAI key in Librechat's `.env` 
@@ -306,7 +306,7 @@ In the [config.py](https://continue.dev/docs/reference/Models/openai) set this a
       api_key="IGNORED",
       model="fake-model-name",
       context_length=2048, # customize if needed for your model
-      api_base="http://localhost:8000" # your proxy server url
+      api_base="http://localhost:4000" # your proxy server url
   ),
 ```
 
@@ -318,7 +318,7 @@ Credits [@vividfog](https://github.com/jmorganca/ollama/issues/305#issuecomment-
 ```shell
 $ pip install aider 
 
-$ aider --openai-api-base http://0.0.0.0:8000 --openai-api-key fake-key
+$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
 ```
 </TabItem>
 <TabItem value="autogen" label="AutoGen">
@@ -332,7 +332,7 @@ from autogen import AssistantAgent, UserProxyAgent, oai
 config_list=[
     {
         "model": "my-fake-model",
-        "api_base": "http://localhost:8000",  #litellm compatible endpoint
+        "api_base": "http://localhost:4000",  #litellm compatible endpoint
         "api_type": "open_ai",
         "api_key": "NULL", # just a placeholder
     }
@@ -370,7 +370,7 @@ import guidance
 
 # set api_base to your proxy
 # set api_key to anything
-gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:8000", api_key="anything")
+gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
 
 experts = guidance('''
 {{#system~}}
@@ -479,7 +479,7 @@ $ litellm --config /path/to/config.yaml
 #### Step 3: Use proxy
 Curl Command
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "zephyr-alpha",
@@ -529,7 +529,7 @@ $ litellm --config /path/to/config.yaml
 #### Step 3: Use proxy
 Curl Command
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "gpt-3.5-turbo",
@@ -586,7 +586,7 @@ litellm_settings:
 **Set dynamically**
 
 ```bash
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
       "model": "zephyr-beta",
@@ -615,7 +615,7 @@ model_list:
   - model_name: custom_embedding_model
     litellm_params:
       model: openai/custom_embedding  # the `openai/` prefix tells litellm it's openai compatible
-      api_base: http://0.0.0.0:8000/
+      api_base: http://0.0.0.0:4000/
   - model_name: custom_embedding_model
     litellm_params:
       model: openai/custom_embedding  # the `openai/` prefix tells litellm it's openai compatible
@@ -665,7 +665,7 @@ litellm --config /path/to/config.yaml
 **Step 3: Generate temporary keys**
 
 ```shell 
-curl 'http://0.0.0.0:8000/key/generate' \
+curl 'http://0.0.0.0:4000/key/generate' \
 --h 'Authorization: Bearer sk-1234' \
 --d '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m"}'
 ```
@@ -719,7 +719,7 @@ model_list:
 **Step 2: Generate a user key - enabling them access to specific models, custom model aliases, etc.**
 
 ```bash
-curl -X POST "https://0.0.0.0:8000/key/generate" \
+curl -X POST "https://0.0.0.0:4000/key/generate" \
 -H "Authorization: Bearer sk-1234" \
 -H "Content-Type: application/json" \
 -d '{
@@ -737,7 +737,7 @@ curl -X POST "https://0.0.0.0:8000/key/generate" \
 You can get spend for a key by using the `/key/info` endpoint. 
 
 ```bash
-curl 'http://0.0.0.0:8000/key/info?key=<user-key>' \
+curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
      -X GET \
      -H 'Authorization: Bearer <your-master-key>'
 ```
@@ -868,7 +868,7 @@ $ litellm --config /path/to/config.yaml
 #### Using Caching 
 Send the same request twice:
 ```shell
-curl http://0.0.0.0:8000/v1/chat/completions \
+curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -876,7 +876,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
      "temperature": 0.7
    }'
 
-curl http://0.0.0.0:8000/v1/chat/completions \
+curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -889,7 +889,7 @@ curl http://0.0.0.0:8000/v1/chat/completions \
 Caching can be switched on/off per `/chat/completions` request
 - Caching **on** for completion - pass `caching=True`:
   ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
+  curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -900,7 +900,7 @@ Caching can be switched on/off per `/chat/completions` request
   ```
 - Caching **off** for completion - pass `caching=False`:
   ```shell
-  curl http://0.0.0.0:8000/v1/chat/completions \
+  curl http://0.0.0.0:4000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
      "model": "gpt-3.5-turbo",
@@ -963,10 +963,10 @@ https://api.openai.com/v1/chat/completions \
 Use this to health check all LLMs defined in your config.yaml
 #### Request
 ```shell
-curl --location 'http://0.0.0.0:8000/health'
+curl --location 'http://0.0.0.0:4000/health'
 ```
 
-You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:8000/health` for you
+You can also run `litellm -health` it makes a `get` request to `http://0.0.0.0:4000/health` for you
 ```
 litellm --health
 ```
@@ -1087,7 +1087,7 @@ litellm -config config.yaml
 
 #### Run a test request to Proxy
 ```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
+curl --location 'http://0.0.0.0:4000/chat/completions' \
     --header 'Authorization: Bearer sk-1244' \
     --data ' {
     "model": "gpt-3.5-turbo",
@@ -1213,7 +1213,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
      ```
 
 #### --port
-   - **Default:** `8000`
+   - **Default:** `4000`
    - The port to bind the server to.
    - **Usage:** 
      ```shell
diff --git a/docs/my-website/img/locust.png b/docs/my-website/img/locust.png
new file mode 100644
index 000000000..1bcedf1d0
Binary files /dev/null and b/docs/my-website/img/locust.png differ
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index 7aaf2e114..62a2d3842 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -42,10 +42,6 @@ const sidebars = {
         "proxy/team_based_routing",
         "proxy/ui",
         "proxy/budget_alerts",
-        "proxy/model_management",
-        "proxy/health",
-        "proxy/debugging",
-        "proxy/pii_masking",
         {
           "type": "category",
           "label": "🔥 Load Balancing",
@@ -54,6 +50,10 @@ const sidebars = {
             "proxy/reliability",
           ]
         },
+        "proxy/model_management",
+        "proxy/health",
+        "proxy/debugging",
+        "proxy/pii_masking",
         "proxy/caching",
         {
           "type": "category",
@@ -101,12 +101,13 @@ const sidebars = {
     },
     {
       type: "category",
-      label: "Embedding(), Moderation(), Image Generation()",
+      label: "Embedding(), Moderation(), Image Generation(), Audio Transcriptions()",
       items: [
         "embedding/supported_embedding", 
         "embedding/async_embedding",
         "embedding/moderation",
-        "image_generation"
+        "image_generation",
+        "audio_transcription"
       ],
     },
     {
diff --git a/enterprise/cloudformation_stack/litellm.yaml b/enterprise/cloudformation_stack/litellm.yaml
new file mode 100644
index 000000000..c30956b94
--- /dev/null
+++ b/enterprise/cloudformation_stack/litellm.yaml
@@ -0,0 +1,44 @@
+Resources:
+  LiteLLMServer:
+    Type: AWS::EC2::Instance
+    Properties:
+      AvailabilityZone: us-east-1a
+      ImageId: ami-0f403e3180720dd7e
+      InstanceType: t2.micro
+
+  LiteLLMServerAutoScalingGroup:
+    Type: AWS::AutoScaling::AutoScalingGroup
+    Properties:
+      AvailabilityZones:
+        - us-east-1a
+      LaunchConfigurationName: !Ref LiteLLMServerLaunchConfig
+      MinSize: 1
+      MaxSize: 3
+      DesiredCapacity: 1
+      HealthCheckGracePeriod: 300
+
+  LiteLLMServerLaunchConfig:
+    Type: AWS::AutoScaling::LaunchConfiguration
+    Properties:
+      ImageId: ami-0f403e3180720dd7e  # Replace with your desired AMI ID
+      InstanceType: t2.micro
+
+  LiteLLMServerScalingPolicy:
+    Type: AWS::AutoScaling::ScalingPolicy
+    Properties:
+      AutoScalingGroupName: !Ref LiteLLMServerAutoScalingGroup
+      PolicyType: TargetTrackingScaling
+      TargetTrackingConfiguration:
+        PredefinedMetricSpecification:
+          PredefinedMetricType: ASGAverageCPUUtilization
+        TargetValue: 60.0
+
+  LiteLLMDB:
+    Type: AWS::RDS::DBInstance
+    Properties:
+      AllocatedStorage: 20
+      Engine: postgres
+      MasterUsername: litellmAdmin
+      MasterUserPassword: litellmPassword
+      DBInstanceClass: db.t3.micro
+      AvailabilityZone: us-east-1a
\ No newline at end of file
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 017bd46ac..04c2d23c7 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -570,7 +570,7 @@ from .utils import (
     _calculate_retry_after,
     _should_retry,
     get_secret,
-    get_mapped_model_params,
+    get_supported_openai_params,
 )
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
@@ -588,6 +588,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_ai import VertexAIConfig
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
+from .llms.ollama_chat import OllamaChatConfig
 from .llms.maritalk import MaritTalkConfig
 from .llms.bedrock import (
     AmazonTitanConfig,
diff --git a/litellm/_logging.py b/litellm/_logging.py
index 438fa9743..26693c15e 100644
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@@ -31,6 +31,18 @@ def _turn_on_debug():
     verbose_proxy_logger.setLevel(level=logging.DEBUG)  # set proxy logs to debug
 
 
+def _disable_debugging():
+    verbose_logger.disabled = True
+    verbose_router_logger.disabled = True
+    verbose_proxy_logger.disabled = True
+
+
+def _enable_debugging():
+    verbose_logger.disabled = False
+    verbose_router_logger.disabled = False
+    verbose_proxy_logger.disabled = False
+
+
 def print_verbose(print_statement):
     try:
         if set_verbose:
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index 5e0887901..becbcc328 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -1,7 +1,7 @@
 import os, types
 import json
 from enum import Enum
-import requests
+import requests, copy
 import time, uuid
 from typing import Callable, Optional
 from litellm.utils import ModelResponse, Usage, map_finish_reason
@@ -117,6 +117,7 @@ def completion(
 ):
     headers = validate_environment(api_key, headers)
     _is_function_call = False
+    messages = copy.deepcopy(messages)
     if model in custom_prompt_dict:
         # check if the model has a registered custom prompt
         model_prompt_details = custom_prompt_dict[model]
diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index 01b54987b..5fc0939bb 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -7,13 +7,15 @@ from litellm.utils import (
     Message,
     CustomStreamWrapper,
     convert_to_model_response_object,
+    TranscriptionResponse,
 )
-from typing import Callable, Optional
+from typing import Callable, Optional, BinaryIO
 from litellm import OpenAIConfig
 import litellm, json
 import httpx
 from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
 from openai import AzureOpenAI, AsyncAzureOpenAI
+import uuid
 
 
 class AzureOpenAIError(Exception):
@@ -270,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
                     azure_client = AzureOpenAI(**azure_client_params)
                 else:
                     azure_client = client
+                    if api_version is not None and isinstance(
+                        azure_client._custom_query, dict
+                    ):
+                        # set api_version to version passed by user
+                        azure_client._custom_query.setdefault(
+                            "api-version", api_version
+                        )
+
                 response = azure_client.chat.completions.create(**data, timeout=timeout)  # type: ignore
                 stringified_response = response.model_dump()
                 ## LOGGING
@@ -333,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
                 azure_client_params["api_key"] = api_key
             elif azure_ad_token is not None:
                 azure_client_params["azure_ad_token"] = azure_ad_token
+
+            # setting Azure client
             if client is None:
                 azure_client = AsyncAzureOpenAI(**azure_client_params)
             else:
                 azure_client = client
+                if api_version is not None and isinstance(
+                    azure_client._custom_query, dict
+                ):
+                    # set api_version to version passed by user
+                    azure_client._custom_query.setdefault("api-version", api_version)
             ## LOGGING
             logging_obj.pre_call(
                 input=data["messages"],
@@ -401,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
             azure_client = AzureOpenAI(**azure_client_params)
         else:
             azure_client = client
+            if api_version is not None and isinstance(azure_client._custom_query, dict):
+                # set api_version to version passed by user
+                azure_client._custom_query.setdefault("api-version", api_version)
         ## LOGGING
         logging_obj.pre_call(
             input=data["messages"],
@@ -454,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
                 azure_client = AsyncAzureOpenAI(**azure_client_params)
             else:
                 azure_client = client
+                if api_version is not None and isinstance(
+                    azure_client._custom_query, dict
+                ):
+                    # set api_version to version passed by user
+                    azure_client._custom_query.setdefault("api-version", api_version)
             ## LOGGING
             logging_obj.pre_call(
                 input=data["messages"],
@@ -757,6 +782,156 @@ class AzureChatCompletion(BaseLLM):
             else:
                 raise AzureOpenAIError(status_code=500, message=str(e))
 
+    def audio_transcriptions(
+        self,
+        model: str,
+        audio_file: BinaryIO,
+        optional_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        api_version: Optional[str] = None,
+        client=None,
+        azure_ad_token: Optional[str] = None,
+        logging_obj=None,
+        atranscription: bool = False,
+    ):
+        data = {"model": model, "file": audio_file, **optional_params}
+
+        # init AzureOpenAI Client
+        azure_client_params = {
+            "api_version": api_version,
+            "azure_endpoint": api_base,
+            "azure_deployment": model,
+            "timeout": timeout,
+        }
+
+        max_retries = optional_params.pop("max_retries", None)
+
+        azure_client_params = select_azure_base_url_or_endpoint(
+            azure_client_params=azure_client_params
+        )
+        if api_key is not None:
+            azure_client_params["api_key"] = api_key
+        elif azure_ad_token is not None:
+            azure_client_params["azure_ad_token"] = azure_ad_token
+
+        if max_retries is not None:
+            azure_client_params["max_retries"] = max_retries
+
+        if atranscription == True:
+            return self.async_audio_transcriptions(
+                audio_file=audio_file,
+                data=data,
+                model_response=model_response,
+                timeout=timeout,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+                azure_client_params=azure_client_params,
+                max_retries=max_retries,
+                logging_obj=logging_obj,
+            )
+        if client is None:
+            azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params)  # type: ignore
+        else:
+            azure_client = client
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=f"audio_file_{uuid.uuid4()}",
+            api_key=azure_client.api_key,
+            additional_args={
+                "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
+                "api_base": azure_client._base_url._uri_reference,
+                "atranscription": True,
+                "complete_input_dict": data,
+            },
+        )
+
+        response = azure_client.audio.transcriptions.create(
+            **data, timeout=timeout  # type: ignore
+        )
+        stringified_response = response.model_dump()
+        ## LOGGING
+        logging_obj.post_call(
+            input=audio_file.name,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=stringified_response,
+        )
+        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+        return final_response
+
+    async def async_audio_transcriptions(
+        self,
+        audio_file: BinaryIO,
+        data: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+        azure_client_params=None,
+        max_retries=None,
+        logging_obj=None,
+    ):
+        response = None
+        try:
+            if client is None:
+                async_azure_client = AsyncAzureOpenAI(
+                    **azure_client_params,
+                    http_client=litellm.aclient_session,
+                )
+            else:
+                async_azure_client = client
+
+            ## LOGGING
+            logging_obj.pre_call(
+                input=f"audio_file_{uuid.uuid4()}",
+                api_key=async_azure_client.api_key,
+                additional_args={
+                    "headers": {
+                        "Authorization": f"Bearer {async_azure_client.api_key}"
+                    },
+                    "api_base": async_azure_client._base_url._uri_reference,
+                    "atranscription": True,
+                    "complete_input_dict": data,
+                },
+            )
+
+            response = await async_azure_client.audio.transcriptions.create(
+                **data, timeout=timeout
+            )  # type: ignore
+
+            stringified_response = response.model_dump()
+
+            ## LOGGING
+            logging_obj.post_call(
+                input=audio_file.name,
+                api_key=api_key,
+                additional_args={
+                    "headers": {
+                        "Authorization": f"Bearer {async_azure_client.api_key}"
+                    },
+                    "api_base": async_azure_client._base_url._uri_reference,
+                    "atranscription": True,
+                    "complete_input_dict": data,
+                },
+                original_response=stringified_response,
+            )
+            response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+            return response
+        except Exception as e:
+            ## LOGGING
+            logging_obj.post_call(
+                input=input,
+                api_key=api_key,
+                original_response=str(e),
+            )
+            raise e
+
     async def ahealth_check(
         self,
         model: Optional[str],
diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py
index 89d1bf16f..4aa27b3c9 100644
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@@ -126,6 +126,8 @@ class AmazonAnthropicClaude3Config:
                 optional_params["max_tokens"] = value
             if param == "tools":
                 optional_params["tools"] = value
+            if param == "stream":
+                optional_params["stream"] = value
         return optional_params
 
 
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index dec74fa92..8378a95ff 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -18,7 +18,7 @@ class OllamaError(Exception):
         )  # Call the base class constructor with the parameters it needs
 
 
-class OllamaConfig:
+class OllamaChatConfig:
     """
     Reference: https://github.com/jmorganca/ollama/blob/main/docs/api.md#parameters
 
@@ -108,6 +108,7 @@ class OllamaConfig:
             k: v
             for k, v in cls.__dict__.items()
             if not k.startswith("__")
+            and k != "function_name"  # special param for function calling
             and not isinstance(
                 v,
                 (
@@ -120,6 +121,61 @@ class OllamaConfig:
             and v is not None
         }
 
+    def get_supported_openai_params(
+        self,
+    ):
+        return [
+            "max_tokens",
+            "stream",
+            "top_p",
+            "temperature",
+            "frequency_penalty",
+            "stop",
+            "tools",
+            "tool_choice",
+            "functions",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "max_tokens":
+                optional_params["num_predict"] = value
+            if param == "stream":
+                optional_params["stream"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "top_p":
+                optional_params["top_p"] = value
+            if param == "frequency_penalty":
+                optional_params["repeat_penalty"] = param
+            if param == "stop":
+                optional_params["stop"] = value
+            ### FUNCTION CALLING LOGIC ###
+            if param == "tools":
+                # ollama actually supports json output
+                optional_params["format"] = "json"
+                litellm.add_function_to_prompt = (
+                    True  # so that main.py adds the function call to the prompt
+                )
+                optional_params["functions_unsupported_model"] = value
+
+                if len(optional_params["functions_unsupported_model"]) == 1:
+                    optional_params["function_name"] = optional_params[
+                        "functions_unsupported_model"
+                    ][0]["function"]["name"]
+
+            if param == "functions":
+                # ollama actually supports json output
+                optional_params["format"] = "json"
+                litellm.add_function_to_prompt = (
+                    True  # so that main.py adds the function call to the prompt
+                )
+                optional_params["functions_unsupported_model"] = non_default_params.pop(
+                    "functions"
+                )
+        non_default_params.pop("tool_choice", None)  # causes ollama requests to hang
+        return optional_params
+
 
 # ollama implementation
 def get_ollama_response(
@@ -138,7 +194,7 @@ def get_ollama_response(
         url = f"{api_base}/api/chat"
 
     ## Load Config
-    config = litellm.OllamaConfig.get_config()
+    config = litellm.OllamaChatConfig.get_config()
     for k, v in config.items():
         if (
             k not in optional_params
@@ -147,6 +203,7 @@ def get_ollama_response(
 
     stream = optional_params.pop("stream", False)
     format = optional_params.pop("format", None)
+    function_name = optional_params.pop("function_name", None)
 
     for m in messages:
         if "role" in m and m["role"] == "tool":
@@ -187,6 +244,7 @@ def get_ollama_response(
                 model_response=model_response,
                 encoding=encoding,
                 logging_obj=logging_obj,
+                function_name=function_name,
             )
         return response
     elif stream == True:
@@ -290,7 +348,9 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
         traceback.print_exc()
 
 
-async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
+async def ollama_acompletion(
+    url, data, model_response, encoding, logging_obj, function_name
+):
     data["stream"] = False
     try:
         timeout = aiohttp.ClientTimeout(total=litellm.request_timeout)  # 10 minutes
@@ -324,7 +384,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                             "id": f"call_{str(uuid.uuid4())}",
                             "function": {
                                 "arguments": response_json["message"]["content"],
-                                "name": "",
+                                "name": function_name or "",
                             },
                             "type": "function",
                         }
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 90846b627..9850cd61e 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, BinaryIO
 import types, time, json, traceback
 import httpx
 from .base import BaseLLM
@@ -9,6 +9,7 @@ from litellm.utils import (
     CustomStreamWrapper,
     convert_to_model_response_object,
     Usage,
+    TranscriptionResponse,
 )
 from typing import Callable, Optional
 import aiohttp, requests
@@ -237,14 +238,22 @@ class OpenAIChatCompletion(BaseLLM):
                     status_code=422, message=f"Timeout needs to be a float"
                 )
 
-            if custom_llm_provider == "mistral":
-                # check if message content passed in as list, and not string
-                messages = prompt_factory(
-                    model=model,
-                    messages=messages,
-                    custom_llm_provider=custom_llm_provider,
-                )
-
+            if custom_llm_provider != "openai":
+                # process all OpenAI compatible provider logic here
+                if custom_llm_provider == "mistral":
+                    # check if message content passed in as list, and not string
+                    messages = prompt_factory(
+                        model=model,
+                        messages=messages,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                if custom_llm_provider == "perplexity" and messages is not None:
+                    # check if messages.name is passed + supported, if not supported remove
+                    messages = prompt_factory(
+                        model=model,
+                        messages=messages,
+                        custom_llm_provider=custom_llm_provider,
+                    )
             for _ in range(
                 2
             ):  # if call fails due to alternating messages, retry with reformatted message
@@ -766,6 +775,103 @@ class OpenAIChatCompletion(BaseLLM):
             else:
                 raise OpenAIError(status_code=500, message=str(e))
 
+    def audio_transcriptions(
+        self,
+        model: str,
+        audio_file: BinaryIO,
+        optional_params: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+        max_retries=None,
+        logging_obj=None,
+        atranscription: bool = False,
+    ):
+        data = {"model": model, "file": audio_file, **optional_params}
+        if atranscription == True:
+            return self.async_audio_transcriptions(
+                audio_file=audio_file,
+                data=data,
+                model_response=model_response,
+                timeout=timeout,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+                max_retries=max_retries,
+                logging_obj=logging_obj,
+            )
+        if client is None:
+            openai_client = OpenAI(
+                api_key=api_key,
+                base_url=api_base,
+                http_client=litellm.client_session,
+                timeout=timeout,
+                max_retries=max_retries,
+            )
+        else:
+            openai_client = client
+        response = openai_client.audio.transcriptions.create(
+            **data, timeout=timeout  # type: ignore
+        )
+
+        stringified_response = response.model_dump()
+        ## LOGGING
+        logging_obj.post_call(
+            input=audio_file.name,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=stringified_response,
+        )
+        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+        return final_response
+
+    async def async_audio_transcriptions(
+        self,
+        audio_file: BinaryIO,
+        data: dict,
+        model_response: TranscriptionResponse,
+        timeout: float,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+        max_retries=None,
+        logging_obj=None,
+    ):
+        response = None
+        try:
+            if client is None:
+                openai_aclient = AsyncOpenAI(
+                    api_key=api_key,
+                    base_url=api_base,
+                    http_client=litellm.aclient_session,
+                    timeout=timeout,
+                    max_retries=max_retries,
+                )
+            else:
+                openai_aclient = client
+            response = await openai_aclient.audio.transcriptions.create(
+                **data, timeout=timeout
+            )  # type: ignore
+            stringified_response = response.model_dump()
+            ## LOGGING
+            logging_obj.post_call(
+                input=audio_file.name,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data},
+                original_response=stringified_response,
+            )
+            return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+        except Exception as e:
+            ## LOGGING
+            logging_obj.post_call(
+                input=input,
+                api_key=api_key,
+                original_response=str(e),
+            )
+            raise e
+
     async def ahealth_check(
         self,
         model: Optional[str],
diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index 616833a2e..a13130c62 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
     3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
     4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
     5. System messages are a separate param to the Messages API (used for tool calling)
+    6. Ensure we only accept role, content. (message.name is not supported)
     """
     ## Ensure final assistant message has no trailing whitespace
     last_assistant_message_idx: Optional[int] = None
@@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
                     new_content.append({"type": "text", "text": m["text"]})
             new_messages.append({"role": messages[0]["role"], "content": new_content})  # type: ignore
         else:
-            new_messages.append(messages[0])
+            new_messages.append(
+                {"role": messages[0]["role"], "content": messages[0]["content"]}
+            )
 
         return new_messages
 
@@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
                     new_content.append({"type": "text", "content": m["text"]})
             new_messages.append({"role": messages[i]["role"], "content": new_content})  # type: ignore
         else:
-            new_messages.append(messages[i])
+            new_messages.append(
+                {"role": messages[i]["role"], "content": messages[i]["content"]}
+            )
 
         if messages[i]["role"] == messages[i + 1]["role"]:
             if messages[i]["role"] == "user":
@@ -897,6 +902,10 @@ def prompt_factory(
                 return anthropic_pt(messages=messages)
         elif "mistral." in model:
             return mistral_instruct_pt(messages=messages)
+    elif custom_llm_provider == "perplexity":
+        for message in messages:
+            message.pop("name", None)
+        return messages
     try:
         if "meta-llama/llama-2" in model and "chat" in model:
             return llama_2_chat_pt(messages=messages)
diff --git a/litellm/main.py b/litellm/main.py
index dfe6c5f31..114b46948 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -8,9 +8,8 @@
 #  Thank you ! We ❤️ you! - Krrish & Ishaan
 
 import os, openai, sys, json, inspect, uuid, datetime, threading
-from typing import Any, Literal, Union
+from typing import Any, Literal, Union, BinaryIO
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
@@ -89,6 +88,7 @@ from litellm.utils import (
     read_config_args,
     Choices,
     Message,
+    TranscriptionResponse,
 )
 
 ####### ENVIRONMENT VARIABLES ###################
@@ -488,6 +488,8 @@ def completion(
     ### ASYNC CALLS ###
     acompletion = kwargs.get("acompletion", False)
     client = kwargs.get("client", None)
+    ### Admin Controls ###
+    no_log = kwargs.get("no-log", False)
     ######## end of unpacking kwargs ###########
     openai_params = [
         "functions",
@@ -564,6 +566,7 @@ def completion(
         "caching_groups",
         "ttl",
         "cache",
+        "no-log",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
@@ -727,6 +730,7 @@ def completion(
             model_info=model_info,
             proxy_server_request=proxy_server_request,
             preset_cache_key=preset_cache_key,
+            no_log=no_log,
         )
         logging.update_environment_variables(
             model=model,
@@ -2418,6 +2422,7 @@ def embedding(
         "caching_groups",
         "ttl",
         "cache",
+        "no-log",
     ]
     default_params = openai_params + litellm_params
     non_default_params = {
@@ -3044,7 +3049,6 @@ def moderation(
     return response
 
 
-##### Moderation #######################
 @client
 async def amoderation(input: str, model: str, api_key: Optional[str] = None, **kwargs):
     # only supports open ai for now
@@ -3067,11 +3071,11 @@ async def aimage_generation(*args, **kwargs):
     Asynchronously calls the `image_generation` function with the given arguments and keyword arguments.
 
     Parameters:
-    - `args` (tuple): Positional arguments to be passed to the `embedding` function.
-    - `kwargs` (dict): Keyword arguments to be passed to the `embedding` function.
+    - `args` (tuple): Positional arguments to be passed to the `image_generation` function.
+    - `kwargs` (dict): Keyword arguments to be passed to the `image_generation` function.
 
     Returns:
-    - `response` (Any): The response returned by the `embedding` function.
+    - `response` (Any): The response returned by the `image_generation` function.
     """
     loop = asyncio.get_event_loop()
     model = args[0] if len(args) > 0 else kwargs["model"]
@@ -3093,7 +3097,7 @@ async def aimage_generation(*args, **kwargs):
         # Await normally
         init_response = await loop.run_in_executor(None, func_with_context)
         if isinstance(init_response, dict) or isinstance(
-            init_response, ModelResponse
+            init_response, ImageResponse
         ):  ## CACHING SCENARIO
             response = init_response
         elif asyncio.iscoroutine(init_response):
@@ -3311,6 +3315,144 @@ def image_generation(
         )
 
 
+##### Transcription #######################
+
+
+@client
+async def atranscription(*args, **kwargs):
+    """
+    Calls openai + azure whisper endpoints.
+
+    Allows router to load balance between them
+    """
+    loop = asyncio.get_event_loop()
+    model = args[0] if len(args) > 0 else kwargs["model"]
+    ### PASS ARGS TO Image Generation ###
+    kwargs["atranscription"] = True
+    custom_llm_provider = None
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(transcription, *args, **kwargs)
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+
+        _, custom_llm_provider, _, _ = get_llm_provider(
+            model=model, api_base=kwargs.get("api_base", None)
+        )
+
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if isinstance(init_response, dict) or isinstance(
+            init_response, TranscriptionResponse
+        ):  ## CACHING SCENARIO
+            response = init_response
+        elif asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            # Call the synchronous function using run_in_executor
+            response = await loop.run_in_executor(None, func_with_context)
+        return response
+    except Exception as e:
+        custom_llm_provider = custom_llm_provider or "openai"
+        raise exception_type(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs=args,
+        )
+
+
+@client
+def transcription(
+    model: str,
+    file: BinaryIO,
+    ## OPTIONAL OPENAI PARAMS ##
+    language: Optional[str] = None,
+    prompt: Optional[str] = None,
+    response_format: Optional[
+        Literal["json", "text", "srt", "verbose_json", "vtt"]
+    ] = None,
+    temperature: Optional[int] = None,  # openai defaults this to 0
+    ## LITELLM PARAMS ##
+    user: Optional[str] = None,
+    timeout=600,  # default to 10 minutes
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    litellm_logging_obj=None,
+    custom_llm_provider=None,
+    **kwargs,
+):
+    """
+    Calls openai + azure whisper endpoints.
+
+    Allows router to load balance between them
+    """
+    atranscription = kwargs.get("atranscription", False)
+    litellm_call_id = kwargs.get("litellm_call_id", None)
+    logger_fn = kwargs.get("logger_fn", None)
+    proxy_server_request = kwargs.get("proxy_server_request", None)
+    model_info = kwargs.get("model_info", None)
+    metadata = kwargs.get("metadata", {})
+
+    model_response = litellm.utils.TranscriptionResponse()
+
+    model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
+
+    optional_params = {
+        "language": language,
+        "prompt": prompt,
+        "response_format": response_format,
+        "temperature": None,  # openai defaults this to 0
+    }
+
+    if custom_llm_provider == "azure":
+        # azure configs
+        api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+
+        api_version = (
+            api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+        )
+
+        azure_ad_token = kwargs.pop("azure_ad_token", None) or get_secret(
+            "AZURE_AD_TOKEN"
+        )
+
+        api_key = (
+            api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_API_KEY")
+        )
+
+        response = azure_chat_completions.audio_transcriptions(
+            model=model,
+            audio_file=file,
+            optional_params=optional_params,
+            model_response=model_response,
+            atranscription=atranscription,
+            timeout=timeout,
+            logging_obj=litellm_logging_obj,
+            api_base=api_base,
+            api_key=api_key,
+            api_version=api_version,
+            azure_ad_token=azure_ad_token,
+        )
+    elif custom_llm_provider == "openai":
+        response = openai_chat_completions.audio_transcriptions(
+            model=model,
+            audio_file=file,
+            optional_params=optional_params,
+            model_response=model_response,
+            atranscription=atranscription,
+            timeout=timeout,
+            logging_obj=litellm_logging_obj,
+        )
+    return response
+
+
 ##### Health Endpoints #######################
 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 111b9f8c3..18c4b0d9a 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -108,7 +108,7 @@
     },
     "gpt-3.5-turbo": {
         "max_tokens": 4097,
-        "max_input_tokens": 4097,
+        "max_input_tokens": 16385,
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.0000015,
         "output_cost_per_token": 0.000002,
@@ -293,6 +293,18 @@
         "output_cost_per_pixel": 0.0,
         "litellm_provider": "openai"
     },
+    "whisper-1": {
+        "mode": "audio_transcription",
+        "input_cost_per_second": 0,
+        "output_cost_per_second": 0.0001, 
+        "litellm_provider": "openai"
+    }, 
+    "azure/whisper-1": {
+        "mode": "audio_transcription",
+        "input_cost_per_second": 0, 
+        "output_cost_per_second": 0.0001, 
+        "litellm_provider": "azure"
+    },
     "azure/gpt-4-0125-preview": {
         "max_tokens": 128000,
         "max_input_tokens": 128000,
@@ -2259,4 +2271,4 @@
         "mode": "embedding"
     }
 
-}
\ No newline at end of file
+}
diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py
index f7eba02ec..e5bcff646 100644
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@@ -16,6 +16,13 @@ from importlib import resources
 import shutil
 
 telemetry = None
+default_num_workers = 1
+try:
+    default_num_workers = os.cpu_count() or 1
+    if default_num_workers is not None and default_num_workers > 0:
+        default_num_workers -= 1
+except:
+    pass
 
 
 def append_query_params(url, params):
@@ -54,10 +61,10 @@ def is_port_in_use(port):
 @click.option(
     "--host", default="0.0.0.0", help="Host for the server to listen on.", envvar="HOST"
 )
-@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
+@click.option("--port", default=4000, help="Port to bind the server to.", envvar="PORT")
 @click.option(
     "--num_workers",
-    default=1,
+    default=default_num_workers,
     help="Number of gunicorn workers to spin up",
     envvar="NUM_WORKERS",
 )
@@ -266,7 +273,7 @@ def run_server(
                 ],
             }
 
-            response = requests.post("http://0.0.0.0:8000/queue/request", json=data)
+            response = requests.post("http://0.0.0.0:4000/queue/request", json=data)
 
             response = response.json()
 
@@ -500,7 +507,7 @@ def run_server(
                 print(
                     f"Unable to connect to DB. DATABASE_URL found in environment, but prisma package not found."
                 )
-        if port == 8000 and is_port_in_use(port):
+        if port == 4000 and is_port_in_use(port):
             port = random.randint(1024, 49152)
 
         from litellm.proxy.proxy_server import app
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 6b4b7a8f6..76c9ed04c 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -5,63 +5,9 @@ model_list:
       api_base: os.environ/AZURE_API_BASE
       api_key: os.environ/AZURE_API_KEY
       api_version: "2023-07-01-preview"
-    model_info:
-      mode: chat
-      max_tokens: 4096
-      base_model: azure/gpt-4-1106-preview
-      access_groups: ["public"] 
-  - model_name: openai-gpt-3.5
-    litellm_params:
-      model: gpt-3.5-turbo
-      api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      access_groups: ["public"]
-  - model_name: anthropic-claude-v2.1
-    litellm_params:
-      model: bedrock/anthropic.claude-v2:1
-      timeout: 300 # sets a 5 minute timeout
-    model_info:
-      access_groups: ["private"]
-  - model_name: anthropic-claude-v2
-    litellm_params:
-      model: bedrock/anthropic.claude-v2
-  - model_name: bedrock-cohere
-    litellm_params:
-      model: bedrock/cohere.command-text-v14
-      timeout: 0.0001
-  - model_name: gpt-4
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-    model_info:
-      base_model: azure/gpt-4
-  - model_name: text-moderation-stable
-    litellm_params:
-      model: text-moderation-stable
-      api_key: os.environ/OPENAI_API_KEY
 litellm_settings:
-  fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
-  success_callback: ['langfuse']
-  # setting callback class
-  callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
-
-general_settings: 
-  master_key: sk-1234
-  alerting: ["slack"]
-  alerting_threshold: 10 # sends alerts if requests hang for 2 seconds
-  # database_type: "dynamo_db" 
-  # database_args: { # 👈  all args - https://github.com/BerriAI/litellm/blob/befbcbb7ac8f59835ce47415c128decf37aac328/litellm/proxy/_types.py#L190
-  #   "billing_mode": "PAY_PER_REQUEST", 
-  #   "region_name": "us-west-2",
-  #   "ssl_verify": False
-  # }
-
-
-
-
-
-environment_variables: 
-  # otel: True          # OpenTelemetry Logger
-  # master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
+  set_verbose: True
+  success_callback: ["langfuse"]
+router_settings:
+  set_verbose: True
+  debug_level: "DEBUG"
\ No newline at end of file
diff --git a/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml b/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
new file mode 100644
index 000000000..2e107d366
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
@@ -0,0 +1,6 @@
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: openai/my-fake-model
+      api_key: my-fake-key
+      api_base: http://0.0.0.0:8090
\ No newline at end of file
diff --git a/litellm/proxy/proxy_load_test/locustfile.py b/litellm/proxy/proxy_load_test/locustfile.py
new file mode 100644
index 000000000..2cd2e2fcc
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@@ -0,0 +1,27 @@
+from locust import HttpUser, task, between
+
+
+class MyUser(HttpUser):
+    wait_time = between(1, 5)
+
+    @task
+    def chat_completion(self):
+        headers = {
+            "Content-Type": "application/json",
+            # Include any additional headers you may need for authentication, etc.
+        }
+
+        # Customize the payload with "model" and "messages" keys
+        payload = {
+            "model": "gpt-3.5-turbo",
+            "messages": [
+                {"role": "system", "content": "You are a chat bot."},
+                {"role": "user", "content": "Hello, how are you?"},
+            ],
+            # Add more data as necessary
+        }
+
+        # Make a POST request to the "chat/completions" endpoint
+        response = self.client.post("chat/completions", json=payload, headers=headers)
+
+        # Print or log the response if needed
diff --git a/litellm/proxy/proxy_load_test/openai_endpoint.py b/litellm/proxy/proxy_load_test/openai_endpoint.py
new file mode 100644
index 000000000..b3291ce70
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/openai_endpoint.py
@@ -0,0 +1,50 @@
+# import sys, os
+# sys.path.insert(
+#     0, os.path.abspath("../")
+# )  # Adds the parent directory to the system path
+from fastapi import FastAPI, Request, status, HTTPException, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.middleware.cors import CORSMiddleware
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+# for completion
+@app.post("/chat/completions")
+@app.post("/v1/chat/completions")
+async def completion(request: Request):
+    return {
+        "id": "chatcmpl-123",
+        "object": "chat.completion",
+        "created": 1677652288,
+        "model": "gpt-3.5-turbo-0125",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": "\n\nHello there, how may I assist you today?",
+                },
+                "logprobs": None,
+                "finish_reason": "stop",
+            }
+        ],
+        "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
+    }
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    # run this on 8090, 8091, 8092 and 8093
+    uvicorn.run(app, host="0.0.0.0", port=8090)
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 5a1b3c6e7..6cc47935a 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -123,6 +123,8 @@ from fastapi import (
     Header,
     Response,
     Form,
+    UploadFile,
+    File,
 )
 from fastapi.routing import APIRouter
 from fastapi.security import OAuth2PasswordBearer
@@ -1684,9 +1686,9 @@ class ProxyConfig:
                         # these are litellm callbacks - "langfuse", "sentry", "wandb"
                         else:
                             litellm.success_callback.append(callback)
-                    verbose_proxy_logger.debug(
+                    print(  # noqa
                         f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
-                    )
+                    )  # noqa
                 elif key == "failure_callback":
                     litellm.failure_callback = []
 
@@ -2682,6 +2684,11 @@ async def chat_completion(
         except:
             data = json.loads(body_str)
 
+        # Azure OpenAI only: check if user passed api-version
+        query_params = dict(request.query_params)
+        if "api-version" in query_params:
+            data["api_version"] = query_params["api-version"]
+
         # Include original request and headers in the data
         data["proxy_server_request"] = {
             "url": str(request.url),
@@ -3079,13 +3086,13 @@ async def embeddings(
     "/v1/images/generations",
     dependencies=[Depends(user_api_key_auth)],
     response_class=ORJSONResponse,
-    tags=["image generation"],
+    tags=["images"],
 )
 @router.post(
     "/images/generations",
     dependencies=[Depends(user_api_key_auth)],
     response_class=ORJSONResponse,
-    tags=["image generation"],
+    tags=["images"],
 )
 async def image_generation(
     request: Request,
@@ -3226,6 +3233,168 @@ async def image_generation(
             )
 
 
+@router.post(
+    "/v1/audio/transcriptions",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["audio"],
+)
+@router.post(
+    "/audio/transcriptions",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["audio"],
+)
+async def audio_transcriptions(
+    request: Request,
+    file: UploadFile = File(...),
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Same params as:
+
+    https://platform.openai.com/docs/api-reference/audio/createTranscription?lang=curl
+    """
+    global proxy_logging_obj
+    try:
+        # Use orjson to parse JSON data, orjson speeds up requests significantly
+        form_data = await request.form()
+        data: Dict = {key: value for key, value in form_data.items() if key != "file"}
+
+        # Include original request and headers in the data
+        data["proxy_server_request"] = {  # type: ignore
+            "url": str(request.url),
+            "method": request.method,
+            "headers": dict(request.headers),
+            "body": copy.copy(data),  # use copy instead of deepcopy
+        }
+
+        if data.get("user", None) is None and user_api_key_dict.user_id is not None:
+            data["user"] = user_api_key_dict.user_id
+
+        data["model"] = (
+            general_settings.get("moderation_model", None)  # server default
+            or user_model  # model name passed via cli args
+            or data["model"]  # default passed in http request
+        )
+        if user_model:
+            data["model"] = user_model
+
+        if "metadata" not in data:
+            data["metadata"] = {}
+        data["metadata"]["user_api_key"] = user_api_key_dict.api_key
+        data["metadata"]["user_api_key_metadata"] = user_api_key_dict.metadata
+        _headers = dict(request.headers)
+        _headers.pop(
+            "authorization", None
+        )  # do not store the original `sk-..` api key in the db
+        data["metadata"]["headers"] = _headers
+        data["metadata"]["user_api_key_alias"] = getattr(
+            user_api_key_dict, "key_alias", None
+        )
+        data["metadata"]["user_api_key_user_id"] = user_api_key_dict.user_id
+        data["metadata"]["user_api_key_team_id"] = getattr(
+            user_api_key_dict, "team_id", None
+        )
+        data["metadata"]["endpoint"] = str(request.url)
+
+        ### TEAM-SPECIFIC PARAMS ###
+        if user_api_key_dict.team_id is not None:
+            team_config = await proxy_config.load_team_config(
+                team_id=user_api_key_dict.team_id
+            )
+            if len(team_config) == 0:
+                pass
+            else:
+                team_id = team_config.pop("team_id", None)
+                data["metadata"]["team_id"] = team_id
+                data = {
+                    **team_config,
+                    **data,
+                }  # add the team-specific configs to the completion call
+
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+
+        assert (
+            file.filename is not None
+        )  # make sure filename passed in (needed for type)
+
+        with open(file.filename, "wb+") as f:
+            f.write(await file.read())
+            try:
+                data["file"] = open(file.filename, "rb")
+                ### CALL HOOKS ### - modify incoming data / reject request before calling the model
+                data = await proxy_logging_obj.pre_call_hook(
+                    user_api_key_dict=user_api_key_dict,
+                    data=data,
+                    call_type="moderation",
+                )
+
+                ## ROUTE TO CORRECT ENDPOINT ##
+                # skip router if user passed their key
+                if "api_key" in data:
+                    response = await litellm.atranscription(**data)
+                elif (
+                    llm_router is not None and data["model"] in router_model_names
+                ):  # model in router model list
+                    response = await llm_router.atranscription(**data)
+
+                elif (
+                    llm_router is not None
+                    and data["model"] in llm_router.deployment_names
+                ):  # model in router deployments, calling a specific deployment on the router
+                    response = await llm_router.atranscription(
+                        **data, specific_deployment=True
+                    )
+                elif (
+                    llm_router is not None
+                    and llm_router.model_group_alias is not None
+                    and data["model"] in llm_router.model_group_alias
+                ):  # model set in model_group_alias
+                    response = await llm_router.atranscription(
+                        **data
+                    )  # ensure this goes the llm_router, router will do the correct alias mapping
+                elif user_model is not None:  # `litellm --model <your-model-name>`
+                    response = await litellm.atranscription(**data)
+                else:
+                    raise HTTPException(
+                        status_code=status.HTTP_400_BAD_REQUEST,
+                        detail={"error": "Invalid model name passed in"},
+                    )
+
+            except Exception as e:
+                raise HTTPException(status_code=500, detail=str(e))
+            finally:
+                os.remove(file.filename)  # Delete the saved file
+
+        ### ALERTING ###
+        data["litellm_status"] = "success"  # used for alerting
+        return response
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e
+        )
+        traceback.print_exc()
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_traceback = traceback.format_exc()
+            error_msg = f"{str(e)}\n\n{error_traceback}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
+
+
 @router.post(
     "/v1/moderations",
     dependencies=[Depends(user_api_key_auth)],
diff --git a/litellm/proxy/tests/load_test_completion.py b/litellm/proxy/tests/load_test_completion.py
index b85ef2d0f..3f0da2e94 100644
--- a/litellm/proxy/tests/load_test_completion.py
+++ b/litellm/proxy/tests/load_test_completion.py
@@ -5,20 +5,7 @@ import traceback
 from large_text import text
 from dotenv import load_dotenv
 
-load_dotenv()
-litellm_client = AsyncOpenAI(
-    base_url="http://0.0.0.0:4000",
-    api_key="sk-VEbqnb28-zDsFzQWTmiCsw",
-    # base_url="http://0.0.0.0:4000",
-    # api_key="sk-1234",
-)
-
-# litellm_client = AsyncAzureOpenAI(
-#     azure_endpoint="https://openai-gpt-4-test-v-1.openai.azure.com",
-#     api_key="d6f82361954b450188295b448e2091ca",
-#     api_version="2023-07-01-preview",
-# )
-
+litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
 
 async def litellm_completion():
     # Your existing code for litellm_completion goes here
@@ -44,7 +31,7 @@ async def litellm_completion():
 async def main():
     for i in range(6):
         start = time.time()
-        n = 100  # Number of concurrent tasks
+        n = 20  # Number of concurrent tasks
         tasks = [litellm_completion() for _ in range(n)]
 
         chat_completions = await asyncio.gather(*tasks)
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 81130787d..89976ff0d 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -693,6 +693,9 @@ class PrismaClient:
         """
         Generic implementation of get data
         """
+        verbose_proxy_logger.debug(
+            f"PrismaClient: get_generic_data: {key}, table_name: {table_name}"
+        )
         try:
             if table_name == "users":
                 response = await self.db.litellm_usertable.find_first(
@@ -758,6 +761,10 @@ class PrismaClient:
             int
         ] = None,  # pagination, number of rows to getch when find_all==True
     ):
+        args_passed_in = locals()
+        verbose_proxy_logger.debug(
+            f"PrismaClient: get_data: token={token}, table_name: {table_name}, query_type: {query_type}, user_id: {user_id}, user_id_list: {user_id_list}, team_id: {team_id}, team_id_list: {team_id_list}, key_val: {key_val}"
+        )
         try:
             response: Any = None
             if (token is not None and table_name is None) or (
@@ -788,6 +795,12 @@ class PrismaClient:
                             response.expires, datetime
                         ):
                             response.expires = response.expires.isoformat()
+                    else:
+                        # Token does not exist.
+                        raise HTTPException(
+                            status_code=status.HTTP_401_UNAUTHORIZED,
+                            detail=f"Authentication Error: invalid user key - user key does not exist in db. User Key={token}",
+                        )
                 elif query_type == "find_all" and user_id is not None:
                     response = await self.db.litellm_verificationtoken.find_many(
                         where={"user_id": user_id},
@@ -991,9 +1004,11 @@ class PrismaClient:
         except Exception as e:
             import traceback
 
-            error_msg = f"LiteLLM Prisma Client Exception get_data: {str(e)}"
+            prisma_query_info = f"LiteLLM Prisma Client Exception: Error with `get_data`. Args passed in: {args_passed_in}"
+            error_msg = prisma_query_info + str(e)
             print_verbose(error_msg)
             error_traceback = error_msg + "\n" + traceback.format_exc()
+            verbose_proxy_logger.debug(error_traceback)
             asyncio.create_task(
                 self.proxy_logging_obj.failure_handler(
                     original_exception=e, traceback_str=error_traceback
@@ -1020,6 +1035,7 @@ class PrismaClient:
         Add a key to the database. If it already exists, do nothing.
         """
         try:
+            verbose_proxy_logger.debug(f"PrismaClient: insert_data: {data}")
             if table_name == "key":
                 token = data["token"]
                 hashed_token = self.hash_token(token=token)
@@ -1152,6 +1168,9 @@ class PrismaClient:
         """
         Update existing data
         """
+        verbose_proxy_logger.debug(
+            f"PrismaClient: update_data, table_name: {table_name}"
+        )
         try:
             db_data = self.jsonify_object(data=data)
             if update_key_values is not None:
diff --git a/litellm/router.py b/litellm/router.py
index 6f33d0b0d..e9bbd1ffc 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -9,7 +9,7 @@
 
 import copy, httpx
 from datetime import datetime
-from typing import Dict, List, Optional, Union, Literal, Any
+from typing import Dict, List, Optional, Union, Literal, Any, BinaryIO
 import random, threading, time, traceback, uuid
 import litellm, openai
 from litellm.caching import RedisCache, InMemoryCache, DualCache
@@ -240,6 +240,21 @@ class Router:
             {"caching_groups": caching_groups}
         )
 
+        self.deployment_stats: dict = {}  # used for debugging load balancing
+        """
+        deployment_stats = {
+            "122999-2828282-277:
+            {
+                "model": "gpt-3",
+                "api_base": "http://localhost:4000",
+                "num_requests": 20,
+                "avg_latency": 0.001,
+                "num_failures": 0,
+                "num_successes": 20
+            }
+        }
+        """
+
         ### ROUTING SETUP ###
         if routing_strategy == "least-busy":
             self.leastbusy_logger = LeastBusyLoggingHandler(
@@ -390,6 +405,10 @@ class Router:
                 messages=messages,
                 specific_deployment=kwargs.pop("specific_deployment", None),
             )
+            if self.set_verbose == True and self.debug_level == "DEBUG":
+                # debug how often this deployment picked
+                self._print_deployment_metrics(deployment=deployment)
+
             kwargs.setdefault("metadata", {}).update(
                 {
                     "deployment": deployment["litellm_params"]["model"],
@@ -446,6 +465,9 @@ class Router:
             verbose_router_logger.info(
                 f"litellm.acompletion(model={model_name})\033[32m 200 OK\033[0m"
             )
+            if self.set_verbose == True and self.debug_level == "DEBUG":
+                # debug how often this deployment picked
+                self._print_deployment_metrics(deployment=deployment, response=response)
             return response
         except Exception as e:
             verbose_router_logger.info(
@@ -611,6 +633,106 @@ class Router:
                 self.fail_calls[model_name] += 1
             raise e
 
+    async def atranscription(self, file: BinaryIO, model: str, **kwargs):
+        """
+        Example Usage:
+
+        ```
+        from litellm import Router
+        client = Router(model_list = [
+            {
+                "model_name": "whisper",
+                "litellm_params": {
+                    "model": "whisper-1",
+                },
+            },
+        ])
+
+        audio_file = open("speech.mp3", "rb")
+        transcript = await client.atranscription(
+        model="whisper",
+        file=audio_file
+        )
+
+        ```
+        """
+        try:
+            kwargs["model"] = model
+            kwargs["file"] = file
+            kwargs["original_function"] = self._atranscription
+            kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
+            timeout = kwargs.get("request_timeout", self.timeout)
+            kwargs.setdefault("metadata", {}).update({"model_group": model})
+            response = await self.async_function_with_fallbacks(**kwargs)
+
+            return response
+        except Exception as e:
+            raise e
+
+    async def _atranscription(self, file: BinaryIO, model: str, **kwargs):
+        try:
+            verbose_router_logger.debug(
+                f"Inside _atranscription()- model: {model}; kwargs: {kwargs}"
+            )
+            deployment = self.get_available_deployment(
+                model=model,
+                messages=[{"role": "user", "content": "prompt"}],
+                specific_deployment=kwargs.pop("specific_deployment", None),
+            )
+            kwargs.setdefault("metadata", {}).update(
+                {
+                    "deployment": deployment["litellm_params"]["model"],
+                    "model_info": deployment.get("model_info", {}),
+                }
+            )
+            kwargs["model_info"] = deployment.get("model_info", {})
+            data = deployment["litellm_params"].copy()
+            model_name = data["model"]
+            for k, v in self.default_litellm_params.items():
+                if (
+                    k not in kwargs
+                ):  # prioritize model-specific params > default router params
+                    kwargs[k] = v
+                elif k == "metadata":
+                    kwargs[k].update(v)
+
+            potential_model_client = self._get_client(
+                deployment=deployment, kwargs=kwargs, client_type="async"
+            )
+            # check if provided keys == client keys #
+            dynamic_api_key = kwargs.get("api_key", None)
+            if (
+                dynamic_api_key is not None
+                and potential_model_client is not None
+                and dynamic_api_key != potential_model_client.api_key
+            ):
+                model_client = None
+            else:
+                model_client = potential_model_client
+
+            self.total_calls[model_name] += 1
+            response = await litellm.atranscription(
+                **{
+                    **data,
+                    "file": file,
+                    "caching": self.cache_responses,
+                    "client": model_client,
+                    **kwargs,
+                }
+            )
+            self.success_calls[model_name] += 1
+            verbose_router_logger.info(
+                f"litellm.atranscription(model={model_name})\033[32m 200 OK\033[0m"
+            )
+            return response
+        except Exception as e:
+            verbose_router_logger.info(
+                f"litellm.atranscription(model={model_name})\033[31m Exception {str(e)}\033[0m"
+            )
+            if model_name is not None:
+                self.fail_calls[model_name] += 1
+            raise e
+
     async def amoderation(self, model: str, input: str, **kwargs):
         try:
             kwargs["model"] = model
@@ -2124,6 +2246,63 @@ class Router:
         )
         return deployment
 
+    def _print_deployment_metrics(self, deployment, response=None):
+        try:
+            litellm_params = deployment["litellm_params"]
+            api_base = litellm_params.get("api_base", "")
+            model = litellm_params.get("model", "")
+
+            model_id = deployment.get("model_info", {}).get("id", None)
+            if response is None:
+
+                # update self.deployment_stats
+                if model_id is not None:
+                    if model_id in self.deployment_stats:
+                        # only update num_requests
+                        self.deployment_stats[model_id]["num_requests"] += 1
+                    else:
+                        self.deployment_stats[model_id] = {
+                            "api_base": api_base,
+                            "model": model,
+                            "num_requests": 1,
+                        }
+            else:
+                # check response_ms and update num_successes
+                response_ms = response.get("_response_ms", 0)
+                if model_id is not None:
+                    if model_id in self.deployment_stats:
+                        # check if avg_latency exists
+                        if "avg_latency" in self.deployment_stats[model_id]:
+                            # update avg_latency
+                            self.deployment_stats[model_id]["avg_latency"] = (
+                                self.deployment_stats[model_id]["avg_latency"]
+                                + response_ms
+                            ) / self.deployment_stats[model_id]["num_successes"]
+                        else:
+                            self.deployment_stats[model_id]["avg_latency"] = response_ms
+
+                        # check if num_successes exists
+                        if "num_successes" in self.deployment_stats[model_id]:
+                            self.deployment_stats[model_id]["num_successes"] += 1
+                        else:
+                            self.deployment_stats[model_id]["num_successes"] = 1
+                    else:
+                        self.deployment_stats[model_id] = {
+                            "api_base": api_base,
+                            "model": model,
+                            "num_successes": 1,
+                            "avg_latency": response_ms,
+                        }
+            from pprint import pformat
+
+            # Assuming self.deployment_stats is your dictionary
+            formatted_stats = pformat(self.deployment_stats)
+
+            # Assuming verbose_router_logger is your logger
+            verbose_router_logger.info("self.deployment_stats: \n%s", formatted_stats)
+        except Exception as e:
+            verbose_router_logger.error(f"Error in _print_deployment_metrics: {str(e)}")
+
     def flush_cache(self):
         litellm.cache = None
         self.cache.flush_cache()
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 4db664dde..e54617bd9 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -83,12 +83,13 @@ def test_completion_claude():
 
 
 def test_completion_claude_3_empty_response():
+    litellm.set_verbose = True
     messages = [
         {
             "role": "system",
             "content": "You are 2twNLGfqk4GMOn3ffp4p.",
         },
-        {"role": "user", "content": "Hi gm!"},
+        {"role": "user", "content": "Hi gm!", "name": "ishaan"},
         {"role": "assistant", "content": "Good morning! How are you doing today?"},
         {
             "role": "user",
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index c513447b0..6f8d73c31 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
 
 
 def test_completion_deep_infra_stream():
-    # deep infra currently includes role in the 2nd chunk
+    # deep infra,currently includes role in the 2nd chunk
     # waiting for them to make a fix on this
     litellm.set_verbose = True
     try:
@@ -727,6 +727,31 @@ def test_completion_claude_stream_bad_key():
 #         pytest.fail(f"Error occurred: {e}")
 
 
+def test_bedrock_claude_3_streaming():
+    try:
+        litellm.set_verbose = True
+        response: ModelResponse = completion(
+            model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+            messages=messages,
+            max_tokens=10,
+            stream=True,
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            if finished:
+                break
+            complete_response += chunk
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except RateLimitError:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 @pytest.mark.skip(reason="Replicate changed exceptions")
 def test_completion_replicate_stream_bad_key():
     try:
diff --git a/litellm/utils.py b/litellm/utils.py
index 4c48c5516..6d42ec2d3 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -10,7 +10,6 @@
 import sys, re, binascii, struct
 import litellm
 import dotenv, json, traceback, threading, base64, ast
-
 import subprocess, os
 from os.path import abspath, join, dirname
 import litellm, openai
@@ -98,7 +97,7 @@ try:
 except Exception as e:
     verbose_logger.debug(f"Exception import enterprise features {str(e)}")
 
-from typing import cast, List, Dict, Union, Optional, Literal, Any
+from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
 from .caching import Cache
 from concurrent.futures import ThreadPoolExecutor
 
@@ -790,6 +789,38 @@ class ImageResponse(OpenAIObject):
             return self.dict()
 
 
+class TranscriptionResponse(OpenAIObject):
+    text: Optional[str] = None
+
+    _hidden_params: dict = {}
+
+    def __init__(self, text=None):
+        super().__init__(text=text)
+
+    def __contains__(self, key):
+        # Define custom behavior for the 'in' operator
+        return hasattr(self, key)
+
+    def get(self, key, default=None):
+        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
+        return getattr(self, key, default)
+
+    def __getitem__(self, key):
+        # Allow dictionary-style access to attributes
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Allow dictionary-style assignment of attributes
+        setattr(self, key, value)
+
+    def json(self, **kwargs):
+        try:
+            return self.model_dump()  # noqa
+        except:
+            # if using pydantic v1
+            return self.dict()
+
+
 ############################################################
 def print_verbose(print_statement, logger_only: bool = False):
     try:
@@ -815,6 +846,8 @@ class CallTypes(Enum):
     aimage_generation = "aimage_generation"
     moderation = "moderation"
     amoderation = "amoderation"
+    atranscription = "atranscription"
+    transcription = "transcription"
 
 
 # Logging function -> log the exact model details + what's being sent | Non-BlockingP
@@ -948,6 +981,7 @@ class Logging:
                 curl_command = self.model_call_details
 
             # only print verbose if verbose logger is not set
+
             if verbose_logger.level == 0:
                 # this means verbose logger was not switched on - user is in litellm.set_verbose=True
                 print_verbose(f"\033[92m{curl_command}\033[0m\n")
@@ -1279,6 +1313,15 @@ class Logging:
 
             for callback in callbacks:
                 try:
+                    litellm_params = self.model_call_details.get("litellm_params", {})
+                    if litellm_params.get("no-log", False) == True:
+                        # proxy cost tracking cal backs should run
+                        if not (
+                            isinstance(callback, CustomLogger)
+                            and "_PROXY_" in callback.__class__.__name__
+                        ):
+                            print_verbose("no-log request, skipping logging")
+                            continue
                     if callback == "lite_debugger":
                         print_verbose("reaches lite_debugger for logging!")
                         print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
@@ -1707,7 +1750,20 @@ class Logging:
             callbacks = litellm._async_success_callback
         verbose_logger.debug(f"Async success callbacks: {callbacks}")
         for callback in callbacks:
+            # check if callback can run for this request
+            litellm_params = self.model_call_details.get("litellm_params", {})
+            if litellm_params.get("no-log", False) == True:
+                # proxy cost tracking cal backs should run
+                if not (
+                    isinstance(callback, CustomLogger)
+                    and "_PROXY_" in callback.__class__.__name__
+                ):
+                    print_verbose("no-log request, skipping logging")
+                    continue
             try:
+                if kwargs.get("no-log", False) == True:
+                    print_verbose("no-log request, skipping logging")
+                    continue
                 if callback == "cache" and litellm.cache is not None:
                     # set_cache once complete streaming response is built
                     print_verbose("async success_callback: reaches cache for logging!")
@@ -2271,6 +2327,12 @@ def client(original_function):
                 or call_type == CallTypes.text_completion.value
             ):
                 messages = args[0] if len(args) > 0 else kwargs["prompt"]
+            elif (
+                call_type == CallTypes.atranscription.value
+                or call_type == CallTypes.transcription.value
+            ):
+                _file_name: BinaryIO = args[1] if len(args) > 1 else kwargs["file"]
+                messages = "audio_file"
             stream = True if "stream" in kwargs and kwargs["stream"] == True else False
             logging_obj = Logging(
                 model=model,
@@ -2568,6 +2630,8 @@ def client(original_function):
                 return result
             elif "aimg_generation" in kwargs and kwargs["aimg_generation"] == True:
                 return result
+            elif "atranscription" in kwargs and kwargs["atranscription"] == True:
+                return result
 
             ### POST-CALL RULES ###
             post_call_processing(original_response=result, model=model or None)
@@ -2985,11 +3049,13 @@ def client(original_function):
             print_verbose(
                 f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
             )
+            # check if user does not want this to be logged
             asyncio.create_task(
                 logging_obj.async_success_handler(result, start_time, end_time)
             )
             threading.Thread(
-                target=logging_obj.success_handler, args=(result, start_time, end_time)
+                target=logging_obj.success_handler,
+                args=(result, start_time, end_time),
             ).start()
 
             # RETURN RESULT
@@ -3892,6 +3958,7 @@ def get_litellm_params(
     proxy_server_request=None,
     acompletion=None,
     preset_cache_key=None,
+    no_log=None,
 ):
     litellm_params = {
         "acompletion": acompletion,
@@ -3908,6 +3975,7 @@ def get_litellm_params(
         "model_info": model_info,
         "proxy_server_request": proxy_server_request,
         "preset_cache_key": preset_cache_key,
+        "no-log": no_log,
         "stream_response": {},  # litellm_call_id: ModelResponse Dict
     }
 
@@ -4147,8 +4215,9 @@ def get_optional_params(
             and custom_llm_provider != "mistral"
             and custom_llm_provider != "anthropic"
             and custom_llm_provider != "bedrock"
+            and custom_llm_provider != "ollama_chat"
         ):
-            if custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
+            if custom_llm_provider == "ollama":
                 # ollama actually supports json output
                 optional_params["format"] = "json"
                 litellm.add_function_to_prompt = (
@@ -4174,7 +4243,7 @@ def get_optional_params(
             else:
                 raise UnsupportedParamsError(
                     status_code=500,
-                    message=f"Function calling is not supported by {custom_llm_provider}. To add it to the prompt, set `litellm.add_function_to_prompt = True`.",
+                    message=f"Function calling is not supported by {custom_llm_provider}.",
                 )
 
     def _check_valid_arg(supported_params):
@@ -4227,15 +4296,9 @@ def get_optional_params(
     ## raise exception if provider doesn't support passed in param
     if custom_llm_provider == "anthropic":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "stop",
-            "temperature",
-            "top_p",
-            "max_tokens",
-            "tools",
-            "tool_choice",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # handle anthropic params
         if stream:
@@ -4259,17 +4322,9 @@ def get_optional_params(
             optional_params["tools"] = tools
     elif custom_llm_provider == "cohere":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "temperature",
-            "max_tokens",
-            "logit_bias",
-            "top_p",
-            "frequency_penalty",
-            "presence_penalty",
-            "stop",
-            "n",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # handle cohere params
         if stream:
@@ -4292,14 +4347,9 @@ def get_optional_params(
             optional_params["stop_sequences"] = stop
     elif custom_llm_provider == "maritalk":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "temperature",
-            "max_tokens",
-            "top_p",
-            "presence_penalty",
-            "stop",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # handle cohere params
         if stream:
@@ -4318,14 +4368,9 @@ def get_optional_params(
             optional_params["stopping_tokens"] = stop
     elif custom_llm_provider == "replicate":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "temperature",
-            "max_tokens",
-            "top_p",
-            "stop",
-            "seed",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if stream:
@@ -4346,7 +4391,9 @@ def get_optional_params(
             optional_params["stop_sequences"] = stop
     elif custom_llm_provider == "huggingface":
         ## check if unsupported param passed in
-        supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
         if temperature is not None:
@@ -4385,16 +4432,9 @@ def get_optional_params(
             )  # since we handle translating echo, we should not send it to TGI request
     elif custom_llm_provider == "together_ai":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "temperature",
-            "max_tokens",
-            "top_p",
-            "stop",
-            "frequency_penalty",
-            "tools",
-            "tool_choice",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if stream:
@@ -4415,16 +4455,9 @@ def get_optional_params(
             optional_params["tool_choice"] = tool_choice
     elif custom_llm_provider == "ai21":
         ## check if unsupported param passed in
-        supported_params = [
-            "stream",
-            "n",
-            "temperature",
-            "max_tokens",
-            "top_p",
-            "stop",
-            "frequency_penalty",
-            "presence_penalty",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if stream:
@@ -4447,7 +4480,9 @@ def get_optional_params(
         custom_llm_provider == "palm" or custom_llm_provider == "gemini"
     ):  # https://developers.generativeai.google/tutorials/curl_quickstart
         ## check if unsupported param passed in
-        supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if temperature is not None:
@@ -4476,14 +4511,9 @@ def get_optional_params(
     ):
         print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
         ## check if unsupported param passed in
-        supported_params = [
-            "temperature",
-            "top_p",
-            "max_tokens",
-            "stream",
-            "tools",
-            "tool_choice",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if temperature is not None:
@@ -4513,7 +4543,9 @@ def get_optional_params(
         )
     elif custom_llm_provider == "sagemaker":
         ## check if unsupported param passed in
-        supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
         if temperature is not None:
@@ -4540,8 +4572,10 @@ def get_optional_params(
                 max_tokens = 1
             optional_params["max_new_tokens"] = max_tokens
     elif custom_llm_provider == "bedrock":
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         if "ai21" in model:
-            supported_params = ["max_tokens", "temperature", "top_p", "stream"]
             _check_valid_arg(supported_params=supported_params)
             # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
             # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@@ -4554,9 +4588,6 @@ def get_optional_params(
             if stream:
                 optional_params["stream"] = stream
         elif "anthropic" in model:
-            supported_params = get_mapped_model_params(
-                model=model, custom_llm_provider=custom_llm_provider
-            )
             _check_valid_arg(supported_params=supported_params)
             # anthropic params on bedrock
             # \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
@@ -4573,7 +4604,6 @@ def get_optional_params(
                     optional_params=optional_params,
                 )
         elif "amazon" in model:  # amazon titan llms
-            supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
             _check_valid_arg(supported_params=supported_params)
             # see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
             if max_tokens is not None:
@@ -4590,7 +4620,6 @@ def get_optional_params(
             if stream:
                 optional_params["stream"] = stream
         elif "meta" in model:  # amazon / meta llms
-            supported_params = ["max_tokens", "temperature", "top_p", "stream"]
             _check_valid_arg(supported_params=supported_params)
             # see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
             if max_tokens is not None:
@@ -4602,7 +4631,6 @@ def get_optional_params(
             if stream:
                 optional_params["stream"] = stream
         elif "cohere" in model:  # cohere models on bedrock
-            supported_params = ["stream", "temperature", "max_tokens"]
             _check_valid_arg(supported_params=supported_params)
             # handle cohere params
             if stream:
@@ -4612,7 +4640,6 @@ def get_optional_params(
             if max_tokens is not None:
                 optional_params["max_tokens"] = max_tokens
         elif "mistral" in model:
-            supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
             _check_valid_arg(supported_params=supported_params)
             # mistral params on bedrock
             # \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
@@ -4656,7 +4683,9 @@ def get_optional_params(
             optional_params["stop_sequences"] = stop
     elif custom_llm_provider == "cloudflare":
         # https://developers.cloudflare.com/workers-ai/models/text-generation/#input
-        supported_params = ["max_tokens", "stream"]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if max_tokens is not None:
@@ -4664,14 +4693,9 @@ def get_optional_params(
         if stream is not None:
             optional_params["stream"] = stream
     elif custom_llm_provider == "ollama":
-        supported_params = [
-            "max_tokens",
-            "stream",
-            "top_p",
-            "temperature",
-            "frequency_penalty",
-            "stop",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if max_tokens is not None:
@@ -4687,39 +4711,17 @@ def get_optional_params(
         if stop is not None:
             optional_params["stop"] = stop
     elif custom_llm_provider == "ollama_chat":
-        supported_params = [
-            "max_tokens",
-            "stream",
-            "top_p",
-            "temperature",
-            "frequency_penalty",
-            "stop",
-        ]
+        supported_params = litellm.OllamaChatConfig().get_supported_openai_params()
+
         _check_valid_arg(supported_params=supported_params)
 
-        if max_tokens is not None:
-            optional_params["num_predict"] = max_tokens
-        if stream:
-            optional_params["stream"] = stream
-        if temperature is not None:
-            optional_params["temperature"] = temperature
-        if top_p is not None:
-            optional_params["top_p"] = top_p
-        if frequency_penalty is not None:
-            optional_params["repeat_penalty"] = frequency_penalty
-        if stop is not None:
-            optional_params["stop"] = stop
+        optional_params = litellm.OllamaChatConfig().map_openai_params(
+            non_default_params=non_default_params, optional_params=optional_params
+        )
     elif custom_llm_provider == "nlp_cloud":
-        supported_params = [
-            "max_tokens",
-            "stream",
-            "temperature",
-            "top_p",
-            "presence_penalty",
-            "frequency_penalty",
-            "n",
-            "stop",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if max_tokens is not None:
@@ -4739,7 +4741,9 @@ def get_optional_params(
         if stop is not None:
             optional_params["stop_sequences"] = stop
     elif custom_llm_provider == "petals":
-        supported_params = ["max_tokens", "temperature", "top_p", "stream"]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         # max_new_tokens=1,temperature=0.9, top_p=0.6
         if max_tokens is not None:
@@ -4751,18 +4755,9 @@ def get_optional_params(
         if stream:
             optional_params["stream"] = stream
     elif custom_llm_provider == "deepinfra":
-        supported_params = [
-            "temperature",
-            "top_p",
-            "n",
-            "stream",
-            "stop",
-            "max_tokens",
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "user",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         if temperature is not None:
             if (
@@ -4789,14 +4784,9 @@ def get_optional_params(
         if user:
             optional_params["user"] = user
     elif custom_llm_provider == "perplexity":
-        supported_params = [
-            "temperature",
-            "top_p",
-            "stream",
-            "max_tokens",
-            "presence_penalty",
-            "frequency_penalty",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         if temperature is not None:
             if (
@@ -4815,15 +4805,9 @@ def get_optional_params(
         if frequency_penalty:
             optional_params["frequency_penalty"] = frequency_penalty
     elif custom_llm_provider == "anyscale":
-        supported_params = [
-            "temperature",
-            "top_p",
-            "stream",
-            "max_tokens",
-            "stop",
-            "frequency_penalty",
-            "presence_penalty",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         if model in [
             "mistralai/Mistral-7B-Instruct-v0.1",
             "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -4851,14 +4835,9 @@ def get_optional_params(
         if max_tokens:
             optional_params["max_tokens"] = max_tokens
     elif custom_llm_provider == "mistral":
-        supported_params = [
-            "temperature",
-            "top_p",
-            "stream",
-            "max_tokens",
-            "tools",
-            "tool_choice",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
         if temperature is not None:
             optional_params["temperature"] = temperature
@@ -4885,25 +4864,9 @@ def get_optional_params(
             extra_body  # openai client supports `extra_body` param
         )
     elif custom_llm_provider == "openrouter":
-        supported_params = [
-            "functions",
-            "function_call",
-            "temperature",
-            "top_p",
-            "n",
-            "stream",
-            "stop",
-            "max_tokens",
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "user",
-            "response_format",
-            "seed",
-            "tools",
-            "tool_choice",
-            "max_retries",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
         _check_valid_arg(supported_params=supported_params)
 
         if functions is not None:
@@ -4957,28 +4920,9 @@ def get_optional_params(
         )
     else:  # assume passing in params for openai/azure openai
         print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
-        supported_params = [
-            "functions",
-            "function_call",
-            "temperature",
-            "top_p",
-            "n",
-            "stream",
-            "stop",
-            "max_tokens",
-            "presence_penalty",
-            "frequency_penalty",
-            "logit_bias",
-            "user",
-            "response_format",
-            "seed",
-            "tools",
-            "tool_choice",
-            "max_retries",
-            "logprobs",
-            "top_logprobs",
-            "extra_headers",
-        ]
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider="openai"
+        )
         _check_valid_arg(supported_params=supported_params)
         if functions is not None:
             optional_params["functions"] = functions
@@ -5036,15 +4980,228 @@ def get_optional_params(
     return optional_params
 
 
-def get_mapped_model_params(model: str, custom_llm_provider: str):
+def get_supported_openai_params(model: str, custom_llm_provider: str):
     """
     Returns the supported openai params for a given model + provider
+
+    Example:
+    ```
+    get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
+    ```
     """
     if custom_llm_provider == "bedrock":
         if model.startswith("anthropic.claude-3"):
             return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
-        else:
+        elif model.startswith("anthropic"):
             return litellm.AmazonAnthropicConfig().get_supported_openai_params()
+        elif model.startswith("ai21"):
+            return ["max_tokens", "temperature", "top_p", "stream"]
+        elif model.startswith("amazon"):
+            return ["max_tokens", "temperature", "stop", "top_p", "stream"]
+        elif model.startswith("meta"):
+            return ["max_tokens", "temperature", "top_p", "stream"]
+        elif model.startswith("cohere"):
+            return ["stream", "temperature", "max_tokens"]
+        elif model.startswith("mistral"):
+            return ["max_tokens", "temperature", "stop", "top_p", "stream"]
+    elif custom_llm_provider == "ollama_chat":
+        return litellm.OllamaChatConfig().get_supported_openai_params()
+    elif custom_llm_provider == "anthropic":
+        return [
+            "stream",
+            "stop",
+            "temperature",
+            "top_p",
+            "max_tokens",
+            "tools",
+            "tool_choice",
+        ]
+    elif custom_llm_provider == "cohere":
+        return [
+            "stream",
+            "temperature",
+            "max_tokens",
+            "logit_bias",
+            "top_p",
+            "frequency_penalty",
+            "presence_penalty",
+            "stop",
+            "n",
+        ]
+    elif custom_llm_provider == "maritalk":
+        return [
+            "stream",
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "presence_penalty",
+            "stop",
+        ]
+    elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
+        return [
+            "functions",
+            "function_call",
+            "temperature",
+            "top_p",
+            "n",
+            "stream",
+            "stop",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "user",
+            "response_format",
+            "seed",
+            "tools",
+            "tool_choice",
+            "max_retries",
+            "logprobs",
+            "top_logprobs",
+            "extra_headers",
+        ]
+    elif custom_llm_provider == "openrouter":
+        return [
+            "functions",
+            "function_call",
+            "temperature",
+            "top_p",
+            "n",
+            "stream",
+            "stop",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "user",
+            "response_format",
+            "seed",
+            "tools",
+            "tool_choice",
+            "max_retries",
+        ]
+    elif custom_llm_provider == "mistral":
+        return [
+            "temperature",
+            "top_p",
+            "stream",
+            "max_tokens",
+            "tools",
+            "tool_choice",
+        ]
+    elif custom_llm_provider == "replicate":
+        return [
+            "stream",
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "stop",
+            "seed",
+        ]
+    elif custom_llm_provider == "huggingface":
+        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+    elif custom_llm_provider == "together_ai":
+        return [
+            "stream",
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "stop",
+            "frequency_penalty",
+            "tools",
+            "tool_choice",
+        ]
+    elif custom_llm_provider == "ai21":
+        return [
+            "stream",
+            "n",
+            "temperature",
+            "max_tokens",
+            "top_p",
+            "stop",
+            "frequency_penalty",
+            "presence_penalty",
+        ]
+    elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
+        return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+    elif custom_llm_provider == "vertex_ai":
+        return [
+            "temperature",
+            "top_p",
+            "max_tokens",
+            "stream",
+            "tools",
+            "tool_choice",
+        ]
+    elif custom_llm_provider == "sagemaker":
+        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+    elif custom_llm_provider == "aleph_alpha":
+        return [
+            "max_tokens",
+            "stream",
+            "top_p",
+            "temperature",
+            "presence_penalty",
+            "frequency_penalty",
+            "n",
+            "stop",
+        ]
+    elif custom_llm_provider == "cloudflare":
+        return ["max_tokens", "stream"]
+    elif custom_llm_provider == "ollama":
+        return [
+            "max_tokens",
+            "stream",
+            "top_p",
+            "temperature",
+            "frequency_penalty",
+            "stop",
+        ]
+    elif custom_llm_provider == "nlp_cloud":
+        return [
+            "max_tokens",
+            "stream",
+            "temperature",
+            "top_p",
+            "presence_penalty",
+            "frequency_penalty",
+            "n",
+            "stop",
+        ]
+    elif custom_llm_provider == "petals":
+        return ["max_tokens", "temperature", "top_p", "stream"]
+    elif custom_llm_provider == "deepinfra":
+        return [
+            "temperature",
+            "top_p",
+            "n",
+            "stream",
+            "stop",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+            "logit_bias",
+            "user",
+        ]
+    elif custom_llm_provider == "perplexity":
+        return [
+            "temperature",
+            "top_p",
+            "stream",
+            "max_tokens",
+            "presence_penalty",
+            "frequency_penalty",
+        ]
+    elif custom_llm_provider == "anyscale":
+        return [
+            "temperature",
+            "top_p",
+            "stream",
+            "max_tokens",
+            "stop",
+            "frequency_penalty",
+            "presence_penalty",
+        ]
 
 
 def get_llm_provider(
@@ -6149,10 +6306,10 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
 def convert_to_model_response_object(
     response_object: Optional[dict] = None,
     model_response_object: Optional[
-        Union[ModelResponse, EmbeddingResponse, ImageResponse]
+        Union[ModelResponse, EmbeddingResponse, ImageResponse, TranscriptionResponse]
     ] = None,
     response_type: Literal[
-        "completion", "embedding", "image_generation"
+        "completion", "embedding", "image_generation", "audio_transcription"
     ] = "completion",
     stream=False,
     start_time=None,
@@ -6263,6 +6420,19 @@ def convert_to_model_response_object(
                 model_response_object.data = response_object["data"]
 
             return model_response_object
+        elif response_type == "audio_transcription" and (
+            model_response_object is None
+            or isinstance(model_response_object, TranscriptionResponse)
+        ):
+            if response_object is None:
+                raise Exception("Error in response object format")
+
+            if model_response_object is None:
+                model_response_object = TranscriptionResponse()
+
+            if "text" in response_object:
+                model_response_object.text = response_object["text"]
+            return model_response_object
     except Exception as e:
         raise Exception(f"Invalid response object {traceback.format_exc()}")
 
@@ -7796,7 +7966,9 @@ def exception_type(
                             message=f"AzureException - {original_exception.message}",
                             llm_provider="azure",
                             model=model,
-                            request=original_exception.request,
+                            request=httpx.Request(
+                                method="POST", url="https://openai.com/"
+                            ),
                         )
                 else:
                     # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors
@@ -7804,7 +7976,11 @@ def exception_type(
                         __cause__=original_exception.__cause__,
                         llm_provider="azure",
                         model=model,
-                        request=original_exception.request,
+                        request=getattr(
+                            original_exception,
+                            "request",
+                            httpx.Request(method="POST", url="https://openai.com/"),
+                        ),
                     )
         if (
             "BadRequestError.__init__() missing 1 required positional argument: 'param'"
@@ -8602,13 +8778,20 @@ class CustomStreamWrapper:
                 text = chunk_data.get("completions")[0].get("data").get("text")
                 is_finished = True
                 finish_reason = "stop"
-            # anthropic mapping
-            elif "completion" in chunk_data:
+            ######## bedrock.anthropic mappings ###############
+            elif "completion" in chunk_data:  # not claude-3
                 text = chunk_data["completion"]  # bedrock.anthropic
                 stop_reason = chunk_data.get("stop_reason", None)
                 if stop_reason != None:
                     is_finished = True
                     finish_reason = stop_reason
+            elif "delta" in chunk_data:
+                if chunk_data["delta"].get("text", None) is not None:
+                    text = chunk_data["delta"]["text"]
+                stop_reason = chunk_data["delta"].get("stop_reason", None)
+                if stop_reason != None:
+                    is_finished = True
+                    finish_reason = stop_reason
             ######## bedrock.cohere mappings ###############
             # meta mapping
             elif "generation" in chunk_data:
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 111b9f8c3..18c4b0d9a 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -108,7 +108,7 @@
     },
     "gpt-3.5-turbo": {
         "max_tokens": 4097,
-        "max_input_tokens": 4097,
+        "max_input_tokens": 16385,
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.0000015,
         "output_cost_per_token": 0.000002,
@@ -293,6 +293,18 @@
         "output_cost_per_pixel": 0.0,
         "litellm_provider": "openai"
     },
+    "whisper-1": {
+        "mode": "audio_transcription",
+        "input_cost_per_second": 0,
+        "output_cost_per_second": 0.0001, 
+        "litellm_provider": "openai"
+    }, 
+    "azure/whisper-1": {
+        "mode": "audio_transcription",
+        "input_cost_per_second": 0, 
+        "output_cost_per_second": 0.0001, 
+        "litellm_provider": "azure"
+    },
     "azure/gpt-4-0125-preview": {
         "max_tokens": 128000,
         "max_input_tokens": 128000,
@@ -2259,4 +2271,4 @@
         "mode": "embedding"
     }
 
-}
\ No newline at end of file
+}
diff --git a/pyproject.toml b/pyproject.toml
index c7701be9c..1d7d4a21c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.30.1"
+version = "1.30.6"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -75,7 +75,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.30.1"
+version = "1.30.6"
 version_files = [
     "pyproject.toml:^version"
 ]
diff --git a/tests/gettysburg.wav b/tests/gettysburg.wav
new file mode 100644
index 000000000..9690f521e
Binary files /dev/null and b/tests/gettysburg.wav differ
diff --git a/tests/test_whisper.py b/tests/test_whisper.py
new file mode 100644
index 000000000..54ecfbf50
--- /dev/null
+++ b/tests/test_whisper.py
@@ -0,0 +1,116 @@
+# What is this?
+## Tests `litellm.transcription` endpoint. Outside litellm module b/c of audio file used in testing (it's ~700kb).
+
+import pytest
+import asyncio, time
+import aiohttp, traceback
+from openai import AsyncOpenAI
+import sys, os, dotenv
+from typing import Optional
+from dotenv import load_dotenv
+
+# Get the current directory of the file being run
+pwd = os.path.dirname(os.path.realpath(__file__))
+print(pwd)
+
+file_path = os.path.join(pwd, "gettysburg.wav")
+
+audio_file = open(file_path, "rb")
+
+load_dotenv()
+
+sys.path.insert(
+    0, os.path.abspath("../")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import Router
+
+
+def test_transcription():
+    transcript = litellm.transcription(
+        model="whisper-1",
+        file=audio_file,
+    )
+    print(f"transcript: {transcript}")
+
+
+# test_transcription()
+
+
+def test_transcription_azure():
+    litellm.set_verbose = True
+    transcript = litellm.transcription(
+        model="azure/azure-whisper",
+        file=audio_file,
+        api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
+        api_key=os.getenv("AZURE_EUROPE_API_KEY"),
+        api_version="2024-02-15-preview",
+    )
+
+    assert transcript.text is not None
+    assert isinstance(transcript.text, str)
+
+
+# test_transcription_azure()
+
+
+@pytest.mark.asyncio
+async def test_transcription_async_azure():
+    transcript = await litellm.atranscription(
+        model="azure/azure-whisper",
+        file=audio_file,
+        api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
+        api_key=os.getenv("AZURE_EUROPE_API_KEY"),
+        api_version="2024-02-15-preview",
+    )
+
+    assert transcript.text is not None
+    assert isinstance(transcript.text, str)
+
+
+# asyncio.run(test_transcription_async_azure())
+
+
+@pytest.mark.asyncio
+async def test_transcription_async_openai():
+    transcript = await litellm.atranscription(
+        model="whisper-1",
+        file=audio_file,
+    )
+
+    assert transcript.text is not None
+    assert isinstance(transcript.text, str)
+
+
+@pytest.mark.asyncio
+async def test_transcription_on_router():
+    litellm.set_verbose = True
+    print("\n Testing async transcription on router\n")
+    try:
+        model_list = [
+            {
+                "model_name": "whisper",
+                "litellm_params": {
+                    "model": "whisper-1",
+                },
+            },
+            {
+                "model_name": "whisper",
+                "litellm_params": {
+                    "model": "azure/azure-whisper",
+                    "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
+                    "api_key": os.getenv("AZURE_EUROPE_API_KEY"),
+                    "api_version": "2024-02-15-preview",
+                },
+            },
+        ]
+
+        router = Router(model_list=model_list)
+        response = await router.atranscription(
+            model="whisper",
+            file=audio_file,
+        )
+        print(response)
+    except Exception as e:
+        traceback.print_exc()
+        pytest.fail(f"Error occurred: {e}")