Merge branch 'main' into litellm_dynamo_use_arn

2024-02-13 21:27:38 -08:00 · 2024-02-13 21:27:38 -08:00 · 003feda33f
commit 003feda33f
parent 7d27690203 fa74adb041
249 changed files with 24392 additions and 2775 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -147,12 +147,18 @@ jobs:
              -e AZURE_API_KEY=$AZURE_API_KEY \
              -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
+              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
+              -e AWS_REGION_NAME=$AWS_REGION_NAME \
+              -e OPENAI_API_KEY=$OPENAI_API_KEY \
              --name my-app \
              -v $(pwd)/proxy_server_config.yaml:/app/config.yaml \
              my-app:latest \
              --config /app/config.yaml \
              --port 4000 \
-              --num_workers 8
+              --num_workers 8 \
+              --detailed_debug \
+              --run_gunicorn \
      - run:
          name: Install curl and dockerize
          command: |
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
--- a/.github/actions/helm-oci-chart-releaser/action.yml
+++ b/.github/actions/helm-oci-chart-releaser/action.yml
@ -0,0 +1,77 @@
+name: Helm OCI Chart Releaser
+description: Push Helm charts to OCI-based (Docker) registries
+author: sergeyshaykhullin
+branding:
+  color: yellow
+  icon: upload-cloud
+inputs:
+  name:
+    required: true
+    description: Chart name
+  repository:
+    required: true
+    description: Chart repository name
+  tag:
+    required: true
+    description: Chart version
+  app_version:
+    required: true
+    description: App version
+  path:
+    required: false
+    description: Chart path (Default 'charts/{name}')
+  registry:
+    required: true
+    description: OCI registry
+  registry_username:
+    required: true
+    description: OCI registry username
+  registry_password:
+    required: true
+    description: OCI registry password
+  update_dependencies:
+    required: false
+    default: 'false'
+    description: Update chart dependencies before packaging (Default 'false')
+outputs:
+  image:
+    value: ${{ steps.output.outputs.image }}
+    description: Chart image (Default '{registry}/{repository}/{image}:{tag}')
+runs:
+  using: composite
+  steps:
+    - name: Helm | Login
+      shell: bash
+      run: echo ${{ inputs.registry_password }} | helm registry login -u ${{ inputs.registry_username }} --password-stdin ${{ inputs.registry }}
+      env:
+        HELM_EXPERIMENTAL_OCI: '1'
+    
+    - name: Helm | Dependency
+      if: inputs.update_dependencies == 'true'
+      shell: bash
+      run: helm dependency update ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }}
+      env:
+        HELM_EXPERIMENTAL_OCI: '1'
+
+    - name: Helm | Package
+      shell: bash
+      run: helm package ${{ inputs.path == null && format('{0}/{1}', 'charts', inputs.name) || inputs.path }} --version ${{ inputs.tag }} --app-version ${{ inputs.app_version }}
+      env:
+        HELM_EXPERIMENTAL_OCI: '1'
+
+    - name: Helm | Push
+      shell: bash
+      run: helm push ${{ inputs.name }}-${{ inputs.tag }}.tgz oci://${{ inputs.registry }}/${{ inputs.repository }}
+      env:
+        HELM_EXPERIMENTAL_OCI: '1'
+
+    - name: Helm | Logout
+      shell: bash
+      run: helm registry logout ${{ inputs.registry }}
+      env:
+        HELM_EXPERIMENTAL_OCI: '1'
+
+    - name: Helm | Output
+      id: output
+      shell: bash
+      run: echo "image=${{ inputs.registry }}/${{ inputs.repository }}/${{ inputs.name }}:${{ inputs.tag }}" >> $GITHUB_OUTPUT
--- a/.github/template.yaml
+++ b/.github/template.yaml
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -34,13 +34,6 @@ jobs:
        with:
          push: true
          tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }} 
-      -
-        name: Build and push litellm-ui image
-        uses: docker/build-push-action@v5
-        with:
-          push: true
-          file: ui/Dockerfile
-          tags: litellm/litellm-ui:${{ github.event.inputs.tag || 'latest' }}
      -
        name: Build and push litellm-database image
        uses: docker/build-push-action@v5
@ -82,36 +75,8 @@ jobs:
          push: true
          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-latest # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
          labels: ${{ steps.meta.outputs.labels }}
-  build-and-push-image-ui:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Log in to the Container registry
-        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata (tags, labels) for UI Dockerfile
-        id: meta-ui
-        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ui
-
-      - name: Build and push UI Docker image
-        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
-        with:
-          context: ui/
-          file: ui/Dockerfile
-          push: true
-          tags: ${{ steps.meta-ui.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-ui.outputs.tags }}-latest
-          labels: ${{ steps.meta-ui.outputs.labels }}
+          platform: local, linux/amd64,linux/arm64,linux/arm64/v8
+          
  build-and-push-image-database:
    runs-on: ubuntu-latest
    permissions:
@ -176,3 +141,14 @@ jobs:
            } catch (error) {
              core.setFailed(error.message);
            }
+      - name: Github Releases To Discord
+        uses: SethCohen/github-releases-to-discord@v1.13.1
+        with:
+          webhook_url: ${{ secrets.WEBHOOK_URL }}
+          color: "2105893"
+          username: "Release Changelog"
+          avatar_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
+          content: "||@everyone||"
+          footer_title: "Changelog"
+          footer_icon_url: "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png"
+          footer_timestamp: true
--- a/.github/workflows/ghcr_helm_deploy.yml
+++ b/.github/workflows/ghcr_helm_deploy.yml
@ -0,0 +1,64 @@
+# this workflow is triggered by an API call when there is a new PyPI release of LiteLLM
+name: Build, Publish LiteLLM Helm Chart. New Release
+on:
+  workflow_dispatch:
+    inputs:
+      chartVersion:
+        description: "Update the helm chart's version to this"
+
+# Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds.
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+  REPO_OWNER: ${{github.repository_owner}}
+
+# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
+jobs:        
+  build-and-push-helm-chart:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: lowercase github.repository_owner
+        run: |
+          echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV}
+
+      - name: Get LiteLLM Latest Tag
+        id: current_app_tag
+        uses: WyriHaximus/github-action-get-previous-tag@v1.3.0
+
+      - name: Get last published chart version
+        id: current_version
+        shell: bash
+        run: helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/litellm-helm | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT
+        env:
+          HELM_EXPERIMENTAL_OCI: '1'
+
+      # Automatically update the helm chart version one "patch" level
+      - name: Bump release version
+        id: bump_version
+        uses: christian-draeger/increment-semantic-version@1.1.0
+        with:
+          current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
+          version-fragment: 'bug'
+
+      - uses: ./.github/actions/helm-oci-chart-releaser
+        with:
+          name: litellm-helm
+          repository: ${{ env.REPO_OWNER }}
+          tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }}
+          app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }}
+          path: deploy/charts/litellm-helm
+          registry: ${{ env.REGISTRY }}
+          registry_username: ${{ github.actor }}
+          registry_password: ${{ secrets.GITHUB_TOKEN }}
+          update_dependencies: true
+  
--- a/.gitignore
+++ b/.gitignore
@ -39,4 +39,8 @@ ui/litellm-dashboard/.next
 ui/litellm-dashboard/node_modules
 ui/litellm-dashboard/next-env.d.ts
 ui/litellm-dashboard/package.json
-ui/litellm-dashboard/package-lock.json
+ui/litellm-dashboard/package-lock.json
+deploy/charts/litellm-helm/*.tgz
+deploy/charts/litellm-helm/charts/*
+deploy/charts/*.tgz
+litellm/proxy/vertex_key.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -10,6 +10,12 @@ repos:
       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
+-   repo: local
+    hooks:
+    -   id: check-files-match
+        name: Check if files match
+        entry: python3 ci_cd/check_files_match.py
+        language: system
 -   repo: local
    hooks:
    -   id: mypy
--- a/5
+++ b/5
@ -32,6 +32,9 @@ RUN pip install dist/*.whl
 # install dependencies as wheels
 RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt

+# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
+RUN pip install redisvl==0.0.7 --no-deps
+
 # Runtime stage
 FROM $LITELLM_RUNTIME_IMAGE as runtime

@ -52,4 +55,4 @@ RUN chmod +x entrypoint.sh
 EXPOSE 4000/tcp

 ENTRYPOINT ["litellm"]
-CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug"]
+CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--detailed_debug", "--run_gunicorn"]
--- a/Dockerfile.database
+++ b/Dockerfile.database
@ -47,6 +47,9 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels

+# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
+RUN pip install redisvl==0.0.7 --no-deps
+
 # Generate prisma client
 RUN prisma generate
 RUN chmod +x entrypoint.sh
@ -56,4 +59,4 @@ EXPOSE 4000/tcp
 # # Set your entrypoint and command

 ENTRYPOINT ["litellm"]
-CMD ["--port", "4000"]
+CMD ["--port", "4000", "--run_gunicorn"]
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@
        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
        <br>
    </p>
-<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a></h4>
+<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Support</a></h4>
 <h4 align="center">
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -28,10 +28,14 @@ LiteLLM manages:
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+

 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)

+Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
+
 # Usage ([**Docs**](https://docs.litellm.ai/docs/))
 > [!IMPORTANT]
 > LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration)
@ -155,6 +159,9 @@ print(response)
 ```

 ## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys))
+UI on `/ui` on your proxy server 
+![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
+
 Track Spend, Set budgets and create virtual keys for the proxy
 `POST /key/generate`

@ -174,17 +181,6 @@ curl 'http://0.0.0.0:8000/key/generate' \
 }
 ```

-### [Beta] Proxy UI
-
-A simple UI to add new models and let your users create keys. 
-
-Live here: https://dashboard.litellm.ai/
-
-Code: https://github.com/BerriAI/litellm/tree/main/ui  
-
-  
-<img width="1672" alt="Screenshot 2023-12-26 at 8 33 53 AM" src="https://github.com/BerriAI/litellm/assets/17561003/274254d8-c5fe-4645-9123-100045a7fb21">
-
 ## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers))
 | Provider      | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses)  | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion)  | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming)  | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding)  | [Async Image Generation](https://docs.litellm.ai/docs/image_generation)  | 
 | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
--- a/ci_cd/check_files_match.py
+++ b/ci_cd/check_files_match.py
@ -0,0 +1,32 @@
+import sys
+import filecmp
+import shutil
+
+
+def main(argv=None):
+    print(
+        "Comparing model_prices_and_context_window and litellm/model_prices_and_context_window_backup.json files... checking if they match."
+    )
+
+    file1 = "model_prices_and_context_window.json"
+    file2 = "litellm/model_prices_and_context_window_backup.json"
+
+    cmp_result = filecmp.cmp(file1, file2, shallow=False)
+
+    if cmp_result:
+        print(f"Passed! Files {file1} and {file2} match.")
+        return 0
+    else:
+        print(
+            f"Failed! Files {file1} and {file2} do not match. Copying content from {file1} to {file2}."
+        )
+        copy_content(file1, file2)
+        return 1
+
+
+def copy_content(source, destination):
+    shutil.copy2(source, destination)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_openai_client.py
@ -0,0 +1,76 @@
+import sys, os
+import traceback
+from dotenv import load_dotenv
+import copy
+
+load_dotenv()
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+from litellm import Router, Timeout
+import time
+from litellm.caching import Cache
+import litellm
+import openai
+
+### Test just calling AsyncAzureOpenAI
+
+openai_client = openai.AsyncAzureOpenAI(
+    azure_endpoint=os.getenv("AZURE_API_BASE"),
+    api_key=os.getenv("AZURE_API_KEY"),
+)
+
+
+async def call_acompletion(semaphore, input_data):
+    async with semaphore:
+        try:
+            # Use asyncio.wait_for to set a timeout for the task
+            response = await openai_client.chat.completions.create(**input_data)
+            # Handle the response as needed
+            print(response)
+            return response
+        except Timeout:
+            print(f"Task timed out: {input_data}")
+            return None  # You may choose to return something else or raise an exception
+
+
+async def main():
+    # Initialize the Router
+
+    # Create a semaphore with a capacity of 100
+    semaphore = asyncio.Semaphore(100)
+
+    # List to hold all task references
+    tasks = []
+    start_time_all_tasks = time.time()
+    # Launch 1000 tasks
+    for _ in range(500):
+        task = asyncio.create_task(
+            call_acompletion(
+                semaphore,
+                {
+                    "model": "chatgpt-v-2",
+                    "messages": [{"role": "user", "content": "Hey, how's it going?"}],
+                },
+            )
+        )
+        tasks.append(task)
+
+    # Wait for all tasks to complete
+    responses = await asyncio.gather(*tasks)
+    # Process responses as needed
+    # Record the end time for all tasks
+    end_time_all_tasks = time.time()
+    # Calculate the total time for all tasks
+    total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
+    print(f"Total time for all tasks: {total_time_all_tasks} seconds")
+
+    # Calculate the average time per response
+    average_time_per_response = total_time_all_tasks / len(responses)
+    print(f"Average time per response: {average_time_per_response} seconds")
+    print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
+
+
+# Run the main function
+asyncio.run(main())
--- a/cookbook/litellm_router_load_test/test_loadtest_router.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router.py
@ -0,0 +1,88 @@
+import sys, os
+import traceback
+from dotenv import load_dotenv
+import copy
+
+load_dotenv()
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+from litellm import Router, Timeout
+import time
+
+### Test calling router async
+
+
+async def call_acompletion(semaphore, router: Router, input_data):
+    async with semaphore:
+        try:
+            # Use asyncio.wait_for to set a timeout for the task
+            response = await router.acompletion(**input_data)
+            # Handle the response as needed
+            print(response)
+            return response
+        except Timeout:
+            print(f"Task timed out: {input_data}")
+            return None  # You may choose to return something else or raise an exception
+
+
+async def main():
+    # Initialize the Router
+    model_list = [
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "gpt-3.5-turbo",
+                "api_key": os.getenv("OPENAI_API_KEY"),
+            },
+        },
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "azure/chatgpt-v-2",
+                "api_key": os.getenv("AZURE_API_KEY"),
+                "api_base": os.getenv("AZURE_API_BASE"),
+                "api_version": os.getenv("AZURE_API_VERSION"),
+            },
+        },
+    ]
+    router = Router(model_list=model_list, num_retries=3, timeout=10)
+
+    # Create a semaphore with a capacity of 100
+    semaphore = asyncio.Semaphore(100)
+
+    # List to hold all task references
+    tasks = []
+    start_time_all_tasks = time.time()
+    # Launch 1000 tasks
+    for _ in range(500):
+        task = asyncio.create_task(
+            call_acompletion(
+                semaphore,
+                router,
+                {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [{"role": "user", "content": "Hey, how's it going?"}],
+                },
+            )
+        )
+        tasks.append(task)
+
+    # Wait for all tasks to complete
+    responses = await asyncio.gather(*tasks)
+    # Process responses as needed
+    # Record the end time for all tasks
+    end_time_all_tasks = time.time()
+    # Calculate the total time for all tasks
+    total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
+    print(f"Total time for all tasks: {total_time_all_tasks} seconds")
+
+    # Calculate the average time per response
+    average_time_per_response = total_time_all_tasks / len(responses)
+    print(f"Average time per response: {average_time_per_response} seconds")
+    print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
+
+
+# Run the main function
+asyncio.run(main())
--- a/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
+++ b/cookbook/litellm_router_load_test/test_loadtest_router_withs3_cache.py
@ -0,0 +1,94 @@
+import sys, os
+import traceback
+from dotenv import load_dotenv
+import copy
+
+load_dotenv()
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+from litellm import Router, Timeout
+import time
+from litellm.caching import Cache
+import litellm
+
+litellm.cache = Cache(
+    type="s3", s3_bucket_name="cache-bucket-litellm", s3_region_name="us-west-2"
+)
+
+### Test calling router with s3 Cache
+
+
+async def call_acompletion(semaphore, router: Router, input_data):
+    async with semaphore:
+        try:
+            # Use asyncio.wait_for to set a timeout for the task
+            response = await router.acompletion(**input_data)
+            # Handle the response as needed
+            print(response)
+            return response
+        except Timeout:
+            print(f"Task timed out: {input_data}")
+            return None  # You may choose to return something else or raise an exception
+
+
+async def main():
+    # Initialize the Router
+    model_list = [
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "gpt-3.5-turbo",
+                "api_key": os.getenv("OPENAI_API_KEY"),
+            },
+        },
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "azure/chatgpt-v-2",
+                "api_key": os.getenv("AZURE_API_KEY"),
+                "api_base": os.getenv("AZURE_API_BASE"),
+                "api_version": os.getenv("AZURE_API_VERSION"),
+            },
+        },
+    ]
+    router = Router(model_list=model_list, num_retries=3, timeout=10)
+
+    # Create a semaphore with a capacity of 100
+    semaphore = asyncio.Semaphore(100)
+
+    # List to hold all task references
+    tasks = []
+    start_time_all_tasks = time.time()
+    # Launch 1000 tasks
+    for _ in range(500):
+        task = asyncio.create_task(
+            call_acompletion(
+                semaphore,
+                router,
+                {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [{"role": "user", "content": "Hey, how's it going?"}],
+                },
+            )
+        )
+        tasks.append(task)
+
+    # Wait for all tasks to complete
+    responses = await asyncio.gather(*tasks)
+    # Process responses as needed
+    # Record the end time for all tasks
+    end_time_all_tasks = time.time()
+    # Calculate the total time for all tasks
+    total_time_all_tasks = end_time_all_tasks - start_time_all_tasks
+    print(f"Total time for all tasks: {total_time_all_tasks} seconds")
+
+    # Calculate the average time per response
+    average_time_per_response = total_time_all_tasks / len(responses)
+    print(f"Average time per response: {average_time_per_response} seconds")
+    print(f"NUMBER OF COMPLETED TASKS: {len(responses)}")
+
+
+# Run the main function
+asyncio.run(main())
--- a/cookbook/misc/dev_release.txt
+++ b/cookbook/misc/dev_release.txt
@ -0,0 +1,2 @@
+python3 -m build
+twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - 
--- a/deploy/charts/litellm-helm/.helmignore
+++ b/deploy/charts/litellm-helm/.helmignore
@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
--- a/deploy/charts/litellm-helm/Chart.lock
+++ b/deploy/charts/litellm-helm/Chart.lock
@ -0,0 +1,6 @@
+dependencies:
+- name: postgresql
+  repository: oci://registry-1.docker.io/bitnamicharts
+  version: 13.3.1
+digest: sha256:f5c129150f0d38dd06752ab37f3c8e143d7c14d30379af058767bcd9f4ba83dd
+generated: "2024-01-19T11:32:56.694808861+11:00"
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -0,0 +1,34 @@
+apiVersion: v2
+
+# We can't call ourselves just "litellm" because then we couldn't publish to the
+#  same OCI repository as the "litellm" OCI image
+name: litellm-helm
+description: Call all LLM APIs using the OpenAI format
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: v1.18.9
+
+dependencies:
+  - name: "postgresql"
+    version: ">=13.3.0"
+    repository: oci://registry-1.docker.io/bitnamicharts
+    condition: db.deployStandalone
+
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -0,0 +1,107 @@
+# Helm Chart for LiteLLM
+
+## Prerequisites
+
+- Kubernetes 1.23+
+- Helm 3.8.0+
+
+If `db.deployStandalone` is used:
+- PV provisioner support in the underlying infrastructure
+
+If `db.useStackgresOperator` is used (not yet implemented):
+- The Stackgres Operator must already be installed in the Kubernetes Cluster.  This chart will **not** install the operator if it is missing.
+
+## Parameters
+
+### LiteLLM Proxy Deployment Settings
+
+| Name                                                       | Description                                                                                                                                                                           | Value |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
+| `replicaCount`                                             | The number of LiteLLM Proxy pods to be deployed                                                                                                                                       | `1`  |
+| `masterkey`                                                | The Master API Key for LiteLLM.  If not specified, a random key is generated.                                                                                                         | N/A  |
+| `environmentSecrets`                                       | An optional array of Secret object names.  The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables.  See below for an example Secret object.  | `[]`  |
+| `image.repository`                                         | LiteLLM Proxy image repository                                                                                                                                                        | `ghcr.io/berriai/litellm`  |
+| `image.pullPolicy`                                         | LiteLLM Proxy image pull policy                                                                                                                                                       | `IfNotPresent`  |
+| `image.tag`                                                | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published.                                                                             | `""`  |
+| `image.dbReadyImage`                                       | On Pod startup, an initContainer is used to make sure the Postgres database is available before attempting to start LiteLLM.  This field specifies the image to use as that initContainer.  | `docker.io/bitnami/postgresql`  |
+| `image.dbReadyTag`                                         | Tag for the above image.  If not specified, "latest" is used.                                                                                                                         | `""`  |
+| `imagePullSecrets`                                         | Registry credentials for the LiteLLM and initContainer images.                                                                                                                        | `[]`  |
+| `serviceAccount.create`                                    | Whether or not to create a Kubernetes Service Account for this deployment.  The default is `false` because LiteLLM has no need to access the Kubernetes API.                          | `false`  |
+| `service.type`                                             | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.)                                                                                                                      | `ClusterIP`  |
+| `service.port`                                             | TCP port that the Kubernetes Service will listen on.  Also the TCP port within the Pod that the proxy will listen on.                                                                 | `8000`  |
+| `ingress.*`                                                | See [values.yaml](./values.yaml) for example settings                                                                                                                                 | N/A  |
+| `proxy_config.*`                                           | See [values.yaml](./values.yaml) for default settings.  See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples.                            | N/A  |
+
+#### Example `environmentSecrets` Secret 
+```
+apiVersion: v1
+kind: Secret
+metadata:
+  name: litellm-envsecrets
+data:
+  AZURE_OPENAI_API_KEY: TXlTZWN1cmVLM3k=
+type: Opaque
+```
+
+### LiteLLM Admin UI Settings
+
+| Name                                                       | Description                                                                                                                                                                           | Value |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
+| `ui.enabled`                                               | Should the LiteLLM Admin UI be deployed                                                                                                                                               | `true`  |
+| `ui.replicaCount`                                          | The number of LiteLLM Admin UI pods to be deployed                                                                                                                                    | `1`   |
+| `ui.image.repository`                                      | LiteLLM Admin UI image repository                                                                                                                                                     | `ghcr.io/berriai/litellm`  |
+| `ui.image.pullPolicy`                                      | LiteLLM Admin UI image pull policy                                                                                                                                                    | `IfNotPresent`  |
+| `ui.image.tag`                                             | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published.                                                                             | `""`  |
+| `ui.imagePullSecrets`                                      | Registry credentials for the above images.                                                                                                                                                         | `[]`  |
+| `ui.service.type`                                          | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.)                                                                                                                      | `ClusterIP`  |
+| `ui.service.port`                                          | TCP port that the Kubernetes Service will listen on.  Also the TCP port within the Pod that the web server will listen on.                                                                 | `8000`  |
+| `ui.ingress.*`                                             | See [values.yaml](./values.yaml) for example settings                                                                                                                                 | N/A |
+
+### Database Settings
+| Name                                                       | Description                                                                                                                                                                           | Value |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- |
+| `db.useExisting`                                           | Use an existing Postgres database.  A Kubernetes Secret object must exist that contains credentials for connecting to the database.  An example secret object definition is provided below.  | `false`  |
+| `db.endpoint`                                              | If `db.useExisting` is `true`, this is the IP, Hostname or Service Name of the Postgres server to connect to.                                                                         | `localhost`  |
+| `db.database`                                              | If `db.useExisting` is `true`, the name of the existing database to connect to.                                                                                                       | `litellm`  |
+| `db.secret.name`                                           | If `db.useExisting` is `true`, the name of the Kubernetes Secret that contains credentials.                                                                                           | `postgres`  |
+| `db.secret.usernameKey`                                    | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the username for authenticating with the Postgres instance.                                | `username`  |
+| `db.secret.passwordKey`                                    | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the password associates with the above user.                                               | `password`  |
+| `db.useStackgresOperator`                                  | Not yet implemented.                                                                                                                                                                  | `false`  |
+| `db.deployStandalone`                                      | Deploy a standalone, single instance deployment of Postgres, using the Bitnami postgresql chart.  This is useful for getting started but doesn't provide HA or (by default) data backups.  | `true`  |
+| `postgresql.*`                                             | If `db.deployStandalone` is `true`, configuration passed to the Bitnami postgresql chart.   See the [Bitnami Documentation](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) for full configuration details.  See [values.yaml](./values.yaml) for the default configuration.  | See [values.yaml](./values.yaml) |
+| `postgresql.auth.*`                                        | If `db.deployStandalone` is `true`, care should be taken to ensure the default `password` and `postgres-password` values are **NOT** used.                                            | `NoTaGrEaTpAsSwOrD`  |
+
+#### Example Postgres `db.useExisting` Secret
+```yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: postgres
+data:
+  # Password for the "postgres" user
+  postgres-password: <some secure password, base64 encoded>
+  username: litellm
+  password: <some secure password, base64 encoded>
+type: Opaque
+```
+
+## Accessing the Admin UI
+When browsing to the URL published per the settings in `ui.ingress.*`, you will
+be prompted for **Admin Configuration**.  The **Proxy Endpoint** is the internal
+(from the `litellm-ui` pod's perspective) URL published by the `litellm-proxy`
+Kubernetes Service.  If the deployment uses the default settings for this
+service, the **Proxy Endpoint** should be set to `http://litellm-proxy:8000`.
+
+The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey`
+was not provided to the helm command line, the `masterkey` is a randomly
+generated string stored in the `litellm-masterkey` Kubernetes Secret.
+
+```bash
+kubectl -n litellm get secret litellm-masterkey -o jsonpath="{.data.masterkey}"
+```
+
+## Admin UI Limitations
+At the time of writing, the Admin UI is unable to add models.  This is because
+it would need to update the `config.yaml` file which is a exposed ConfigMap, and
+therefore, read-only.  This is a limitation of this helm chart, not the Admin UI
+itself.
--- a/deploy/charts/litellm-helm/templates/NOTES.txt
+++ b/deploy/charts/litellm-helm/templates/NOTES.txt
@ -0,0 +1,22 @@
+1. Get the application URL by running these commands:
+{{- if .Values.ingress.enabled }}
+{{- range $host := .Values.ingress.hosts }}
+  {{- range .paths }}
+  http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
+  {{- end }}
+{{- end }}
+{{- else if contains "NodePort" .Values.service.type }}
+  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "litellm.fullname" . }})
+  export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
+  echo http://$NODE_IP:$NODE_PORT
+{{- else if contains "LoadBalancer" .Values.service.type }}
+     NOTE: It may take a few minutes for the LoadBalancer IP to be available.
+           You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "litellm.fullname" . }}'
+  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "litellm.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
+  echo http://$SERVICE_IP:{{ .Values.service.port }}
+{{- else if contains "ClusterIP" .Values.service.type }}
+  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "litellm.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
+  export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
+  echo "Visit http://127.0.0.1:8080 to use your application"
+  kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/_helpers.tpl
+++ b/deploy/charts/litellm-helm/templates/_helpers.tpl
@ -0,0 +1,74 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "litellm.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "litellm.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "litellm.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "litellm.labels" -}}
+helm.sh/chart: {{ include "litellm.chart" . }}
+{{ include "litellm.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+{{- define "litellm.ui.labels" -}}
+helm.sh/chart: {{ include "litellm.chart" . }}
+{{ include "litellm.ui.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "litellm.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "litellm.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+{{- define "litellm.ui.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "litellm.name" . }}-ui
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "litellm.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "litellm.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/configmap-litellm.yaml
+++ b/deploy/charts/litellm-helm/templates/configmap-litellm.yaml
@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "litellm.fullname" . }}-config
+data:
+  config.yaml: |
+{{ .Values.proxy_config | toYaml | indent 6 }}
--- a/deploy/charts/litellm-helm/templates/deployment-proxy.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment-proxy.yaml
@ -0,0 +1,230 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "litellm.fullname" . }}-proxy
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+spec:
+  {{- if not .Values.autoscaling.enabled }}
+  replicas: {{ .Values.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "litellm.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "litellm.labels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "litellm.serviceAccountName" . }}
+      securityContext:
+        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: db-ready
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: "docker.io/bitnami/postgresql:16.1.0-debian-11-r20"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            {{- if .Values.db.deployStandalone }}
+            - name: DATABASE_USERNAME
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "litellm.name" . }}-dbcredentials
+                  key: username
+            - name: PGPASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "litellm.name" . }}-dbcredentials
+                  key: password
+            - name: DATABASE_HOST
+              value: {{ .Release.Name }}-postgresql
+            - name: DATABASE_NAME
+              value: litellm
+            {{- else if .Values.db.useExisting }}
+            - name: DATABASE_USERNAME
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.db.secret.name }}
+                  key: {{ .Values.db.secret.usernameKey }}
+            - name: PGPASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.db.secret.name }}
+                  key: {{ .Values.db.secret.passwordKey }}
+            - name: DATABASE_HOST
+              value: {{ .Values.db.endpoint }}
+            - name: DATABASE_NAME
+              value: litellm
+            {{- end }}
+          command:
+            - sh
+            - -c
+            - |
+              # Maximum wait time will be (limit * 2) seconds.
+              limit=60
+              current=0
+              ret=1
+              while [ $current -lt $limit ] && [ $ret -ne 0 ]; do
+                echo "Waiting for database to be ready $current"
+                psql -U $(DATABASE_USERNAME) -h $(DATABASE_HOST) -l
+                ret=$?
+                current=$(( $current + 1 ))
+                sleep 2
+              done
+              if [ $ret -eq 0 ]; then
+                echo "Database is ready"
+              else
+                echo "Database failed to become ready before we gave up waiting."
+              fi
+          {{ if .Values.securityContext.readOnlyRootFilesystem }}
+          volumeMounts:
+            - name: tmp
+              mountPath: /tmp
+          {{ end }}
+      containers:
+        - name: {{ include "litellm.name" . }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          env:
+            - name: HOST
+              value: "0.0.0.0"
+            - name: PORT
+              value: {{ .Values.service.port | quote}}
+            {{- if .Values.db.deployStandalone }}
+            - name: DATABASE_USERNAME
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "litellm.name" . }}-dbcredentials
+                  key: username
+            - name: DATABASE_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "litellm.name" . }}-dbcredentials
+                  key: password
+            - name: DATABASE_HOST
+              value: {{ .Release.Name }}-postgresql
+            - name: DATABASE_NAME
+              value: litellm
+            {{- else if .Values.db.useExisting }}
+            - name: DATABASE_USERNAME
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.db.secret.name }}
+                  key: {{ .Values.db.secret.usernameKey }}
+            - name: DATABASE_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.db.secret.name }}
+                  key: {{ .Values.db.secret.passwordKey }}
+            - name: DATABASE_HOST
+              value: {{ .Values.db.endpoint }}
+            - name: DATABASE_NAME
+              value: {{ .Values.db.database }}
+            {{- end }}
+            - name: DATABASE_URL
+              value: "postgresql://$(DATABASE_USERNAME):$(DATABASE_PASSWORD)@$(DATABASE_HOST)/$(DATABASE_NAME)"
+            - name: PROXY_MASTER_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "litellm.name" . }}-masterkey
+                  key: masterkey
+          envFrom:
+          {{- range .Values.environmentSecrets }}
+            - secretRef:
+                name: {{ . }}
+          {{- end }}
+          args:
+            - --config
+            - /etc/litellm/config.yaml
+          # command: 
+          #   - bash
+          #   - -c
+          #   - |
+          #     ls -la /etc/litellm/; cat /etc/litellm/config.yaml; export
+          #     find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/before.list
+          #     prisma generate
+          #     find / 2>/dev/null | grep -v -e '^/proc' -e '^/sys' -e '^/dev' >/tmp/after.list
+          #     diff -ruN /tmp/before.list /tmp/after.list
+          #     sleep 3600
+          ports:
+            - name: http
+              containerPort: {{ .Values.service.port }}
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: http
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: http
+          # Give the container time to start up.  Up to 5 minutes (10 * 30 seconds)
+          startupProbe:
+            httpGet:
+              path: /health/readiness
+              port: http
+            failureThreshold: 30
+            periodSeconds: 10
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: litellm-config
+              mountPath: /etc/litellm/
+          {{ if .Values.securityContext.readOnlyRootFilesystem }}
+            - name: tmp
+              mountPath: /tmp
+            - name: cache
+              mountPath: /.cache
+            - name: npm
+              mountPath: /.npm
+          {{- end }}
+          {{- with .Values.volumeMounts }}
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      volumes:
+        {{ if .Values.securityContext.readOnlyRootFilesystem }}
+        - name: tmp
+          emptyDir:
+            sizeLimit: 500Mi
+        - name: cache
+          emptyDir:
+            sizeLimit: 500Mi
+        - name: npm
+          emptyDir:
+            sizeLimit: 500Mi
+        {{- end }}
+        - name: litellm-config
+          configMap:
+            name: {{ include "litellm.fullname" . }}-config
+            items:
+              - key: "config.yaml"
+                path: "config.yaml"
+      {{- with .Values.volumes }}
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
--- a/deploy/charts/litellm-helm/templates/deployment-ui.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment-ui.yaml
@ -0,0 +1,89 @@
+{{- if .Values.ui.enabled -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "litellm.fullname" . }}-ui
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+spec:
+  {{- if not .Values.ui.autoscaling.enabled }}
+  replicas: {{ .Values.ui.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "litellm.ui.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "litellm.ui.labels" . | nindent 8 }}
+        {{- with .Values.ui.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "litellm.serviceAccountName" . }}
+      securityContext:
+        {{- toYaml .Values.ui.podSecurityContext | nindent 8 }}
+      containers:
+        - name: {{ include "litellm.name" . }}-ui
+          securityContext:
+            {{- toYaml .Values.ui.securityContext | nindent 12 }}
+          image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default (printf "main-%s" .Chart.AppVersion) }}"
+          imagePullPolicy: {{ .Values.ui.image.pullPolicy }}
+          env:
+            - name: BASE_URL
+              value: {{ (index .Values.ui.ingress.hosts 0).host | default "example.com" }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.ui.service.port }}
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /
+              port: http
+          readinessProbe:
+            httpGet:
+              path: /
+              port: http
+          # Give the container time to start up.  Up to 5 minutes (10 * 30 seconds)
+          startupProbe:
+            httpGet:
+              path: /
+              port: http
+            failureThreshold: 30
+            periodSeconds: 10
+          resources:
+            {{- toYaml .Values.ui.resources | nindent 12 }}
+          volumeMounts:
+            - name: tmp
+              mountPath: /tmp
+          {{- with .Values.ui.volumeMounts }}
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      volumes:
+        - name: tmp
+          emptyDir:
+            sizeLimit: 500Mi
+      {{- with .Values.ui.volumes }}
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.ui.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.ui.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.ui.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+{{- end -}}
--- a/deploy/charts/litellm-helm/templates/hpa.yaml
+++ b/deploy/charts/litellm-helm/templates/hpa.yaml
@ -0,0 +1,32 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "litellm.fullname" . }}
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "litellm.fullname" . }}
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/ingress-proxy.yaml
+++ b/deploy/charts/litellm-helm/templates/ingress-proxy.yaml
@ -0,0 +1,61 @@
+{{- if .Values.ingress.enabled -}}
+{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-proxy") -}}
+{{- $svcPort := .Values.service.port -}}
+{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
+  {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
+  {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
+  {{- end }}
+{{- end }}
+{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1
+{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1beta1
+{{- else -}}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+  name: {{ $fullName }}
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+  {{- with .Values.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
+  ingressClassName: {{ .Values.ingress.className }}
+  {{- end }}
+  {{- if .Values.ingress.tls }}
+  tls:
+    {{- range .Values.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.ingress.hosts }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
+            pathType: {{ .pathType }}
+            {{- end }}
+            backend:
+              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
+              service:
+                name: {{ $fullName }}
+                port:
+                  number: {{ $svcPort }}
+              {{- else }}
+              serviceName: {{ $fullName }}
+              servicePort: {{ $svcPort }}
+              {{- end }}
+          {{- end }}
+    {{- end }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/ingress-ui.yaml
+++ b/deploy/charts/litellm-helm/templates/ingress-ui.yaml
@ -0,0 +1,61 @@
+{{- if .Values.ui.ingress.enabled -}}
+{{- $fullName := (printf "%s%s" (include "litellm.fullname" .) "-ui") -}}
+{{- $svcPort := .Values.ui.service.port -}}
+{{- if and .Values.ui.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
+  {{- if not (hasKey .Values.ui.ingress.annotations "kubernetes.io/ingress.class") }}
+  {{- $_ := set .Values.ui.ingress.annotations "kubernetes.io/ingress.class" .Values.ui.ingress.className}}
+  {{- end }}
+{{- end }}
+{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1
+{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1beta1
+{{- else -}}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+  name: {{ $fullName }}
+  labels:
+    {{- include "litellm.ui.labels" . | nindent 4 }}
+  {{- with .Values.ui.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if and .Values.ui.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
+  ingressClassName: {{ .Values.ui.ingress.className }}
+  {{- end }}
+  {{- if .Values.ui.ingress.tls }}
+  tls:
+    {{- range .Values.ui.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.ui.ingress.hosts }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
+            pathType: {{ .pathType }}
+            {{- end }}
+            backend:
+              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
+              service:
+                name: {{ $fullName }}
+                port:
+                  number: {{ $svcPort }}
+              {{- else }}
+              serviceName: {{ $fullName }}
+              servicePort: {{ $svcPort }}
+              {{- end }}
+          {{- end }}
+    {{- end }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/secret-dbcredentials.yaml
+++ b/deploy/charts/litellm-helm/templates/secret-dbcredentials.yaml
@ -0,0 +1,12 @@
+{{- if .Values.db.deployStandalone -}}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "litellm.name" . }}-dbcredentials
+data:
+  # Password for the "postgres" user
+  postgres-password: {{ ( index .Values.postgresql.auth "postgres-password") | default "litellm" | b64enc }}
+  username: {{ .Values.postgresql.auth.username | default "litellm" | b64enc }}
+  password: {{ .Values.postgresql.auth.password | default "litellm" | b64enc }}
+type: Opaque
+{{- end -}}
--- a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
+++ b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml
@ -0,0 +1,8 @@
+{{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "litellm.name" . }}-masterkey
+data:
+  masterkey: {{ $masterkey | b64enc }}
+type: Opaque
--- a/deploy/charts/litellm-helm/templates/service-proxy.yaml
+++ b/deploy/charts/litellm-helm/templates/service-proxy.yaml
@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "litellm.fullname" . }}-proxy
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    {{- include "litellm.selectorLabels" . | nindent 4 }}
--- a/deploy/charts/litellm-helm/templates/service-ui.yaml
+++ b/deploy/charts/litellm-helm/templates/service-ui.yaml
@ -0,0 +1,17 @@
+{{- if .Values.ui.enabled -}}
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "litellm.fullname" . }}-ui
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.ui.service.type }}
+  ports:
+    - port: {{ .Values.ui.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    {{- include "litellm.ui.selectorLabels" . | nindent 4 }}
+{{ end -}}
--- a/deploy/charts/litellm-helm/templates/serviceaccount.yaml
+++ b/deploy/charts/litellm-helm/templates/serviceaccount.yaml
@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "litellm.serviceAccountName" . }}
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
+{{- end }}
--- a/deploy/charts/litellm-helm/templates/tests/test-connection.yaml
+++ b/deploy/charts/litellm-helm/templates/tests/test-connection.yaml
@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ include "litellm.fullname" . }}-test-connection"
+  labels:
+    {{- include "litellm.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": test
+spec:
+  containers:
+    - name: wget
+      image: busybox
+      command: ['wget']
+      args: ['{{ include "litellm.fullname" . }}:{{ .Values.service.port }}']
+  restartPolicy: Never
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -0,0 +1,219 @@
+# Default values for litellm.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+replicaCount: 1
+
+image:
+  repository: ghcr.io/berriai/litellm
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+  # tag: "main-latest"
+  tag: ""
+
+  # Image and tag used for the init container to check and wait for the
+  #  readiness of the postgres database.
+  dbReadyImage: docker.io/bitnami/postgresql
+  dbReadyTag: ""
+
+imagePullSecrets: []
+nameOverride: "litellm"
+fullnameOverride: ""
+
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: false
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ""
+
+podAnnotations: {}
+podLabels: {}
+
+# At the time of writing, the litellm docker image requires write access to the
+#  filesystem on startup so that prisma can install some dependencies.
+podSecurityContext: {}
+securityContext: {}
+  # capabilities:
+  #   drop:
+  #     - ALL
+  # readOnlyRootFilesystem: false
+  # runAsNonRoot: true
+  # runAsUser: 1000
+
+# A list of Kubernetes Secret objects that will be exported to the LiteLLM proxy
+#  pod as environment variables.  These secrets can then be referenced in the
+#  configuration file (or "litellm" ConfigMap) with `os.environ/<Env Var Name>`
+environmentSecrets: []
+  # - litellm-envsecrets
+
+service:
+  type: ClusterIP
+  port: 8000
+
+ingress:
+  enabled: true
+  className: "nginx"
+  annotations: {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: api.example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+# The elements within proxy_config are rendered as config.yaml for the proxy
+#  Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml
+#  Reference: https://docs.litellm.ai/docs/proxy/configs
+proxy_config:
+  model_list:
+    # At least one model must exist for the proxy to start.
+    - model_name: gpt-3.5-turbo
+      litellm_params:
+        model: gpt-3.5-turbo
+        api_key: eXaMpLeOnLy
+  general_settings:
+    master_key: os.environ/PROXY_MASTER_KEY
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# Additional volumes on the output Deployment definition.
+volumes: []
+# - name: foo
+#   secret:
+#     secretName: mysecret
+#     optional: false
+
+# Additional volumeMounts on the output Deployment definition.
+volumeMounts: []
+# - name: foo
+#   mountPath: "/etc/foo"
+#   readOnly: true
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
+
+db:
+  # Use an existing postgres server/cluster
+  useExisting: false
+
+  # How to connect to the existing postgres server/cluster
+  endpoint: localhost
+  database: litellm
+  secret:
+    name: postgres
+    usernameKey: username
+    passwordKey: password
+
+  # Use the Stackgres Helm chart to deploy an instance of a Stackgres cluster.
+  #  The Stackgres Operator must already be installed within the target
+  #  Kubernetes cluster.
+  # TODO: Stackgres deployment currently unsupported
+  useStackgresOperator: false
+
+  # Use the Postgres Helm chart to create a single node, stand alone postgres
+  #  instance.  See the "postgresql" top level key for additional configuration.
+  deployStandalone: true
+
+# Settings for Bitnami postgresql chart (if db.deployStandalone is true, ignored
+#  otherwise)
+postgresql:
+  architecture: standalone
+  auth:
+    username: litellm
+    database: litellm
+
+    # You should override these on the helm command line with
+    #  `--set postgresql.auth.postgres-password=<some good password>,postgresql.auth.password=<some good password>`
+    password: NoTaGrEaTpAsSwOrD
+    postgres-password: NoTaGrEaTpAsSwOrD
+
+    # A secret is created by this chart (litellm-helm) with the credentials that
+    #  the new Postgres instance should use.
+    existingSecret: litellm-dbcredentials
+    secretKeys:
+      userPasswordKey: password
+
+ui:
+  enabled: true
+  replicaCount: 1
+  autoscaling:
+    enabled: false
+  image:
+    repository: ghcr.io/berriai/litellm-ui
+    pullPolicy: IfNotPresent
+    # Overrides the image tag whose default is the chart appVersion.
+    # tag: "main-latest"
+    # TODO: Switch to BerryAI repo and tags if/when they provide a ui image
+    # https://github.com/BerriAI/litellm/pull/1505
+    tag: ""
+  
+  service:
+    type: ClusterIP
+    port: 8501
+
+  ingress:
+    enabled: true
+    className: "nginx"
+    annotations: {}
+    hosts:
+      - host: ui.example.local
+        paths:
+          - path: /
+            pathType: ImplementationSpecific
+    tls: []
+
+  podAnnotations: {}
+  podLabels: {}
+
+  podSecurityContext:
+    fsGroup: 1000
+
+  securityContext:
+    capabilities:
+      drop:
+        - ALL
+    readOnlyRootFilesystem: true
+    runAsNonRoot: true
+    runAsUser: 1000
+
+  resources: {}
+
+  volumes: []
+
+  volumeMounts: []
+
+  nodeSelector: {}
+
+  tolerations: []
+
+  affinity: {}
--- a/docker-compose.example.yml
+++ b/docker-compose.example.yml
@ -1,12 +0,0 @@
-version: "3.9"
-services:
-  litellm:
-    image: ghcr.io/berriai/litellm:main
-    ports:
-      - "8000:8000" # Map the container port to the host, change the host port if necessary
-    volumes:
-      - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file
-    # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value
-    command: [ "--config", "/app/config.yaml", "--port", "8000", "--num_workers", "8" ]
-
-# ...rest of your docker-compose config if any
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,15 @@
+version: "3.9"
+services:
+  litellm:
+    image: ghcr.io/berriai/litellm:main-latest
+    volumes:
+      - ./proxy_server_config.yaml:/app/proxy_server_config.yaml # mount your litellm config.yaml
+    ports:
+      - "4000:4000"
+    environment:
+      - AZURE_API_KEY=sk-123
+  litellm-ui:
+    image: ghcr.io/berriai/litellm-ui:main-latest
+
+
+
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@ -1,11 +1,17 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache

 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)

-## Initialize Cache - In Memory, Redis, s3 Bucket
+:::info
+
+Need to use Caching on LiteLLM Proxy Server? Doc here: [Caching Proxy Server](https://docs.litellm.ai/docs/proxy/caching)
+
+:::
+
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache


 <Tabs>
@ -18,7 +24,7 @@ pip install redis
 ```

 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@ -55,7 +61,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@ -80,6 +86,66 @@ response2 = completion(
 </TabItem>


+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">

 ### Quick Start
--- a/docs/my-website/docs/completion/token_usage.md
+++ b/docs/my-website/docs/completion/token_usage.md
@ -150,5 +150,12 @@ litellm.register_model(model_cost=
 "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json")
 ```

+**Don't pull hosted model_cost_map**  
+If you have firewalls, and want to just use the local copy of the model cost map, you can do so like this:
+```bash
+export LITELLM_LOCAL_MODEL_COST_MAP="True"
+```
+
+Note: this means you will need to upgrade to get updated pricing, and newer models. 


--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -13,8 +13,8 @@ response = embedding(model='text-embedding-ada-002', input=["good morning from l

 - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'`

- `input`: *array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
-```
+- `input`: *string or array* - Input text to embed, encoded as a string or array of tokens. To embed multiple inputs in a single request, pass an array of strings or array of token arrays. The input must not exceed the max input tokens for the model (8192 tokens for text-embedding-ada-002), cannot be an empty string, and any array must be 2048 dimensions or less. 
+```python
 input=["good morning from litellm"]
 ```

@ -22,7 +22,11 @@ input=["good morning from litellm"]

 - `user`: *string (optional)* A unique identifier representing your end-user, 

- `timeout`: *integer* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).
+- `dimensions`: *integer (Optional)* The number of dimensions the resulting output embeddings should have. Only supported in OpenAI/Azure text-embedding-3 and later models.
+
+- `encoding_format`: *string (Optional)* The format to return the embeddings in. Can be either `"float"` or `"base64"`. Defaults to `encoding_format="float"`
+
+- `timeout`: *integer (Optional)* - The maximum time, in seconds, to wait for the API to respond. Defaults to 600 seconds (10 minutes).

 - `api_base`: *string (optional)* - The api endpoint you want to call the model with

@ -66,11 +70,18 @@ input=["good morning from litellm"]
 from litellm import embedding
 import os
 os.environ['OPENAI_API_KEY'] = ""
-response = embedding('text-embedding-ada-002', input=["good morning from litellm"])
+response = embedding(
+    model="text-embedding-3-small",
+    input=["good morning from litellm", "this is another item"],
+    metadata={"anything": "good day"},
+    dimensions=5 # Only supported in text-embedding-3 and later models.
+)
 ```

 | Model Name           | Function Call                               | Required OS Variables                |
 |----------------------|---------------------------------------------|--------------------------------------|
+| text-embedding-3-small | `embedding('text-embedding-3-small', input)` | `os.environ['OPENAI_API_KEY']`       |
+| text-embedding-3-large | `embedding('text-embedding-3-large', input)` | `os.environ['OPENAI_API_KEY']`       |
 | text-embedding-ada-002 | `embedding('text-embedding-ada-002', input)` | `os.environ['OPENAI_API_KEY']`       |

 ## Azure OpenAI Embedding Models
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -0,0 +1,15 @@
+# Enterprise
+
+LiteLLM offers dedicated enterprise support.
+
+This covers: 
+- **Feature Prioritization**
+- **Custom Integrations**
+- **Professional Support - Dedicated discord + slack**
+- **Custom SLAs**
+
+:::info
+
+[Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
--- a/docs/my-website/docs/image_generation.md
+++ b/docs/my-website/docs/image_generation.md
@ -131,3 +131,23 @@ response = image_generation(
  prompt="cute baby otter"
 )
 ```
+
+## Bedrock - Stable Diffusion
+Use this for stable diffusion on bedrock
+
+
+### Usage
+```python
+import os
+from litellm import image_generation
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = image_generation(
+            prompt="A cute baby sea otter",
+            model="bedrock/stability.stable-diffusion-xl-v0",
+        )
+print(f"response: {response}")
+```
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -5,10 +5,14 @@ import TabItem from '@theme/TabItem';

 https://github.com/BerriAI/litellm

-import QuickStart from '../src/components/QuickStart.js'

 ## **Call 100+ LLMs using the same Input/Output Format**

+- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
+- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
+- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+
 ## Basic usage 
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -157,9 +161,6 @@ response = completion(
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```

 </TabItem>
@ -177,9 +178,6 @@ response = completion(
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```

 </TabItem>
@ -199,9 +197,6 @@ response = completion(
  messages=[{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```

 </TabItem>
@ -222,9 +217,7 @@ response = completion(
  stream=True,
 )

-
-for chunk in response: 
-  print(chunk)
+print(response)
 ```

 </TabItem>
@ -246,9 +239,6 @@ response = completion(
  messages = [{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```

 </TabItem>
@ -265,9 +255,6 @@ response = completion(
            api_base="http://localhost:11434",
            stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 </TabItem>
 <TabItem value="or" label="Openrouter">
@ -284,9 +271,6 @@ response = completion(
  messages = [{ "content": "Hello, how are you?","role": "user"}],
  stream=True,
 )
-
-for chunk in response: 
-  print(chunk)
 ```
 </TabItem>

@ -327,34 +311,8 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```

-## Calculate Costs, Usage, Latency
-
-Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
-
-```python
-from litellm import completion, completion_cost
-import os
-os.environ["OPENAI_API_KEY"] = "your-api-key"
-
-response = completion(
-  model="gpt-3.5-turbo", 
-  messages=[{ "content": "Hello, how are you?","role": "user"}]
-)
-
-cost = completion_cost(completion_response=response)
-print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
-```
-
-**Output**
-```shell
-Cost for completion call with gpt-3.5-turbo:  $0.0000775000
-```
-
-### Track Costs, Usage, Latency for streaming
-We use a custom callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
- We define a callback function to calculate cost `def track_cost_callback()`
- In `def track_cost_callback()` we check if the stream is complete - `if "complete_streaming_response" in kwargs`
- Use `litellm.completion_cost()` to calculate cost, once the stream is complete
+## Track Costs, Usage, Latency for streaming
+Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback

 ```python
 import litellm
@ -366,18 +324,8 @@ def track_cost_callback(
    start_time, end_time    # start/end time
 ):
    try:
-        # check if it has collected an entire stream response
-        if "complete_streaming_response" in kwargs:
-            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
-            completion_response=kwargs["complete_streaming_response"]
-            input_text = kwargs["messages"]
-            output_text = completion_response["choices"][0]["message"]["content"]
-            response_cost = litellm.completion_cost(
-                model = kwargs["model"],
-                messages = input_text,
-                completion=output_text
-            )
-            print("streaming response_cost", response_cost)
+      response_cost = kwargs.get("response_cost", 0)
+      print("streaming response_cost", response_cost)
    except:
        pass
 # set callback 
@ -400,6 +348,8 @@ response = completion(

 Track spend across multiple projects/people 

+![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
+
 The proxy provides: 
 1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
 2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
@ -436,8 +386,7 @@ response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
 print(response)
 ```

-
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
-* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
+* [proxy virtual keys & spend management](./tutorials/fallbacks.md)
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -27,6 +27,7 @@ Use just 2 lines of code, to instantly log your responses **across all providers
 Get your Langfuse API Keys from https://cloud.langfuse.com/
 ```python
 litellm.success_callback = ["langfuse"]
+litellm.failure_callback = ["langfuse"] # logs errors to langfuse
 ```
 ```python
 # pip install langfuse 
@ -93,7 +94,7 @@ print(response)

 ```

-### Set Custom Trace ID, Trace User ID
+### Set Custom Trace ID, Trace User ID and Tags

 Pass `trace_id`, `trace_user_id` in `metadata`

@ -122,6 +123,8 @@ response = completion(
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
+      "session_id": "session-1",                    # set langfuse Session ID
+      "tags": ["tag1", "tag2"]                      # set langfuse Tags
  },
 )

--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -74,6 +74,8 @@ response = litellm.completion(
 | gpt-4-32k            | `completion('azure/<your deployment name>', messages)`         | 
 | gpt-4-32k-0314            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4-32k-0613            | `completion('azure/<your deployment name>', messages)`         | 
+| gpt-4-1106-preview            | `completion('azure/<your deployment name>', messages)`         | 
+| gpt-4-0125-preview            | `completion('azure/<your deployment name>', messages)`         | 
 | gpt-3.5-turbo    | `completion('azure/<your deployment name>', messages)` |
 | gpt-3.5-turbo-0301    | `completion('azure/<your deployment name>', messages)` |
 | gpt-3.5-turbo-0613    | `completion('azure/<your deployment name>', messages)` |
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -197,7 +197,7 @@ response = completion(

 ### SSO Login (AWS Profile)
 - Set `AWS_PROFILE` environment variable
- Make bedrock completion call 
+- Make bedrock completion call
 ```python
 import os
 from litellm import completion
@ -208,11 +208,24 @@ response = completion(
 )
 ```

-### STS based Auth 
+or pass `aws_profile_name`:
+
+```python
+import os
+from litellm import completion
+
+response = completion(
+            model="bedrock/anthropic.claude-instant-v1",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            aws_profile_name="dev-profile",
+)
+```
+
+### STS based Auth

 - Set `aws_role_name` and `aws_session_name` in completion() / embedding() function

-Make the bedrock completion call 
+Make the bedrock completion call
 ```python
 from litellm import completion

@ -315,3 +328,50 @@ print(response)
 | Titan Embeddings - G1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` |
 | Cohere Embeddings - English | `embedding(model="bedrock/cohere.embed-english-v3", input=input)` |
 | Cohere Embeddings - Multilingual | `embedding(model="bedrock/cohere.embed-multilingual-v3", input=input)` |
+
+## Image Generation
+Use this for stable diffusion on bedrock
+
+
+### Usage
+```python
+import os
+from litellm import image_generation
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = image_generation(
+            prompt="A cute baby sea otter",
+            model="bedrock/stability.stable-diffusion-xl-v0",
+        )
+print(f"response: {response}")
+```
+
+**Set optional params**
+```python
+import os
+from litellm import image_generation
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = image_generation(
+            prompt="A cute baby sea otter",
+            model="bedrock/stability.stable-diffusion-xl-v0",
+            ### OPENAI-COMPATIBLE ###
+            size="128x512", # width=128, height=512
+            ### PROVIDER-SPECIFIC ### see `AmazonStabilityConfig` in bedrock.py for all params
+            seed=30
+        )
+print(f"response: {response}")
+```
+
+## Supported AWS Bedrock Image Generation Models
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` |
+| Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` |
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -34,6 +34,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL

 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
+| gpt-4-0125-preview    | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
 | gpt-4-1106-preview    | `response = completion(model="gpt-4-1106-preview", messages=messages)` |
 | gpt-3.5-turbo-1106    | `response = completion(model="gpt-3.5-turbo-1106", messages=messages)` |
 | gpt-3.5-turbo         | `response = completion(model="gpt-3.5-turbo", messages=messages)` |
@ -173,6 +174,31 @@ response = completion(
    messages=[{ "content": "Hello, how are you?","role": "user"}]
 )
 ```
+
+### Set `ssl_verify=False`
+
+This is done by setting your own `httpx.Client` 
+
+- For `litellm.completion` set `litellm.client_session=httpx.Client(verify=False)`
+- For `litellm.acompletion` set `litellm.aclient_session=AsyncClient.Client(verify=False)`
+```python
+import litellm, httpx
+
+# for completion
+litellm.client_session = httpx.Client(verify=False)
+response = litellm.completion(
+    model="gpt-3.5-turbo",
+    messages=messages,
+)
+
+# for acompletion
+litellm.aclient_session = httpx.AsyncClient(verify=False)
+response = litellm.acompletion(
+    model="gpt-3.5-turbo",
+    messages=messages,
+)
+```
+
 ### Using Helicone Proxy with LiteLLM
 ```python
 import os 
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -1,4 +1,4 @@
-# VertexAI - Google [Gemini]
+# VertexAI - Google [Gemini, Model Garden]

 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_VertextAI_Example.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -20,6 +20,27 @@ litellm.vertex_location = "us-central1"  # proj location
 response = litellm.completion(model="gemini-pro", messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}])
 ```

+## OpenAI Proxy Usage 
+
+1. Modify the config.yaml 
+
+```yaml
+litellm_settings: 
+  vertex_project: "hardy-device-38811" # Your Project ID
+  vertex_location: "us-central1" # proj location
+
+model_list: 
+  -model_name: team1-gemini-pro
+   litellm_params: 
+     model: gemini-pro
+```
+
+2. Start the proxy 
+
+```bash
+$ litellm --config /path/to/config.yaml
+```
+
 ## Set Vertex Project & Vertex Location
 All calls using Vertex AI require the following parameters:
 * Your Project ID
@ -46,16 +67,39 @@ os.environ["VERTEXAI_LOCATION"] = "us-central1 # Your Location
 # set directly on module 
 litellm.vertex_location = "us-central1 # Your Location
 ```
+## Model Garden
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| llama2   | `completion('vertex_ai/<endpoint_id>', messages)` |
+
+#### Using Model Garden
+
+```python
+from litellm import completion
+import os
+
+## set ENV variables
+os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
+os.environ["VERTEXAI_LOCATION"] = "us-central1"
+
+response = completion(
+  model="vertex_ai/<your-endpoint-id>", 
+  messages=[{ "content": "Hello, how are you?","role": "user"}]
+)
+```

 ## Gemini Pro
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
-| gemini-pro   | `completion('gemini-pro', messages)` |
+| gemini-pro   | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |

 ## Gemini Pro Vision
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
-| gemini-pro-vision   | `completion('gemini-pro-vision', messages)` |
+| gemini-pro-vision   | `completion('gemini-pro-vision', messages)`, `completion('vertex_ai/gemini-pro-vision', messages)`|
+
+
+

 #### Using Gemini Pro Vision

@ -93,6 +137,7 @@ response = litellm.completion(
 print(response)
 ```

+
 ## Chat Models
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,6 +1,13 @@
 # Slack Alerting

-Get alerts for failed db read/writes, hanging api calls, failed api calls. 
+Get alerts for:
+- hanging LLM api calls
+- failed LLM api calls
+- slow LLM api calls
+- budget Tracking per key/user:
+    - When a User/Key crosses their Budget 
+    - When a User/Key is 15% away from crossing their Budget
+- failed db read/writes

 ## Quick Start

--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -7,16 +7,17 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 

-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>

 <TabItem value="redis" label="redis cache">

 Caching can be enabled by adding the `cache` key in the `config.yaml`

-### Step 1: Add `cache` to the config.yaml
+#### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -31,7 +32,7 @@ litellm_settings:
  cache: True          # set cache responses to True, litellm defaults to using a redis cache
 ```

-### Step 2: Add Redis Credentials to .env
+#### Step 2: Add Redis Credentials to .env
 Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.

  ```shell
@ -49,7 +50,7 @@ REDIS_<redis-kwarg-name> = ""
 ``` 

 [**See how it's read from the environment**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/_redis.py#L40)
-### Step 3: Run proxy with config
+#### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
@ -57,7 +58,7 @@ $ litellm --config /path/to/config.yaml

 <TabItem value="s3" label="s3 cache">

-### Step 1: Add `cache` to the config.yaml
+#### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -79,7 +80,57 @@ litellm_settings:
    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
 ```

-### Step 2: Run proxy with config
+#### Step 2: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+#### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+#### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+#### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
@ -160,9 +211,10 @@ litellm_settings:

 The proxy support 3 cache-controls:

- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 

 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)

--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -22,18 +22,22 @@ Set a model alias for your deployments.

 In the `config.yaml` the model_name parameter is the user-facing name to use for your deployment. 

-In the config below requests with:
+In the config below:
+- `model_name`: the name to pass TO litellm from the external client  
+- `litellm_params.model`: the model string passed to the litellm.completion() function
+
+E.g.: 
 - `model=vllm-models` will route to `openai/facebook/opt-125m`. 
 - `model=gpt-3.5-turbo` will load balance between `azure/gpt-turbo-small-eu` and `azure/gpt-turbo-small-ca`

 ```yaml
 model_list:
-  - model_name: gpt-3.5-turbo # user-facing model alias
+  - model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ###
    litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
-      model: azure/gpt-turbo-small-eu
+      model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ###
      api_base: https://my-endpoint-europe-berri-992.openai.azure.com/
      api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU")
-      rpm: 6      # Rate limit for this deployment: in requests per minute (rpm)
+      rpm: 6      # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm)
  - model_name: bedrock-claude-v1 
    litellm_params:
      model: bedrock/anthropic.claude-instant-v1
@ -43,6 +47,11 @@ model_list:
      api_base: https://my-endpoint-canada-berri992.openai.azure.com/
      api_key: "os.environ/AZURE_API_KEY_CA"
      rpm: 6
+  - model_name: anthropic-claude
+    litellm_params: 
+      model="bedrock/anthropic.claude-instant-v1"
+      ### [OPTIONAL] SET AWS REGION ###
+      aws_region_name="us-east-1"
  - model_name: vllm-models
    litellm_params:
      model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
@ -58,6 +67,11 @@ litellm_settings: # module level litellm settings - https://github.com/BerriAI/l
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
 ```
+:::info
+
+For more provider-specific info, [go here](../providers/)
+
+:::

 #### Step 2: Start Proxy with config

@ -188,7 +202,7 @@ print(response)
 </Tabs>


-## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Headers etc.)
+## Save Model-specific params (API Base, API Keys, Temperature, Max Tokens, Seed, Organization, Headers etc.)
 You can use the config to save model-specific information like api_base, api_key, temperature, max_tokens, etc. 

 [**All input params**](https://docs.litellm.ai/docs/completion/input#input-params-1)
@ -210,6 +224,12 @@ model_list:
      api_key: sk-123
      api_base: https://openai-gpt-4-test-v-2.openai.azure.com/
      temperature: 0.2
+  - model_name: openai-gpt-3.5
+    litellm_params:
+      model: openai/gpt-3.5-turbo
+      api_key: sk-123
+      organization: org-ikDc4ex8NB
+      temperature: 0.2
  - model_name: mistral-7b
    litellm_params:
      model: ollama/mistral
@ -226,6 +246,28 @@ model_list:
 $ litellm --config /path/to/config.yaml
 ```

+
+## Set Azure `base_model` for cost tracking
+
+**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
+
+**Solution** ✅ :  Set `base_model` on your config so litellm uses the correct model for calculating azure cost
+
+Example config with `base_model`
+```yaml
+model_list:
+  - model_name: azure-gpt-3.5
+    litellm_params:
+      model: azure/chatgpt-v-2
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+    model_info:
+      base_model: azure/gpt-4-1106-preview
+```
+
+You can view your cost once you set up [Virtual keys](https://docs.litellm.ai/docs/proxy/virtual_keys) or [custom_callbacks](https://docs.litellm.ai/docs/proxy/logging)
+
 ## Load API Keys

 ### Load API Keys from Environment 
@ -318,6 +360,26 @@ See supported Embedding Providers & Models [here](https://docs.litellm.ai/docs/e
 #### Create Config.yaml

 <Tabs>
+<TabItem value="bedrock" label="Bedrock Completion/Chat">
+
+```yaml
+model_list:
+  - model_name: bedrock-cohere
+    litellm_params:
+      model: "bedrock/cohere.command-text-v14"
+      aws_region_name: "us-west-2"
+  - model_name: bedrock-cohere
+    litellm_params:
+      model: "bedrock/cohere.command-text-v14"
+      aws_region_name: "us-east-2"
+  - model_name: bedrock-cohere
+    litellm_params:
+      model: "bedrock/cohere.command-text-v14"
+      aws_region_name: "us-east-1"
+
+```
+
+</TabItem>

 <TabItem value="sagemaker" label="Sagemaker, Bedrock Embeddings">

@ -430,20 +492,26 @@ model_list:
 </Tabs>

 #### Start Proxy
+
 ```shell
 litellm --config config.yaml
 ```

 #### Make Request
-Sends Request to `deployed-codebert-base`
+Sends Request to `bedrock-cohere`

 ```shell
-curl --location 'http://0.0.0.0:8000/embeddings' \
+curl --location 'http://0.0.0.0:8000/chat/completions' \
  --header 'Content-Type: application/json' \
  --data ' {
-  "model": "deployed-codebert-base",
-  "input": ["write a litellm poem"]
-  }'
+  "model": "bedrock-cohere",
+  "messages": [
+      {
+      "role": "user",
+      "content": "gm"
+      }
+  ]
+}'
 ```


@ -483,3 +551,55 @@ general_settings:
  max_parallel_requests: 100 # max parallel requests for a user = 100
 ```

+## All settings 
+
+```python
+{
+  "environment_variables": {},
+  "model_list": [
+    {
+      "model_name": "string",
+      "litellm_params": {},
+      "model_info": {
+        "id": "string",
+        "mode": "embedding",
+        "input_cost_per_token": 0,
+        "output_cost_per_token": 0,
+        "max_tokens": 2048,
+        "base_model": "gpt-4-1106-preview",
+        "additionalProp1": {}
+      }
+    }
+  ],
+  "litellm_settings": {}, # ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
+  "general_settings": {
+    "completion_model": "string",
+    "key_management_system": "google_kms", # either google_kms or azure_kms
+    "master_key": "string",
+    "database_url": "string",
+    "database_type": "dynamo_db",
+    "database_args": {
+      "billing_mode": "PROVISIONED_THROUGHPUT",
+      "read_capacity_units": 0,
+      "write_capacity_units": 0,
+      "ssl_verify": true,
+      "region_name": "string",
+      "user_table_name": "LiteLLM_UserTable",
+      "key_table_name": "LiteLLM_VerificationToken",
+      "config_table_name": "LiteLLM_Config",
+      "spend_table_name": "LiteLLM_SpendLogs"
+    },
+    "otel": true,
+    "custom_auth": "string",
+    "max_parallel_requests": 0,
+    "infer_model_from_keys": true,
+    "background_health_checks": true,
+    "health_check_interval": 300,
+    "alerting": [
+      "string"
+    ],
+    "alerting_threshold": 0
+  }
+}
+```
+
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -10,6 +10,12 @@ There's 2 ways to track cost:

 By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md)

+:::info
+
+LiteLLM already has pricing for any model in our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). 
+
+:::
+
 ## Quick Start 

 Register custom pricing for sagemaker completion model. 
@ -54,7 +60,7 @@ model_list:
  - model_name: sagemaker-embedding-model
    litellm_params:
      model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
-      input_cost_per_second: 0.000420
+      input_cost_per_second: 0.000420 
 ```

 **Step 2: Start proxy**
@ -67,25 +73,28 @@ litellm /path/to/config.yaml

 <Image img={require('../../img/spend_logs_table.png')} />

-## Cost Per Token 
+## Cost Per Token (e.g. Azure)
+

 ```python
 # !pip install boto3 
 from litellm import completion, completion_cost 

-os.environ["AWS_ACCESS_KEY_ID"] = ""
-os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-os.environ["AWS_REGION_NAME"] = ""
+## set ENV variables
+os.environ["AZURE_API_KEY"] = ""
+os.environ["AZURE_API_BASE"] = ""
+os.environ["AZURE_API_VERSION"] = ""


-def test_completion_sagemaker():
+def test_completion_azure_model():
    try:
-        print("testing sagemaker")
+        print("testing azure custom pricing")
+        # azure call
        response = completion(
-            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
-            messages=[{"role": "user", "content": "Hey, how's it going?"}],
-            input_cost_per_token=0.005,
-            output_cost_per_token=1,
+          model = "azure/<your_deployment_name>", 
+          messages = [{ "content": "Hello, how are you?","role": "user"}]
+          input_cost_per_token=0.005,
+          output_cost_per_token=1,
        )
        # Add any assertions here to check the response
        print(response)
@ -94,15 +103,19 @@ def test_completion_sagemaker():
    except Exception as e:
        raise Exception(f"Error occurred: {e}")

+test_completion_azure_model()
 ```

 ### Usage with OpenAI Proxy Server

 ```yaml
 model_list:
-  - model_name: sagemaker-completion-model
+  - model_name: azure-model
    litellm_params:
-      model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4
-      input_cost_per_token: 0.000420 # 👈 key change
-      output_cost_per_token: 0.000420 # 👈 key change
+      model: azure/<your_deployment_name>
+      api_key: os.environ/AZURE_API_KEY
+      api_base: os.environ/AZURE_API_BASE
+      api_version: os.envrion/AZURE_API_VERSION
+      input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
+      output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
 ```
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -0,0 +1,34 @@
+# Debugging
+
+2 levels of debugging supported. 
+
+- debug (prints info logs)
+- detailed debug (prints debug logs)
+
+## `debug`
+
+**via cli**
+
+```bash
+$ litellm --debug
+```
+
+**via env**
+
+```python
+os.environ["LITELLM_LOG"] = "INFO"
+```
+
+## `detailed debug`
+
+**via cli**
+
+```bash
+$ litellm --detailed_debug
+```
+
+**via env**
+
+```python
+os.environ["LITELLM_LOG"] = "DEBUG"
+```
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -116,6 +116,20 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 </TabItem>
 </Tabs>

+## Setting SSL Certification 
+
+Use this, If you need to set ssl certificates for your on prem litellm proxy
+
+Pass `ssl_keyfile_path` (Path to the SSL keyfile) and `ssl_certfile_path` (Path to the SSL certfile) when starting litellm proxy 
+
+```shell
+docker run ghcr.io/berriai/litellm:main-latest \
+    --ssl_keyfile_path ssl_test/keyfile.key \
+    --ssl_certfile_path ssl_test/certfile.crt
+```
+
+Provide an ssl certificate when starting litellm proxy server 
+
 ## Platform-specific Guide


--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -112,7 +112,8 @@ Example Response:
 ```json
 {
    "status": "healthy",
-    "db": "connected"
+    "db": "connected",
+    "litellm_version":"1.19.2",
 }
 ```

@ -121,7 +122,8 @@ Example Response:
 ```json
 {
    "status": "healthy",
-    "db": "Not connected"
+    "db": "Not connected",
+    "litellm_version":"1.19.2",
 }
 ```

--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -435,6 +435,7 @@ print(response)
 </TabItem>
 </Tabs>

+
 ## Logging Proxy Input/Output - s3 Buckets

 We will use the `--config` to set 
@ -490,6 +491,34 @@ curl --location 'http://0.0.0.0:8000/chat/completions' \

 Your logs should be available on the specified s3 Bucket

+## Team-based Logging 
+
+Set success callbacks (e.g. langfuse), for a specific team-id. 
+
+```yaml
+litellm_settings:
+  default_team_settings: 
+    - team_id: my-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2
+      langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_2
+    - team_id: ishaans-secret-project
+      success_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_3
+      langfuse_secret: os.environ/LANGFUSE_SECRET_3
+```
+
+Now, when you [generate keys](./virtual_keys.md) for this team-id 
+
+```bash
+curl -X POST 'http://0.0.0.0:8000/key/generate' \
+-H 'Authorization: Bearer sk-1234' \
+-H 'Content-Type: application/json' \
+-D '{"team_id": "ishaans-secret-project"}'
+```
+
+All requests made with these keys will log data to their team-specific logging.
+
 ## Logging Proxy Input/Output - DynamoDB

 We will use the `--config` to set 
--- a/docs/my-website/docs/proxy/pii_masking.md
+++ b/docs/my-website/docs/proxy/pii_masking.md
@ -0,0 +1,30 @@
+import Image from '@theme/IdealImage';
+
+# PII Masking
+
+LiteLLM supports [Microsoft Presidio](https://github.com/microsoft/presidio/) for PII masking. 
+
+## Step 1. Add env
+
+```bash
+export PRESIDIO_ANALYZER_API_BASE="http://localhost:5002"
+export PRESIDIO_ANONYMIZER_API_BASE="http://localhost:5001"
+```
+
+## Step 2. Set it as a callback in config.yaml
+
+```yaml
+litellm_settings: 
+    callbacks = ["presidio", ...] # e.g. ["presidio", custom_callbacks.proxy_handler_instance]
+```
+
+## Start proxy 
+
+```
+litellm --config /path/to/config.yaml
+```
+
+
+This will mask the input going to the llm provider
+
+<Image img={require('../../img/presidio_screenshot.png')} />
--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@ -8,16 +8,8 @@ Quick start CLI, Config, Docker
 LiteLLM Server manages:

 * **Unified Interface**: Calling 100+ LLMs [Huggingface/Bedrock/TogetherAI/etc.](#other-supported-models) in the OpenAI `ChatCompletions` & `Completions` format
+* **Cost tracking**: Authentication, Spend Tracking & Budgets [Virtual Keys](https://docs.litellm.ai/docs/proxy/virtual_keys)
 * **Load Balancing**: between [Multiple Models](#multiple-models---quick-start) + [Deployments of the same model](#multiple-instances-of-1-model) - LiteLLM proxy can handle 1.5k+ requests/second during load tests.
-* **Cost tracking**: Authentication & Spend Tracking [Virtual Keys](#managing-auth---virtual-keys)
-
-[**See LiteLLM Proxy code**](https://github.com/BerriAI/litellm/tree/main/litellm/proxy)
-
-
-#### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
-
-
-View all the supported args for the Proxy CLI [here](https://docs.litellm.ai/docs/simple_proxy#proxy-cli-arguments)

 ```shell
 $ pip install 'litellm[proxy]'
@ -40,115 +32,6 @@ litellm --test

 This will now automatically route any requests for gpt-3.5-turbo to bigcode starcoder, hosted on huggingface inference endpoints. 

-### Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
-
-<Tabs>
-<TabItem value="Curl" label="Curl Request">
-
-```shell
-curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
-      "model": "gpt-3.5-turbo",
-      "messages": [
-        {
-          "role": "user",
-          "content": "what llm are you"
-        }
-      ]
-    }
-'
-```
-</TabItem>
-<TabItem value="openai" label="OpenAI v1.0.0+">
-
-```python
-import openai
-client = openai.OpenAI(
-    api_key="anything",
-    base_url="http://0.0.0.0:8000"
-)
-
-# request sent to model set on litellm proxy, `litellm --model`
-response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
-    {
-        "role": "user",
-        "content": "this is a test request, write a short poem"
-    }
-])
-
-print(response)
-
-```
-</TabItem>
-<TabItem value="langchain" label="Langchain">
-
-```python
-from langchain.chat_models import ChatOpenAI
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    HumanMessagePromptTemplate,
-    SystemMessagePromptTemplate,
-)
-from langchain.schema import HumanMessage, SystemMessage
-
-chat = ChatOpenAI(
-    openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
-    model = "gpt-3.5-turbo",
-    temperature=0.1
-)
-
-messages = [
-    SystemMessage(
-        content="You are a helpful assistant that im using to make a test request to."
-    ),
-    HumanMessage(
-        content="test from litellm. tell me why it's amazing in 1 sentence"
-    ),
-]
-response = chat(messages)
-
-print(response)
-```
-
-</TabItem>
-<TabItem value="langchain-embedding" label="Langchain Embeddings">
-
-```python
-from langchain.embeddings import OpenAIEmbeddings
-
-embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
-
-
-text = "This is a test document."
-
-query_result = embeddings.embed_query(text)
-
-print(f"SAGEMAKER EMBEDDINGS")
-print(query_result[:5])
-
-embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
-
-text = "This is a test document."
-
-query_result = embeddings.embed_query(text)
-
-print(f"BEDROCK EMBEDDINGS")
-print(query_result[:5])
-
-embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
-
-text = "This is a test document."
-
-query_result = embeddings.embed_query(text)
-
-print(f"TITAN EMBEDDINGS")
-print(query_result[:5])
-```
-</TabItem>
-</Tabs>
-
-
 ### Supported LLMs
 All LiteLLM supported LLMs are supported on the Proxy. Seel all [supported llms](https://docs.litellm.ai/docs/providers)
 <Tabs>
@ -330,9 +213,6 @@ $ litellm --model command-nightly

 </Tabs>

-
-
-
 ## Quick Start - LiteLLM Proxy + Config.yaml
 The config allows you to create a model list and set `api_base`, `max_tokens` (all litellm params). See more details about the config [here](https://docs.litellm.ai/docs/proxy/configs)

@ -363,6 +243,115 @@ model_list:
 litellm --config your_config.yaml
 ```

+
+## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "gpt-3.5-turbo",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:8000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:8000", # set openai_api_base to the LiteLLM Proxy
+    model = "gpt-3.5-turbo",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+
+</TabItem>
+<TabItem value="langchain-embedding" label="Langchain Embeddings">
+
+```python
+from langchain.embeddings import OpenAIEmbeddings
+
+embeddings = OpenAIEmbeddings(model="sagemaker-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+
+
+text = "This is a test document."
+
+query_result = embeddings.embed_query(text)
+
+print(f"SAGEMAKER EMBEDDINGS")
+print(query_result[:5])
+
+embeddings = OpenAIEmbeddings(model="bedrock-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+
+text = "This is a test document."
+
+query_result = embeddings.embed_query(text)
+
+print(f"BEDROCK EMBEDDINGS")
+print(query_result[:5])
+
+embeddings = OpenAIEmbeddings(model="bedrock-titan-embeddings", openai_api_base="http://0.0.0.0:8000", openai_api_key="temp-key")
+
+text = "This is a test document."
+
+query_result = embeddings.embed_query(text)
+
+print(f"TITAN EMBEDDINGS")
+print(query_result[:5])
+```
+</TabItem>
+</Tabs>
+
 [**More Info**](./configs.md)


--- a/docs/my-website/docs/proxy/ui.md
+++ b/docs/my-website/docs/proxy/ui.md
@ -1,9 +1,11 @@
 import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';

-# [BETA] Admin UI
+# 🔑 [BETA] Proxy UI 
+### **Create + delete keys through a UI**

- Track Spend Per API Key, User
- Allow your users to create their own keys through a UI
+[Let users create their own keys](#setup-ssoauth-for-ui)

 :::info

@ -11,61 +13,129 @@ This is in beta, so things may change. If you have feedback, [let us know](https

 :::

+<Image img={require('../../img/litellm_ui_create_key.png')} />  
+
+
+
 ## Quick Start

-Requirements: 
+- Requires proxy master key to be set 
+- Requires db connected 

- Need to a SMTP server connection to send emails (e.g. [Resend](https://resend.com/docs/send-with-smtp))
+Follow [setup](./virtual_keys.md#setup)

-[**See code**](https://github.com/BerriAI/litellm/blob/61cd800b9ffbb02c286481d2056b65c7fb5447bf/litellm/proxy/proxy_server.py#L1782)
+### 1. Start the proxy
+```bash
+litellm --config /path/to/config.yaml

-### Step 1. Save SMTP server credentials
-
-```env
-export SMTP_HOST="my-smtp-host"
-export SMTP_USERNAME="my-smtp-password"
-export SMTP_PASSWORD="my-smtp-password"
-export SMTP_SENDER_EMAIL="krrish@berri.ai"
+#INFO: Proxy running on http://0.0.0.0:8000
 ```

-### Step 2. Enable user auth 
+### 2. Go to UI 
+```bash
+http://0.0.0.0:8000/ui # <proxy_base_url>/ui
+```

-In your config.yaml, 
+
+## Get Admin UI Link on Swagger 
+Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhost:4000/`
+
+<Image img={require('../../img/ui_link.png')} />
+
+## Change default username + password
+
+Set the following in your .env on the Proxy
+
+```shell
+UI_USERNAME=ishaan-litellm
+UI_PASSWORD=langchain
+```
+
+On accessing the LiteLLM UI, you will be prompted to enter your username, password
+
+
+## Setup SSO/Auth for UI
+
+### Step 1: Set upperbounds for keys
+Control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 

 ```yaml
-general_settings:
-    # other changes
-    allow_user_auth: true
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
 ```

-This will enable:
-* Users to create keys via `/key/generate` (by default, only admin can create keys)
-* The `/user/auth` endpoint to send user's emails with their login credentials (key + user id)
+** Expected Behavior **

-### Step 3. Connect to UI 
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound

-You can use our hosted UI (https://dashboard.litellm.ai/) or [self-host your own](https://github.com/BerriAI/litellm/tree/main/ui). 
+### Step 2: Setup Oauth Client
+<Tabs>
+<TabItem value="google" label="Google SSO">

-If you self-host, you need to save the UI url in your proxy environment as `LITELLM_HOSTED_UI`. 
+- Create a new Oauth 2.0 Client on https://console.cloud.google.com/ 

-Connect your proxy to your UI, by entering: 
-1. The hosted proxy URL 
-2. Accepted email subdomains
-3. [OPTIONAL] Allowed admin emails 
+**Required .env variables on your Proxy**
+```shell
+# for Google SSO Login
+GOOGLE_CLIENT_ID=
+GOOGLE_CLIENT_SECRET=
+```

-<Image img={require('../../img/admin_dashboard.png')} />  
+- Set Redirect URL on your Oauth 2.0 Client on https://console.cloud.google.com/ 
+    - Set a redirect url = `<your proxy base url>/sso/callback`
+    ```shell
+    https://litellm-production-7002.up.railway.app/sso/callback
+    ```

-## What users will see? 
+</TabItem>

-### Auth 
+<TabItem value="msft" label="Microsoft SSO">

-<Image img={require('../../img/user_auth_screen.png')} />  
+- Create a new App Registration on https://portal.azure.com/
+- Create a client Secret for your App Registration

-### Create Keys 
+**Required .env variables on your Proxy**
+```shell
+MICROSOFT_CLIENT_ID="84583a4d-"
+MICROSOFT_CLIENT_SECRET="nbk8Q~"
+MICROSOFT_TENANT="5a39737
+```
+- Set Redirect URI on your App Registration on https://portal.azure.com/
+    - Set a redirect url = `<your proxy base url>/sso/callback`
+    ```shell
+    http://localhost:4000/sso/callback
+    ```

-<Image img={require('../../img/user_create_key_screen.png')} />  
+</TabItem>

-### Spend Per Key
+</Tabs>

-<Image img={require('../../img/spend_per_api_key.png')} />  
+### Step 3. Test flow
+<Image img={require('../../img/litellm_ui_3.gif')} />

+## Set Admin view w/ SSO 
+
+You just need to set Proxy Admin ID
+
+### Step 1: Copy your ID from the UI 
+
+<Image img={require('../../img/litellm_ui_copy_id.png')} />
+
+### Step 2: Set it in your .env as the PROXY_ADMIN_ID 
+
+```env
+export PROXY_ADMIN_ID="116544810872468347480"
+```
+
+### Step 3: See all proxy keys
+
+<Image img={require('../../img/litellm_ui_admin.png')} />
+
+:::info
+
+If you don't see all your keys this could be due to a cached token. So just re-login and it should work.
+
+:::
--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Use with Langchain, OpenAI SDK, Curl
+# Use with Langchain, OpenAI SDK, LlamaIndex, Curl

 :::info

@ -51,6 +51,42 @@ response = client.chat.completions.create(
 print(response)
 ```
 </TabItem>
+<TabItem value="LlamaIndex" label="LlamaIndex">
+
+```python
+import os, dotenv
+
+from llama_index.llms import AzureOpenAI
+from llama_index.embeddings import AzureOpenAIEmbedding
+from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
+
+llm = AzureOpenAI(
+    engine="azure-gpt-3.5",               # model_name on litellm proxy
+    temperature=0.0,
+    azure_endpoint="http://0.0.0.0:4000", # litellm proxy endpoint
+    api_key="sk-1234",                    # litellm proxy API Key
+    api_version="2023-07-01-preview",
+)
+
+embed_model = AzureOpenAIEmbedding(
+    deployment_name="azure-embedding-model",
+    azure_endpoint="http://0.0.0.0:4000",
+    api_key="sk-1234",
+    api_version="2023-07-01-preview",
+)
+
+
+documents = SimpleDirectoryReader("llama_index_data").load_data()
+service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
+index = VectorStoreIndex.from_documents(documents, service_context=service_context)
+
+query_engine = index.as_query_engine()
+response = query_engine.query("What did the author do growing up?")
+print(response)
+
+```
+</TabItem>
+
 <TabItem value="Curl" label="Curl Request">

 Pass `metadata` as part of the request body
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 💰 Budgets, Rate Limits per user 
+# 💰 Budgets, Rate Limits

 Requirements: 

@ -10,22 +10,72 @@ Requirements:

 ## Set Budgets

+You can set budgets at 3 levels: 
+- For the proxy 
+- For a user 
+- For a 'user' passed to `/chat/completions`, `/embeddings` etc
+- For a key

-Set `max_budget` in (USD $) param in the `/user/new` or `/key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys

 <Tabs>
-<TabItem value="per-user" label="Per User">
+<TabItem value="proxy" label="For Proxy">

-LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 
+Apply a budget across all calls on the proxy

+**Step 1. Modify config.yaml**

+```yaml
+general_settings:
+  master_key: sk-1234
+
+litellm_settings:
+  # other litellm settings
+  max_budget: 0 # (float) sets max budget as $0 USD
+  budget_duration: 30d # (str) frequency of reset - You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+```
+
+**Step 2. Start proxy**
+
+```bash
+litellm /path/to/config.yaml
+```
+
+**Step 3. Send test call**
+
+```bash
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+    --header 'Autherization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ],
+}'
+```
+</TabItem>
+<TabItem value="per-user" label="For User">
+
+Apply a budget across multiple keys.
+
+LiteLLM exposes a `/user/new` endpoint to create budgets for this.
+
+You can:
+- Add budgets to users [**Jump**](#add-budgets-to-users)
+- Add budget durations, to reset spend [**Jump**](#add-budget-duration-to-users)
+
+By default the `max_budget` is set to `null` and is not checked for keys
+
+### **Add budgets to users**
 ```shell 
 curl --location 'http://localhost:8000/user/new' \
 --header 'Authorization: Bearer <your-master-key>' \
 --header 'Content-Type: application/json' \
 --data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}' 
 ```
-The request is a normal `/key/generate` request body + a `max_budget` field. 

 [**See Swagger**](https://litellm-api.up.railway.app/#/user%20management/new_user_user_new_post)

@ -40,9 +90,93 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
 }
 ```

+### **Add budget duration to users**
+
+`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+
+```
+curl 'http://0.0.0.0:8000/user/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "team_id": "core-infra", # [OPTIONAL]
+  "max_budget": 10,
+  "budget_duration": 10s,
+}'
+```
+
+### Create new keys for existing user
+
+Now you can just call `/key/generate` with that user_id (i.e. krrish3@berri.ai) and:
+- **Budget Check**: krrish3@berri.ai's budget (i.e. $10) will be checked for this key
+- **Spend Tracking**: spend for this key will update krrish3@berri.ai's spend as well
+
+```bash
+curl --location 'http://0.0.0.0:8000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
+```

 </TabItem>
-<TabItem value="per-key" label="Per Key">
+<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+
+Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
+
+**Step 1. Modify config.yaml**
+Define `litellm.max_user_budget`
+```yaml
+general_settings:
+  master_key: sk-1234
+
+litellm_settings:
+  max_budget: 10      # global budget for proxy 
+  max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
+```
+
+2. Make a /chat/completions call, pass 'user' - First call Works 
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+Error
+```shell
+{"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%                
+```
+
+</TabItem>
+<TabItem value="per-key" label="For Key">
+
+Apply a budget on a key.

 You can:
 - Add budgets to keys [**Jump**](#add-budgets-to-keys)
@ -53,6 +187,8 @@ You can:
 - After the key crosses it's `max_budget`, requests fail
 - If duration set, spend is reset at the end of the duration

+By default the `max_budget` is set to `null` and is not checked for keys
+
 ### **Add budgets to keys**

 ```bash
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -1,4 +1,4 @@
-# Virtual Keys
+# Virtual Keys, Users
 Track Spend, Set budgets and create virtual keys for the proxy

 Grant other's temporary access to your proxy, with keys that expire after a set duration.
@ -6,6 +6,7 @@ Grant other's temporary access to your proxy, with keys that expire after a set

 :::info

+- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
 - [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
 - Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)

@ -16,8 +17,11 @@ Grant other's temporary access to your proxy, with keys that expire after a set

 Requirements: 

- Need to a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
+- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc)
 - Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env 
+- Set a `master key`, this is your Proxy Admin key - you can use this to create other keys
+  - ** Set on config.yaml** set your master key under `general_settings:master_key`, example below
+  - ** Set env variable** set `LITELLM_MASTER_KEY` (**Note: either set this on the config.yaml or in your env** whatever is more convenient for you)

 (the proxy Dockerfile checks if the `DATABASE_URL` is set and then intializes the DB connection)

@ -81,15 +85,17 @@ curl 'http://0.0.0.0:8000/key/generate' \

 Request Params:

- `models`: *list or null (optional)* - Specify the models a token has access too. If null, then token has access to all models on server. 
+- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+- `key_alias`: *Optional[str]* - User defined key alias
+- `team_id`: *Optional[str]* - The team id of the user
+- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
+- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
+- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
+- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
+- `max_budget`: *Optional[float]* - Specify max budget for a given key.
+- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
+- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }

- `duration`: *str or null (optional)* Specify the length of time the token is valid for. If null, default is set to 1 hour. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
-
- `metadata`: *dict or null (optional)* Pass metadata for the created token. If null defaults to {}
-
- `team_id`: *str or null (optional)* Specify team_id for the associated key
-
- `max_budget`: *float or null (optional)* Specify max budget (in Dollars $) for a given key. If no value is set, the key has no budget

 ### Response

@ -97,20 +103,11 @@ Request Params:
 {
    "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
    "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object
+    "key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
+    ...
 }
 ```

-### Keys that don't expire
-
-Just set duration to None. 
-
-```bash
-curl --location 'http://0.0.0.0:8000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data '{"models": ["azure-models"], "aliases": {"mistral-7b": "gpt-3.5-turbo"}, "duration": null}'
-```
-
 ### Upgrade/Downgrade Models 

 If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -285,7 +282,152 @@ Request Params:
 }
 ```

-## Set Budgets - Per Key
+## /user/new
+
+### Request
+
+All [key/generate params supported](#keygenerate) for creating a user
+```shell
+curl 'http://0.0.0.0:4000/user/new' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "user_id": "ishaan1",
+  "user_email": "ishaan@litellm.ai",
+  "user_role": "admin",
+  "team_id": "cto-team",
+  "max_budget": 20,
+  "budget_duration": "1h"
+
+}'
+```
+
+Request Params:
+
+- user_id: str (optional - defaults to uuid)  - The unique identifier for the user.
+- user_email: str (optional - defaults to "")  - The email address associated with the user.
+- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
+
+**Possible `user_role` values**
+```
+"admin" - Maintaining the proxy and owning the overall budget
+"app_owner" - employees maintaining the apps, each owner may own more than one app
+"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
+```
+- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
+- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
+- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
+
+### Response
+A key will be generated for the new user created
+
+```shell
+{
+  "models": [],
+  "spend": 0.0,
+  "max_budget": null,
+  "user_id": "ishaan1",
+  "team_id": null,
+  "max_parallel_requests": null,
+  "metadata": {},
+  "tpm_limit": null,
+  "rpm_limit": null,
+  "budget_duration": null,
+  "allowed_cache_controls": [],
+  "key_alias": null,
+  "duration": null,
+  "aliases": {},
+  "config": {},
+  "key": "sk-JflB33ucTqc2NYvNAgiBCA",
+  "key_name": null,
+  "expires": null
+}
+
+```
+
+Request Params:
+- keys: List[str] - List of keys to delete
+
+### Response
+
+```json
+{
+  "deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
+}
+```
+
+## Advanced 
+### Upperbound /key/generate params
+Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
+
+Set `litellm_settings:upperbound_key_generate_params`:
+```yaml
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
+```
+
+** Expected Behavior **
+
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound
+
+### Default /key/generate params
+Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
+
+When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
+
+Set `litellm_settings:default_key_generate_params`:
+```yaml
+litellm_settings:
+  default_key_generate_params:
+    max_budget: 1.5000
+    models: ["azure-gpt-3.5"]
+    duration:     # blank means `null`
+    metadata: {"setting":"default"}
+    team_id: "core-infra"
+```
+
+### Restrict models by `team_id`
+`litellm-dev` can only access `azure-gpt-3.5`
+
+```yaml
+litellm_settings:
+  default_team_settings:
+    - team_id: litellm-dev
+      models: ["azure-gpt-3.5"]
+```
+
+#### Create key with team_id="litellm-dev"
+```shell
+curl --location 'http://localhost:4000/key/generate' \
+--header 'Authorization: Bearer sk-1234' \
+--header 'Content-Type: application/json' \
+--data-raw '{"team_id": "litellm-dev"}'
+```
+
+#### Use Key to call invalid model - Fails 
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
+    --data '{
+        "model": "BEDROCK_GROUP",
+        "messages": [
+            {
+                "role": "user",
+                "content": "hi"
+            }
+        ]
+    }'
+```
+
+```shell
+{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n    _is_valid_team_configs(\n  File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n    raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP.  Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%            
+```         
+
+### Set Budgets - Per Key

 Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys

@ -331,7 +473,7 @@ Expected Response from `/chat/completions` when key has crossed budget
 ```


-## Set Budgets - Per User
+### Set Budgets - Per User

 LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys. 

@ -356,7 +498,7 @@ The request is a normal `/key/generate` request body + a `max_budget` field.
 }
 ```

-## Tracking Spend 
+### Tracking Spend 

 You can get spend for a key by using the `/key/info` endpoint. 

@ -391,13 +533,13 @@ This is automatically updated (in USD) when calls are made to /completions, /cha
 ```


-## Custom Auth 
+### Custom Auth 

 You can now override the default api key auth. 

 Here's how: 

-### 1. Create a custom auth file. 
+#### 1. Create a custom auth file. 

 Make sure the response type follows the `UserAPIKeyAuth` pydantic object. This is used by for logging usage specific to that user key.

@ -414,7 +556,7 @@ async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
        raise Exception
 ```

-### 2. Pass the filepath (relative to the config.yaml)
+#### 2. Pass the filepath (relative to the config.yaml)

 Pass the filepath to the config.yaml 

@ -435,16 +577,16 @@ general_settings:

 [**Implementation Code**](https://github.com/BerriAI/litellm/blob/caf2a6b279ddbe89ebd1d8f4499f65715d684851/litellm/proxy/utils.py#L122)

-### 3. Start the proxy
+#### 3. Start the proxy
 ```shell
 $ litellm --config /path/to/config.yaml 
 ```

-## Custom /key/generate
+### Custom /key/generate

 If you need to add custom logic before generating a Proxy API Key (Example Validating `team_id`)

-### 1. Write a custom `custom_generate_key_fn`
+#### 1. Write a custom `custom_generate_key_fn`


 The input to the custom_generate_key_fn function is a single parameter: `data` [(Type: GenerateKeyRequest)](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/_types.py#L125)
@ -510,7 +652,7 @@ async def custom_generate_key_fn(data: GenerateKeyRequest)-> dict:
 ```


-### 2. Pass the filepath (relative to the config.yaml)
+#### 2. Pass the filepath (relative to the config.yaml)

 Pass the filepath to the config.yaml 

@ -532,18 +674,18 @@ general_settings:



-## [BETA] Dynamo DB 
+### [BETA] Dynamo DB 

 Only live in `v1.16.21.dev1`. 

-### Step 1. Save keys to env
+#### Step 1. Save keys to env

 ```shell
 AWS_ACCESS_KEY_ID = "your-aws-access-key-id"
 AWS_SECRET_ACCESS_KEY = "your-aws-secret-access-key"
 ```

-### Step 2. Add details to config 
+#### Step 2. Add details to config 

 ```yaml
 general_settings: 
@ -560,7 +702,7 @@ general_settings:
  }
 ```

-### Step 3. Generate Key
+#### Step 3. Generate Key

 ```bash
 curl --location 'http://0.0.0.0:8000/key/generate' \
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -605,6 +605,49 @@ response = router.completion(model="gpt-3.5-turbo", messages=messages)
 print(f"response: {response}")
 ```

+## Custom Callbacks - Track API Key, API Endpoint, Model Used 
+
+If you need to track the api_key, api endpoint, model, custom_llm_provider used for each completion call, you can setup a [custom callback](https://docs.litellm.ai/docs/observability/custom_callback) 
+
+### Usage
+
+```python
+import litellm
+from litellm.integrations.custom_logger import CustomLogger
+
+class MyCustomHandler(CustomLogger):        
+	def log_success_event(self, kwargs, response_obj, start_time, end_time): 
+		print(f"On Success")
+		print("kwargs=", kwargs)
+		litellm_params= kwargs.get("litellm_params")
+		api_key = litellm_params.get("api_key")
+		api_base = litellm_params.get("api_base")
+		custom_llm_provider= litellm_params.get("custom_llm_provider")
+		response_cost = kwargs.get("response_cost")
+
+		# print the values
+		print("api_key=", api_key)
+		print("api_base=", api_base)
+		print("custom_llm_provider=", custom_llm_provider)
+		print("response_cost=", response_cost)
+
+	def log_failure_event(self, kwargs, response_obj, start_time, end_time): 
+		print(f"On Failure")
+		print("kwargs=")
+
+customHandler = MyCustomHandler()
+
+litellm.callbacks = [customHandler]
+
+# Init Router
+router = Router(model_list=model_list, routing_strategy="simple-shuffle")
+
+# router completion call
+response = router.completion(
+	model="gpt-3.5-turbo", 
+	messages=[{ "role": "user", "content": "Hi who are you"}]
+)
+```

 ## Deploy Router 

--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -99,6 +99,12 @@ const config = {
            position: 'left',
            label: 'Docs',
          },
+          {
+            sidebarId: 'tutorialSidebar',
+            position: 'left',
+            label: 'Enterprise',
+            to: "docs/enterprise"
+          },
          {
            href: 'https://github.com/BerriAI/litellm',
            label: 'GitHub',
--- a/docs/my-website/img/admin_ui_2.png
+++ b/docs/my-website/img/admin_ui_2.png
--- a/docs/my-website/img/google_oauth2.png
+++ b/docs/my-website/img/google_oauth2.png
--- a/docs/my-website/img/google_redirect.png
+++ b/docs/my-website/img/google_redirect.png
--- a/docs/my-website/img/litellm_ui_3.gif
+++ b/docs/my-website/img/litellm_ui_3.gif
--- a/docs/my-website/img/litellm_ui_admin.png
+++ b/docs/my-website/img/litellm_ui_admin.png
--- a/docs/my-website/img/litellm_ui_copy_id.png
+++ b/docs/my-website/img/litellm_ui_copy_id.png
--- a/docs/my-website/img/litellm_ui_create_key.png
+++ b/docs/my-website/img/litellm_ui_create_key.png
--- a/docs/my-website/img/litellm_ui_login.png
+++ b/docs/my-website/img/litellm_ui_login.png
--- a/docs/my-website/img/presidio_screenshot.png
+++ b/docs/my-website/img/presidio_screenshot.png
--- a/docs/my-website/img/spend_per_user.png
+++ b/docs/my-website/img/spend_per_user.png
--- a/docs/my-website/img/ui_3.gif
+++ b/docs/my-website/img/ui_3.gif
--- a/docs/my-website/img/ui_link.png
+++ b/docs/my-website/img/ui_link.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -98,7 +98,7 @@ const sidebars = {
      link: {
        type: 'generated-index',
        title: '💥 OpenAI Proxy Server',
-        description: `Proxy Server to call 100+ LLMs in a unified interface, load balance deployments, track costs per user`,
+        description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
        slug: '/simple_proxy',
      },
      items: [
@ -115,6 +115,8 @@ const sidebars = {
        "proxy/ui",
        "proxy/model_management",
        "proxy/health",
+        "proxy/debugging",
+        "proxy/pii_masking",
        {
          "type": "category",
          "label": "🔥 Load Balancing",
@ -123,6 +125,7 @@ const sidebars = {
            "proxy/reliability",
          ]
        },
+        "proxy/caching",
        {
          "type": "category",
          "label": "Logging, Alerting, Caching",
@ -130,7 +133,6 @@ const sidebars = {
            "proxy/logging", 
            "proxy/alerting",
            "proxy/streaming_logging",
-            "proxy/caching",
          ]
        },
        {
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@ -8,6 +8,11 @@ https://github.com/BerriAI/litellm

 ## **Call 100+ LLMs using the same Input/Output Format**

+- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
+- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
+- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+
 ## Basic usage 
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
@ -306,30 +311,7 @@ litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langf
 response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
 ```

-## Calculate Costs, Usage, Latency
-
-Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
-
-```python
-from litellm import completion, completion_cost
-import os
-os.environ["OPENAI_API_KEY"] = "your-api-key"
-
-response = completion(
-  model="gpt-3.5-turbo", 
-  messages=[{ "content": "Hello, how are you?","role": "user"}]
-)
-
-cost = completion_cost(completion_response=response)
-print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
-```
-
-**Output**
-```shell
-Cost for completion call with gpt-3.5-turbo:  $0.0000775000
-```
-
-### Track Costs, Usage, Latency for streaming
+## Track Costs, Usage, Latency for streaming
 Use a callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback

 ```python
@ -342,18 +324,8 @@ def track_cost_callback(
    start_time, end_time    # start/end time
 ):
    try:
-        # check if it has collected an entire stream response
-        if "complete_streaming_response" in kwargs:
-            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
-            completion_response=kwargs["complete_streaming_response"]
-            input_text = kwargs["messages"]
-            output_text = completion_response["choices"][0]["message"]["content"]
-            response_cost = litellm.completion_cost(
-                model = kwargs["model"],
-                messages = input_text,
-                completion=output_text
-            )
-            print("streaming response_cost", response_cost)
+      response_cost = kwargs.get("response_cost", 0)
+      print("streaming response_cost", response_cost)
    except:
        pass
 # set callback 
@ -372,13 +344,12 @@ response = completion(
 )
 ```

-
-Need a dedicated key? Email us @ krrish@berri.ai
-
 ## OpenAI Proxy

 Track spend across multiple projects/people 

+![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033)
+
 The proxy provides: 
 1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
 2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
@ -418,4 +389,4 @@ print(response)
 ## More details
 * [exception mapping](./exception_mapping.md)
 * [retries + model fallbacks for completion()](./completion/reliable_completions.md)
-* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
+* [proxy virtual keys & spend management](./tutorials/fallbacks.md)
--- a/litellm/init.py
+++ b/litellm/init.py
@ -1,11 +1,13 @@
 ### INIT VARIABLES ###
-import threading, requests
+import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any
 from litellm.caching import Cache
-from litellm._logging import set_verbose, _turn_on_debug
+from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
 from litellm.proxy._types import KeyManagementSystem
 import httpx
+import dotenv

+dotenv.load_dotenv()
 #############################################
 if set_verbose == True:
    _turn_on_debug()
@ -62,6 +64,9 @@ cache: Optional[
 model_alias_map: Dict[str, str] = {}
 model_group_alias_map: Dict[str, str] = {}
 max_budget: float = 0.0  # set the max budget across all providers
+budget_duration: Optional[
+    str
+] = None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 _openai_completion_params = [
    "functions",
    "function_call",
@ -140,6 +145,10 @@ model_cost_map_url: str = "https://raw.githubusercontent.com/BerriAI/litellm/mai
 suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
+default_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[Dict] = None
+default_team_settings: Optional[List] = None
+max_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint
@ -159,6 +168,19 @@ _key_management_system: Optional[KeyManagementSystem] = None


 def get_model_cost_map(url: str):
+    if (
+        os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == True
+        or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True"
+    ):
+        import importlib.resources
+        import json
+
+        with importlib.resources.open_text(
+            "litellm", "model_prices_and_context_window_backup.json"
+        ) as f:
+            content = json.load(f)
+            return content
+
    try:
        with requests.get(
            url, timeout=5
@ -214,6 +236,7 @@ vertex_chat_models: List = []
 vertex_code_chat_models: List = []
 vertex_text_models: List = []
 vertex_code_text_models: List = []
+vertex_embedding_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -243,6 +266,8 @@ for key, value in model_cost.items():
        vertex_chat_models.append(key)
    elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
        vertex_code_chat_models.append(key)
+    elif value.get("litellm_provider") == "vertex_ai-embedding-models":
+        vertex_embedding_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -262,6 +287,7 @@ openai_compatible_endpoints: List = [
    "api.endpoints.anyscale.com/v1",
    "api.deepinfra.com/v1/openai",
    "api.mistral.ai/v1",
+    "api.together.xyz/v1",
 ]

 # this is maintained for Exception Mapping
@ -271,6 +297,7 @@ openai_compatible_providers: List = [
    "deepinfra",
    "perplexity",
    "xinference",
+    "together_ai",
 ]


@ -479,7 +506,10 @@ bedrock_embedding_models: List = [
 ]

 all_embedding_models = (
-    open_ai_embedding_models + cohere_embedding_models + bedrock_embedding_models
+    open_ai_embedding_models
+    + cohere_embedding_models
+    + bedrock_embedding_models
+    + vertex_embedding_models
 )

 ####### IMAGE GENERATION MODELS ###################
@ -534,6 +564,7 @@ from .llms.bedrock import (
    AmazonAnthropicConfig,
    AmazonCohereConfig,
    AmazonLlamaConfig,
+    AmazonStabilityConfig,
 )
 from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
 from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -7,8 +7,11 @@ handler = logging.StreamHandler()
 handler.setLevel(logging.DEBUG)

 # Create a formatter and set it for the handler
+formatter = logging.Formatter(
+    "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(message)s",
+    datefmt="%H:%M:%S",
+)

-formatter = logging.Formatter("\033[92m%(name)s - %(levelname)s\033[0m: %(message)s")

 handler.setFormatter(formatter)

--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -11,6 +11,7 @@
 import os
 import inspect
 import redis, litellm
+import redis.asyncio as async_redis
 from typing import List, Optional


@ -67,7 +68,10 @@ def get_redis_url_from_environment():
    )


-def get_redis_client(**env_overrides):
+def _get_redis_client_logic(**env_overrides):
+    """
+    Common functionality across sync + async redis client implementations
+    """
    ### check if "os.environ/<key-name>" passed in
    for k, v in env_overrides.items():
        if isinstance(v, str) and v.startswith("os.environ/"):
@ -85,9 +89,33 @@ def get_redis_client(**env_overrides):
        redis_kwargs.pop("port", None)
        redis_kwargs.pop("db", None)
        redis_kwargs.pop("password", None)
-
-        return redis.Redis.from_url(**redis_kwargs)
    elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
        raise ValueError("Either 'host' or 'url' must be specified for redis.")
    litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
+    return redis_kwargs
+
+
+def get_redis_client(**env_overrides):
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        return redis.Redis.from_url(**redis_kwargs)
    return redis.Redis(**redis_kwargs)
+
+
+def get_redis_async_client(**env_overrides):
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        return async_redis.Redis.from_url(**redis_kwargs)
+    return async_redis.Redis(
+        socket_timeout=5,
+        **redis_kwargs,
+    )
+
+
+def get_redis_connection_pool(**env_overrides):
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        return async_redis.BlockingConnectionPool.from_url(
+            timeout=5, url=redis_kwargs["url"]
+        )
+    return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs)
--- a/litellm/budget_manager.py
+++ b/litellm/budget_manager.py
@ -1,3 +1,12 @@
+# +-----------------------------------------------+
+# |                                               |
+# |           NOT PROXY BUDGET MANAGER            |
+# |  proxy budget manager is in proxy_server.py   |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+
 import os, json, time
 import litellm
 from litellm.utils import ModelResponse
@ -16,7 +25,7 @@ class BudgetManager:
        self.client_type = client_type
        self.project_name = project_name
        self.api_base = api_base or "https://api.litellm.ai"
-        self.headers = headers or {'Content-Type': 'application/json'}
+        self.headers = headers or {"Content-Type": "application/json"}
        ## load the data or init the initial dictionaries
        self.load_data()

--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -8,7 +8,7 @@
 #  Thank you users! We ❤️ you! - Krrish & Ishaan

 import litellm
-import time, logging
+import time, logging, asyncio
 import json, traceback, ast, hashlib
 from typing import Optional, Literal, List, Union, Any
 from openai._models import BaseModel as OpenAIObject
@ -28,9 +28,18 @@ class BaseCache:
    def set_cache(self, key, value, **kwargs):
        raise NotImplementedError

+    async def async_set_cache(self, key, value, **kwargs):
+        raise NotImplementedError
+
    def get_cache(self, key, **kwargs):
        raise NotImplementedError

+    async def async_get_cache(self, key, **kwargs):
+        raise NotImplementedError
+
+    async def disconnect(self):
+        raise NotImplementedError
+

 class InMemoryCache(BaseCache):
    def __init__(self):
@ -43,6 +52,16 @@ class InMemoryCache(BaseCache):
        if "ttl" in kwargs:
            self.ttl_dict[key] = time.time() + kwargs["ttl"]

+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+
+    async def async_set_cache_pipeline(self, cache_list, ttl=None):
+        for cache_key, cache_value in cache_list:
+            if ttl is not None:
+                self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
+            else:
+                self.set_cache(key=cache_key, value=cache_value)
+
    def get_cache(self, key, **kwargs):
        if key in self.cache_dict:
            if key in self.ttl_dict:
@ -57,17 +76,26 @@ class InMemoryCache(BaseCache):
            return cached_response
        return None

+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+
    def flush_cache(self):
        self.cache_dict.clear()
        self.ttl_dict.clear()

+    async def disconnect(self):
+        pass
+
+    def delete_cache(self, key):
+        self.cache_dict.pop(key, None)
+        self.ttl_dict.pop(key, None)
+

 class RedisCache(BaseCache):
-    def __init__(self, host=None, port=None, password=None, **kwargs):
-        import redis
+    # if users don't provider one, use the default litellm cache

-        # if users don't provider one, use the default litellm cache
-        from ._redis import get_redis_client
+    def __init__(self, host=None, port=None, password=None, **kwargs):
+        from ._redis import get_redis_client, get_redis_connection_pool

        redis_kwargs = {}
        if host is not None:
@ -78,18 +106,84 @@ class RedisCache(BaseCache):
            redis_kwargs["password"] = password

        redis_kwargs.update(kwargs)
-
        self.redis_client = get_redis_client(**redis_kwargs)
+        self.redis_kwargs = redis_kwargs
+        self.async_redis_conn_pool = get_redis_connection_pool()
+
+    def init_async_client(self):
+        from ._redis import get_redis_async_client
+
+        return get_redis_async_client(
+            connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
+        )

    def set_cache(self, key, value, **kwargs):
        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}")
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
        try:
            self.redis_client.set(name=key, value=str(value), ex=ttl)
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)

+    async def async_set_cache(self, key, value, **kwargs):
+        _redis_client = self.init_async_client()
+        async with _redis_client as redis_client:
+            ttl = kwargs.get("ttl", None)
+            print_verbose(
+                f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
+            )
+            try:
+                await redis_client.set(name=key, value=json.dumps(value), ex=ttl)
+            except Exception as e:
+                # NON blocking - notify users Redis is throwing an exception
+                logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    async def async_set_cache_pipeline(self, cache_list, ttl=None):
+        """
+        Use Redis Pipelines for bulk write operations
+        """
+        _redis_client = self.init_async_client()
+        try:
+            async with _redis_client as redis_client:
+                async with redis_client.pipeline(transaction=True) as pipe:
+                    # Iterate through each key-value pair in the cache_list and set them in the pipeline.
+                    for cache_key, cache_value in cache_list:
+                        print_verbose(
+                            f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
+                        )
+                        # Set the value with a TTL if it's provided.
+                        if ttl is not None:
+                            pipe.setex(cache_key, ttl, json.dumps(cache_value))
+                        else:
+                            pipe.set(cache_key, json.dumps(cache_value))
+                    # Execute the pipeline and return the results.
+                    results = await pipe.execute()
+
+            print_verbose(f"pipeline results: {results}")
+            # Optionally, you could process 'results' to make sure that all set operations were successful.
+            return results
+        except Exception as e:
+            print_verbose(f"Error occurred in pipeline write - {str(e)}")
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
+        # cached_response is in `b{} convert it to ModelResponse
+        cached_response = cached_response.decode("utf-8")  # Convert bytes to string
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
+
    def get_cache(self, key, **kwargs):
        try:
            print_verbose(f"Get Redis Cache: key: {key}")
@ -97,26 +191,361 @@ class RedisCache(BaseCache):
            print_verbose(
                f"Got Redis Cache: key: {key}, cached_response {cached_response}"
            )
-            if cached_response != None:
-                # cached_response is in `b{} convert it to ModelResponse
-                cached_response = cached_response.decode(
-                    "utf-8"
-                )  # Convert bytes to string
-                try:
-                    cached_response = json.loads(
-                        cached_response
-                    )  # Convert string to dictionary
-                except:
-                    cached_response = ast.literal_eval(cached_response)
-                return cached_response
+            return self._get_cache_logic(cached_response=cached_response)
        except Exception as e:
            # NON blocking - notify users Redis is throwing an exception
            traceback.print_exc()
            logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)

+    async def async_get_cache(self, key, **kwargs):
+        _redis_client = self.init_async_client()
+        async with _redis_client as redis_client:
+            try:
+                print_verbose(f"Get Redis Cache: key: {key}")
+                cached_response = await redis_client.get(key)
+                print_verbose(
+                    f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
+                )
+                response = self._get_cache_logic(cached_response=cached_response)
+                return response
+            except Exception as e:
+                # NON blocking - notify users Redis is throwing an exception
+                traceback.print_exc()
+                logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e)
+
    def flush_cache(self):
        self.redis_client.flushall()

+    async def disconnect(self):
+        pass
+
+    def delete_cache(self, key):
+        self.redis_client.delete(key)
+
+
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        use_async=False,
+        embedding_model="text-embedding-ada-002",
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
+
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
+
+    #
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
+
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
+
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
+
+    def set_cache(self, key, value, **kwargs):
+        import numpy as np
+
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model=self.embedding_model,
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = self.index.load(new_data)
+
+        return
+
+    def get_cache(self, key, **kwargs):
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model=self.embedding_model,
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
+
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+                metadata={
+                    "user_api_key": user_api_key,
+                    "semantic-cache-embedding": True,
+                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                },
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
+
+    async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+                metadata={
+                    "user_api_key": user_api_key,
+                    "semantic-cache-embedding": True,
+                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                },
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+        pass
+
+    async def _index_info(self):
+        return await self.index.ainfo()
+

 class S3Cache(BaseCache):
    def __init__(
@ -195,6 +624,9 @@ class S3Cache(BaseCache):
            # NON blocking - notify users S3 is throwing an exception
            print_verbose(f"S3 Caching: set_cache() - Got exception from S3: {e}")

+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+
    def get_cache(self, key, **kwargs):
        import boto3, botocore

@ -237,9 +669,15 @@ class S3Cache(BaseCache):
            traceback.print_exc()
            print_verbose(f"S3 Caching: get_cache() - Got exception from S3: {e}")

+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+
    def flush_cache(self):
        pass

+    async def disconnect(self):
+        pass
+

 class DualCache(BaseCache):
    """
@ -304,15 +742,22 @@ class DualCache(BaseCache):
        if self.redis_cache is not None:
            self.redis_cache.flush_cache()

+    def delete_cache(self, key):
+        if self.in_memory_cache is not None:
+            self.in_memory_cache.delete_cache(key)
+        if self.redis_cache is not None:
+            self.redis_cache.delete_cache(key)
+

 #### LiteLLM.Completion / Embedding Cache ####
 class Cache:
    def __init__(
        self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
        host: Optional[str] = None,
        port: Optional[str] = None,
        password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
        supported_call_types: Optional[
            List[Literal["completion", "acompletion", "embedding", "aembedding"]]
        ] = ["completion", "acompletion", "embedding", "aembedding"],
@ -327,16 +772,20 @@ class Cache:
        s3_aws_secret_access_key: Optional[str] = None,
        s3_aws_session_token: Optional[str] = None,
        s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
        **kwargs,
    ):
        """
        Initializes the cache based on the given type.

        Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
            host (str, optional): The host address for the Redis cache. Required if type is "redis".
            port (int, optional): The port number for the Redis cache. Required if type is "redis".
            password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
            supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
            **kwargs: Additional keyword arguments for redis.Redis() cache

@ -348,9 +797,19 @@ class Cache:
        """
        if type == "redis":
            self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
-        if type == "local":
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
+                **kwargs,
+            )
+        elif type == "local":
            self.cache = InMemoryCache()
-        if type == "s3":
+        elif type == "s3":
            self.cache = S3Cache(
                s3_bucket_name=s3_bucket_name,
                s3_region_name=s3_region_name,
@ -476,6 +935,45 @@ class Cache:
            }
            time.sleep(0.02)

+    def _get_cache_logic(
+        self,
+        cached_result: Optional[Any],
+        max_age: Optional[float],
+    ):
+        """
+        Common get cache logic across sync + async implementations
+        """
+        # Check if a timestamp was stored with the cached response
+        if (
+            cached_result is not None
+            and isinstance(cached_result, dict)
+            and "timestamp" in cached_result
+        ):
+            timestamp = cached_result["timestamp"]
+            current_time = time.time()
+
+            # Calculate age of the cached response
+            response_age = current_time - timestamp
+
+            # Check if the cached response is older than the max-age
+            if max_age is not None and response_age > max_age:
+                return None  # Cached response is too old
+
+            # If the response is fresh, or there's no max-age requirement, return the cached response
+            # cached_response is in `b{} convert it to ModelResponse
+            cached_response = cached_result.get("response")
+            try:
+                if isinstance(cached_response, dict):
+                    pass
+                else:
+                    cached_response = json.loads(
+                        cached_response  # type: ignore
+                    )  # Convert string to dictionary
+            except:
+                cached_response = ast.literal_eval(cached_response)  # type: ignore
+            return cached_response
+        return cached_result
+
    def get_cache(self, *args, **kwargs):
        """
        Retrieves the cached result for the given arguments.
@ -488,6 +986,7 @@ class Cache:
            The cached result if it exists, otherwise None.
        """
        try:  # never block execution
+            messages = kwargs.get("messages", [])
            if "cache_key" in kwargs:
                cache_key = kwargs["cache_key"]
            else:
@ -497,55 +996,44 @@ class Cache:
                max_age = cache_control_args.get(
                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                )
-                cached_result = self.cache.get_cache(cache_key)
-                # Check if a timestamp was stored with the cached response
-                if (
-                    cached_result is not None
-                    and isinstance(cached_result, dict)
-                    and "timestamp" in cached_result
-                    and max_age is not None
-                ):
-                    timestamp = cached_result["timestamp"]
-                    current_time = time.time()
-
-                    # Calculate age of the cached response
-                    response_age = current_time - timestamp
-
-                    # Check if the cached response is older than the max-age
-                    if response_age > max_age:
-                        print_verbose(
-                            f"Cached response for key {cache_key} is too old. Max-age: {max_age}s, Age: {response_age}s"
-                        )
-                        return None  # Cached response is too old
-
-                    # If the response is fresh, or there's no max-age requirement, return the cached response
-                    # cached_response is in `b{} convert it to ModelResponse
-                    cached_response = cached_result.get("response")
-                    try:
-                        if isinstance(cached_response, dict):
-                            pass
-                        else:
-                            cached_response = json.loads(
-                                cached_response
-                            )  # Convert string to dictionary
-                    except:
-                        cached_response = ast.literal_eval(cached_response)
-                    return cached_response
-                return cached_result
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
+                return self._get_cache_logic(
+                    cached_result=cached_result, max_age=max_age
+                )
        except Exception as e:
            print_verbose(f"An exception occurred: {traceback.format_exc()}")
            return None

-    def add_cache(self, result, *args, **kwargs):
+    async def async_get_cache(self, *args, **kwargs):
        """
-        Adds a result to the cache.
+        Async get cache implementation.

-        Args:
-            *args: args to litellm.completion() or embedding()
-            **kwargs: kwargs to litellm.completion() or embedding()
+        Used for embedding calls in async wrapper
+        """
+        try:  # never block execution
+            messages = kwargs.get("messages", [])
+            if "cache_key" in kwargs:
+                cache_key = kwargs["cache_key"]
+            else:
+                cache_key = self.get_cache_key(*args, **kwargs)
+            if cache_key is not None:
+                cache_control_args = kwargs.get("cache", {})
+                max_age = cache_control_args.get(
+                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+                )
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, *args, **kwargs
+                )
+                return self._get_cache_logic(
+                    cached_result=cached_result, max_age=max_age
+                )
+        except Exception as e:
+            print_verbose(f"An exception occurred: {traceback.format_exc()}")
+            return None

-        Returns:
-            None
+    def _add_cache_logic(self, result, *args, **kwargs):
+        """
+        Common implementation across sync + async add_cache functions
        """
        try:
            if "cache_key" in kwargs:
@ -564,14 +1052,82 @@ class Cache:
                        if k == "ttl":
                            kwargs["ttl"] = v
                cached_data = {"timestamp": time.time(), "response": result}
-                self.cache.set_cache(cache_key, cached_data, **kwargs)
+                return cache_key, cached_data, kwargs
+            else:
+                raise Exception("cache key is None")
+        except Exception as e:
+            raise e
+
+    def add_cache(self, result, *args, **kwargs):
+        """
+        Adds a result to the cache.
+
+        Args:
+            *args: args to litellm.completion() or embedding()
+            **kwargs: kwargs to litellm.completion() or embedding()
+
+        Returns:
+            None
+        """
+        try:
+            cache_key, cached_data, kwargs = self._add_cache_logic(
+                result=result, *args, **kwargs
+            )
+            self.cache.set_cache(cache_key, cached_data, **kwargs)
        except Exception as e:
            print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
            traceback.print_exc()
            pass

-    async def _async_add_cache(self, result, *args, **kwargs):
-        self.add_cache(result, *args, **kwargs)
+    async def async_add_cache(self, result, *args, **kwargs):
+        """
+        Async implementation of add_cache
+        """
+        try:
+            cache_key, cached_data, kwargs = self._add_cache_logic(
+                result=result, *args, **kwargs
+            )
+            await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
+            traceback.print_exc()
+
+    async def async_add_cache_pipeline(self, result, *args, **kwargs):
+        """
+        Async implementation of add_cache for Embedding calls
+
+        Does a bulk write, to prevent using too many clients
+        """
+        try:
+            cache_list = []
+            for idx, i in enumerate(kwargs["input"]):
+                preset_cache_key = litellm.cache.get_cache_key(
+                    *args, **{**kwargs, "input": i}
+                )
+                kwargs["cache_key"] = preset_cache_key
+                embedding_response = result.data[idx]
+                cache_key, cached_data, kwargs = self._add_cache_logic(
+                    result=embedding_response,
+                    *args,
+                    **kwargs,
+                )
+                cache_list.append((cache_key, cached_data))
+            if hasattr(self.cache, "async_set_cache_pipeline"):
+                await self.cache.async_set_cache_pipeline(cache_list=cache_list)
+            else:
+                tasks = []
+                for val in cache_list:
+                    tasks.append(
+                        self.cache.async_set_cache(cache_key, cached_data, **kwargs)
+                    )
+                await asyncio.gather(*tasks)
+        except Exception as e:
+            print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
+            traceback.print_exc()
+
+    async def disconnect(self):
+        if hasattr(self.cache, "disconnect"):
+            await self.cache.disconnect()


 def enable_cache(
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -63,6 +63,22 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
    ):
        pass

+    async def async_post_call_streaming_hook(
+        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
+    ):
+        """
+        Returns streaming chunk before their returned to user
+        """
+        pass
+
+    async def async_post_call_success_hook(
+        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
+    ):
+        """
+        Returns llm response before it's returned to user
+        """
+        pass
+
    #### SINGLE-USE #### - https://docs.litellm.ai/docs/observability/custom_callback#using-your-custom-callback-function

    def log_input_event(self, model, messages, kwargs, print_verbose, callback_func):
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -2,6 +2,7 @@
 #    On success, logs events to Helicone
 import dotenv, os
 import requests
+import litellm

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -56,6 +57,10 @@ class HeliconeLogger:
                else "gpt-3.5-turbo"
            )
            provider_request = {"model": model, "messages": messages}
+            if isinstance(response_obj, litellm.EmbeddingResponse) or isinstance(
+                response_obj, litellm.ModelResponse
+            ):
+                response_obj = response_obj.json()

            if "claude" in model:
                provider_request, response_obj = self.claude_mapping(
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -9,11 +9,12 @@ dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 from packaging.version import Version
 from litellm._logging import verbose_logger
+import litellm


 class LangFuseLogger:
    # Class variables or attributes
-    def __init__(self):
+    def __init__(self, langfuse_public_key=None, langfuse_secret=None):
        try:
            from langfuse import Langfuse
        except Exception as e:
@ -21,8 +22,8 @@ class LangFuseLogger:
                f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\033[0m"
            )
        # Instance variables
-        self.secret_key = os.getenv("LANGFUSE_SECRET_KEY")
-        self.public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
+        self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
+        self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
        self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
        self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
        self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
@ -34,8 +35,41 @@ class LangFuseLogger:
            debug=self.langfuse_debug,
        )

+        if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
+            self.upstream_langfuse_secret_key = os.getenv(
+                "UPSTREAM_LANGFUSE_SECRET_KEY"
+            )
+            self.upstream_langfuse_public_key = os.getenv(
+                "UPSTREAM_LANGFUSE_PUBLIC_KEY"
+            )
+            self.upstream_langfuse_host = os.getenv("UPSTREAM_LANGFUSE_HOST")
+            self.upstream_langfuse_release = os.getenv("UPSTREAM_LANGFUSE_RELEASE")
+            self.upstream_langfuse_debug = os.getenv("UPSTREAM_LANGFUSE_DEBUG")
+            self.upstream_langfuse = Langfuse(
+                public_key=self.upstream_langfuse_public_key,
+                secret_key=self.upstream_langfuse_secret_key,
+                host=self.upstream_langfuse_host,
+                release=self.upstream_langfuse_release,
+                debug=self.upstream_langfuse_debug,
+            )
+        else:
+            self.upstream_langfuse = None
+
+    # def log_error(kwargs, response_obj, start_time, end_time):
+    #     generation = trace.generation(
+    #         level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
+    #         status_message='error' # can be any string (e.g. stringified stack trace or error body)
+    #     )
    def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self,
+        kwargs,
+        response_obj,
+        start_time,
+        end_time,
+        user_id,
+        print_verbose,
+        level="DEFAULT",
+        status_message=None,
    ):
        # Method definition

@ -63,32 +97,49 @@ class LangFuseLogger:
                        pass

            # end of processing langfuse ########################
-            input = prompt
-            output = response_obj["choices"][0]["message"].json()
-            print_verbose(
-                f"OUTPUT IN LANGFUSE: {output}; original: {response_obj['choices'][0]['message']}"
-            )
-            self._log_langfuse_v2(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-            ) if self._is_langfuse_v2() else self._log_langfuse_v1(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-            )
+            if (
+                level == "ERROR"
+                and status_message is not None
+                and isinstance(status_message, str)
+            ):
+                input = prompt
+                output = status_message
+            elif response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
+            ):
+                input = prompt
+                output = response_obj["data"]
+            elif response_obj is not None:
+                input = prompt
+                output = response_obj["choices"][0]["message"].json()
+            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            if self._is_langfuse_v2():
+                self._log_langfuse_v2(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                    level,
+                    print_verbose,
+                )
+            elif response_obj is not None:
+                self._log_langfuse_v1(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                )

            self.Langfuse.flush()
            print_verbose(
@ -97,15 +148,15 @@ class LangFuseLogger:
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
        except:
            traceback.print_exc()
-            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
            pass

    async def _async_log_event(
        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
    ):
-        self.log_event(
-            kwargs, response_obj, start_time, end_time, user_id, print_verbose
-        )
+        """
+        TODO: support async calls when langfuse is truly async
+        """

    def _is_langfuse_v2(self):
        import langfuse
@ -167,40 +218,84 @@ class LangFuseLogger:
        optional_params,
        input,
        response_obj,
+        level,
+        print_verbose,
    ):
        import langfuse

-        tags = []
-        supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
+        try:
+            tags = []
+            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
+            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")

-        trace_params = {
-            "name": metadata.get("generation_name", "litellm-completion"),
-            "input": input,
-            "output": output,
-            "user_id": metadata.get("trace_user_id", user_id),
-            "id": metadata.get("trace_id", None),
-        }
-        if supports_tags:
-            for key, value in metadata.items():
-                tags.append(f"{key}:{value}")
-            if "cache_hit" in kwargs:
-                tags.append(f"cache_hit:{kwargs['cache_hit']}")
-            trace_params.update({"tags": tags})
+            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")

-        trace = self.Langfuse.trace(**trace_params)
+            if supports_tags:
+                metadata_tags = metadata.get("tags", [])
+                tags = metadata_tags

-        trace.generation(
-            name=metadata.get("generation_name", "litellm-completion"),
-            id=metadata.get("generation_id", None),
-            startTime=start_time,
-            endTime=end_time,
-            model=kwargs["model"],
-            modelParameters=optional_params,
-            input=input,
-            output=output,
-            usage={
-                "prompt_tokens": response_obj["usage"]["prompt_tokens"],
-                "completion_tokens": response_obj["usage"]["completion_tokens"],
-            },
-            metadata=metadata,
-        )
+            generation_name = metadata.get("generation_name", None)
+            if generation_name is None:
+                # just log `litellm-{call_type}` as the generation name
+                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+
+            trace_params = {
+                "name": generation_name,
+                "input": input,
+                "user_id": metadata.get("trace_user_id", user_id),
+                "id": metadata.get("trace_id", None),
+                "session_id": metadata.get("session_id", None),
+            }
+
+            if level == "ERROR":
+                trace_params["status_message"] = output
+            else:
+                trace_params["output"] = output
+
+            cost = kwargs.get("response_cost", None)
+            print_verbose(f"trace: {cost}")
+            if supports_tags:
+                for key, value in metadata.items():
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                        "semantic-similarity",
+                    ]:
+                        tags.append(f"{key}:{value}")
+                if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
+                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
+                trace_params.update({"tags": tags})
+
+            trace = self.Langfuse.trace(**trace_params)
+
+            generation_id = None
+            usage = None
+            if response_obj is not None and response_obj.get("id", None) is not None:
+                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
+                usage = {
+                    "prompt_tokens": response_obj["usage"]["prompt_tokens"],
+                    "completion_tokens": response_obj["usage"]["completion_tokens"],
+                    "total_cost": cost if supports_costs else None,
+                }
+            generation_params = {
+                "name": generation_name,
+                "id": metadata.get("generation_id", generation_id),
+                "startTime": start_time,
+                "endTime": end_time,
+                "model": kwargs["model"],
+                "modelParameters": optional_params,
+                "input": input,
+                "output": output,
+                "usage": usage,
+                "metadata": metadata,
+                "level": level,
+            }
+
+            if output is not None and isinstance(output, str) and level == "ERROR":
+                generation_params["statusMessage"] = output
+
+            trace.generation(**generation_params)
+        except Exception as e:
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@ -8,7 +8,7 @@ dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import datetime, subprocess, sys
 import litellm, uuid
-from litellm._logging import print_verbose
+from litellm._logging import print_verbose, verbose_logger


 class S3Logger:
@ -31,7 +31,9 @@ class S3Logger:
        import boto3

        try:
-            print_verbose("in init s3 logger")
+            verbose_logger.debug(
+                f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}"
+            )

            if litellm.s3_callback_params is not None:
                # read in .env variables - example os.environ/AWS_BUCKET_NAME
@ -42,7 +44,7 @@ class S3Logger:
                s3_bucket_name = litellm.s3_callback_params.get("s3_bucket_name")
                s3_region_name = litellm.s3_callback_params.get("s3_region_name")
                s3_api_version = litellm.s3_callback_params.get("s3_api_version")
-                s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl")
+                s3_use_ssl = litellm.s3_callback_params.get("s3_use_ssl", True)
                s3_verify = litellm.s3_callback_params.get("s3_verify")
                s3_endpoint_url = litellm.s3_callback_params.get("s3_endpoint_url")
                s3_aws_access_key_id = litellm.s3_callback_params.get(
@ -59,6 +61,7 @@ class S3Logger:

            self.bucket_name = s3_bucket_name
            self.s3_path = s3_path
+            verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}")
            # Create an S3 client with custom endpoint URL
            self.s3_client = boto3.client(
                "s3",
@ -84,7 +87,9 @@ class S3Logger:

    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
        try:
-            print_verbose(f"s3 Logging - Enters logging function for model {kwargs}")
+            verbose_logger.debug(
+                f"s3 Logging - Enters logging function for model {kwargs}"
+            )

            # construct payload to send to s3
            # follows the same params as langfuse.py
@ -123,12 +128,22 @@ class S3Logger:
                    # non blocking if it can't cast to a str
                    pass

+            s3_file_name = litellm.utils.get_logging_id(start_time, payload) or ""
            s3_object_key = (
                (self.s3_path.rstrip("/") + "/" if self.s3_path else "")
-                + payload["id"]
-                + "-time="
-                + str(start_time)
+                + start_time.strftime("%Y-%m-%d")
+                + "/"
+                + s3_file_name
            )  # we need the s3 key to include the time, so we log cache hits too
+            s3_object_key += ".json"
+
+            s3_object_download_filename = (
+                "time-"
+                + start_time.strftime("%Y-%m-%dT%H-%M-%S-%f")
+                + "_"
+                + payload["id"]
+                + ".json"
+            )

            import json

@ -142,7 +157,8 @@ class S3Logger:
                Body=payload,
                ContentType="application/json",
                ContentLanguage="en",
-                ContentDisposition=f'inline; filename="{key}.json"',
+                ContentDisposition=f'inline; filename="{s3_object_download_filename}"',
+                CacheControl="private, immutable, max-age=31536000, s-maxage=0",
            )

            print_verbose(f"Response from s3:{str(response)}")
@ -151,5 +167,5 @@ class S3Logger:
            return response
        except Exception as e:
            traceback.print_exc()
-            print_verbose(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
+            verbose_logger.debug(f"s3 Layer Error - {str(e)}\n{traceback.format_exc()}")
            pass
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -2,9 +2,9 @@ import json, copy, types
 import os
 from enum import Enum
 import time
-from typing import Callable, Optional, Any, Union
+from typing import Callable, Optional, Any, Union, List
 import litellm
-from litellm.utils import ModelResponse, get_secret, Usage
+from litellm.utils import ModelResponse, get_secret, Usage, ImageResponse
 from .prompt_templates.factory import prompt_factory, custom_prompt
 import httpx

@ -282,6 +282,73 @@ class AmazonLlamaConfig:
        }


+class AmazonStabilityConfig:
+    """
+    Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0
+
+    Supported Params for the Amazon / Stable Diffusion models:
+
+    - `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)
+
+    - `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed)
+
+    - `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run.
+
+    - `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64.
+        Engine-specific dimension validation:
+
+        - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
+        - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
+        - SDXL v1.0: same as SDXL v0.9
+        - SD v1.6: must be between 320x320 and 1536x1536
+
+    - `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64.
+        Engine-specific dimension validation:
+
+        - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512.
+        - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152
+        - SDXL v1.0: same as SDXL v0.9
+        - SD v1.6: must be between 320x320 and 1536x1536
+    """
+
+    cfg_scale: Optional[int] = None
+    seed: Optional[float] = None
+    steps: Optional[List[str]] = None
+    width: Optional[int] = None
+    height: Optional[int] = None
+
+    def __init__(
+        self,
+        cfg_scale: Optional[int] = None,
+        seed: Optional[float] = None,
+        steps: Optional[List[str]] = None,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+
 def init_bedrock_client(
    region_name=None,
    aws_access_key_id: Optional[str] = None,
@ -289,7 +356,9 @@ def init_bedrock_client(
    aws_region_name: Optional[str] = None,
    aws_bedrock_runtime_endpoint: Optional[str] = None,
    aws_session_name: Optional[str] = None,
+    aws_profile_name: Optional[str] = None,
    aws_role_name: Optional[str] = None,
+    timeout: Optional[int] = None,
 ):
    # check for custom AWS_REGION_NAME and use it if not passed to init_bedrock_client
    litellm_aws_region_name = get_secret("AWS_REGION_NAME", None)
@ -303,6 +372,7 @@ def init_bedrock_client(
        aws_region_name,
        aws_bedrock_runtime_endpoint,
        aws_session_name,
+        aws_profile_name,
        aws_role_name,
    ]

@ -317,6 +387,7 @@ def init_bedrock_client(
        aws_region_name,
        aws_bedrock_runtime_endpoint,
        aws_session_name,
+        aws_profile_name,
        aws_role_name,
    ) = params_to_check

@ -346,6 +417,8 @@ def init_bedrock_client(

    import boto3

+    config = boto3.session.Config(connect_timeout=timeout, read_timeout=timeout)
+
    ### CHECK STS ###
    if aws_role_name is not None and aws_session_name is not None:
        # use sts if role name passed in
@ -366,6 +439,7 @@ def init_bedrock_client(
            aws_session_token=sts_response["Credentials"]["SessionToken"],
            region_name=region_name,
            endpoint_url=endpoint_url,
+            config=config,
        )
    elif aws_access_key_id is not None:
        # uses auth params passed to completion
@ -377,6 +451,16 @@ def init_bedrock_client(
            aws_secret_access_key=aws_secret_access_key,
            region_name=region_name,
            endpoint_url=endpoint_url,
+            config=config,
+        )
+    elif aws_profile_name is not None:
+        # uses auth values from AWS profile usually stored in ~/.aws/credentials
+
+        client = boto3.Session(profile_name=aws_profile_name).client(
+            service_name="bedrock-runtime",
+            region_name=region_name,
+            endpoint_url=endpoint_url,
+            config=config,
        )
    else:
        # aws_access_key_id is None, assume user is trying to auth using env variables
@ -386,6 +470,7 @@ def init_bedrock_client(
            service_name="bedrock-runtime",
            region_name=region_name,
            endpoint_url=endpoint_url,
+            config=config,
        )

    return client
@ -441,6 +526,7 @@ def completion(
    optional_params=None,
    litellm_params=None,
    logger_fn=None,
+    timeout=None,
 ):
    exception_mapping_worked = False
    try:
@ -450,6 +536,7 @@ def completion(
        aws_region_name = optional_params.pop("aws_region_name", None)
        aws_role_name = optional_params.pop("aws_role_name", None)
        aws_session_name = optional_params.pop("aws_session_name", None)
+        aws_profile_name = optional_params.pop("aws_profile_name", None)
        aws_bedrock_runtime_endpoint = optional_params.pop(
            "aws_bedrock_runtime_endpoint", None
        )
@ -466,6 +553,8 @@ def completion(
                aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
                aws_role_name=aws_role_name,
                aws_session_name=aws_session_name,
+                aws_profile_name=aws_profile_name,
+                timeout=timeout,
            )

        model = model
@ -652,6 +741,8 @@ def completion(
            try:
                if len(outputText) > 0:
                    model_response["choices"][0]["message"]["content"] = outputText
+                else:
+                    raise Exception()
            except:
                raise BedrockError(
                    message=json.dumps(outputText),
@ -659,9 +750,16 @@ def completion(
                )

        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
-        prompt_tokens = len(encoding.encode(prompt))
-        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+        prompt_tokens = response_metadata.get(
+            "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
+        )
+        completion_tokens = response_metadata.get(
+            "x-amzn-bedrock-output-token-count",
+            len(
+                encoding.encode(
+                    model_response["choices"][0]["message"].get("content", "")
+                )
+            ),
        )

        model_response["created"] = int(time.time())
@ -672,6 +770,8 @@ def completion(
            total_tokens=prompt_tokens + completion_tokens,
        )
        model_response.usage = usage
+        model_response._hidden_params["region_name"] = client.meta.region_name
+        print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
        return model_response
    except BedrockError as e:
        exception_mapping_worked = True
@ -693,6 +793,11 @@ def _embedding_func_single(
    encoding=None,
    logging_obj=None,
 ):
+    if isinstance(input, str) is False:
+        raise BedrockError(
+            message="Bedrock Embedding API input must be type str | List[str]",
+            status_code=400,
+        )
    # logic for parsing in - calling - parsing out model embedding calls
    ## FORMAT EMBEDDING INPUT ##
    provider = model.split(".")[0]
@ -786,7 +891,8 @@ def embedding(
        aws_role_name=aws_role_name,
        aws_session_name=aws_session_name,
    )
-    if type(input) == str:
+    if isinstance(input, str):
+        ## Embedding Call
        embeddings = [
            _embedding_func_single(
                model,
@ -796,8 +902,8 @@ def embedding(
                logging_obj=logging_obj,
            )
        ]
-    else:
-        ## Embedding Call
+    elif isinstance(input, list):
+        ## Embedding Call - assuming this is a List[str]
        embeddings = [
            _embedding_func_single(
                model,
@ -808,6 +914,12 @@ def embedding(
            )
            for i in input
        ]  # [TODO]: make these parallel calls
+    else:
+        # enters this branch if input = int, ex. input=2
+        raise BedrockError(
+            message="Bedrock Embedding API input must be type str | List[str]",
+            status_code=400,
+        )

    ## Populate OpenAI compliant dictionary
    embedding_response = []
@ -834,3 +946,112 @@ def embedding(
    model_response.usage = usage

    return model_response
+
+
+def image_generation(
+    model: str,
+    prompt: str,
+    timeout=None,
+    logging_obj=None,
+    model_response=None,
+    optional_params=None,
+    aimg_generation=False,
+):
+    """
+    Bedrock Image Gen endpoint support
+    """
+    ### BOTO3 INIT ###
+    # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
+    aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
+    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
+    aws_region_name = optional_params.pop("aws_region_name", None)
+    aws_role_name = optional_params.pop("aws_role_name", None)
+    aws_session_name = optional_params.pop("aws_session_name", None)
+    aws_bedrock_runtime_endpoint = optional_params.pop(
+        "aws_bedrock_runtime_endpoint", None
+    )
+
+    # use passed in BedrockRuntime.Client if provided, otherwise create a new one
+    client = init_bedrock_client(
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+        aws_region_name=aws_region_name,
+        aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint,
+        aws_role_name=aws_role_name,
+        aws_session_name=aws_session_name,
+        timeout=timeout,
+    )
+
+    ### FORMAT IMAGE GENERATION INPUT ###
+    modelId = model
+    provider = model.split(".")[0]
+    inference_params = copy.deepcopy(optional_params)
+    inference_params.pop(
+        "user", None
+    )  # make sure user is not passed in for bedrock call
+    data = {}
+    if provider == "stability":
+        prompt = prompt.replace(os.linesep, " ")
+        ## LOAD CONFIG
+        config = litellm.AmazonStabilityConfig.get_config()
+        for k, v in config.items():
+            if (
+                k not in inference_params
+            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                inference_params[k] = v
+        data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params}
+    else:
+        raise BedrockError(
+            status_code=422, message=f"Unsupported model={model}, passed in"
+        )
+
+    body = json.dumps(data).encode("utf-8")
+    ## LOGGING
+    request_str = f"""
+    response = client.invoke_model(
+        body={body},
+        modelId={modelId},
+        accept="application/json",
+        contentType="application/json",
+    )"""  # type: ignore
+    logging_obj.pre_call(
+        input=prompt,
+        api_key="",  # boto3 is used for init.
+        additional_args={
+            "complete_input_dict": {"model": modelId, "texts": prompt},
+            "request_str": request_str,
+        },
+    )
+    try:
+        response = client.invoke_model(
+            body=body,
+            modelId=modelId,
+            accept="application/json",
+            contentType="application/json",
+        )
+        response_body = json.loads(response.get("body").read())
+        ## LOGGING
+        logging_obj.post_call(
+            input=prompt,
+            api_key="",
+            additional_args={"complete_input_dict": data},
+            original_response=json.dumps(response_body),
+        )
+    except Exception as e:
+        raise BedrockError(
+            message=f"Embedding Error with model {model}: {e}", status_code=500
+        )
+
+    ### FORMAT RESPONSE TO OPENAI FORMAT ###
+    if response_body is None:
+        raise Exception("Error in response object format")
+
+    if model_response is None:
+        model_response = ImageResponse()
+
+    image_list: List = []
+    for artifact in response_body["artifacts"]:
+        image_dict = {"url": artifact["base64"]}
+
+    model_response.data = image_dict
+    return model_response
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -145,8 +145,17 @@ def get_ollama_response(
        ):  # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

-    optional_params["stream"] = optional_params.get("stream", False)
-    data = {"model": model, "prompt": prompt, **optional_params}
+    stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
+
    ## LOGGING
    logging_obj.pre_call(
        input=None,
@ -159,7 +168,7 @@ def get_ollama_response(
        },
    )
    if acompletion is True:
-        if optional_params.get("stream", False) == True:
+        if stream == True:
            response = ollama_async_streaming(
                url=url,
                data=data,
@ -176,10 +185,12 @@ def get_ollama_response(
                logging_obj=logging_obj,
            )
        return response
-    elif optional_params.get("stream", False) == True:
+    elif stream == True:
        return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)

-    response = requests.post(url=f"{url}", json=data, timeout=litellm.request_timeout)
+    response = requests.post(
+        url=f"{url}", json={**data, "stream": stream}, timeout=litellm.request_timeout
+    )
    if response.status_code != 200:
        raise OllamaError(status_code=response.status_code, message=response.text)

@ -254,7 +265,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
        ) as response:
            if response.status_code != 200:
                raise OllamaError(
-                    status_code=response.status_code, message=response.text
+                    status_code=response.status_code, message=await response.aread()
                )

            streamwrapper = litellm.CustomStreamWrapper(
@ -267,6 +278,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                yield transformed_chunk
    except Exception as e:
        traceback.print_exc()
+        raise e


 async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -145,8 +145,16 @@ def get_ollama_response(
        ):  # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

-    optional_params["stream"] = optional_params.get("stream", False)
-    data = {"model": model, "messages": messages, **optional_params}
+    stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "messages": messages,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
    ## LOGGING
    logging_obj.pre_call(
        input=None,
@ -159,7 +167,7 @@ def get_ollama_response(
        },
    )
    if acompletion is True:
-        if optional_params.get("stream", False) == True:
+        if stream == True:
            response = ollama_async_streaming(
                url=url,
                data=data,
@ -176,7 +184,7 @@ def get_ollama_response(
                logging_obj=logging_obj,
            )
        return response
-    elif optional_params.get("stream", False) == True:
+    elif stream == True:
        return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)

    response = requests.post(
@ -220,8 +228,10 @@ def get_ollama_response(
        model_response["choices"][0]["message"] = response_json["message"]
    model_response["created"] = int(time.time())
    model_response["model"] = "ollama/" + model
-    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt)))  # type: ignore
-    completion_tokens = response_json["eval_count"]
+    prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages))  # type: ignore
+    completion_tokens = response_json.get(
+        "eval_count", litellm.token_counter(text=response_json["message"]["content"])
+    )
    model_response["usage"] = litellm.Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -318,10 +328,16 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                model_response["choices"][0]["message"] = message
            else:
                model_response["choices"][0]["message"] = response_json["message"]
+
            model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
-            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt)))  # type: ignore
-            completion_tokens = response_json["eval_count"]
+            model_response["model"] = "ollama_chat/" + data["model"]
+            prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
+            completion_tokens = response_json.get(
+                "eval_count",
+                litellm.token_counter(
+                    text=response_json["message"]["content"], count_response_tokens=True
+                ),
+            )
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -221,6 +221,8 @@ class OpenAIChatCompletion(BaseLLM):
        headers: Optional[dict] = None,
        custom_prompt_dict: dict = {},
        client=None,
+        organization: Optional[str] = None,
+        custom_llm_provider: Optional[str] = None,
    ):
        super().completion()
        exception_mapping_worked = False
@ -235,6 +237,14 @@ class OpenAIChatCompletion(BaseLLM):
                    status_code=422, message=f"Timeout needs to be a float"
                )

+            if custom_llm_provider == "mistral":
+                # check if message content passed in as list, and not string
+                messages = prompt_factory(
+                    model=model,
+                    messages=messages,
+                    custom_llm_provider=custom_llm_provider,
+                )
+
            for _ in range(
                2
            ):  # if call fails due to alternating messages, retry with reformatted message
@ -254,6 +264,7 @@ class OpenAIChatCompletion(BaseLLM):
                                timeout=timeout,
                                client=client,
                                max_retries=max_retries,
+                                organization=organization,
                            )
                        else:
                            return self.acompletion(
@ -266,6 +277,7 @@ class OpenAIChatCompletion(BaseLLM):
                                timeout=timeout,
                                client=client,
                                max_retries=max_retries,
+                                organization=organization,
                            )
                    elif optional_params.get("stream", False):
                        return self.streaming(
@ -278,6 +290,7 @@ class OpenAIChatCompletion(BaseLLM):
                            timeout=timeout,
                            client=client,
                            max_retries=max_retries,
+                            organization=organization,
                        )
                    else:
                        if not isinstance(max_retries, int):
@ -291,6 +304,7 @@ class OpenAIChatCompletion(BaseLLM):
                                http_client=litellm.client_session,
                                timeout=timeout,
                                max_retries=max_retries,
+                                organization=organization,
                            )
                        else:
                            openai_client = client
@ -320,12 +334,13 @@ class OpenAIChatCompletion(BaseLLM):
                            model_response_object=model_response,
                        )
                except Exception as e:
-                    if "Conversation roles must alternate user/assistant" in str(
-                        e
-                    ) or "user and assistant roles should be alternating" in str(e):
+                    if (
+                        "Conversation roles must alternate user/assistant" in str(e)
+                        or "user and assistant roles should be alternating" in str(e)
+                    ) and messages is not None:
                        # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
                        new_messages = []
-                        for i in range(len(messages) - 1):
+                        for i in range(len(messages) - 1):  # type: ignore
                            new_messages.append(messages[i])
                            if messages[i]["role"] == messages[i + 1]["role"]:
                                if messages[i]["role"] == "user":
@ -336,7 +351,9 @@ class OpenAIChatCompletion(BaseLLM):
                                    new_messages.append({"role": "user", "content": ""})
                        new_messages.append(messages[-1])
                        messages = new_messages
-                    elif "Last message must have role `user`" in str(e):
+                    elif (
+                        "Last message must have role `user`" in str(e)
+                    ) and messages is not None:
                        new_messages = messages
                        new_messages.append({"role": "user", "content": ""})
                        messages = new_messages
@ -358,6 +375,7 @@ class OpenAIChatCompletion(BaseLLM):
        timeout: float,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
+        organization: Optional[str] = None,
        client=None,
        max_retries=None,
        logging_obj=None,
@ -372,6 +390,7 @@ class OpenAIChatCompletion(BaseLLM):
                    http_client=litellm.aclient_session,
                    timeout=timeout,
                    max_retries=max_retries,
+                    organization=organization,
                )
            else:
                openai_aclient = client
@ -412,6 +431,7 @@ class OpenAIChatCompletion(BaseLLM):
        model: str,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
+        organization: Optional[str] = None,
        client=None,
        max_retries=None,
        headers=None,
@ -423,6 +443,7 @@ class OpenAIChatCompletion(BaseLLM):
                http_client=litellm.client_session,
                timeout=timeout,
                max_retries=max_retries,
+                organization=organization,
            )
        else:
            openai_client = client
@ -431,8 +452,8 @@ class OpenAIChatCompletion(BaseLLM):
            input=data["messages"],
            api_key=api_key,
            additional_args={
-                "headers": headers,
-                "api_base": api_base,
+                "headers": {"Authorization": f"Bearer {openai_client.api_key}"},
+                "api_base": openai_client._base_url._uri_reference,
                "acompletion": False,
                "complete_input_dict": data,
            },
@ -454,6 +475,7 @@ class OpenAIChatCompletion(BaseLLM):
        model: str,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
+        organization: Optional[str] = None,
        client=None,
        max_retries=None,
        headers=None,
@ -467,6 +489,7 @@ class OpenAIChatCompletion(BaseLLM):
                    http_client=litellm.aclient_session,
                    timeout=timeout,
                    max_retries=max_retries,
+                    organization=organization,
                )
            else:
                openai_aclient = client
@ -718,8 +741,22 @@ class OpenAIChatCompletion(BaseLLM):
            return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation")  # type: ignore
        except OpenAIError as e:
            exception_mapping_worked = True
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data},
+                original_response=str(e),
+            )
            raise e
        except Exception as e:
+            ## LOGGING
+            logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                additional_args={"complete_input_dict": data},
+                original_response=str(e),
+            )
            if hasattr(e, "status_code"):
                raise OpenAIError(status_code=e.status_code, message=str(e))
            else:
@ -734,8 +771,11 @@ class OpenAIChatCompletion(BaseLLM):
        messages: Optional[list] = None,
        input: Optional[list] = None,
        prompt: Optional[str] = None,
+        organization: Optional[str] = None,
    ):
-        client = AsyncOpenAI(api_key=api_key, timeout=timeout)
+        client = AsyncOpenAI(
+            api_key=api_key, timeout=timeout, organization=organization
+        )
        if model is None and mode != "image_generation":
            raise Exception("model is not set")

--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -99,12 +99,16 @@ def ollama_pt(


 def mistral_instruct_pt(messages):
+    # Following the Mistral example's https://huggingface.co/docs/transformers/main/chat_templating
    prompt = custom_prompt(
        initial_prompt_value="<s>",
        role_dict={
-            "system": {"pre_message": "[INST]", "post_message": "[/INST]"},
-            "user": {"pre_message": "[INST]", "post_message": "[/INST]"},
-            "assistant": {"pre_message": "[INST]", "post_message": "[/INST]"},
+            "system": {
+                "pre_message": "[INST] \n",
+                "post_message": " [/INST]\n",
+            },
+            "user": {"pre_message": "[INST] ", "post_message": " [/INST]\n"},
+            "assistant": {"pre_message": " ", "post_message": " "},
        },
        final_prompt_value="</s>",
        messages=messages,
@ -112,6 +116,28 @@ def mistral_instruct_pt(messages):
    return prompt


+def mistral_api_pt(messages):
+    """
+    - handles scenario where content is list and not string
+    - content list is just text, and no images
+    - if image passed in, then just return as is (user-intended)
+
+    Motivation: mistral api doesn't support content as a list
+    """
+    new_messages = []
+    for m in messages:
+        texts = ""
+        if isinstance(m["content"], list):
+            for c in m["content"]:
+                if c["type"] == "image_url":
+                    return messages
+                elif c["type"] == "text" and isinstance(c["text"], str):
+                    texts += c["text"]
+        new_m = {"role": m["role"], "content": texts}
+        new_messages.append(new_m)
+    return new_messages
+
+
 # Falcon prompt template - from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py#L110
 def falcon_instruct_pt(messages):
    prompt = ""
@ -372,6 +398,7 @@ def anthropic_pt(
    You can "put words in Claude's mouth" by ending with an assistant message.
    See: https://docs.anthropic.com/claude/docs/put-words-in-claudes-mouth
    """
+
    class AnthropicConstants(Enum):
        HUMAN_PROMPT = "\n\nHuman: "
        AI_PROMPT = "\n\nAssistant: "
@ -394,32 +421,35 @@ def anthropic_pt(
        prompt += f"{AnthropicConstants.AI_PROMPT.value}"
    return prompt

-    
+
 def _load_image_from_url(image_url):
    try:
        from PIL import Image
    except:
-        raise Exception("gemini image conversion failed please run `pip install Pillow`")
+        raise Exception(
+            "gemini image conversion failed please run `pip install Pillow`"
+        )
    from io import BytesIO
+
    try:
        # Send a GET request to the image URL
        response = requests.get(image_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Check the response's content type to ensure it is an image
-        content_type = response.headers.get('content-type')
-        if not content_type or 'image' not in content_type:
-            raise ValueError(f"URL does not point to a valid image (content-type: {content_type})")
+        content_type = response.headers.get("content-type")
+        if not content_type or "image" not in content_type:
+            raise ValueError(
+                f"URL does not point to a valid image (content-type: {content_type})"
+            )

        # Load the image from the response content
        return Image.open(BytesIO(response.content))
-        
+
    except requests.RequestException as e:
-        print(f"Request failed: {e}")
-    except UnidentifiedImageError:
-        print("Cannot identify image file (it may not be a supported image format or might be corrupted).")
-    except ValueError as e:
-        print(e)
+        raise Exception(f"Request failed: {e}")
+    except Exception as e:
+        raise e


 def _gemini_vision_convert_messages(messages: list):
@ -437,10 +467,11 @@ def _gemini_vision_convert_messages(messages: list):
    try:
        from PIL import Image
    except:
-        raise Exception("gemini image conversion failed please run `pip install Pillow`")
+        raise Exception(
+            "gemini image conversion failed please run `pip install Pillow`"
+        )

    try:
-
        # given messages for gpt-4 vision, convert them for gemini
        # https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/getting-started/intro_gemini_python.ipynb
        prompt = ""
@ -589,7 +620,7 @@ def prompt_factory(
    if custom_llm_provider == "ollama":
        return ollama_pt(model=model, messages=messages)
    elif custom_llm_provider == "anthropic":
-        if any(_ in model for _ in ["claude-2.1","claude-v2:1"]):
+        if any(_ in model for _ in ["claude-2.1", "claude-v2:1"]):
            return claude_2_1_pt(messages=messages)
        else:
            return anthropic_pt(messages=messages)
@ -603,6 +634,8 @@ def prompt_factory(
            return _gemini_vision_convert_messages(messages=messages)
        else:
            return gemini_text_image_pt(messages=messages)
+    elif custom_llm_provider == "mistral":
+        return mistral_api_pt(messages=messages)
    try:
        if "meta-llama/llama-2" in model and "chat" in model:
            return llama_2_chat_pt(messages=messages)
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -34,22 +34,35 @@ class TokenIterator:
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0
+        self.end_of_data = False

    def __iter__(self):
        return self

    def __next__(self):
-        while True:
-            self.buffer.seek(self.read_pos)
-            line = self.buffer.readline()
-            if line and line[-1] == ord("\n"):
-                self.read_pos += len(line) + 1
-                full_line = line[:-1].decode("utf-8")
-                line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
-                return line_data["token"]["text"]
-            chunk = next(self.byte_iterator)
-            self.buffer.seek(0, io.SEEK_END)
-            self.buffer.write(chunk["PayloadPart"]["Bytes"])
+        try:
+            while True:
+                self.buffer.seek(self.read_pos)
+                line = self.buffer.readline()
+                if line and line[-1] == ord("\n"):
+                    response_obj = {"text": "", "is_finished": False}
+                    self.read_pos += len(line) + 1
+                    full_line = line[:-1].decode("utf-8")
+                    line_data = json.loads(full_line.lstrip("data:").rstrip("/n"))
+                    if line_data.get("generated_text", None) is not None:
+                        self.end_of_data = True
+                        response_obj["is_finished"] = True
+                    response_obj["text"] = line_data["token"]["text"]
+                    return response_obj
+                chunk = next(self.byte_iterator)
+                self.buffer.seek(0, io.SEEK_END)
+                self.buffer.write(chunk["PayloadPart"]["Bytes"])
+        except StopIteration as e:
+            if self.end_of_data == True:
+                raise e  # Re-raise StopIteration
+            else:
+                self.end_of_data = True
+                return "data: [DONE]"


 class SagemakerConfig:
@ -353,7 +366,7 @@ def embedding(
    aws_access_key_id = optional_params.pop("aws_access_key_id", None)
    aws_region_name = optional_params.pop("aws_region_name", None)

-    if aws_access_key_id != None:
+    if aws_access_key_id is not None:
        # uses auth params passed to completion
        # aws_access_key_id is not None, assume user is trying to auth using litellm.completion
        client = boto3.client(
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -1,3 +1,7 @@
+"""
+Deprecated. We now do together ai calls via the openai client.
+Reference: https://docs.together.ai/docs/openai-api-compatibility
+"""
 import os, types
 import json
 from enum import Enum
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -3,7 +3,7 @@ import json
 from enum import Enum
 import requests
 import time
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
 import litellm, uuid
 import httpx
@ -75,6 +75,41 @@ class VertexAIConfig:
        }


+import asyncio
+
+
+class TextStreamer:
+    """
+    Fake streaming iterator for Vertex AI Model Garden calls
+    """
+
+    def __init__(self, text):
+        self.text = text.split()  # let's assume words as a streaming unit
+        self.index = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index < len(self.text):
+            result = self.text[self.index]
+            self.index += 1
+            return result
+        else:
+            raise StopIteration
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.index < len(self.text):
+            result = self.text[self.index]
+            self.index += 1
+            return result
+        else:
+            raise StopAsyncIteration  # once we run out of data to stream, we raise this error
+
+
 def _get_image_bytes_from_url(image_url: str) -> bytes:
    try:
        response = requests.get(image_url)
@ -236,9 +271,17 @@ def completion(
            Part,
            GenerationConfig,
        )
+        from google.cloud import aiplatform
+        from google.protobuf import json_format  # type: ignore
+        from google.protobuf.struct_pb2 import Value  # type: ignore
        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
+        import google.auth

-        vertexai.init(project=vertex_project, location=vertex_location)
+        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
+        creds, _ = google.auth.default(quota_project_id=vertex_project)
+        vertexai.init(
+            project=vertex_project, location=vertex_location, credentials=creds
+        )

        ## Load Config
        config = litellm.VertexAIConfig.get_config()
@ -272,6 +315,11 @@ def completion(

        request_str = ""
        response_obj = None
+        async_client = None
+        instances = None
+        client_options = {
+            "api_endpoint": f"{vertex_location}-aiplatform.googleapis.com"
+        }
        if (
            model in litellm.vertex_language_models
            or model in litellm.vertex_vision_models
@ -291,39 +339,51 @@ def completion(
            llm_model = CodeGenerationModel.from_pretrained(model)
            mode = "text"
            request_str += f"llm_model = CodeGenerationModel.from_pretrained({model})\n"
-        else:  # vertex_code_llm_models
+        elif model in litellm.vertex_code_chat_models:  # vertex_code_llm_models
            llm_model = CodeChatModel.from_pretrained(model)
            mode = "chat"
            request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n"
+        else:  # assume vertex model garden
+            client = aiplatform.gapic.PredictionServiceClient(
+                client_options=client_options
+            )

-        if acompletion == True:  # [TODO] expand support to vertex ai chat + text models
+            instances = [optional_params]
+            instances[0]["prompt"] = prompt
+            instances = [
+                json_format.ParseDict(instance_dict, Value())
+                for instance_dict in instances
+            ]
+            llm_model = client.endpoint_path(
+                project=vertex_project, location=vertex_location, endpoint=model
+            )
+
+            mode = "custom"
+            request_str += f"llm_model = client.endpoint_path(project={vertex_project}, location={vertex_location}, endpoint={model})\n"
+
+        if acompletion == True:
+            data = {
+                "llm_model": llm_model,
+                "mode": mode,
+                "prompt": prompt,
+                "logging_obj": logging_obj,
+                "request_str": request_str,
+                "model": model,
+                "model_response": model_response,
+                "encoding": encoding,
+                "messages": messages,
+                "print_verbose": print_verbose,
+                "client_options": client_options,
+                "instances": instances,
+                "vertex_location": vertex_location,
+                "vertex_project": vertex_project,
+                **optional_params,
+            }
            if optional_params.get("stream", False) is True:
                # async streaming
-                return async_streaming(
-                    llm_model=llm_model,
-                    mode=mode,
-                    prompt=prompt,
-                    logging_obj=logging_obj,
-                    request_str=request_str,
-                    model=model,
-                    model_response=model_response,
-                    messages=messages,
-                    print_verbose=print_verbose,
-                    **optional_params,
-                )
-            return async_completion(
-                llm_model=llm_model,
-                mode=mode,
-                prompt=prompt,
-                logging_obj=logging_obj,
-                request_str=request_str,
-                model=model,
-                model_response=model_response,
-                encoding=encoding,
-                messages=messages,
-                print_verbose=print_verbose,
-                **optional_params,
-            )
+                return async_streaming(**data)
+
+            return async_completion(**data)

        if mode == "vision":
            print_verbose("\nMaking VertexAI Gemini Pro Vision Call")
@ -468,7 +528,36 @@ def completion(
                },
            )
            completion_response = llm_model.predict(prompt, **optional_params).text
+        elif mode == "custom":
+            """
+            Vertex AI Model Garden
+            """
+            request_str += (
+                f"client.predict(endpoint={llm_model}, instances={instances})\n"
+            )
+            ## LOGGING
+            logging_obj.pre_call(
+                input=prompt,
+                api_key=None,
+                additional_args={
+                    "complete_input_dict": optional_params,
+                    "request_str": request_str,
+                },
+            )

+            response = client.predict(
+                endpoint=llm_model,
+                instances=instances,
+            ).predictions
+            completion_response = response[0]
+            if (
+                isinstance(completion_response, str)
+                and "\nOutput:\n" in completion_response
+            ):
+                completion_response = completion_response.split("\nOutput:\n", 1)[1]
+            if "stream" in optional_params and optional_params["stream"] == True:
+                response = TextStreamer(completion_response)
+                return response
        ## LOGGING
        logging_obj.post_call(
            input=prompt, api_key=None, original_response=completion_response
@ -536,6 +625,10 @@ async def async_completion(
    encoding=None,
    messages=None,
    print_verbose=None,
+    client_options=None,
+    instances=None,
+    vertex_project=None,
+    vertex_location=None,
    **optional_params,
 ):
    """
@ -624,7 +717,43 @@ async def async_completion(
            )
            response_obj = await llm_model.predict_async(prompt, **optional_params)
            completion_response = response_obj.text
+        elif mode == "custom":
+            """
+            Vertex AI Model Garden
+            """
+            from google.cloud import aiplatform

+            async_client = aiplatform.gapic.PredictionServiceAsyncClient(
+                client_options=client_options
+            )
+            llm_model = async_client.endpoint_path(
+                project=vertex_project, location=vertex_location, endpoint=model
+            )
+
+            request_str += (
+                f"client.predict(endpoint={llm_model}, instances={instances})\n"
+            )
+            ## LOGGING
+            logging_obj.pre_call(
+                input=prompt,
+                api_key=None,
+                additional_args={
+                    "complete_input_dict": optional_params,
+                    "request_str": request_str,
+                },
+            )
+
+            response_obj = await async_client.predict(
+                endpoint=llm_model,
+                instances=instances,
+            )
+            response = response_obj.predictions
+            completion_response = response[0]
+            if (
+                isinstance(completion_response, str)
+                and "\nOutput:\n" in completion_response
+            ):
+                completion_response = completion_response.split("\nOutput:\n", 1)[1]
        ## LOGGING
        logging_obj.post_call(
            input=prompt, api_key=None, original_response=completion_response
@ -654,14 +783,12 @@ async def async_completion(
            # init prompt tokens
            # this block attempts to get usage from response_obj if it exists, if not it uses the litellm token counter
            prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
-            if response_obj is not None:
-                if hasattr(response_obj, "usage_metadata") and hasattr(
-                    response_obj.usage_metadata, "prompt_token_count"
-                ):
-                    prompt_tokens = response_obj.usage_metadata.prompt_token_count
-                    completion_tokens = (
-                        response_obj.usage_metadata.candidates_token_count
-                    )
+            if response_obj is not None and (
+                hasattr(response_obj, "usage_metadata")
+                and hasattr(response_obj.usage_metadata, "prompt_token_count")
+            ):
+                prompt_tokens = response_obj.usage_metadata.prompt_token_count
+                completion_tokens = response_obj.usage_metadata.candidates_token_count
            else:
                prompt_tokens = len(encoding.encode(prompt))
                completion_tokens = len(
@ -690,8 +817,13 @@ async def async_streaming(
    model_response: ModelResponse,
    logging_obj=None,
    request_str=None,
+    encoding=None,
    messages=None,
    print_verbose=None,
+    client_options=None,
+    instances=None,
+    vertex_project=None,
+    vertex_location=None,
    **optional_params,
 ):
    """
@ -760,17 +892,198 @@ async def async_streaming(
            },
        )
        response = llm_model.predict_streaming_async(prompt, **optional_params)
+    elif mode == "custom":
+        from google.cloud import aiplatform

+        async_client = aiplatform.gapic.PredictionServiceAsyncClient(
+            client_options=client_options
+        )
+        llm_model = async_client.endpoint_path(
+            project=vertex_project, location=vertex_location, endpoint=model
+        )
+
+        request_str += f"client.predict(endpoint={llm_model}, instances={instances})\n"
+        ## LOGGING
+        logging_obj.pre_call(
+            input=prompt,
+            api_key=None,
+            additional_args={
+                "complete_input_dict": optional_params,
+                "request_str": request_str,
+            },
+        )
+
+        response_obj = await async_client.predict(
+            endpoint=llm_model,
+            instances=instances,
+        )
+        response = response_obj.predictions
+        completion_response = response[0]
+        if (
+            isinstance(completion_response, str)
+            and "\nOutput:\n" in completion_response
+        ):
+            completion_response = completion_response.split("\nOutput:\n", 1)[1]
+        if "stream" in optional_params and optional_params["stream"] == True:
+            response = TextStreamer(completion_response)
    streamwrapper = CustomStreamWrapper(
        completion_stream=response,
        model=model,
        custom_llm_provider="vertex_ai",
        logging_obj=logging_obj,
    )
-    async for transformed_chunk in streamwrapper:
-        yield transformed_chunk
+    return streamwrapper


-def embedding():
+def embedding(
+    model: str,
+    input: Union[list, str],
+    api_key: Optional[str] = None,
+    logging_obj=None,
+    model_response=None,
+    optional_params=None,
+    encoding=None,
+    vertex_project=None,
+    vertex_location=None,
+    aembedding=False,
+):
    # logic for parsing in - calling - parsing out model embedding calls
-    pass
+    try:
+        import vertexai
+    except:
+        raise VertexAIError(
+            status_code=400,
+            message="vertexai import failed please run `pip install google-cloud-aiplatform`",
+        )
+
+    from vertexai.language_models import TextEmbeddingModel
+    import google.auth
+
+    ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
+    try:
+        creds, _ = google.auth.default(quota_project_id=vertex_project)
+        vertexai.init(
+            project=vertex_project, location=vertex_location, credentials=creds
+        )
+    except Exception as e:
+        raise VertexAIError(status_code=401, message=str(e))
+
+    if isinstance(input, str):
+        input = [input]
+
+    try:
+        llm_model = TextEmbeddingModel.from_pretrained(model)
+    except Exception as e:
+        raise VertexAIError(status_code=422, message=str(e))
+
+    if aembedding == True:
+        return async_embedding(
+            model=model,
+            client=llm_model,
+            input=input,
+            logging_obj=logging_obj,
+            model_response=model_response,
+            optional_params=optional_params,
+            encoding=encoding,
+        )
+
+    request_str = f"""embeddings = llm_model.get_embeddings({input})"""
+    ## LOGGING PRE-CALL
+    logging_obj.pre_call(
+        input=input,
+        api_key=None,
+        additional_args={
+            "complete_input_dict": optional_params,
+            "request_str": request_str,
+        },
+    )
+
+    try:
+        embeddings = llm_model.get_embeddings(input)
+    except Exception as e:
+        raise VertexAIError(status_code=500, message=str(e))
+
+    ## LOGGING POST-CALL
+    logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
+    ## Populate OpenAI compliant dictionary
+    embedding_response = []
+    for idx, embedding in enumerate(embeddings):
+        embedding_response.append(
+            {
+                "object": "embedding",
+                "index": idx,
+                "embedding": embedding.values,
+            }
+        )
+    model_response["object"] = "list"
+    model_response["data"] = embedding_response
+    model_response["model"] = model
+    input_tokens = 0
+
+    input_str = "".join(input)
+
+    input_tokens += len(encoding.encode(input_str))
+
+    usage = Usage(
+        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+    )
+    model_response.usage = usage
+
+    return model_response
+
+
+async def async_embedding(
+    model: str,
+    input: Union[list, str],
+    logging_obj=None,
+    model_response=None,
+    optional_params=None,
+    encoding=None,
+    client=None,
+):
+    """
+    Async embedding implementation
+    """
+    request_str = f"""embeddings = llm_model.get_embeddings({input})"""
+    ## LOGGING PRE-CALL
+    logging_obj.pre_call(
+        input=input,
+        api_key=None,
+        additional_args={
+            "complete_input_dict": optional_params,
+            "request_str": request_str,
+        },
+    )
+
+    try:
+        embeddings = await client.get_embeddings_async(input)
+    except Exception as e:
+        raise VertexAIError(status_code=500, message=str(e))
+
+    ## LOGGING POST-CALL
+    logging_obj.post_call(input=input, api_key=None, original_response=embeddings)
+    ## Populate OpenAI compliant dictionary
+    embedding_response = []
+    for idx, embedding in enumerate(embeddings):
+        embedding_response.append(
+            {
+                "object": "embedding",
+                "index": idx,
+                "embedding": embedding.values,
+            }
+        )
+    model_response["object"] = "list"
+    model_response["data"] = embedding_response
+    model_response["model"] = model
+    input_tokens = 0
+
+    input_str = "".join(input)
+
+    input_tokens += len(encoding.encode(input_str))
+
+    usage = Usage(
+        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+    )
+    model_response.usage = usage
+
+    return model_response
--- a/litellm/main.py
+++ b/litellm/main.py
@ -15,7 +15,7 @@ import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx
 import litellm
-
+from ._logging import verbose_logger
 from litellm import (  # type: ignore
    client,
    exception_type,
@ -31,6 +31,7 @@ from litellm.utils import (
    get_llm_provider,
    get_api_key,
    mock_completion_streaming_obj,
+    async_mock_completion_streaming_obj,
    convert_to_model_response_object,
    token_counter,
    Usage,
@ -235,6 +236,9 @@ async def acompletion(
        "model_list": model_list,
        "acompletion": True,  # assuming this is a required parameter
    }
+    _, custom_llm_provider, _, _ = get_llm_provider(
+        model=model, api_base=completion_kwargs.get("base_url", None)
+    )
    try:
        # Use a partial function to pass your keyword arguments
        func = partial(completion, **completion_kwargs, **kwargs)
@ -246,7 +250,6 @@ async def acompletion(
        _, custom_llm_provider, _, _ = get_llm_provider(
            model=model, api_base=kwargs.get("api_base", None)
        )
-
        if (
            custom_llm_provider == "openai"
            or custom_llm_provider == "azure"
@ -261,6 +264,7 @@ async def acompletion(
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "ollama_chat"
            or custom_llm_provider == "vertex_ai"
+            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
            if isinstance(init_response, dict) or isinstance(
@ -274,14 +278,10 @@ async def acompletion(
        else:
            # Call the synchronous function using run_in_executor
            response = await loop.run_in_executor(None, func_with_context)  # type: ignore
-        # if kwargs.get("stream", False):  # return an async generator
-        #     return _async_streaming(
-        #         response=response,
-        #         model=model,
-        #         custom_llm_provider=custom_llm_provider,
-        #         args=args,
-        #     )
-        # else:
+        if isinstance(response, CustomStreamWrapper):
+            response.set_logging_event_loop(
+                loop=loop
+            )  # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
        return response
    except Exception as e:
        custom_llm_provider = custom_llm_provider or "openai"
@ -308,6 +308,7 @@ def mock_completion(
    messages: List,
    stream: Optional[bool] = False,
    mock_response: str = "This is a mock request",
+    logging=None,
    **kwargs,
 ):
    """
@ -336,6 +337,15 @@ def mock_completion(
        model_response = ModelResponse(stream=stream)
        if stream is True:
            # don't try to access stream object,
+            if kwargs.get("acompletion", False) == True:
+                return CustomStreamWrapper(
+                    completion_stream=async_mock_completion_streaming_obj(
+                        model_response, mock_response=mock_response, model=model
+                    ),
+                    model=model,
+                    custom_llm_provider="openai",
+                    logging_obj=logging,
+                )
            response = mock_completion_streaming_obj(
                model_response, mock_response=mock_response, model=model
            )
@ -455,6 +465,7 @@ def completion(
    num_retries = kwargs.get("num_retries", None)  ## deprecated
    max_retries = kwargs.get("max_retries", None)
    context_window_fallback_dict = kwargs.get("context_window_fallback_dict", None)
+    organization = kwargs.get("organization", None)
    ### CUSTOM MODEL COST ###
    input_cost_per_token = kwargs.get("input_cost_per_token", None)
    output_cost_per_token = kwargs.get("output_cost_per_token", None)
@ -590,28 +601,43 @@ def completion(
        )
        if model_response is not None and hasattr(model_response, "_hidden_params"):
            model_response._hidden_params["custom_llm_provider"] = custom_llm_provider
+            model_response._hidden_params["region_name"] = kwargs.get(
+                "aws_region_name", None
+            )  # support region-based pricing for bedrock
+
        ### REGISTER CUSTOM MODEL PRICING -- IF GIVEN ###
        if input_cost_per_token is not None and output_cost_per_token is not None:
+            print_verbose(f"Registering model={model} in model cost map")
            litellm.register_model(
                {
+                    f"{custom_llm_provider}/{model}": {
+                        "input_cost_per_token": input_cost_per_token,
+                        "output_cost_per_token": output_cost_per_token,
+                        "litellm_provider": custom_llm_provider,
+                    },
                    model: {
                        "input_cost_per_token": input_cost_per_token,
                        "output_cost_per_token": output_cost_per_token,
                        "litellm_provider": custom_llm_provider,
-                    }
+                    },
                }
            )
-        if (
+        elif (
            input_cost_per_second is not None
        ):  # time based pricing just needs cost in place
            output_cost_per_second = output_cost_per_second or 0.0
            litellm.register_model(
                {
+                    f"{custom_llm_provider}/{model}": {
+                        "input_cost_per_second": input_cost_per_second,
+                        "output_cost_per_second": output_cost_per_second,
+                        "litellm_provider": custom_llm_provider,
+                    },
                    model: {
                        "input_cost_per_second": input_cost_per_second,
                        "output_cost_per_second": output_cost_per_second,
                        "litellm_provider": custom_llm_provider,
-                    }
+                    },
                }
            )
        ### BUILD CUSTOM PROMPT TEMPLATE -- IF GIVEN ###
@ -702,7 +728,12 @@ def completion(
        )
        if mock_response:
            return mock_completion(
-                model, messages, stream=stream, mock_response=mock_response
+                model,
+                messages,
+                stream=stream,
+                mock_response=mock_response,
+                logging=logging,
+                acompletion=acompletion,
            )
        if custom_llm_provider == "azure":
            # azure configs
@ -777,6 +808,7 @@ def completion(
            or custom_llm_provider == "anyscale"
            or custom_llm_provider == "mistral"
            or custom_llm_provider == "openai"
+            or custom_llm_provider == "together_ai"
            or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
        ):  # allow user to make an openai call with a custom base
            # note: if a user sets a custom base - we should ensure this works
@ -788,7 +820,8 @@ def completion(
                or "https://api.openai.com/v1"
            )
            openai.organization = (
-                litellm.organization
+                organization
+                or litellm.organization
                or get_secret("OPENAI_ORGANIZATION")
                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
            )
@ -828,6 +861,7 @@ def completion(
                    timeout=timeout,
                    custom_prompt_dict=custom_prompt_dict,
                    client=client,  # pass AsyncOpenAI, OpenAI client
+                    organization=organization,
                )
            except Exception as e:
                ## LOGGING - log the original exception returned
@ -1314,6 +1348,9 @@ def completion(
            or ("togethercomputer" in model)
            or (model in litellm.together_ai_models)
        ):
+            """
+            Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
+            """
            custom_llm_provider = "together_ai"
            together_ai_key = (
                api_key
@ -1421,9 +1458,15 @@ def completion(
                return response
            response = model_response
        elif custom_llm_provider == "vertex_ai":
-            vertex_ai_project = litellm.vertex_project or get_secret("VERTEXAI_PROJECT")
-            vertex_ai_location = litellm.vertex_location or get_secret(
-                "VERTEXAI_LOCATION"
+            vertex_ai_project = (
+                optional_params.pop("vertex_ai_project", None)
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.pop("vertex_ai_location", None)
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
            )

            model_response = vertex_ai.completion(
@ -1514,11 +1557,6 @@ def completion(
            if (
                "stream" in optional_params and optional_params["stream"] == True
            ):  ## [BETA]
-                # sagemaker does not support streaming as of now so we're faking streaming:
-                # https://discuss.huggingface.co/t/streaming-output-text-when-deploying-on-sagemaker/39611
-                # "SageMaker is currently not supporting streaming responses."
-
-                # fake streaming for sagemaker
                print_verbose(f"ENTERS SAGEMAKER CUSTOMSTREAMWRAPPER")
                from .llms.sagemaker import TokenIterator

@ -1529,6 +1567,12 @@ def completion(
                    custom_llm_provider="sagemaker",
                    logging_obj=logging,
                )
+                ## LOGGING
+                logging.post_call(
+                    input=messages,
+                    api_key=None,
+                    original_response=response,
+                )
                return response

            ## RESPONSE OBJECT
@ -1547,6 +1591,7 @@ def completion(
                logger_fn=logger_fn,
                encoding=encoding,
                logging_obj=logging,
+                timeout=timeout,
            )

            if "stream" in optional_params and optional_params["stream"] == True:
@ -2191,6 +2236,7 @@ async def aembedding(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "ollama"
+            or custom_llm_provider == "vertex_ai"
        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
            # Await normally
            init_response = await loop.run_in_executor(None, func_with_context)
@ -2221,6 +2267,7 @@ def embedding(
    model,
    input=[],
    # Optional params
+    dimensions: Optional[int] = None,
    timeout=600,  # default to 10 minutes
    # set api_base, api_version, api_key
    api_base: Optional[str] = None,
@ -2241,6 +2288,7 @@ def embedding(
    Parameters:
    - model: The embedding model to use.
    - input: The input for which embeddings are to be generated.
+    - dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
    - timeout: The timeout value for the API call, default 10 mins
    - litellm_call_id: The call ID for litellm logging.
    - litellm_logging_obj: The litellm logging object.
@ -2274,6 +2322,7 @@ def embedding(
    output_cost_per_second = kwargs.get("output_cost_per_second", None)
    openai_params = [
        "user",
+        "dimensions",
        "request_timeout",
        "api_base",
        "api_version",
@ -2342,7 +2391,9 @@ def embedding(
        api_key=api_key,
    )
    optional_params = get_optional_params_embeddings(
+        model=model,
        user=user,
+        dimensions=dimensions,
        encoding_format=encoding_format,
        custom_llm_provider=custom_llm_provider,
        **non_default_params,
@ -2461,7 +2512,7 @@ def embedding(
                client=client,
                aembedding=aembedding,
            )
-        elif model in litellm.cohere_embedding_models:
+        elif custom_llm_provider == "cohere":
            cohere_key = (
                api_key
                or litellm.cohere_key
@ -2503,6 +2554,29 @@ def embedding(
                optional_params=optional_params,
                model_response=EmbeddingResponse(),
            )
+        elif custom_llm_provider == "vertex_ai":
+            vertex_ai_project = (
+                optional_params.pop("vertex_ai_project", None)
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.pop("vertex_ai_location", None)
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
+            )
+
+            response = vertex_ai.embedding(
+                model=model,
+                input=input,
+                encoding=encoding,
+                logging_obj=logging,
+                optional_params=optional_params,
+                model_response=EmbeddingResponse(),
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                aembedding=aembedding,
+            )
        elif custom_llm_provider == "oobabooga":
            response = oobabooga.embedding(
                model=model,
@ -3064,7 +3138,7 @@ def image_generation(
            custom_llm_provider=custom_llm_provider,
            **non_default_params,
        )
-        logging = litellm_logging_obj
+        logging: Logging = litellm_logging_obj
        logging.update_environment_variables(
            model=model,
            user=user,
@ -3128,7 +3202,18 @@ def image_generation(
                model_response=model_response,
                aimg_generation=aimg_generation,
            )
-
+        elif custom_llm_provider == "bedrock":
+            if model is None:
+                raise Exception("Model needs to be set for bedrock")
+            model_response = bedrock.image_generation(
+                model=model,
+                prompt=prompt,
+                timeout=timeout,
+                logging_obj=litellm_logging_obj,
+                optional_params=optional_params,
+                model_response=model_response,
+                aimg_generation=aimg_generation,
+            )
        return model_response
    except Exception as e:
        ## Map to OpenAI Exception
@ -3164,6 +3249,9 @@ async def ahealth_check(
        if model is None:
            raise Exception("model not set")

+        if model in litellm.model_cost and mode is None:
+            mode = litellm.model_cost[model]["mode"]
+
        model, custom_llm_provider, _, _ = get_llm_provider(model=model)
        mode = mode or "chat"  # default to chat completion calls

@ -3210,6 +3298,7 @@ async def ahealth_check(
            or custom_llm_provider == "text-completion-openai"
        ):
            api_key = model_params.get("api_key") or get_secret("OPENAI_API_KEY")
+            organization = model_params.get("organization")

            timeout = (
                model_params.get("timeout")
@ -3227,8 +3316,12 @@ async def ahealth_check(
                mode=mode,
                prompt=prompt,
                input=input,
+                organization=organization,
            )
        else:
+            model_params["cache"] = {
+                "no-cache": True
+            }  # don't used cached responses for making health check calls
            if mode == "embedding":
                model_params.pop("messages", None)
                model_params["input"] = input
@ -3244,6 +3337,10 @@ async def ahealth_check(
                response = {}  # args like remaining ratelimit etc.
        return response
    except Exception as e:
+        if model not in litellm.model_cost and mode is None:
+            raise Exception(
+                "Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
+            )
        return {"error": str(e)}


@ -3251,6 +3348,7 @@ async def ahealth_check(
 ## Set verbose to true -> ```litellm.set_verbose = True```
 def print_verbose(print_statement):
    try:
+        verbose_logger.debug(print_statement)
        if litellm.set_verbose:
            print(print_statement)  # noqa
    except:
@ -3342,6 +3440,16 @@ def stream_chunk_builder(
    chunks: list, messages: Optional[list] = None, start_time=None, end_time=None
 ):
    model_response = litellm.ModelResponse()
+    ### SORT CHUNKS BASED ON CREATED ORDER ##
+    print_verbose("Goes into checking if chunk has hiddden created at param")
+    if chunks[0]._hidden_params.get("created_at", None):
+        print_verbose("Chunks have a created at hidden param")
+        # Sort chunks based on created_at in ascending order
+        chunks = sorted(
+            chunks, key=lambda x: x._hidden_params.get("created_at", float("inf"))
+        )
+        print_verbose("Chunks sorted")
+
    # set hidden params from chunk to model_response
    if model_response is not None and hasattr(model_response, "_hidden_params"):
        model_response._hidden_params = chunks[0].get("_hidden_params", {})
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/RFLw1mmLh8foSp0K2TJeY/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/RFLw1mmLh8foSp0K2TJeY/_buildManifest.js
@ -0,0 +1 @@
+self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
--- a/litellm/proxy/_experimental/out/_next/static/RFLw1mmLh8foSp0K2TJeY/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/RFLw1mmLh8foSp0K2TJeY/_ssgManifest.js
@ -0,0 +1 @@
+self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();`
				`@ -0,0 +1 @@`
				`self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()`