add unit tests for vertex pass through

use get_litellm_virtual_key
docs add usage example for js
2024-11-22 16:49:35 -08:00 · 2024-11-22 16:44:35 -08:00 · 2024-11-22 16:40:40 -08:00 · 2024-11-22 16:31:58 -08:00 · 2024-11-22 16:15:37 -08:00 · 2024-11-22 16:07:45 -08:00
208 changed files with 3415 additions and 8542 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -625,48 +625,6 @@ jobs:
          paths:
            - llm_translation_coverage.xml
            - llm_translation_coverage
-  pass_through_unit_testing:
-    docker:
-      - image: cimg/python:3.11
-        auth:
-          username: ${DOCKERHUB_USERNAME}
-          password: ${DOCKERHUB_PASSWORD}
-    working_directory: ~/project
-
-    steps:
-      - checkout
-      - run:
-          name: Install Dependencies
-          command: |
-            python -m pip install --upgrade pip
-            python -m pip install -r requirements.txt
-            pip install "pytest==7.3.1"
-            pip install "pytest-retry==1.6.3"
-            pip install "pytest-cov==5.0.0"
-            pip install "pytest-asyncio==0.21.1"
-            pip install "respx==0.21.1"
-      # Run pytest and generate JUnit XML report
-      - run:
-          name: Run tests
-          command: |
-            pwd
-            ls
-            python -m pytest -vv tests/pass_through_unit_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
-          no_output_timeout: 120m
-      - run:
-          name: Rename the coverage files
-          command: |
-            mv coverage.xml pass_through_unit_tests_coverage.xml
-            mv .coverage pass_through_unit_tests_coverage
-        
-      # Store test results
-      - store_test_results:
-          path: test-results
-      - persist_to_workspace:
-          root: .
-          paths:
-            - pass_through_unit_tests_coverage.xml
-            - pass_through_unit_tests_coverage
  image_gen_testing:
    docker:
      - image: cimg/python:3.11
@ -807,12 +765,11 @@ jobs:
            curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
      - run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
      - run: ruff check ./litellm
-      # - run: python ./tests/documentation_tests/test_general_setting_keys.py
+      - run: python ./tests/documentation_tests/test_general_setting_keys.py
      - run: python ./tests/code_coverage_tests/router_code_coverage.py
      - run: python ./tests/code_coverage_tests/test_router_strategy_async.py
      - run: python ./tests/code_coverage_tests/litellm_logging_code_coverage.py
      - run: python ./tests/documentation_tests/test_env_keys.py
-      - run: python ./tests/documentation_tests/test_router_settings.py
      - run: python ./tests/documentation_tests/test_api_docs.py
      - run: python ./tests/code_coverage_tests/ensure_async_clients_test.py
      - run: helm lint ./deploy/charts/litellm-helm
@ -966,7 +923,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests
+            python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests
          no_output_timeout: 120m

      # Store test results
@ -1180,7 +1137,15 @@ jobs:
            pip install "PyGithub==1.59.1"
            pip install "google-cloud-aiplatform==1.59.0"
            pip install anthropic
+            python -m pip install -r requirements.txt
      # Run pytest and generate JUnit XML report
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/pass_through_unit_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout: 120m
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -1192,7 +1157,6 @@ jobs:
              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e LITELLM_MASTER_KEY="sk-1234" \
              -e OPENAI_API_KEY=$OPENAI_API_KEY \
-              -e GEMINI_API_KEY=$GEMINI_API_KEY \
              -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
              -e LITELLM_LICENSE=$LITELLM_LICENSE \
              --name my-app \
@ -1230,13 +1194,12 @@ jobs:
          name: Install Node.js dependencies
          command: |
            npm install @google-cloud/vertexai
-            npm install @google/generative-ai
            npm install --save-dev jest

      - run:
-          name: Run Vertex AI, Google AI Studio Node.js tests
+          name: Run Vertex AI tests
          command: |
-            npx jest tests/pass_through_tests --verbose
+            npx jest tests/pass_through_tests/test_vertex.test.js --verbose
          no_output_timeout: 30m
      - run:
          name: Run tests
@ -1270,7 +1233,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage
+            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -1376,7 +1339,6 @@ jobs:
          name: Install Dependencies
          command: |
            npm install -D @playwright/test
-            npm install @google-cloud/vertexai
            pip install "pytest==7.3.1"
            pip install "pytest-retry==1.6.3"
            pip install "pytest-asyncio==0.21.1"
@ -1408,7 +1370,7 @@ jobs:
          command: |
            docker run -d \
              -p 4000:4000 \
-              -e DATABASE_URL=$PROXY_DATABASE_URL_2 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
              -e LITELLM_MASTER_KEY="sk-1234" \
              -e OPENAI_API_KEY=$OPENAI_API_KEY \
              -e UI_USERNAME="admin" \
@ -1438,7 +1400,7 @@ jobs:
      - run:
          name: Run Playwright Tests
          command: |
-            npx playwright test e2e_ui_tests/ --reporter=html --output=test-results
+            npx playwright test --reporter=html --output=test-results
          no_output_timeout: 120m
      - store_test_results:
          path: test-results
@ -1560,12 +1522,6 @@ workflows:
              only:
                - main
                - /litellm_.*/
-      - pass_through_unit_testing:
-          filters:
-            branches:
-              only:
-                - main
-                - /litellm_.*/
      - image_gen_testing:
          filters:
            branches:
@ -1581,7 +1537,6 @@ workflows:
      - upload-coverage:
          requires:
            - llm_translation_testing
-            - pass_through_unit_testing
            - image_gen_testing
            - logging_testing
            - litellm_router_testing
@ -1622,7 +1577,6 @@ workflows:
            - load_testing
            - test_bad_database_url
            - llm_translation_testing
-            - pass_through_unit_testing
            - image_gen_testing
            - logging_testing
            - litellm_router_testing
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -41,7 +41,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea

 | Provider | temperature | max_completion_tokens | max_tokens | top_p | stream | stream_options | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |✅ | ✅ | | ✅ | ✅ |  |  | ✅ |
+|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |  |  |   |  |  |  |✅ | ✅ | ✅ | ✅ | ✅ |  |  | ✅ |
 |OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ |
 |Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ |  |  | ✅ |
 |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |  |   |  |   |
--- a/docs/my-website/docs/guides/finetuned_models.md
+++ b/docs/my-website/docs/guides/finetuned_models.md
@ -1,74 +0,0 @@
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-
-# Calling Finetuned Models
-
-## OpenAI
-
-
-| Model Name                | Function Call                                                          |
-|---------------------------|-----------------------------------------------------------------|
-| fine tuned `gpt-4-0613`    | `response = completion(model="ft:gpt-4-0613", messages=messages)`     |
-| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` |
-| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` |
-| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` |
-| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |
-
-
-## Vertex AI
-
-Fine tuned models on vertex have a numerical model/endpoint id. 
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import completion
-import os
-
-## set ENV variables
-os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811"
-os.environ["VERTEXAI_LOCATION"] = "us-central1"
-
-response = completion(
-  model="vertex_ai/<your-finetuned-model>",  # e.g. vertex_ai/4965075652664360960
-  messages=[{ "content": "Hello, how are you?","role": "user"}],
-  base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-1. Add Vertex Credentials to your env 
-
-```bash
-!gcloud auth application-default login
-```
-
-2. Setup config.yaml 
-
-```yaml
- model_name: finetuned-gemini
-  litellm_params:
-    model: vertex_ai/<ENDPOINT_ID>
-    vertex_project: <PROJECT_ID>
-    vertex_location: <LOCATION>
-  model_info:
-    base_model: vertex_ai/gemini-1.5-pro # IMPORTANT
-```
-
-3. Test it! 
-
-```bash
-curl --location 'https://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: <LITELLM_KEY>' \
--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}'
-```
-
-</TabItem>
-</Tabs>
-
-
--- a/docs/my-website/docs/moderation.md
+++ b/docs/my-website/docs/moderation.md
@ -1,135 +0,0 @@
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-# Moderation
-
-
-### Usage
-<Tabs>
-<TabItem value="python" label="LiteLLM Python SDK">
-
-```python
-from litellm import moderation
-
-response = moderation(
-    input="hello from litellm",
-    model="text-moderation-stable"
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="LiteLLM Proxy Server">
-
-For `/moderations` endpoint, there is **no need to specify `model` in the request or on the litellm config.yaml**
-
-Start litellm proxy server 
-
-```
-litellm
-```
-
-
-<Tabs>
-<TabItem value="python" label="OpenAI Python SDK">
-
-```python
-from openai import OpenAI
-
-# set base_url to your proxy server
-# set api_key to send to proxy server
-client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
-
-response = client.moderations.create(
-    input="hello from litellm",
-    model="text-moderation-stable" # optional, defaults to `omni-moderation-latest`
-)
-
-print(response)
-```
-</TabItem>
-
-<TabItem value="curl" label="Curl Request">
-
-```shell
-curl --location 'http://0.0.0.0:4000/moderations' \
-    --header 'Content-Type: application/json' \
-    --header 'Authorization: Bearer sk-1234' \
-    --data '{"input": "Sample text goes here", "model": "text-moderation-stable"}'
-```
-</TabItem>
-</Tabs>
-
-</TabItem>
-</Tabs>
-
-## Input Params
-LiteLLM accepts and translates the [OpenAI Moderation params](https://platform.openai.com/docs/api-reference/moderations) across all supported providers.
-
-### Required Fields
-
- `input`: *string or array* - Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
-  - If string: A string of text to classify for moderation
-  - If array of strings: An array of strings to classify for moderation
-  - If array of objects: An array of multi-modal inputs to the moderation model, where each object can be:
-    - An object describing an image to classify with:
-      - `type`: *string, required* - Always `image_url`
-      - `image_url`: *object, required* - Contains either an image URL or a data URL for a base64 encoded image
-    - An object describing text to classify with:
-      - `type`: *string, required* - Always `text`
-      - `text`: *string, required* - A string of text to classify
-
-### Optional Fields
-
- `model`: *string (optional)* - The moderation model to use. Defaults to `omni-moderation-latest`.
-
-## Output Format
-Here's the exact json output and type you can expect from all moderation calls:
-
-[**LiteLLM follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/moderations/object)
-
-
-```python
-{
-  "id": "modr-AB8CjOTu2jiq12hp1AQPfeqFWaORR",
-  "model": "text-moderation-007",
-  "results": [
-    {
-      "flagged": true,
-      "categories": {
-        "sexual": false,
-        "hate": false,
-        "harassment": true,
-        "self-harm": false,
-        "sexual/minors": false,
-        "hate/threatening": false,
-        "violence/graphic": false,
-        "self-harm/intent": false,
-        "self-harm/instructions": false,
-        "harassment/threatening": true,
-        "violence": true
-      },
-      "category_scores": {
-        "sexual": 0.000011726012417057063,
-        "hate": 0.22706663608551025,
-        "harassment": 0.5215635299682617,
-        "self-harm": 2.227119921371923e-6,
-        "sexual/minors": 7.107352217872176e-8,
-        "hate/threatening": 0.023547329008579254,
-        "violence/graphic": 0.00003391829886822961,
-        "self-harm/intent": 1.646940972932498e-6,
-        "self-harm/instructions": 1.1198755256458526e-9,
-        "harassment/threatening": 0.5694745779037476,
-        "violence": 0.9971134662628174
-      }
-    }
-  ]
-}
-
-```
-
-
-## **Supported Providers**
-
-| Provider    |
-|-------------|
-| OpenAI      |  
--- a/docs/my-website/docs/observability/argilla.md
+++ b/docs/my-website/docs/observability/argilla.md
@ -4,63 +4,24 @@ import TabItem from '@theme/TabItem';

 # Argilla 

-Argilla is a collaborative annotation tool for AI engineers and domain experts who need to build high-quality datasets for their projects.
+Argilla is a tool for annotating datasets. 


-## Getting Started

-To log the data to Argilla, first you need to deploy the Argilla server. If you have not deployed the Argilla server, please follow the instructions [here](https://docs.argilla.io/latest/getting_started/quickstart/).
-
-Next, you will need to configure and create the Argilla dataset.
-
-```python
-import argilla as rg
-
-client = rg.Argilla(api_url="<api_url>", api_key="<api_key>")
-
-settings = rg.Settings(
-    guidelines="These are some guidelines.",
-    fields=[
-        rg.ChatField(
-            name="user_input",
-        ),
-        rg.TextField(
-            name="llm_output",
-        ),
-    ],
-    questions=[
-        rg.RatingQuestion(
-            name="rating",
-            values=[1, 2, 3, 4, 5, 6, 7],
-        ),
-    ],
-)
-
-dataset = rg.Dataset(
-    name="my_first_dataset",
-    settings=settings,
-)
-
-dataset.create()
-```
-
-For further configuration, please refer to the [Argilla documentation](https://docs.argilla.io/latest/how_to_guides/dataset/).
-
-
-## Usage
+## Usage 

 <Tabs>
 <Tab value="sdk" label="SDK">

 ```python
-import os
-import litellm
 from litellm import completion
+import litellm
+import os 

 # add env vars
 os.environ["ARGILLA_API_KEY"]="argilla.apikey"
 os.environ["ARGILLA_BASE_URL"]="http://localhost:6900"
-os.environ["ARGILLA_DATASET_NAME"]="my_first_dataset"   
+os.environ["ARGILLA_DATASET_NAME"]="my_second_dataset"   
 os.environ["OPENAI_API_KEY"]="sk-proj-..."

 litellm.callbacks = ["argilla"]
--- a/docs/my-website/docs/pass_through/google_ai_studio.md
+++ b/docs/my-website/docs/pass_through/google_ai_studio.md
@ -1,21 +1,12 @@
-import Image from '@theme/IdealImage';
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-
 # Google AI Studio SDK

 Pass-through endpoints for Google AI Studio - call provider-specific endpoint, in native format (no translation).

-Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini`
+Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` 🚀

 #### **Example Usage**
-
-<Tabs>
-<TabItem value="curl" label="curl">
-
 ```bash
-curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
+http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \
 -H 'Content-Type: application/json' \
 -d '{
    "contents": [{
@ -26,53 +17,6 @@ curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=
 }'
 ```

-</TabItem>
-<TabItem value="js" label="Google AI Node.js SDK">
-
-```javascript
-const { GoogleGenerativeAI } = require("@google/generative-ai");
-
-const modelParams = {
-    model: 'gemini-pro',
-};
-  
-const requestOptions = {
-    baseUrl: 'http://localhost:4000/gemini', // http://<proxy-base-url>/gemini
-};
-  
-const genAI = new GoogleGenerativeAI("sk-1234"); // litellm proxy API key
-const model = genAI.getGenerativeModel(modelParams, requestOptions);
-
-async function main() {
-    try {
-        const result = await model.generateContent("Explain how AI works");
-        console.log(result.response.text());
-    } catch (error) {
-        console.error('Error:', error);
-    }
-}
-
-// For streaming responses
-async function main_streaming() {
-    try {
-        const streamingResult = await model.generateContentStream("Explain how AI works");
-        for await (const chunk of streamingResult.stream) {
-            console.log('Stream chunk:', JSON.stringify(chunk));
-        }
-        const aggregatedResponse = await streamingResult.response;
-        console.log('Aggregated response:', JSON.stringify(aggregatedResponse));
-    } catch (error) {
-        console.error('Error:', error);
-    }
-}
-
-main();
-// main_streaming();
-```
-
-</TabItem>
-</Tabs>
-
 Supports **ALL** Google AI Studio Endpoints (including streaming).

 [**See All Google AI Studio Endpoints**](https://ai.google.dev/api)
@ -222,14 +166,14 @@ curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5
 ```


-## Advanced 
+## Advanced - Use with Virtual Keys 

 Pre-requisites
 - [Setup proxy with DB](../proxy/virtual_keys.md#setup)

 Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints.

-### Use with Virtual Keys
+### Usage

 1. Setup environment

@ -276,66 +220,4 @@ http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-123
          }]
        }]
 }'
-```
-
-
-### Send `tags` in request headers
-
-Use this if you want `tags` to be tracked in the LiteLLM DB and on logging callbacks.
-
-Pass tags in request headers as a comma separated list. In the example below the following tags will be tracked
-
-```
-tags: ["gemini-js-sdk", "pass-through-endpoint"]
-```
-
-<Tabs>
-<TabItem value="curl" label="curl">
-
-```bash
-curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:generateContent?key=sk-anything' \
-H 'Content-Type: application/json' \
-H 'tags: gemini-js-sdk,pass-through-endpoint' \
-d '{
-    "contents": [{
-        "parts":[{
-          "text": "The quick brown fox jumps over the lazy dog."
-          }]
-        }]
-}'
-```
-
-</TabItem>
-<TabItem value="js" label="Google AI Node.js SDK">
-
-```javascript
-const { GoogleGenerativeAI } = require("@google/generative-ai");
-
-const modelParams = {
-    model: 'gemini-pro',
-};
-  
-const requestOptions = {
-    baseUrl: 'http://localhost:4000/gemini', // http://<proxy-base-url>/gemini
-    customHeaders: {
-        "tags": "gemini-js-sdk,pass-through-endpoint"
-    }
-};
-  
-const genAI = new GoogleGenerativeAI("sk-1234");
-const model = genAI.getGenerativeModel(modelParams, requestOptions);
-
-async function main() {
-    try {
-        const result = await model.generateContent("Explain how AI works");
-        console.log(result.response.text());
-    } catch (error) {
-        console.error('Error:', error);
-    }
-}
-
-main();
-```
-
-</TabItem>
-</Tabs>
+```
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
--- a/docs/my-website/docs/proxy/config_management.md
+++ b/docs/my-website/docs/proxy/config_management.md
@ -1,59 +0,0 @@
-# File Management
-
-## `include` external YAML files in a config.yaml 
-
-You can use `include` to include external YAML files in a config.yaml. 
-
-**Quick Start Usage:**
-
-To include a config file, use `include` with either a single file or a list of files. 
-
-Contents of `parent_config.yaml`:
-```yaml
-include:
-  - model_config.yaml # 👈 Key change, will include the contents of model_config.yaml
-
-litellm_settings:
-  callbacks: ["prometheus"] 
-```
-
-
-Contents of `model_config.yaml`:
-```yaml
-model_list:
-  - model_name: gpt-4o
-    litellm_params:
-      model: openai/gpt-4o
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: fake-anthropic-endpoint
-    litellm_params:
-      model: anthropic/fake
-      api_base: https://exampleanthropicendpoint-production.up.railway.app/
-
-```
-
-Start proxy server 
-
-This will start the proxy server with config `parent_config.yaml`. Since the `include` directive is used, the server will also include the contents of `model_config.yaml`.
-```
-litellm --config parent_config.yaml --detailed_debug
-```
-
-
-
-
-
-## Examples using `include`
-
-Include a single file:
-```yaml
-include:
-  - model_config.yaml
-```
-
-Include multiple files:
-```yaml
-include:
-  - model_config.yaml
-  - another_config.yaml
-```
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@ -1,507 +0,0 @@
-# All settings
-
-
-```yaml
-environment_variables: {}
-
-model_list:
-  - model_name: string
-    litellm_params: {}
-    model_info:
-      id: string
-      mode: embedding
-      input_cost_per_token: 0
-      output_cost_per_token: 0
-      max_tokens: 2048
-      base_model: gpt-4-1106-preview
-      additionalProp1: {}
-
-litellm_settings:
-  # Logging/Callback settings
-  success_callback: ["langfuse"]  # list of success callbacks
-  failure_callback: ["sentry"]  # list of failure callbacks
-  callbacks: ["otel"]  # list of callbacks - runs on success and failure
-  service_callbacks: ["datadog", "prometheus"]  # logs redis, postgres failures on datadog, prometheus
-  turn_off_message_logging: boolean  # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged.
-  redact_user_api_key_info: boolean  # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
-  langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
-  
-  # Networking settings
-  request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
-  force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API
-  
-  set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
-  json_logs: boolean # if true, logs will be in json format
-
-  # Fallbacks, reliability
-  default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad.
-  content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors
-  context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors
-
-
-
-  # Caching settings
-  cache: true 
-  cache_params:        # set cache params for redis
-    type: redis        # type of cache to initialize
-
-    # Optional - Redis Settings
-    host: "localhost"  # The host address for the Redis cache. Required if type is "redis".
-    port: 6379  # The port number for the Redis cache. Required if type is "redis".
-    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
-    namespace: "litellm.caching.caching" # namespace for redis cache
-  
-    # Optional - Redis Cluster Settings
-    redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] 
-
-    # Optional - Redis Sentinel Settings
-    service_name: "mymaster"
-    sentinel_nodes: [["localhost", 26379]]
-
-    # Optional - Qdrant Semantic Cache Settings
-    qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
-    qdrant_collection_name: test_collection
-    qdrant_quantization_config: binary
-    similarity_threshold: 0.8   # similarity threshold for semantic cache
-
-    # Optional - S3 Cache Settings
-    s3_bucket_name: cache-bucket-litellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
-    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket
-
-    # Common Cache settings
-    # Optional - Supported call types for caching
-    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
-                          # /chat/completions, /completions, /embeddings, /audio/transcriptions
-    mode: default_off # if default_off, you need to opt in to caching on a per call basis
-    ttl: 600 # ttl for caching
-
-
-callback_settings:
-  otel:
-    message_logging: boolean  # OTEL logging callback specific settings
-
-general_settings:
-  completion_model: string
-  disable_spend_logs: boolean  # turn off writing each transaction to the db
-  disable_master_key_return: boolean  # turn off returning master key on UI (checked on '/user/info' endpoint)
-  disable_retry_on_max_parallel_request_limit_error: boolean  # turn off retries when max parallel request limit is reached
-  disable_reset_budget: boolean  # turn off reset budget scheduled task
-  disable_adding_master_key_hash_to_db: boolean  # turn off storing master key hash in db, for spend tracking
-  enable_jwt_auth: boolean  # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
-  enforce_user_param: boolean  # requires all openai endpoint requests to have a 'user' param
-  allowed_routes: ["route1", "route2"]  # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
-  key_management_system: google_kms  # either google_kms or azure_kms
-  master_key: string
-
-  # Database Settings
-  database_url: string
-  database_connection_pool_limit: 0  # default 100
-  database_connection_timeout: 0  # default 60s
-  allow_requests_on_db_unavailable: boolean  # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work 
-
-  custom_auth: string
-  max_parallel_requests: 0  # the max parallel requests allowed per deployment 
-  global_max_parallel_requests: 0  # the max parallel requests allowed on the proxy all up 
-  infer_model_from_keys: true
-  background_health_checks: true
-  health_check_interval: 300
-  alerting: ["slack", "email"]
-  alerting_threshold: 0
-  use_client_credentials_pass_through_routes: boolean  # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints
-```
-
-### litellm_settings - Reference
-
-| Name | Type | Description |
-|------|------|-------------|
-| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
-| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
-| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
-| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) |
-| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) |
-| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider |
-| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.|
-| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) |
-| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) |
-| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
-| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
-| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
-| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
-| force_ipv4 | boolean | If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API |
-| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
-| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
-| cache | boolean | If true, enables caching. [Further docs](./caching) |
-| cache_params | object | Parameters for the cache. [Further docs](./caching) |
-| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) |
-| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". |
-| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". |
-| cache_params.password | string | The password for the Redis cache. Required if type is "redis". |
-| cache_params.namespace | string | The namespace for the Redis cache. |
-| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) |
-| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) |
-| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) |
-| cache_params.ttl | integer | The time (in seconds) to store entries in cache. |
-| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. |
-| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. |
-| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. |
-| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. |
-| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. |
-| cache_params.s3_region_name | string | The region name for the S3 bucket. |
-| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. |
-| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. |
-| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. |
-| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) |
-| cache_params.mode | string | The mode of the cache. [Further docs](./caching) |
-| disable_end_user_cost_tracking | boolean | If true, turns off end user cost tracking on prometheus metrics + litellm spend logs table on proxy. |
-| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) |
-
-### general_settings - Reference
-
-| Name | Type | Description |
-|------|------|-------------|
-| completion_model | string | The default model to use for completions when `model` is not specified in the request |
-| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
-| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
-| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
-| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
-| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db |
-| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) |
-| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)|
-| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)|
-| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) |
-| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) |
-| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
-| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
-| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
-| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
-| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
-| max_parallel_requests | integer | The max parallel requests allowed per deployment |
-| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
-| infer_model_from_keys | boolean | If true, infers the model from the provided keys |
-| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) |
-| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) |
-| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) |
-| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) |
-| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) |
-| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) |
-| public_routes | List[str] | (Enterprise Feature) Control list of public routes |
-| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] |
-| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy |
-| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
-| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address |
-| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | 
-| image_generation_model | str | The default model to use for image generation - ignores model set in request |
-| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
-| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
-| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
-| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** |
-| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. **Default is 605 seconds** |
-| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 10 seconds** |
-| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) |
-| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) |
-| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. |
-| embedding_model | str | The default model to use for embeddings - ignores model set in request |
-| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). |
-| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) |
-| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) |
-| allow_user_auth | boolean | (Deprecated) old approach for user authentication. |
-| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. |
-| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB |
-| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) |
-| moderation_model | str | The default model to use for moderation. |
-| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) |
-| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) |
-| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) |
-| use_azure_key_vault | boolean | If true, load keys from azure key vault | 
-| use_google_kms | boolean | If true, load keys from google kms |
-| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) |
-| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) |
-| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) |
-| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) |
-| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings | 
-| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
-| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
-| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
-| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |
-
-### router_settings - Reference
-
-:::info
-
-Most values can also be set via `litellm_settings`. If you see overlapping values, settings on `router_settings` will override those on `litellm_settings`.
-:::
-
-```yaml
-router_settings:
-  routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
-  redis_host: <your-redis-host>           # string
-  redis_password: <your-redis-password>   # string
-  redis_port: <your-redis-port>           # string
-  enable_pre_call_check: true             # bool - Before call is made check if a call is within model context window 
-  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
-  cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
-  disable_cooldowns: True                  # bool - Disable cooldowns for all models 
-  enable_tag_filtering: True                # bool - Use tag based routing for requests
-  retry_policy: {                          # Dict[str, int]: retry policy for different types of exceptions
-    "AuthenticationErrorRetries": 3,
-    "TimeoutErrorRetries": 3,
-    "RateLimitErrorRetries": 3,
-    "ContentPolicyViolationErrorRetries": 4,
-    "InternalServerErrorRetries": 4
-  }
-  allowed_fails_policy: {
-    "BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
-    "AuthenticationErrorAllowedFails": 10, # int 
-    "TimeoutErrorAllowedFails": 12, # int 
-    "RateLimitErrorAllowedFails": 10000, # int 
-    "ContentPolicyViolationErrorAllowedFails": 15, # int 
-    "InternalServerErrorAllowedFails": 20, # int 
-  }
-  content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
-  fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
-```
-
-| Name | Type | Description |
-|------|------|-------------|
-| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) |
-| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
-| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
-| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**|
-| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
-| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) |
-| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) |
-| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) |
-| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. |
-| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) |
-| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) |
-| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) |
-| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) |
-| default_max_parallel_requests | Optional[int] | The default maximum number of parallel requests for a deployment. |
-| default_priority | (Optional[int]) | The default priority for a request. Only for '.scheduler_acompletion()'. Default is None. | 
-| polling_interval | (Optional[float]) | frequency of polling queue. Only for '.scheduler_acompletion()'. Default is 3ms. |
-| max_fallbacks | Optional[int] | The maximum number of fallbacks to try before exiting the call. Defaults to 5. |
-| default_litellm_params | Optional[dict] | The default litellm parameters to add to all requests (e.g. `temperature`, `max_tokens`). |
-| timeout | Optional[float] | The default timeout for a request. |
-| debug_level | Literal["DEBUG", "INFO"] | The debug level for the logging library in the router. Defaults to "INFO". |
-| client_ttl | int | Time-to-live for cached clients in seconds. Defaults to 3600. |
-| cache_kwargs | dict | Additional keyword arguments for the cache initialization. |
-| routing_strategy_args | dict | Additional keyword arguments for the routing strategy - e.g. lowest latency routing default ttl |
-| model_group_alias | dict | Model group alias mapping. E.g. `{"claude-3-haiku": "claude-3-haiku-20240229"}` |
-| num_retries | int | Number of retries for a request. Defaults to 3. |
-| default_fallbacks | Optional[List[str]] | Fallbacks to try if no model group-specific fallbacks are defined. |
-| caching_groups | Optional[List[tuple]] | List of model groups for caching across model groups. Defaults to None. - e.g. caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]|
-| alerting_config | AlertingConfig | [SDK-only arg] Slack alerting configuration. Defaults to None. [Further Docs](../routing.md#alerting-) |
-| assistants_config | AssistantsConfig | Set on proxy via `assistant_settings`. [Further docs](../assistants.md) |
-| set_verbose | boolean | [DEPRECATED PARAM - see debug docs](./debugging.md) If true, sets the logging level to verbose. |
-| retry_after | int | Time to wait before retrying a request in seconds. Defaults to 0. If `x-retry-after` is received from LLM API, this value is overridden. |
-| provider_budget_config | ProviderBudgetConfig | Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None. [Further Docs](./provider_budget_routing.md) |
-| enable_pre_call_checks | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
-| model_group_retry_policy | Dict[str, RetryPolicy] | [SDK-only arg] Set retry policy for model groups. |
-| context_window_fallbacks | List[Dict[str, List[str]]] | Fallback models for context window violations. |
-| redis_url | str | URL for Redis server. **Known performance issue with Redis URL.** |
-| cache_responses | boolean | Flag to enable caching LLM Responses, if cache set under `router_settings`. If true, caches responses. Defaults to False. |
-| router_general_settings | RouterGeneralSettings | [SDK-Only] Router general settings - contains optimizations like 'async_only_mode'. [Docs](../routing.md#router-general-settings) |
-
-### environment variables - Reference
-
-| Name | Description |
-|------|-------------|
-| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
-| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
-| AISPEND_ACCOUNT_ID | Account ID for AI Spend
-| AISPEND_API_KEY | API Key for AI Spend
-| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
-| ARIZE_API_KEY | API key for Arize platform integration
-| ARIZE_SPACE_KEY | Space key for Arize platform
-| ARGILLA_BATCH_SIZE | Batch size for Argilla logging
-| ARGILLA_API_KEY | API key for Argilla platform
-| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging
-| ARGILLA_DATASET_NAME | Dataset name for Argilla logging
-| ARGILLA_BASE_URL | Base URL for Argilla service
-| ATHINA_API_KEY | API key for Athina service
-| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key)
-| AWS_ACCESS_KEY_ID | Access Key ID for AWS services
-| AWS_PROFILE_NAME | AWS CLI profile name to be used
-| AWS_REGION_NAME | Default AWS region for service interactions
-| AWS_ROLE_NAME | Role name for AWS IAM usage
-| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services
-| AWS_SESSION_NAME | Name for AWS session
-| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS
-| AZURE_API_VERSION | Version of the Azure API being used
-| AZURE_AUTHORITY_HOST | Azure authority host URL
-| AZURE_CLIENT_ID | Client ID for Azure services
-| AZURE_CLIENT_SECRET | Client secret for Azure services
-| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
-| AZURE_KEY_VAULT_URI | URI for Azure Key Vault
-| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
-| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
-| BRAINTRUST_API_KEY | API key for Braintrust integration
-| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
-| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
-| CONFIG_FILE_PATH | File path for configuration file
-| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
-| DATABASE_HOST | Hostname for the database server
-| DATABASE_NAME | Name of the database
-| DATABASE_PASSWORD | Password for the database user
-| DATABASE_PORT | Port number for database connection
-| DATABASE_SCHEMA | Schema name used in the database
-| DATABASE_URL | Connection URL for the database
-| DATABASE_USER | Username for database connection
-| DATABASE_USERNAME | Alias for database user
-| DATABRICKS_API_BASE | Base URL for Databricks API
-| DD_BASE_URL | Base URL for Datadog integration
-| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
-| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
-| DD_API_KEY | API key for Datadog integration
-| DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
-| DD_SOURCE | Source identifier for Datadog logs
-| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
-| DD_SERVICE | Service identifier for Datadog logs. Defaults to "litellm-server"
-| DD_VERSION | Version identifier for Datadog logs. Defaults to "unknown"
-| DEBUG_OTEL | Enable debug mode for OpenTelemetry
-| DIRECT_URL | Direct URL for service endpoint
-| DISABLE_ADMIN_UI | Toggle to disable the admin UI
-| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates
-| DOCS_DESCRIPTION | Description text for documentation pages
-| DOCS_FILTERED | Flag indicating filtered documentation
-| DOCS_TITLE | Title of the documentation pages
-| DOCS_URL | The path to the Swagger API documentation. **By default this is "/"**
-| EMAIL_SUPPORT_CONTACT | Support contact email address
-| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket
-| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file
-| GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds**
-| GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048**
-| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers
-| GENERIC_CLIENT_ID | Client ID for generic OAuth providers
-| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers
-| GENERIC_CLIENT_STATE | State parameter for generic client authentication
-| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth
-| GENERIC_SCOPE | Scope settings for generic OAuth providers
-| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers
-| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth
-| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth
-| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth
-| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth
-| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth
-| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider
-| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role
-| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth
-| GALILEO_BASE_URL | Base URL for Galileo platform
-| GALILEO_PASSWORD | Password for Galileo authentication
-| GALILEO_PROJECT_ID | Project ID for Galileo usage
-| GALILEO_USERNAME | Username for Galileo authentication
-| GREENSCALE_API_KEY | API key for Greenscale service
-| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service
-| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file
-| GOOGLE_CLIENT_ID | Client ID for Google OAuth
-| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth
-| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS
-| HF_API_BASE | Base URL for Hugging Face API
-| HELICONE_API_KEY | API key for Helicone service
-| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
-| IAM_TOKEN_DB_AUTH | IAM token for database authentication
-| JSON_LOGS | Enable JSON formatted logging
-| JWT_AUDIENCE | Expected audience for JWT tokens
-| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification
-| LAGO_API_BASE | Base URL for Lago API
-| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago
-| LAGO_API_EVENT_CODE | Event code for Lago API events
-| LAGO_API_KEY | API key for accessing Lago services
-| LANGFUSE_DEBUG | Toggle debug mode for Langfuse
-| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs
-| LANGFUSE_HOST | Host URL for Langfuse service
-| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication
-| LANGFUSE_RELEASE | Release version of Langfuse integration
-| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication
-| LANGSMITH_API_KEY | API key for Langsmith platform
-| LANGSMITH_BASE_URL | Base URL for Langsmith service
-| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith
-| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run
-| LANGSMITH_PROJECT | Project name for Langsmith integration
-| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging
-| LANGTRACE_API_KEY | API key for Langtrace service
-| LITERAL_API_KEY | API key for Literal integration
-| LITERAL_API_URL | API URL for Literal service
-| LITERAL_BATCH_SIZE | Batch size for Literal operations
-| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
-| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
-| LITELLM_EMAIL | Email associated with LiteLLM account
-| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
-| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
-| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM
-| LITELLM_LICENSE | License key for LiteLLM usage
-| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM
-| LITELLM_LOG | Enable detailed logging for LiteLLM
-| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development)
-| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM
-| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM
-| LITELLM_TOKEN | Access token for LiteLLM integration
-| LOGFIRE_TOKEN | Token for Logfire logging service
-| MICROSOFT_CLIENT_ID | Client ID for Microsoft services
-| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
-| MICROSOFT_TENANT | Tenant ID for Microsoft Azure
-| NO_DOCS | Flag to disable documentation generation
-| NO_PROXY | List of addresses to bypass proxy
-| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
-| OPENAI_API_BASE | Base URL for OpenAI API
-| OPENAI_API_KEY | API key for OpenAI services
-| OPENAI_ORGANIZATION | Organization identifier for OpenAI
-| OPENID_BASE_URL | Base URL for OpenID Connect services
-| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication
-| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication
-| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration
-| OPENMETER_API_KEY | API key for OpenMeter services
-| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter
-| OTEL_ENDPOINT | OpenTelemetry endpoint for traces
-| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry
-| OTEL_EXPORTER | Exporter type for OpenTelemetry
-| OTEL_HEADERS | Headers for OpenTelemetry requests
-| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
-| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
-| PREDIBASE_API_BASE | Base URL for Predibase API
-| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
-| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service
-| PROMETHEUS_URL | URL for Prometheus service
-| PROMPTLAYER_API_KEY | API key for PromptLayer integration
-| PROXY_ADMIN_ID | Admin identifier for proxy server
-| PROXY_BASE_URL | Base URL for proxy service
-| PROXY_LOGOUT_URL | URL for logging out of the proxy service
-| PROXY_MASTER_KEY | Master key for proxy authentication
-| QDRANT_API_BASE | Base URL for Qdrant API
-| QDRANT_API_KEY | API key for Qdrant service
-| QDRANT_URL | Connection URL for Qdrant database
-| REDIS_HOST | Hostname for Redis server
-| REDIS_PASSWORD | Password for Redis service
-| REDIS_PORT | Port number for Redis server
-| REDOC_URL | The path to the Redoc Fast API documentation. **By default this is "/redoc"**
-| SERVER_ROOT_PATH | Root path for the server application
-| SET_VERBOSE | Flag to enable verbose logging
-| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
-| SLACK_WEBHOOK_URL | Webhook URL for Slack integration
-| SMTP_HOST | Hostname for the SMTP server
-| SMTP_PASSWORD | Password for SMTP authentication
-| SMTP_PORT | Port number for SMTP server
-| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
-| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
-| SMTP_TLS | Flag to enable or disable TLS for SMTP connections
-| SMTP_USERNAME | Username for SMTP authentication
-| SPEND_LOGS_URL | URL for retrieving spend logs
-| SSL_CERTIFICATE | Path to the SSL certificate file
-| SSL_VERIFY | Flag to enable or disable SSL certificate verification
-| SUPABASE_KEY | API key for Supabase service
-| SUPABASE_URL | Base URL for Supabase instance
-| TEST_EMAIL_ADDRESS | Email address used for testing purposes
-| UI_LOGO_PATH | Path to the logo image used in the UI
-| UI_PASSWORD | Password for accessing the UI
-| UI_USERNAME | Username for accessing the UI
-| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse
-| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service
-| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication
-| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
-| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
-| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
-| WEBHOOK_URL | URL for receiving webhooks from external services
-
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Overview
+# Proxy Config.yaml
 Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`master-key`) on the config.yaml. 

 | Param Name           | Description                                                   |
@ -357,6 +357,77 @@ curl --location 'http://0.0.0.0:4000/v1/model/info' \
 --data ''
 ```

+ 
+### Provider specific wildcard routing 
+**Proxy all models from a provider**
+
+Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
+
+**Step 1** - define provider specific routing on config.yaml
+```yaml
+model_list:
+  # provider specific wildcard routing
+  - model_name: "anthropic/*"
+    litellm_params:
+      model: "anthropic/*"
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: "groq/*"
+    litellm_params:
+      model: "groq/*"
+      api_key: os.environ/GROQ_API_KEY
+  - model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
+    litellm_params:
+      model: "openai/fo::*:static::*"
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+Step 2 - Run litellm proxy 
+
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+Step 3 Test it 
+
+Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "anthropic/claude-3-sonnet-20240229",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude!"}
+    ]
+  }'
+```
+
+Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "groq/llama3-8b-8192",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude!"}
+    ]
+  }'
+```
+
+Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "fo::hi::static::hi",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude!"}
+    ]
+  }'
+```
+
 ### Load Balancing 

 :::info
@ -526,6 +597,479 @@ general_settings:
  database_connection_timeout: 60 # sets a 60s timeout for any connection call to the db 
 ```

+## **All settings**
+
+
+```yaml
+environment_variables: {}
+
+model_list:
+  - model_name: string
+    litellm_params: {}
+    model_info:
+      id: string
+      mode: embedding
+      input_cost_per_token: 0
+      output_cost_per_token: 0
+      max_tokens: 2048
+      base_model: gpt-4-1106-preview
+      additionalProp1: {}
+
+litellm_settings:
+  # Logging/Callback settings
+  success_callback: ["langfuse"]  # list of success callbacks
+  failure_callback: ["sentry"]  # list of failure callbacks
+  callbacks: ["otel"]  # list of callbacks - runs on success and failure
+  service_callbacks: ["datadog", "prometheus"]  # logs redis, postgres failures on datadog, prometheus
+  turn_off_message_logging: boolean  # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged.
+  redact_user_api_key_info: boolean  # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
+  langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
+  
+  # Networking settings
+  request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
+  force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API
+  
+  set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
+  json_logs: boolean # if true, logs will be in json format
+
+  # Fallbacks, reliability
+  default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad.
+  content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors
+  context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors
+
+
+
+  # Caching settings
+  cache: true 
+  cache_params:        # set cache params for redis
+    type: redis        # type of cache to initialize
+
+    # Optional - Redis Settings
+    host: "localhost"  # The host address for the Redis cache. Required if type is "redis".
+    port: 6379  # The port number for the Redis cache. Required if type is "redis".
+    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
+    namespace: "litellm.caching.caching" # namespace for redis cache
+  
+    # Optional - Redis Cluster Settings
+    redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] 
+
+    # Optional - Redis Sentinel Settings
+    service_name: "mymaster"
+    sentinel_nodes: [["localhost", 26379]]
+
+    # Optional - Qdrant Semantic Cache Settings
+    qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
+    qdrant_collection_name: test_collection
+    qdrant_quantization_config: binary
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+
+    # Optional - S3 Cache Settings
+    s3_bucket_name: cache-bucket-litellm   # AWS Bucket Name for S3
+    s3_region_name: us-west-2              # AWS Region Name for S3
+    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
+    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
+    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket
+
+    # Common Cache settings
+    # Optional - Supported call types for caching
+    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
+                          # /chat/completions, /completions, /embeddings, /audio/transcriptions
+    mode: default_off # if default_off, you need to opt in to caching on a per call basis
+    ttl: 600 # ttl for caching
+
+
+callback_settings:
+  otel:
+    message_logging: boolean  # OTEL logging callback specific settings
+
+general_settings:
+  completion_model: string
+  disable_spend_logs: boolean  # turn off writing each transaction to the db
+  disable_master_key_return: boolean  # turn off returning master key on UI (checked on '/user/info' endpoint)
+  disable_retry_on_max_parallel_request_limit_error: boolean  # turn off retries when max parallel request limit is reached
+  disable_reset_budget: boolean  # turn off reset budget scheduled task
+  disable_adding_master_key_hash_to_db: boolean  # turn off storing master key hash in db, for spend tracking
+  enable_jwt_auth: boolean  # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
+  enforce_user_param: boolean  # requires all openai endpoint requests to have a 'user' param
+  allowed_routes: ["route1", "route2"]  # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
+  key_management_system: google_kms  # either google_kms or azure_kms
+  master_key: string
+
+  # Database Settings
+  database_url: string
+  database_connection_pool_limit: 0  # default 100
+  database_connection_timeout: 0  # default 60s
+  allow_requests_on_db_unavailable: boolean  # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work 
+
+  custom_auth: string
+  max_parallel_requests: 0  # the max parallel requests allowed per deployment 
+  global_max_parallel_requests: 0  # the max parallel requests allowed on the proxy all up 
+  infer_model_from_keys: true
+  background_health_checks: true
+  health_check_interval: 300
+  alerting: ["slack", "email"]
+  alerting_threshold: 0
+  use_client_credentials_pass_through_routes: boolean  # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints
+```
+
+### litellm_settings - Reference
+
+| Name | Type | Description |
+|------|------|-------------|
+| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
+| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
+| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) |
+| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) |
+| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) |
+| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider |
+| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.|
+| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) |
+| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) |
+| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
+| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
+| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
+| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
+| force_ipv4 | boolean | If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API |
+| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
+| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
+| cache | boolean | If true, enables caching. [Further docs](./caching) |
+| cache_params | object | Parameters for the cache. [Further docs](./caching) |
+| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) |
+| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". |
+| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". |
+| cache_params.password | string | The password for the Redis cache. Required if type is "redis". |
+| cache_params.namespace | string | The namespace for the Redis cache. |
+| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) |
+| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) |
+| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) |
+| cache_params.ttl | integer | The time (in seconds) to store entries in cache. |
+| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. |
+| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. |
+| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. |
+| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. |
+| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. |
+| cache_params.s3_region_name | string | The region name for the S3 bucket. |
+| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. |
+| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. |
+| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. |
+| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) |
+| cache_params.mode | string | The mode of the cache. [Further docs](./caching) |
+
+### general_settings - Reference
+
+| Name | Type | Description |
+|------|------|-------------|
+| completion_model | string | The default model to use for completions when `model` is not specified in the request |
+| disable_spend_logs | boolean | If true, turns off writing each transaction to the database |
+| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) |
+| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached |
+| disable_reset_budget | boolean | If true, turns off reset budget scheduled task |
+| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db |
+| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) |
+| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)|
+| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)|
+| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) |
+| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) |
+| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) |
+| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) |
+| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) |
+| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key |
+| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) |
+| max_parallel_requests | integer | The max parallel requests allowed per deployment |
+| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall |
+| infer_model_from_keys | boolean | If true, infers the model from the provided keys |
+| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) |
+| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) |
+| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) |
+| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) |
+| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) |
+| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) |
+| public_routes | List[str] | (Enterprise Feature) Control list of public routes |
+| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] |
+| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy |
+| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
+| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address |
+| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | 
+| image_generation_model | str | The default model to use for image generation - ignores model set in request |
+| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) |
+| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. |
+| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. |
+| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** |
+| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. **Default is 605 seconds** |
+| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 10 seconds** |
+| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) |
+| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) |
+| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. |
+| embedding_model | str | The default model to use for embeddings - ignores model set in request |
+| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). |
+| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) |
+| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) |
+| allow_user_auth | boolean | (Deprecated) old approach for user authentication. |
+| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. |
+| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB |
+| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) |
+| moderation_model | str | The default model to use for moderation. |
+| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) |
+| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) |
+| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) |
+| use_azure_key_vault | boolean | If true, load keys from azure key vault | 
+| use_google_kms | boolean | If true, load keys from google kms |
+| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) |
+| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) |
+| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) |
+| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) |
+| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings | 
+| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
+| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
+| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
+| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |
+
+### router_settings - Reference
+
+```yaml
+router_settings:
+  routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle"
+  redis_host: <your-redis-host>           # string
+  redis_password: <your-redis-password>   # string
+  redis_port: <your-redis-port>           # string
+  enable_pre_call_check: true             # bool - Before call is made check if a call is within model context window 
+  allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. 
+  cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails
+  disable_cooldowns: True                  # bool - Disable cooldowns for all models 
+  enable_tag_filtering: True                # bool - Use tag based routing for requests
+  retry_policy: {                          # Dict[str, int]: retry policy for different types of exceptions
+    "AuthenticationErrorRetries": 3,
+    "TimeoutErrorRetries": 3,
+    "RateLimitErrorRetries": 3,
+    "ContentPolicyViolationErrorRetries": 4,
+    "InternalServerErrorRetries": 4
+  }
+  allowed_fails_policy: {
+    "BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment
+    "AuthenticationErrorAllowedFails": 10, # int 
+    "TimeoutErrorAllowedFails": 12, # int 
+    "RateLimitErrorAllowedFails": 10000, # int 
+    "ContentPolicyViolationErrorAllowedFails": 15, # int 
+    "InternalServerErrorAllowedFails": 20, # int 
+  }
+  content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations
+  fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors
+```
+
+| Name | Type | Description |
+|------|------|-------------|
+| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) |
+| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
+| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** |
+| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**|
+| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) |
+| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) |
+| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) |
+| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) |
+| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. |
+| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) |
+| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) |
+| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) |
+| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) |
+
+
+### environment variables - Reference
+
+| Name | Description |
+|------|-------------|
+| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions
+| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions
+| AISPEND_ACCOUNT_ID | Account ID for AI Spend
+| AISPEND_API_KEY | API Key for AI Spend
+| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access
+| ARIZE_API_KEY | API key for Arize platform integration
+| ARIZE_SPACE_KEY | Space key for Arize platform
+| ARGILLA_BATCH_SIZE | Batch size for Argilla logging
+| ARGILLA_API_KEY | API key for Argilla platform
+| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging
+| ARGILLA_DATASET_NAME | Dataset name for Argilla logging
+| ARGILLA_BASE_URL | Base URL for Argilla service
+| ATHINA_API_KEY | API key for Athina service
+| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key)
+| AWS_ACCESS_KEY_ID | Access Key ID for AWS services
+| AWS_PROFILE_NAME | AWS CLI profile name to be used
+| AWS_REGION_NAME | Default AWS region for service interactions
+| AWS_ROLE_NAME | Role name for AWS IAM usage
+| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services
+| AWS_SESSION_NAME | Name for AWS session
+| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS
+| AZURE_API_VERSION | Version of the Azure API being used
+| AZURE_AUTHORITY_HOST | Azure authority host URL
+| AZURE_CLIENT_ID | Client ID for Azure services
+| AZURE_CLIENT_SECRET | Client secret for Azure services
+| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token
+| AZURE_KEY_VAULT_URI | URI for Azure Key Vault
+| AZURE_TENANT_ID | Tenant ID for Azure Active Directory
+| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service
+| BRAINTRUST_API_KEY | API key for Braintrust integration
+| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
+| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
+| CONFIG_FILE_PATH | File path for configuration file
+| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
+| DATABASE_HOST | Hostname for the database server
+| DATABASE_NAME | Name of the database
+| DATABASE_PASSWORD | Password for the database user
+| DATABASE_PORT | Port number for database connection
+| DATABASE_SCHEMA | Schema name used in the database
+| DATABASE_URL | Connection URL for the database
+| DATABASE_USER | Username for database connection
+| DATABASE_USERNAME | Alias for database user
+| DATABRICKS_API_BASE | Base URL for Databricks API
+| DD_BASE_URL | Base URL for Datadog integration
+| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
+| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration
+| DD_API_KEY | API key for Datadog integration
+| DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
+| DD_SOURCE | Source identifier for Datadog logs
+| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
+| DEBUG_OTEL | Enable debug mode for OpenTelemetry
+| DIRECT_URL | Direct URL for service endpoint
+| DISABLE_ADMIN_UI | Toggle to disable the admin UI
+| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates
+| DOCS_DESCRIPTION | Description text for documentation pages
+| DOCS_FILTERED | Flag indicating filtered documentation
+| DOCS_TITLE | Title of the documentation pages
+| DOCS_URL | The path to the Swagger API documentation. **By default this is "/"**
+| EMAIL_SUPPORT_CONTACT | Support contact email address
+| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket
+| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file
+| GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds**
+| GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048**
+| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers
+| GENERIC_CLIENT_ID | Client ID for generic OAuth providers
+| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers
+| GENERIC_CLIENT_STATE | State parameter for generic client authentication
+| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth
+| GENERIC_SCOPE | Scope settings for generic OAuth providers
+| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers
+| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth
+| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth
+| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth
+| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth
+| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth
+| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider
+| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role
+| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth
+| GALILEO_BASE_URL | Base URL for Galileo platform
+| GALILEO_PASSWORD | Password for Galileo authentication
+| GALILEO_PROJECT_ID | Project ID for Galileo usage
+| GALILEO_USERNAME | Username for Galileo authentication
+| GREENSCALE_API_KEY | API key for Greenscale service
+| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service
+| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file
+| GOOGLE_CLIENT_ID | Client ID for Google OAuth
+| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth
+| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS
+| HF_API_BASE | Base URL for Hugging Face API
+| HELICONE_API_KEY | API key for Helicone service
+| HUGGINGFACE_API_BASE | Base URL for Hugging Face API
+| IAM_TOKEN_DB_AUTH | IAM token for database authentication
+| JSON_LOGS | Enable JSON formatted logging
+| JWT_AUDIENCE | Expected audience for JWT tokens
+| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification
+| LAGO_API_BASE | Base URL for Lago API
+| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago
+| LAGO_API_EVENT_CODE | Event code for Lago API events
+| LAGO_API_KEY | API key for accessing Lago services
+| LANGFUSE_DEBUG | Toggle debug mode for Langfuse
+| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs
+| LANGFUSE_HOST | Host URL for Langfuse service
+| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication
+| LANGFUSE_RELEASE | Release version of Langfuse integration
+| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication
+| LANGSMITH_API_KEY | API key for Langsmith platform
+| LANGSMITH_BASE_URL | Base URL for Langsmith service
+| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith
+| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run
+| LANGSMITH_PROJECT | Project name for Langsmith integration
+| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging
+| LANGTRACE_API_KEY | API key for Langtrace service
+| LITERAL_API_KEY | API key for Literal integration
+| LITERAL_API_URL | API URL for Literal service
+| LITERAL_BATCH_SIZE | Batch size for Literal operations
+| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI
+| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests
+| LITELLM_EMAIL | Email associated with LiteLLM account
+| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM
+| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM
+| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM
+| LITELLM_LICENSE | License key for LiteLLM usage
+| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM
+| LITELLM_LOG | Enable detailed logging for LiteLLM
+| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development)
+| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM
+| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM
+| LITELLM_TOKEN | Access token for LiteLLM integration
+| LOGFIRE_TOKEN | Token for Logfire logging service
+| MICROSOFT_CLIENT_ID | Client ID for Microsoft services
+| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services
+| MICROSOFT_TENANT | Tenant ID for Microsoft Azure
+| NO_DOCS | Flag to disable documentation generation
+| NO_PROXY | List of addresses to bypass proxy
+| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval
+| OPENAI_API_BASE | Base URL for OpenAI API
+| OPENAI_API_KEY | API key for OpenAI services
+| OPENAI_ORGANIZATION | Organization identifier for OpenAI
+| OPENID_BASE_URL | Base URL for OpenID Connect services
+| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication
+| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication
+| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration
+| OPENMETER_API_KEY | API key for OpenMeter services
+| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter
+| OTEL_ENDPOINT | OpenTelemetry endpoint for traces
+| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry
+| OTEL_EXPORTER | Exporter type for OpenTelemetry
+| OTEL_HEADERS | Headers for OpenTelemetry requests
+| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry
+| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing
+| PREDIBASE_API_BASE | Base URL for Predibase API
+| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service
+| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service
+| PROMETHEUS_URL | URL for Prometheus service
+| PROMPTLAYER_API_KEY | API key for PromptLayer integration
+| PROXY_ADMIN_ID | Admin identifier for proxy server
+| PROXY_BASE_URL | Base URL for proxy service
+| PROXY_LOGOUT_URL | URL for logging out of the proxy service
+| PROXY_MASTER_KEY | Master key for proxy authentication
+| QDRANT_API_BASE | Base URL for Qdrant API
+| QDRANT_API_KEY | API key for Qdrant service
+| QDRANT_URL | Connection URL for Qdrant database
+| REDIS_HOST | Hostname for Redis server
+| REDIS_PASSWORD | Password for Redis service
+| REDIS_PORT | Port number for Redis server
+| REDOC_URL | The path to the Redoc Fast API documentation. **By default this is "/redoc"**
+| SERVER_ROOT_PATH | Root path for the server application
+| SET_VERBOSE | Flag to enable verbose logging
+| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly)
+| SLACK_WEBHOOK_URL | Webhook URL for Slack integration
+| SMTP_HOST | Hostname for the SMTP server
+| SMTP_PASSWORD | Password for SMTP authentication
+| SMTP_PORT | Port number for SMTP server
+| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions
+| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP
+| SMTP_TLS | Flag to enable or disable TLS for SMTP connections
+| SMTP_USERNAME | Username for SMTP authentication
+| SPEND_LOGS_URL | URL for retrieving spend logs
+| SSL_CERTIFICATE | Path to the SSL certificate file
+| SSL_VERIFY | Flag to enable or disable SSL certificate verification
+| SUPABASE_KEY | API key for Supabase service
+| SUPABASE_URL | Base URL for Supabase instance
+| TEST_EMAIL_ADDRESS | Email address used for testing purposes
+| UI_LOGO_PATH | Path to the logo image used in the UI
+| UI_PASSWORD | Password for accessing the UI
+| UI_USERNAME | Username for accessing the UI
+| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse
+| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service
+| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication
+| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse
+| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication
+| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption
+| WEBHOOK_URL | URL for receiving webhooks from external services
 ## Extras


--- a/docs/my-website/docs/proxy/db_info.md
+++ b/docs/my-website/docs/proxy/db_info.md
@ -50,22 +50,18 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma
 | LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** |
 | LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** |

-## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs`
+## How to Disable `LiteLLM_SpendLogs`

-You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.
+You can disable spend_logs by setting `disable_spend_logs` to `True` on the `general_settings` section of your proxy_config.yaml file.

 ```yaml
 general_settings:
-  disable_spend_logs: True   # Disable writing spend logs to DB
-  disable_error_logs: True   # Disable writing error logs to DB
+  disable_spend_logs: True
 ```

-### What is the impact of disabling these logs?

-When disabling spend logs (`disable_spend_logs: True`):
+### What is the impact of disabling `LiteLLM_SpendLogs`?
+
 - You **will not** be able to view Usage on the LiteLLM UI
 - You **will** continue seeing cost metrics on s3, Prometheus, Langfuse (any other Logging integration you are using)

-When disabling error logs (`disable_error_logs: True`):
- You **will not** be able to view Errors on the LiteLLM UI
- You **will** continue seeing error logs in your application logs and any other logging integrations you are using
--- a/docs/my-website/docs/proxy/load_balancing.md
+++ b/docs/my-website/docs/proxy/load_balancing.md
@ -1,4 +1,4 @@
-# Proxy - Load Balancing
+# Multiple Instances
 Load balance multiple instances of the same model

 The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput**
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -23,7 +23,6 @@ general_settings:

  # OPTIONAL Best Practices
  disable_spend_logs: True # turn off writing each transaction to the db. We recommend doing this is you don't need to see Usage on the LiteLLM UI and are tracking metrics via Prometheus
-  disable_error_logs: True # turn off writing LLM Exceptions to DB
  allow_requests_on_db_unavailable: True # Only USE when running LiteLLM on your VPC. Allow requests to still be processed even if the DB is unavailable. We recommend doing this if you're running LiteLLM on VPC that cannot be accessed from the public internet.

 litellm_settings:
@ -103,22 +102,17 @@ general_settings:
  allow_requests_on_db_unavailable: True
 ```

-## 6. Disable spend_logs & error_logs if not using the LiteLLM UI
+## 6. Disable spend_logs if you're not using the LiteLLM UI

-By default, LiteLLM writes several types of logs to the database:
- Every LLM API request to the `LiteLLM_SpendLogs` table
- LLM Exceptions to the `LiteLLM_LogsErrors` table
+By default LiteLLM will write every request to the `LiteLLM_SpendLogs` table. This is used for viewing Usage on the LiteLLM UI. 

-If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`:
+If you're not viewing Usage on the LiteLLM UI (most users use Prometheus when this is disabled), you can disable spend_logs by setting `disable_spend_logs` to `True`.

 ```yaml
 general_settings:
-  disable_spend_logs: True    # Disable writing spend logs to DB
-  disable_error_logs: True    # Disable writing error logs to DB
+  disable_spend_logs: True
 ```

-[More information about what the Database is used for here](db_info)
-
 ## 7. Use Helm PreSync Hook for Database Migrations [BETA]

 To ensure only one service manages database migrations, use our [Helm PreSync hook for Database Migrations](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/templates/migrations-job.yaml). This ensures migrations are handled during `helm upgrade` or `helm install`, while LiteLLM pods explicitly disable migrations.
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -192,13 +192,3 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das
 |----------------------|--------------------------------------|
 | `litellm_llm_api_failed_requests_metric`             | **deprecated** use `litellm_proxy_failed_requests_metric` |
 | `litellm_requests_metric`             | **deprecated** use `litellm_proxy_total_requests_metric` |
-
-
-## FAQ 
-
-### What are `_created` vs. `_total` metrics?
-
- `_created` metrics are metrics that are created when the proxy starts
- `_total` metrics are metrics that are incremented for each request
-
-You should consume the `_total` metrics for your counting purposes
--- a/docs/my-website/docs/proxy/provider_budget_routing.md
+++ b/docs/my-website/docs/proxy/provider_budget_routing.md
@ -16,27 +16,25 @@ model_list:
        api_key: os.environ/OPENAI_API_KEY

 router_settings:
+  redis_host: <your-redis-host>
+  redis_password: <your-redis-password>
+  redis_port: <your-redis-port>
  provider_budget_config: 
-    openai: 
-      budget_limit: 0.000000000001 # float of $ value budget for time period
-      time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo
-    azure:
-      budget_limit: 100
-      time_period: 1d
-    anthropic:
-      budget_limit: 100
-      time_period: 10d
-    vertex_ai:
-      budget_limit: 100
-      time_period: 12d
-    gemini:
-      budget_limit: 100
-      time_period: 12d
-  
-  # OPTIONAL: Set Redis Host, Port, and Password if using multiple instance of LiteLLM
-  redis_host: os.environ/REDIS_HOST
-  redis_port: os.environ/REDIS_PORT
-  redis_password: os.environ/REDIS_PASSWORD
+	openai: 
+		budget_limit: 0.000000000001 # float of $ value budget for time period
+		time_period: 1d # can be 1d, 2d, 30d 
+	azure:
+		budget_limit: 100
+		time_period: 1d
+	anthropic:
+		budget_limit: 100
+		time_period: 10d
+	vertexai:
+		budget_limit: 100
+		time_period: 12d
+	gemini:
+		budget_limit: 100
+		time_period: 12d

 general_settings:
  master_key: sk-1234
@ -114,11 +112,8 @@ Expected response on failure
   - If all providers exceed budget, raises an error

 3. **Supported Time Periods**:
-   - Seconds: "Xs" (e.g., "30s")
-   - Minutes: "Xm" (e.g., "10m")
-   - Hours: "Xh" (e.g., "24h")
-   - Days: "Xd" (e.g., "1d", "30d")
-   - Months: "Xmo" (e.g., "1mo", "2mo")
+   - Format: "Xd" where X is number of days
+   - Examples: "1d" (1 day), "30d" (30 days)

 4. **Requirements**:
   - Redis required for tracking spend across instances
@ -134,31 +129,6 @@ This metric indicates the remaining budget for a provider in dollars (USD)
 litellm_provider_remaining_budget_metric{api_provider="openai"} 10
 ```

-## Multi-instance setup
-
-If you are using a multi-instance setup, you will need to set the Redis host, port, and password in the `proxy_config.yaml` file. Redis is used to sync the spend across LiteLLM instances.
-
-```yaml
-model_list:
-    - model_name: gpt-3.5-turbo
-      litellm_params:
-        model: openai/gpt-3.5-turbo
-        api_key: os.environ/OPENAI_API_KEY
-
-router_settings:
-  provider_budget_config: 
-    openai: 
-      budget_limit: 0.000000000001 # float of $ value budget for time period
-      time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo
-  
-  # 👇 Add this: Set Redis Host, Port, and Password if using multiple instance of LiteLLM
-  redis_host: os.environ/REDIS_HOST
-  redis_port: os.environ/REDIS_PORT
-  redis_password: os.environ/REDIS_PASSWORD
-
-general_settings:
-  master_key: sk-1234
-```

 ## Spec for provider_budget_config

@ -166,12 +136,7 @@ The `provider_budget_config` is a dictionary where:
 - **Key**: Provider name (string) - Must be a valid [LiteLLM provider name](https://docs.litellm.ai/docs/providers)
 - **Value**: Budget configuration object with the following parameters:
  - `budget_limit`: Float value representing the budget in USD
-  - `time_period`: Duration string in one of the following formats:
-    - Seconds: `"Xs"` (e.g., "30s")
-    - Minutes: `"Xm"` (e.g., "10m")
-    - Hours: `"Xh"` (e.g., "24h")
-    - Days: `"Xd"` (e.g., "1d", "30d")
-    - Months: `"Xmo"` (e.g., "1mo", "2mo")
+  - `time_period`: String in the format "Xd" where X is the number of days (e.g., "1d", "30d")

 Example structure:
 ```yaml
@ -182,10 +147,4 @@ provider_budget_config:
  azure:
    budget_limit: 500.0    # $500 USD
    time_period: "30d"     # 30 day period
-  anthropic:
-    budget_limit: 200.0    # $200 USD
-    time_period: "1mo"     # 1 month period
-  gemini:
-    budget_limit: 50.0     # $50 USD
-    time_period: "24h"     # 24 hour period
 ```
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Proxy - Fallbacks, Retries
+# Fallbacks, Load Balancing, Retries

 - Quick Start [load balancing](#test---load-balancing)
 - Quick Start [client side fallbacks](#test---client-side-fallbacks)
--- a/docs/my-website/docs/proxy/self_serve.md
+++ b/docs/my-website/docs/proxy/self_serve.md
@ -217,10 +217,4 @@ litellm_settings:
    max_parallel_requests: 1000 # (Optional[int], optional): Max number of requests that can be made in parallel. Defaults to None.
    tpm_limit: 1000 #(Optional[int], optional): Tpm limit. Defaults to None.
    rpm_limit: 1000 #(Optional[int], optional): Rpm limit. Defaults to None.
-
-  key_generation_settings: # Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation)
-    team_key_generation:
-      allowed_team_member_roles: ["admin"]
-    personal_key_generation: # maps to 'Default Team' on UI 
-      allowed_user_roles: ["proxy_admin"]
-```
+```
--- a/docs/my-website/docs/proxy/team_based_routing.md
+++ b/docs/my-website/docs/proxy/team_based_routing.md
@ -1,4 +1,4 @@
-# Team-based Routing
+# 👥 Team-based Routing

 ## Routing
 Route calls to different model groups based on the team-id
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -811,78 +811,6 @@ litellm_settings:
    team_id: "core-infra"
 ```

-### Restricting Key Generation
-
-Use this to control who can generate keys. Useful when letting others create keys on the UI. 
-
-```yaml
-litellm_settings:
-  key_generation_settings:
-    team_key_generation:
-      allowed_team_member_roles: ["admin"]
-      required_params: ["tags"] # require team admins to set tags for cost-tracking when generating a team key
-    personal_key_generation: # maps to 'Default Team' on UI 
-      allowed_user_roles: ["proxy_admin"]
-```
-
-#### Spec 
-
-```python
-class TeamUIKeyGenerationConfig(TypedDict):
-    allowed_team_member_roles: List[str]
-    required_params: List[str] # require params on `/key/generate` to be set if a team key (team_id in request) is being generated
-
-
-class PersonalUIKeyGenerationConfig(TypedDict):
-    allowed_user_roles: List[LitellmUserRoles] 
-    required_params: List[str] # require params on `/key/generate` to be set if a personal key (no team_id in request) is being generated
-
-
-class StandardKeyGenerationConfig(TypedDict, total=False):
-    team_key_generation: TeamUIKeyGenerationConfig
-    personal_key_generation: PersonalUIKeyGenerationConfig
-
-
-class LitellmUserRoles(str, enum.Enum):
-    """
-    Admin Roles:
-    PROXY_ADMIN: admin over the platform
-    PROXY_ADMIN_VIEW_ONLY: can login, view all own keys, view all spend
-    ORG_ADMIN: admin over a specific organization, can create teams, users only within their organization
-
-    Internal User Roles:
-    INTERNAL_USER: can login, view/create/delete their own keys, view their spend
-    INTERNAL_USER_VIEW_ONLY: can login, view their own keys, view their own spend
-
-
-    Team Roles:
-    TEAM: used for JWT auth
-
-
-    Customer Roles:
-    CUSTOMER: External users -> these are customers
-
-    """
-
-    # Admin Roles
-    PROXY_ADMIN = "proxy_admin"
-    PROXY_ADMIN_VIEW_ONLY = "proxy_admin_viewer"
-
-    # Organization admins
-    ORG_ADMIN = "org_admin"
-
-    # Internal User Roles
-    INTERNAL_USER = "internal_user"
-    INTERNAL_USER_VIEW_ONLY = "internal_user_viewer"
-
-    # Team Roles
-    TEAM = "team"
-
-    # Customer Roles - External users of proxy
-    CUSTOMER = "customer"
-```
-
-
 ## **Next Steps - Set Budgets, Rate Limits per Virtual Key**

 [Follow this doc to set budgets, rate limiters per virtual key with LiteLLM](users)
--- a/docs/my-website/docs/router_architecture.md
+++ b/docs/my-website/docs/router_architecture.md
@ -1,24 +0,0 @@
-import Image from '@theme/IdealImage';
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-# Router Architecture (Fallbacks / Retries)
-
-## High Level architecture
-
-<Image img={require('../img/router_architecture.png')} style={{ width: '100%', maxWidth: '4000px' }} />
-
-### Request Flow 
-
-1. **User Sends Request**: The process begins when a user sends a request to the LiteLLM Router endpoint. All unified endpoints (`.completion`, `.embeddings`, etc) are supported by LiteLLM Router.
-
-2. **function_with_fallbacks**: The initial request is sent to the `function_with_fallbacks` function. This function wraps the initial request in a try-except block, to handle any exceptions - doing fallbacks if needed. This request is then sent to the `function_with_retries` function.
-
-
-3. **function_with_retries**: The `function_with_retries` function wraps the request in a try-except block and passes the initial request to a base litellm unified function (`litellm.completion`, `litellm.embeddings`, etc) to handle LLM API calling. `function_with_retries` handles any exceptions - doing retries on the model group if needed (i.e. if the request fails, it will retry on an available model within the model group). 
-
-4. **litellm.completion**: The `litellm.completion` function is a base function that handles the LLM API calling. It is used by `function_with_retries` to make the actual request to the LLM API.
-
-## Legend 
-
-**model_group**: A group of LLM API deployments that share the same `model_name`, are part of the same `model_group`, and can be load balanced across.
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -1891,22 +1891,3 @@ router = Router(
    debug_level="DEBUG"  # defaults to INFO
 )
 ```
-
-## Router General Settings
-
-### Usage 
-
-```python
-router = Router(model_list=..., router_general_settings=RouterGeneralSettings(async_only_mode=True))
-```
-
-### Spec 
-```python
-class RouterGeneralSettings(BaseModel):
-    async_only_mode: bool = Field(
-        default=False
-    )  # this will only initialize async clients. Good for memory utils
-    pass_through_all_models: bool = Field(
-        default=False
-    )  # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
-```
--- a/docs/my-website/docs/text_completion.md
+++ b/docs/my-website/docs/text_completion.md
@ -1,174 +0,0 @@
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-# Text Completion
-
-### Usage
-<Tabs>
-<TabItem value="python" label="LiteLLM Python SDK">
-
-```python
-from litellm import text_completion
-
-response = text_completion(
-    model="gpt-3.5-turbo-instruct",
-    prompt="Say this is a test",
-    max_tokens=7
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="LiteLLM Proxy Server">
-
-1. Define models on config.yaml
-
-```yaml
-model_list:
-  - model_name: gpt-3.5-turbo-instruct
-    litellm_params:
-      model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: text-davinci-003
-    litellm_params:
-      model: text-completion-openai/text-davinci-003
-      api_key: os.environ/OPENAI_API_KEY
-```
-
-2. Start litellm proxy server 
-
-```
-litellm --config config.yaml
-```
-
-<Tabs>
-<TabItem value="python" label="OpenAI Python SDK">
-
-```python
-from openai import OpenAI
-
-# set base_url to your proxy server
-# set api_key to send to proxy server
-client = OpenAI(api_key="<proxy-api-key>", base_url="http://0.0.0.0:4000")
-
-response = client.completions.create(
-    model="gpt-3.5-turbo-instruct",
-    prompt="Say this is a test",
-    max_tokens=7
-)
-
-print(response)
-```
-</TabItem>
-
-<TabItem value="curl" label="Curl Request">
-
-```shell
-curl --location 'http://0.0.0.0:4000/completions' \
-    --header 'Content-Type: application/json' \
-    --header 'Authorization: Bearer sk-1234' \
-    --data '{
-        "model": "gpt-3.5-turbo-instruct",
-        "prompt": "Say this is a test",
-        "max_tokens": 7
-    }'
-```
-</TabItem>
-</Tabs>
-
-</TabItem>
-</Tabs>
-
-## Input Params
-
-LiteLLM accepts and translates the [OpenAI Text Completion params](https://platform.openai.com/docs/api-reference/completions) across all supported providers.
-
-### Required Fields
-
- `model`: *string* - ID of the model to use
- `prompt`: *string or array* - The prompt(s) to generate completions for
-
-### Optional Fields
-
- `best_of`: *integer* - Generates best_of completions server-side and returns the "best" one
- `echo`: *boolean* - Echo back the prompt in addition to the completion.
- `frequency_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency.
- `logit_bias`: *map* - Modify the likelihood of specified tokens appearing in the completion
- `logprobs`: *integer* - Include the log probabilities on the logprobs most likely tokens. Max value of 5
- `max_tokens`: *integer* - The maximum number of tokens to generate.
- `n`: *integer* - How many completions to generate for each prompt.
- `presence_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
- `seed`: *integer* - If specified, system will attempt to make deterministic samples
- `stop`: *string or array* - Up to 4 sequences where the API will stop generating tokens
- `stream`: *boolean* - Whether to stream back partial progress. Defaults to false
- `suffix`: *string* - The suffix that comes after a completion of inserted text
- `temperature`: *number* - What sampling temperature to use, between 0 and 2. 
- `top_p`: *number* - An alternative to sampling with temperature, called nucleus sampling. 
- `user`: *string* - A unique identifier representing your end-user
-
-## Output Format
-Here's the exact JSON output format you can expect from completion calls:
-
-
-[**Follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/completions/object)
-
-<Tabs>
-
-<TabItem value="non-streaming" label="Non-Streaming Response">
-
-```python
-{
-  "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
-  "object": "text_completion",
-  "created": 1589478378,
-  "model": "gpt-3.5-turbo-instruct",
-  "system_fingerprint": "fp_44709d6fcb",
-  "choices": [
-    {
-      "text": "\n\nThis is indeed a test",
-      "index": 0,
-      "logprobs": null,
-      "finish_reason": "length"
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 5,
-    "completion_tokens": 7,
-    "total_tokens": 12
-  }
-}
-
-```
-</TabItem>
-<TabItem value="streaming" label="Streaming Response">
-
-```python
-{
-  "id": "cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe",
-  "object": "text_completion",
-  "created": 1690759702,
-  "choices": [
-    {
-      "text": "This",
-      "index": 0,
-      "logprobs": null,
-      "finish_reason": null
-    }
-  ],
-  "model": "gpt-3.5-turbo-instruct"
-  "system_fingerprint": "fp_44709d6fcb",
-}
-
-```
-
-</TabItem>
-</Tabs>
-
-
-## **Supported Providers**
-
-| Provider    | Link to Usage      |
-|-------------|--------------------|
-| OpenAI      |   [Usage](../docs/providers/text_completion_openai)                 | 
-| Azure OpenAI|   [Usage](../docs/providers/azure)                 |  
-
-
--- a/docs/my-website/docs/wildcard_routing.md
+++ b/docs/my-website/docs/wildcard_routing.md
@ -1,140 +0,0 @@
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-# Provider specific Wildcard routing 
-
-**Proxy all models from a provider**
-
-Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml**
-
-## Step 1. Define provider specific routing 
-
-<Tabs>
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import Router
-
-router = Router(
-    model_list=[
-        {
-            "model_name": "anthropic/*",
-            "litellm_params": {
-                "model": "anthropic/*",
-                "api_key": os.environ["ANTHROPIC_API_KEY"]
-            }
-        }, 
-        {
-            "model_name": "groq/*",
-            "litellm_params": {
-                "model": "groq/*",
-                "api_key": os.environ["GROQ_API_KEY"]
-            }
-        }, 
-        {
-            "model_name": "fo::*:static::*", # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
-            "litellm_params": {
-                "model": "openai/fo::*:static::*",
-                "api_key": os.environ["OPENAI_API_KEY"]
-            }
-        }
-    ]
-)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-**Step 1** - define provider specific routing on config.yaml
-```yaml
-model_list:
-  # provider specific wildcard routing
-  - model_name: "anthropic/*"
-    litellm_params:
-      model: "anthropic/*"
-      api_key: os.environ/ANTHROPIC_API_KEY
-  - model_name: "groq/*"
-    litellm_params:
-      model: "groq/*"
-      api_key: os.environ/GROQ_API_KEY
-  - model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*"
-    litellm_params:
-      model: "openai/fo::*:static::*"
-      api_key: os.environ/OPENAI_API_KEY
-```
-</TabItem>
-</Tabs>
-
-## [PROXY-Only] Step 2 - Run litellm proxy 
-
-```shell
-$ litellm --config /path/to/config.yaml
-```
-
-## Step 3 - Test it 
-
-<Tabs>  
-<TabItem value="sdk" label="SDK">
-
-```python
-from litellm import Router
-
-router = Router(model_list=...)
-
-# Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
-resp = completion(model="anthropic/claude-3-sonnet-20240229", messages=[{"role": "user", "content": "Hello, Claude!"}])
-print(resp)
-
-# Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
-resp = completion(model="groq/llama3-8b-8192", messages=[{"role": "user", "content": "Hello, Groq!"}])
-print(resp)
-
-# Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
-resp = completion(model="fo::hi::static::hi", messages=[{"role": "user", "content": "Hello, Claude!"}])
-print(resp)
-```
-
-</TabItem>
-<TabItem value="proxy" label="PROXY">
-
-Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*`
-```bash
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "anthropic/claude-3-sonnet-20240229",
-    "messages": [
-      {"role": "user", "content": "Hello, Claude!"}
-    ]
-  }'
-```
-
-Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*`
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "groq/llama3-8b-8192",
-    "messages": [
-      {"role": "user", "content": "Hello, Claude!"}
-    ]
-  }'
-```
-
-Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*`
-```shell
-curl http://localhost:4000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer sk-1234" \
-  -d '{
-    "model": "fo::hi::static::hi",
-    "messages": [
-      {"role": "user", "content": "Hello, Claude!"}
-    ]
-  }'
-```
-
-</TabItem>
-</Tabs>
--- a/docs/my-website/img/router_architecture.png
+++ b/docs/my-website/img/router_architecture.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -29,17 +29,13 @@ const sidebars = {
      },
      items: [
        "proxy/docker_quick_start", 
-        {
-          "type": "category", 
-          "label": "Config.yaml",
-          "items": ["proxy/configs", "proxy/config_management", "proxy/config_settings"]
-        },
        {
          type: "category",
          label: "Setup & Deployment",
          items: [
            "proxy/deploy", 
            "proxy/prod", 
+            "proxy/configs", 
            "proxy/cli",
            "proxy/model_management",
            "proxy/health",
@ -51,7 +47,7 @@ const sidebars = {
        {
          type: "category",
          label: "Architecture",
-          items: ["proxy/architecture", "proxy/db_info", "router_architecture"],
+          items: ["proxy/architecture", "proxy/db_info"],
        }, 
        {
          type: "link",
@ -100,10 +96,11 @@ const sidebars = {
          label: "Spend Tracking + Budgets",
          items: ["proxy/cost_tracking", "proxy/users", "proxy/custom_pricing", "proxy/team_budgets", "proxy/billing", "proxy/customers"],
        },
+        "proxy/reliability",
        {
-          type: "link",
-          label: "Load Balancing, Routing, Fallbacks",
-          href: "https://docs.litellm.ai/docs/routing-load-balancing",
+          type: "category",
+          label: "Routing",
+          items: ["proxy/load_balancing", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing",],
        },
        {
          type: "category",
@ -202,31 +199,6 @@ const sidebars = {
        
      ],
    },
-    {
-      type: "category",
-      label: "Guides",
-      items: [
-        "exception_mapping",
-        "completion/provider_specific_params",
-        "guides/finetuned_models",
-        "completion/audio",
-        "completion/vision",
-        "completion/json_mode",
-        "completion/prompt_caching",
-        "completion/predict_outputs",
-        "completion/prefix",
-        "completion/drop_params",
-        "completion/prompt_formatting",
-        "completion/stream",
-        "completion/message_trimming",
-        "completion/function_call",
-        "completion/model_alias",
-        "completion/batching",
-        "completion/mock_requests",
-        "completion/reliable_completions",
-        
-      ]
-    },
    {
      type: "category",
      label: "Supported Endpoints",
@ -242,11 +214,27 @@ const sidebars = {
          },
          items: [
            "completion/input",
+            "completion/provider_specific_params",
+            "completion/json_mode",
+            "completion/prompt_caching",
+            "completion/audio",
+            "completion/vision",
+            "completion/predict_outputs",
+            "completion/prefix",
+            "completion/drop_params",
+            "completion/prompt_formatting",
            "completion/output",
            "completion/usage",
+            "exception_mapping",
+            "completion/stream",
+            "completion/message_trimming",
+            "completion/function_call",
+            "completion/model_alias",
+            "completion/batching",
+            "completion/mock_requests",
+            "completion/reliable_completions",
          ],
        },
-        "text_completion",
        "embedding/supported_embedding",
        "image_generation",
        {
@ -262,7 +250,6 @@ const sidebars = {
        "batches",
        "realtime",
        "fine_tuning",
-        "moderation",
        {
          type: "link",
          label: "Use LiteLLM Proxy with Vertex, Bedrock SDK",
@ -272,14 +259,8 @@ const sidebars = {
    },
    {
      type: "category",
-      label: "Routing, Loadbalancing & Fallbacks",
-      link: {
-        type: "generated-index",
-        title: "Routing, Loadbalancing & Fallbacks",
-        description: "Learn how to load balance, route, and set fallbacks for your LLM requests",
-        slug: "/routing-load-balancing",
-      },
-      items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"],
+      label: "Load Balancing",
+      items: ["routing", "scheduler"],
    },
    {
      type: "category",
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -2,9 +2,7 @@
 from typing import Optional, List
 from litellm._logging import verbose_logger
 from litellm.proxy.proxy_server import PrismaClient, HTTPException
-from litellm.llms.custom_httpx.http_handler import HTTPHandler
 import collections
-import httpx
 from datetime import datetime


@ -116,6 +114,7 @@ async def ui_get_spend_by_tags(


 def _forecast_daily_cost(data: list):
+    import requests  # type: ignore
    from datetime import datetime, timedelta

    if len(data) == 0:
@ -137,17 +136,17 @@ def _forecast_daily_cost(data: list):

    print("last entry date", last_entry_date)

+    # Assuming today_date is a datetime object
+    today_date = datetime.now()
+
    # Calculate the last day of the month
    last_day_of_todays_month = datetime(
        today_date.year, today_date.month % 12 + 1, 1
    ) - timedelta(days=1)

-    print("last day of todays month", last_day_of_todays_month)
    # Calculate the remaining days in the month
    remaining_days = (last_day_of_todays_month - last_entry_date).days

-    print("remaining days", remaining_days)
-
    current_spend_this_month = 0
    series = {}
    for entry in data:
@ -177,19 +176,13 @@ def _forecast_daily_cost(data: list):
        "Content-Type": "application/json",
    }

-    client = HTTPHandler()
-
-    try:
-        response = client.post(
-            url="https://trend-api-production.up.railway.app/forecast",
-            json=payload,
-            headers=headers,
-        )
-    except httpx.HTTPStatusError as e:
-        raise HTTPException(
-            status_code=500,
-            detail={"error": f"Error getting forecast: {e.response.text}"},
-        )
+    response = requests.post(
+        url="https://trend-api-production.up.railway.app/forecast",
+        json=payload,
+        headers=headers,
+    )
+    # check the status code
+    response.raise_for_status()

    json_response = response.json()
    forecast_data = json_response["forecast"]
@ -213,3 +206,13 @@ def _forecast_daily_cost(data: list):
        f"Predicted Spend for { today_month } 2024, ${total_predicted_spend}"
    )
    return {"response": response_data, "predicted_spend": predicted_spend}
+
+    # print(f"Date: {entry['date']}, Spend: {entry['spend']}, Response: {response.text}")
+
+
+# _forecast_daily_cost(
+#     [
+#         {"date": "2022-01-01", "spend": 100},
+
+#     ]
+# )
--- a/litellm/init.py
+++ b/litellm/init.py
@ -24,7 +24,6 @@ from litellm.proxy._types import (
    KeyManagementSettings,
    LiteLLM_UpperboundKeyGenerateParams,
 )
-from litellm.types.utils import StandardKeyGenerationConfig
 import httpx
 import dotenv
 from enum import Enum
@ -68,7 +67,6 @@ callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] =
 langfuse_default_tags: Optional[List[str]] = None
 langsmith_batch_size: Optional[int] = None
 argilla_batch_size: Optional[int] = None
-datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
 argilla_transformation_object: Optional[Dict[str, Any]] = None
 _async_input_callback: List[Callable] = (
    []
@ -275,7 +273,6 @@ s3_callback_params: Optional[Dict] = None
 generic_logger_headers: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
 upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None
-key_generation_settings: Optional[StandardKeyGenerationConfig] = None
 default_internal_user_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 max_user_budget: Optional[float] = None
@ -283,7 +280,6 @@ default_max_internal_user_budget: Optional[float] = None
 max_internal_user_budget: Optional[float] = None
 internal_user_budget_duration: Optional[str] = None
 max_end_user_budget: Optional[float] = None
-disable_end_user_cost_tracking: Optional[bool] = None
 #### REQUEST PRIORITIZATION ####
 priority_reservation: Optional[Dict[str, float]] = None
 #### RELIABILITY ####
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -313,13 +313,12 @@ def get_redis_async_client(**env_overrides) -> async_redis.Redis:

 def get_redis_connection_pool(**env_overrides):
    redis_kwargs = _get_redis_client_logic(**env_overrides)
-    verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
        return async_redis.BlockingConnectionPool.from_url(
            timeout=5, url=redis_kwargs["url"]
        )
    connection_class = async_redis.Connection
-    if "ssl" in redis_kwargs:
+    if "ssl" in redis_kwargs and redis_kwargs["ssl"] is not None:
        connection_class = async_redis.SSLConnection
        redis_kwargs.pop("ssl", None)
        redis_kwargs["connection_class"] = connection_class
--- a/litellm/caching/redis_cache.py
+++ b/litellm/caching/redis_cache.py
@ -20,7 +20,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 import litellm
 from litellm._logging import print_verbose, verbose_logger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
-from litellm.types.caching import RedisPipelineIncrementOperation
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes
 from litellm.types.utils import all_litellm_params

@ -891,92 +890,3 @@ class RedisCache(BaseCache):

    def delete_cache(self, key):
        self.redis_client.delete(key)
-
-    async def _pipeline_increment_helper(
-        self,
-        pipe: pipeline,
-        increment_list: List[RedisPipelineIncrementOperation],
-    ) -> Optional[List[float]]:
-        """Helper function for pipeline increment operations"""
-        # Iterate through each increment operation and add commands to pipeline
-        for increment_op in increment_list:
-            cache_key = self.check_and_fix_namespace(key=increment_op["key"])
-            print_verbose(
-                f"Increment ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {increment_op['increment_value']}\nttl={increment_op['ttl']}"
-            )
-            pipe.incrbyfloat(cache_key, increment_op["increment_value"])
-            if increment_op["ttl"] is not None:
-                _td = timedelta(seconds=increment_op["ttl"])
-                pipe.expire(cache_key, _td)
-        # Execute the pipeline and return results
-        results = await pipe.execute()
-        print_verbose(f"Increment ASYNC Redis Cache PIPELINE: results: {results}")
-        return results
-
-    async def async_increment_pipeline(
-        self, increment_list: List[RedisPipelineIncrementOperation], **kwargs
-    ) -> Optional[List[float]]:
-        """
-        Use Redis Pipelines for bulk increment operations
-        Args:
-            increment_list: List of RedisPipelineIncrementOperation dicts containing:
-                - key: str
-                - increment_value: float
-                - ttl_seconds: int
-        """
-        # don't waste a network request if there's nothing to increment
-        if len(increment_list) == 0:
-            return None
-
-        from redis.asyncio import Redis
-
-        _redis_client: Redis = self.init_async_client()  # type: ignore
-        start_time = time.time()
-
-        print_verbose(
-            f"Increment Async Redis Cache Pipeline: increment list: {increment_list}"
-        )
-
-        try:
-            async with _redis_client as redis_client:
-                async with redis_client.pipeline(transaction=True) as pipe:
-                    results = await self._pipeline_increment_helper(
-                        pipe, increment_list
-                    )
-
-            print_verbose(f"pipeline increment results: {results}")
-
-            ## LOGGING ##
-            end_time = time.time()
-            _duration = end_time - start_time
-            asyncio.create_task(
-                self.service_logger_obj.async_service_success_hook(
-                    service=ServiceTypes.REDIS,
-                    duration=_duration,
-                    call_type="async_increment_pipeline",
-                    start_time=start_time,
-                    end_time=end_time,
-                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
-                )
-            )
-            return results
-        except Exception as e:
-            ## LOGGING ##
-            end_time = time.time()
-            _duration = end_time - start_time
-            asyncio.create_task(
-                self.service_logger_obj.async_service_failure_hook(
-                    service=ServiceTypes.REDIS,
-                    duration=_duration,
-                    error=e,
-                    call_type="async_increment_pipeline",
-                    start_time=start_time,
-                    end_time=end_time,
-                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
-                )
-            )
-            verbose_logger.error(
-                "LiteLLM Redis Caching: async increment_pipeline() - Got exception from REDIS %s",
-                str(e),
-            )
-            raise e
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -32,11 +32,9 @@ from litellm.llms.custom_httpx.http_handler import (
    get_async_httpx_client,
    httpxSpecialProvider,
 )
-from litellm.proxy._types import UserAPIKeyAuth
-from litellm.types.integrations.datadog import *
 from litellm.types.services import ServiceLoggerPayload
-from litellm.types.utils import StandardLoggingPayload

+from .types import DD_ERRORS, DatadogPayload, DataDogStatus
 from .utils import make_json_serializable

 DD_MAX_BATCH_SIZE = 1000  # max number of logs DD API can accept
@ -108,20 +106,20 @@ class DataDogLogger(CustomBatchLogger):
            verbose_logger.debug(
                "Datadog: Logging - Enters logging function for model %s", kwargs
            )
-            await self._log_async_event(kwargs, response_obj, start_time, end_time)
-
-        except Exception as e:
-            verbose_logger.exception(
-                f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
+            dd_payload = self.create_datadog_logging_payload(
+                kwargs=kwargs,
+                response_obj=response_obj,
+                start_time=start_time,
+                end_time=end_time,
            )
-            pass

-    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        try:
+            self.log_queue.append(dd_payload)
            verbose_logger.debug(
-                "Datadog: Logging - Enters logging function for model %s", kwargs
+                f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
            )
-            await self._log_async_event(kwargs, response_obj, start_time, end_time)
+
+            if len(self.log_queue) >= self.batch_size:
+                await self.async_send_batch()

        except Exception as e:
            verbose_logger.exception(
@ -183,20 +181,12 @@ class DataDogLogger(CustomBatchLogger):
            verbose_logger.debug(
                "Datadog: Logging - Enters logging function for model %s", kwargs
            )
-            if litellm.datadog_use_v1 is True:
-                dd_payload = self._create_v0_logging_payload(
-                    kwargs=kwargs,
-                    response_obj=response_obj,
-                    start_time=start_time,
-                    end_time=end_time,
-                )
-            else:
-                dd_payload = self.create_datadog_logging_payload(
-                    kwargs=kwargs,
-                    response_obj=response_obj,
-                    start_time=start_time,
-                    end_time=end_time,
-                )
+            dd_payload = self.create_datadog_logging_payload(
+                kwargs=kwargs,
+                response_obj=response_obj,
+                start_time=start_time,
+                end_time=end_time,
+            )

            response = self.sync_client.post(
                url=self.intake_url,
@ -225,22 +215,6 @@ class DataDogLogger(CustomBatchLogger):
            pass
        pass

-    async def _log_async_event(self, kwargs, response_obj, start_time, end_time):
-        dd_payload = self.create_datadog_logging_payload(
-            kwargs=kwargs,
-            response_obj=response_obj,
-            start_time=start_time,
-            end_time=end_time,
-        )
-
-        self.log_queue.append(dd_payload)
-        verbose_logger.debug(
-            f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
-        )
-
-        if len(self.log_queue) >= self.batch_size:
-            await self.async_send_batch()
-
    def create_datadog_logging_payload(
        self,
        kwargs: Union[dict, Any],
@ -262,29 +236,73 @@ class DataDogLogger(CustomBatchLogger):
        """
        import json

-        standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
-            "standard_logging_object", None
-        )
-        if standard_logging_object is None:
-            raise ValueError("standard_logging_object not found in kwargs")
+        litellm_params = kwargs.get("litellm_params", {})
+        metadata = (
+            litellm_params.get("metadata", {}) or {}
+        )  # if litellm_params['metadata'] == None
+        messages = kwargs.get("messages")
+        optional_params = kwargs.get("optional_params", {})
+        call_type = kwargs.get("call_type", "litellm.completion")
+        cache_hit = kwargs.get("cache_hit", False)
+        usage = response_obj["usage"]
+        id = response_obj.get("id", str(uuid.uuid4()))
+        usage = dict(usage)
+        try:
+            response_time = (end_time - start_time).total_seconds() * 1000
+        except Exception:
+            response_time = None

-        status = DataDogStatus.INFO
-        if standard_logging_object.get("status") == "failure":
-            status = DataDogStatus.ERROR
+        try:
+            response_obj = dict(response_obj)
+        except Exception:
+            response_obj = response_obj
+
+        # Clean Metadata before logging - never log raw metadata
+        # the raw metadata can contain circular references which leads to infinite recursion
+        # we clean out all extra litellm metadata params before logging
+        clean_metadata = {}
+        if isinstance(metadata, dict):
+            for key, value in metadata.items():
+                # clean litellm metadata before logging
+                if key in [
+                    "endpoint",
+                    "caching_groups",
+                    "previous_models",
+                ]:
+                    continue
+                else:
+                    clean_metadata[key] = value

        # Build the initial payload
-        make_json_serializable(standard_logging_object)
-        json_payload = json.dumps(standard_logging_object)
+        payload = {
+            "id": id,
+            "call_type": call_type,
+            "cache_hit": cache_hit,
+            "start_time": start_time,
+            "end_time": end_time,
+            "response_time": response_time,
+            "model": kwargs.get("model", ""),
+            "user": kwargs.get("user", ""),
+            "model_parameters": optional_params,
+            "spend": kwargs.get("response_cost", 0),
+            "messages": messages,
+            "response": response_obj,
+            "usage": usage,
+            "metadata": clean_metadata,
+        }
+
+        make_json_serializable(payload)
+        json_payload = json.dumps(payload)

        verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)

        dd_payload = DatadogPayload(
-            ddsource=self._get_datadog_source(),
-            ddtags=self._get_datadog_tags(),
-            hostname=self._get_datadog_hostname(),
+            ddsource=os.getenv("DD_SOURCE", "litellm"),
+            ddtags="",
+            hostname="",
            message=json_payload,
-            service=self._get_datadog_service(),
-            status=status,
+            service="litellm-server",
+            status=DataDogStatus.INFO,
        )
        return dd_payload

@ -364,140 +382,3 @@ class DataDogLogger(CustomBatchLogger):
        No user has asked for this so far, this might be spammy on datatdog. If need arises we can implement this
        """
        return
-
-    async def async_post_call_failure_hook(
-        self,
-        request_data: dict,
-        original_exception: Exception,
-        user_api_key_dict: UserAPIKeyAuth,
-    ):
-        """
-        Handles Proxy Errors (not-related to LLM API), ex: Authentication Errors
-        """
-        import json
-
-        _exception_payload = DatadogProxyFailureHookJsonMessage(
-            exception=str(original_exception),
-            error_class=str(original_exception.__class__.__name__),
-            status_code=getattr(original_exception, "status_code", None),
-            traceback=traceback.format_exc(),
-            user_api_key_dict=user_api_key_dict.model_dump(),
-        )
-
-        json_payload = json.dumps(_exception_payload)
-        verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
-        dd_payload = DatadogPayload(
-            ddsource=self._get_datadog_source(),
-            ddtags=self._get_datadog_tags(),
-            hostname=self._get_datadog_hostname(),
-            message=json_payload,
-            service=self._get_datadog_service(),
-            status=DataDogStatus.ERROR,
-        )
-
-        self.log_queue.append(dd_payload)
-
-    def _create_v0_logging_payload(
-        self,
-        kwargs: Union[dict, Any],
-        response_obj: Any,
-        start_time: datetime.datetime,
-        end_time: datetime.datetime,
-    ) -> DatadogPayload:
-        """
-        Note: This is our V1 Version of DataDog Logging Payload
-
-
-        (Not Recommended) If you want this to get logged set `litellm.datadog_use_v1 = True`
-        """
-        import json
-
-        litellm_params = kwargs.get("litellm_params", {})
-        metadata = (
-            litellm_params.get("metadata", {}) or {}
-        )  # if litellm_params['metadata'] == None
-        messages = kwargs.get("messages")
-        optional_params = kwargs.get("optional_params", {})
-        call_type = kwargs.get("call_type", "litellm.completion")
-        cache_hit = kwargs.get("cache_hit", False)
-        usage = response_obj["usage"]
-        id = response_obj.get("id", str(uuid.uuid4()))
-        usage = dict(usage)
-        try:
-            response_time = (end_time - start_time).total_seconds() * 1000
-        except Exception:
-            response_time = None
-
-        try:
-            response_obj = dict(response_obj)
-        except Exception:
-            response_obj = response_obj
-
-        # Clean Metadata before logging - never log raw metadata
-        # the raw metadata can contain circular references which leads to infinite recursion
-        # we clean out all extra litellm metadata params before logging
-        clean_metadata = {}
-        if isinstance(metadata, dict):
-            for key, value in metadata.items():
-                # clean litellm metadata before logging
-                if key in [
-                    "endpoint",
-                    "caching_groups",
-                    "previous_models",
-                ]:
-                    continue
-                else:
-                    clean_metadata[key] = value
-
-        # Build the initial payload
-        payload = {
-            "id": id,
-            "call_type": call_type,
-            "cache_hit": cache_hit,
-            "start_time": start_time,
-            "end_time": end_time,
-            "response_time": response_time,
-            "model": kwargs.get("model", ""),
-            "user": kwargs.get("user", ""),
-            "model_parameters": optional_params,
-            "spend": kwargs.get("response_cost", 0),
-            "messages": messages,
-            "response": response_obj,
-            "usage": usage,
-            "metadata": clean_metadata,
-        }
-
-        make_json_serializable(payload)
-        json_payload = json.dumps(payload)
-
-        verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
-
-        dd_payload = DatadogPayload(
-            ddsource=self._get_datadog_source(),
-            ddtags=self._get_datadog_tags(),
-            hostname=self._get_datadog_hostname(),
-            message=json_payload,
-            service=self._get_datadog_service(),
-            status=DataDogStatus.INFO,
-        )
-        return dd_payload
-
-    @staticmethod
-    def _get_datadog_tags():
-        return f"env:{os.getenv('DD_ENV', 'unknown')},service:{os.getenv('DD_SERVICE', 'litellm')},version:{os.getenv('DD_VERSION', 'unknown')}"
-
-    @staticmethod
-    def _get_datadog_source():
-        return os.getenv("DD_SOURCE", "litellm")
-
-    @staticmethod
-    def _get_datadog_service():
-        return os.getenv("DD_SERVICE", "litellm-server")
-
-    @staticmethod
-    def _get_datadog_hostname():
-        return ""
-
-    @staticmethod
-    def _get_datadog_env():
-        return os.getenv("DD_ENV", "unknown")
--- a/litellm/integrations/datadog/types.py
+++ b/litellm/integrations/datadog/types.py
@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Optional, TypedDict
+from typing import TypedDict


 class DataDogStatus(str, Enum):
@ -19,11 +19,3 @@ class DatadogPayload(TypedDict, total=False):

 class DD_ERRORS(Enum):
    DATADOG_413_ERROR = "Datadog API Error - Payload too large (batch is above 5MB uncompressed). If you want this logged either disable request/response logging or set `DD_BATCH_SIZE=50`"
-
-
-class DatadogProxyFailureHookJsonMessage(TypedDict, total=False):
-    exception: str
-    error_class: str
-    status_code: Optional[int]
-    traceback: str
-    user_api_key_dict: dict
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -18,7 +18,6 @@ from litellm.integrations.custom_logger import CustomLogger
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.integrations.prometheus import *
 from litellm.types.utils import StandardLoggingPayload
-from litellm.utils import get_end_user_id_for_cost_tracking


 class PrometheusLogger(CustomLogger):
@ -365,7 +364,8 @@ class PrometheusLogger(CustomLogger):
        model = kwargs.get("model", "")
        litellm_params = kwargs.get("litellm_params", {}) or {}
        _metadata = litellm_params.get("metadata", {})
-        end_user_id = get_end_user_id_for_cost_tracking(litellm_params)
+        proxy_server_request = litellm_params.get("proxy_server_request") or {}
+        end_user_id = proxy_server_request.get("body", {}).get("user", None)
        user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
        user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
        user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@ -664,11 +664,13 @@ class PrometheusLogger(CustomLogger):

        # unpack kwargs
        model = kwargs.get("model", "")
+        litellm_params = kwargs.get("litellm_params", {}) or {}
        standard_logging_payload: StandardLoggingPayload = kwargs.get(
            "standard_logging_object", {}
        )
-        litellm_params = kwargs.get("litellm_params", {}) or {}
-        end_user_id = get_end_user_id_for_cost_tracking(litellm_params)
+        proxy_server_request = litellm_params.get("proxy_server_request") or {}
+
+        end_user_id = proxy_server_request.get("body", {}).get("user", None)
        user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
        user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
        user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
--- a/litellm/litellm_core_utils/README.md
+++ b/litellm/litellm_core_utils/README.md
@ -8,5 +8,4 @@ Core files:
 - `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types. 
 - `default_encoding.py`: code for loading the default encoding (tiktoken)
 - `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name. 
- `duration_parser.py`: code for parsing durations - e.g. "1d", "1mo", "10s"

--- a/litellm/litellm_core_utils/duration_parser.py
+++ b/litellm/litellm_core_utils/duration_parser.py
@ -1,92 +0,0 @@
-"""
-Helper utilities for parsing durations - 1s, 1d, 10d, 30d, 1mo, 2mo
-
-duration_in_seconds is used in diff parts of the code base, example 
- Router - Provider budget routing
- Proxy - Key, Team Generation
-"""
-
-import re
-import time
-from datetime import datetime, timedelta
-from typing import Tuple
-
-
-def _extract_from_regex(duration: str) -> Tuple[int, str]:
-    match = re.match(r"(\d+)(mo|[smhd]?)", duration)
-
-    if not match:
-        raise ValueError("Invalid duration format")
-
-    value, unit = match.groups()
-    value = int(value)
-
-    return value, unit
-
-
-def get_last_day_of_month(year, month):
-    # Handle December case
-    if month == 12:
-        return 31
-    # Next month is January, so subtract a day from March 1st
-    next_month = datetime(year=year, month=month + 1, day=1)
-    last_day_of_month = (next_month - timedelta(days=1)).day
-    return last_day_of_month
-
-
-def duration_in_seconds(duration: str) -> int:
-    """
-    Parameters:
-    - duration:
-        - "<number>s" - seconds
-        - "<number>m" - minutes
-        - "<number>h" - hours
-        - "<number>d" - days
-        - "<number>mo" - months
-
-    Returns time in seconds till when budget needs to be reset
-    """
-    value, unit = _extract_from_regex(duration=duration)
-
-    if unit == "s":
-        return value
-    elif unit == "m":
-        return value * 60
-    elif unit == "h":
-        return value * 3600
-    elif unit == "d":
-        return value * 86400
-    elif unit == "mo":
-        now = time.time()
-        current_time = datetime.fromtimestamp(now)
-
-        if current_time.month == 12:
-            target_year = current_time.year + 1
-            target_month = 1
-        else:
-            target_year = current_time.year
-            target_month = current_time.month + value
-
-        # Determine the day to set for next month
-        target_day = current_time.day
-        last_day_of_target_month = get_last_day_of_month(target_year, target_month)
-
-        if target_day > last_day_of_target_month:
-            target_day = last_day_of_target_month
-
-        next_month = datetime(
-            year=target_year,
-            month=target_month,
-            day=target_day,
-            hour=current_time.hour,
-            minute=current_time.minute,
-            second=current_time.second,
-            microsecond=current_time.microsecond,
-        )
-
-        # Calculate the duration until the first day of the next month
-        duration_until_next_month = next_month - current_time
-        return int(duration_until_next_month.total_seconds())
-
-    else:
-        raise ValueError(f"Unsupported duration unit, passed duration: {duration}")
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -934,10 +934,19 @@ class Logging:
                        status="success",
                    )
                )
-            callbacks = get_combined_callback_list(
-                dynamic_success_callbacks=self.dynamic_success_callbacks,
-                global_callbacks=litellm.success_callback,
-            )
+            if self.dynamic_success_callbacks is not None and isinstance(
+                self.dynamic_success_callbacks, list
+            ):
+                callbacks = self.dynamic_success_callbacks
+                ## keep the internal functions ##
+                for callback in litellm.success_callback:
+                    if (
+                        isinstance(callback, CustomLogger)
+                        and "_PROXY_" in callback.__class__.__name__
+                    ):
+                        callbacks.append(callback)
+            else:
+                callbacks = litellm.success_callback

            ## REDACT MESSAGES ##
            result = redact_message_input_output_from_logging(
@ -1359,11 +1368,8 @@ class Logging:
                        and customLogger is not None
                    ):  # custom logger functions
                        print_verbose(
-                            "success callbacks: Running Custom Callback Function - {}".format(
-                                callback
-                            )
+                            "success callbacks: Running Custom Callback Function"
                        )
-
                        customLogger.log_event(
                            kwargs=self.model_call_details,
                            response_obj=result,
@ -1460,10 +1466,21 @@ class Logging:
                    status="success",
                )
            )
-        callbacks = get_combined_callback_list(
-            dynamic_success_callbacks=self.dynamic_async_success_callbacks,
-            global_callbacks=litellm._async_success_callback,
-        )
+        if self.dynamic_async_success_callbacks is not None and isinstance(
+            self.dynamic_async_success_callbacks, list
+        ):
+            callbacks = self.dynamic_async_success_callbacks
+            ## keep the internal functions ##
+            for callback in litellm._async_success_callback:
+                callback_name = ""
+                if isinstance(callback, CustomLogger):
+                    callback_name = callback.__class__.__name__
+                if callable(callback):
+                    callback_name = callback.__name__
+                if "_PROXY_" in callback_name:
+                    callbacks.append(callback)
+        else:
+            callbacks = litellm._async_success_callback

        result = redact_message_input_output_from_logging(
            model_call_details=(
@ -1730,10 +1747,21 @@ class Logging:
                start_time=start_time,
                end_time=end_time,
            )
-            callbacks = get_combined_callback_list(
-                dynamic_success_callbacks=self.dynamic_failure_callbacks,
-                global_callbacks=litellm.failure_callback,
-            )
+            callbacks = []  # init this to empty incase it's not created
+
+            if self.dynamic_failure_callbacks is not None and isinstance(
+                self.dynamic_failure_callbacks, list
+            ):
+                callbacks = self.dynamic_failure_callbacks
+                ## keep the internal functions ##
+                for callback in litellm.failure_callback:
+                    if (
+                        isinstance(callback, CustomLogger)
+                        and "_PROXY_" in callback.__class__.__name__
+                    ):
+                        callbacks.append(callback)
+            else:
+                callbacks = litellm.failure_callback

            result = None  # result sent to all loggers, init this to None incase it's not created

@ -1916,10 +1944,21 @@ class Logging:
            end_time=end_time,
        )

-        callbacks = get_combined_callback_list(
-            dynamic_success_callbacks=self.dynamic_async_failure_callbacks,
-            global_callbacks=litellm._async_failure_callback,
-        )
+        callbacks = []  # init this to empty incase it's not created
+
+        if self.dynamic_async_failure_callbacks is not None and isinstance(
+            self.dynamic_async_failure_callbacks, list
+        ):
+            callbacks = self.dynamic_async_failure_callbacks
+            ## keep the internal functions ##
+            for callback in litellm._async_failure_callback:
+                if (
+                    isinstance(callback, CustomLogger)
+                    and "_PROXY_" in callback.__class__.__name__
+                ):
+                    callbacks.append(callback)
+        else:
+            callbacks = litellm._async_failure_callback

        result = None  # result sent to all loggers, init this to None incase it's not created
        for callback in callbacks:
@ -2320,7 +2359,6 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        _in_memory_loggers.append(_mlflow_logger)
        return _mlflow_logger  # type: ignore

-
 def get_custom_logger_compatible_class(
    logging_integration: litellm._custom_logger_compatible_callbacks_literal,
 ) -> Optional[CustomLogger]:
@ -2911,11 +2949,3 @@ def modify_integration(integration_name, integration_params):
    if integration_name == "supabase":
        if "table_name" in integration_params:
            Supabase.supabase_table_name = integration_params["table_name"]
-
-
-def get_combined_callback_list(
-    dynamic_success_callbacks: Optional[List], global_callbacks: List
-) -> List:
-    if dynamic_success_callbacks is None:
-        return global_callbacks
-    return list(set(dynamic_success_callbacks + global_callbacks))
--- a/litellm/llms/AzureOpenAI/azure.py
+++ b/litellm/llms/AzureOpenAI/azure.py
@ -1528,8 +1528,7 @@ class AzureChatCompletion(BaseLLM):
        prompt: Optional[str] = None,
    ) -> dict:
        client_session = (
-            litellm.aclient_session
-            or get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE).client
+            litellm.aclient_session or httpx.AsyncClient()
        )  # handle dall-e-2 calls

        if "gateway.ai.cloudflare.com" in api_base:
--- a/litellm/llms/azure_ai/rerank/handler.py
+++ b/litellm/llms/azure_ai/rerank/handler.py
@ -4,7 +4,6 @@ import httpx

 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.cohere.rerank import CohereRerank
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.rerank import RerankResponse


@ -74,7 +73,6 @@ class AzureAIRerank(CohereRerank):
        return_documents: Optional[bool] = True,
        max_chunks_per_doc: Optional[int] = None,
        _is_async: Optional[bool] = False,
-        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ) -> RerankResponse:

        if headers is None:
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -458,7 +458,7 @@ class AmazonConverseConfig:
        """
        Abbreviations of regions AWS Bedrock supports for cross region inference
        """
-        return ["us", "eu", "apac"]
+        return ["us", "eu"]

    def _get_base_model(self, model: str) -> str:
        """
--- a/litellm/llms/cohere/embed/handler.py
+++ b/litellm/llms/cohere/embed/handler.py
@ -74,7 +74,6 @@ async def async_embedding(
        },
    )
    ## COMPLETION CALL
-
    if client is None:
        client = get_async_httpx_client(
            llm_provider=litellm.LlmProviders.COHERE,
@ -152,11 +151,6 @@ def embedding(
            api_key=api_key,
            headers=headers,
            encoding=encoding,
-            client=(
-                client
-                if client is not None and isinstance(client, AsyncHTTPHandler)
-                else None
-            ),
        )

    ## LOGGING
--- a/litellm/llms/cohere/rerank.py
+++ b/litellm/llms/cohere/rerank.py
@ -6,14 +6,10 @@ LiteLLM supports the re rank API format, no paramter transformation occurs

 from typing import Any, Dict, List, Optional, Union

-import httpx
-
 import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.base import BaseLLM
 from litellm.llms.custom_httpx.http_handler import (
-    AsyncHTTPHandler,
-    HTTPHandler,
    _get_httpx_client,
    get_async_httpx_client,
 )
@ -38,23 +34,6 @@ class CohereRerank(BaseLLM):
        # Merge other headers, overriding any default ones except Authorization
        return {**default_headers, **headers}

-    def ensure_rerank_endpoint(self, api_base: str) -> str:
-        """
-        Ensures the `/v1/rerank` endpoint is appended to the given `api_base`.
-        If `/v1/rerank` is already present, the original URL is returned.
-
-        :param api_base: The base API URL.
-        :return: A URL with `/v1/rerank` appended if missing.
-        """
-        # Parse the base URL to ensure proper structure
-        url = httpx.URL(api_base)
-
-        # Check if the URL already ends with `/v1/rerank`
-        if not url.path.endswith("/v1/rerank"):
-            url = url.copy_with(path=f"{url.path.rstrip('/')}/v1/rerank")
-
-        return str(url)
-
    def rerank(
        self,
        model: str,
@ -69,10 +48,9 @@ class CohereRerank(BaseLLM):
        return_documents: Optional[bool] = True,
        max_chunks_per_doc: Optional[int] = None,
        _is_async: Optional[bool] = False,  # New parameter
-        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ) -> RerankResponse:
        headers = self.validate_environment(api_key=api_key, headers=headers)
-        api_base = self.ensure_rerank_endpoint(api_base)
+
        request_data = RerankRequest(
            model=model,
            query=query,
@ -98,13 +76,9 @@ class CohereRerank(BaseLLM):
        if _is_async:
            return self.async_rerank(request_data=request_data, api_key=api_key, api_base=api_base, headers=headers)  # type: ignore # Call async method

-        if client is not None and isinstance(client, HTTPHandler):
-            client = client
-        else:
-            client = _get_httpx_client()
-
+        client = _get_httpx_client()
        response = client.post(
-            url=api_base,
+            api_base,
            headers=headers,
            json=request_data_dict,
        )
@ -126,13 +100,10 @@ class CohereRerank(BaseLLM):
        api_key: str,
        api_base: str,
        headers: dict,
-        client: Optional[AsyncHTTPHandler] = None,
    ) -> RerankResponse:
        request_data_dict = request_data.dict(exclude_none=True)

-        client = client or get_async_httpx_client(
-            llm_provider=litellm.LlmProviders.COHERE
-        )
+        client = get_async_httpx_client(llm_provider=litellm.LlmProviders.COHERE)

        response = await client.post(
            api_base,
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -8,7 +8,8 @@ from httpx import USE_CLIENT_DEFAULT, AsyncHTTPTransport, HTTPTransport

 import litellm
 from litellm.caching import InMemoryCache
-from litellm.types.llms.custom_http import *
+
+from .types import httpxSpecialProvider

 if TYPE_CHECKING:
    from litellm import LlmProviders
@ -28,62 +29,6 @@ headers = {
 _DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0)
 _DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600  # 1 hour, re-use the same httpx client for 1 hour

-import re
-
-
-def mask_sensitive_info(error_message):
-    # Find the start of the key parameter
-    if isinstance(error_message, str):
-        key_index = error_message.find("key=")
-    else:
-        return error_message
-
-    # If key is found
-    if key_index != -1:
-        # Find the end of the key parameter (next & or end of string)
-        next_param = error_message.find("&", key_index)
-
-        if next_param == -1:
-            # If no more parameters, mask until the end of the string
-            masked_message = error_message[: key_index + 4] + "[REDACTED_API_KEY]"
-        else:
-            # Replace the key with redacted value, keeping other parameters
-            masked_message = (
-                error_message[: key_index + 4]
-                + "[REDACTED_API_KEY]"
-                + error_message[next_param:]
-            )
-
-        return masked_message
-
-    return error_message
-
-
-class MaskedHTTPStatusError(httpx.HTTPStatusError):
-    def __init__(
-        self, original_error, message: Optional[str] = None, text: Optional[str] = None
-    ):
-        # Create a new error with the masked URL
-        masked_url = mask_sensitive_info(str(original_error.request.url))
-        # Create a new error that looks like the original, but with a masked URL
-
-        super().__init__(
-            message=original_error.message,
-            request=httpx.Request(
-                method=original_error.request.method,
-                url=masked_url,
-                headers=original_error.request.headers,
-                content=original_error.request.content,
-            ),
-            response=httpx.Response(
-                status_code=original_error.response.status_code,
-                content=original_error.response.content,
-                headers=original_error.response.headers,
-            ),
-        )
-        self.message = message
-        self.text = text
-

 class AsyncHTTPHandler:
    def __init__(
@ -211,16 +156,13 @@ class AsyncHTTPHandler:
                headers=headers,
            )
        except httpx.HTTPStatusError as e:
-
+            setattr(e, "status_code", e.response.status_code)
            if stream is True:
                setattr(e, "message", await e.response.aread())
                setattr(e, "text", await e.response.aread())
            else:
-                setattr(e, "message", mask_sensitive_info(e.response.text))
-                setattr(e, "text", mask_sensitive_info(e.response.text))
-
-            setattr(e, "status_code", e.response.status_code)
-
+                setattr(e, "message", e.response.text)
+                setattr(e, "text", e.response.text)
            raise e
        except Exception as e:
            raise e
@ -458,17 +400,11 @@ class HTTPHandler:
                llm_provider="litellm-httpx-handler",
            )
        except httpx.HTTPStatusError as e:
-
-            if stream is True:
-                setattr(e, "message", mask_sensitive_info(e.response.read()))
-                setattr(e, "text", mask_sensitive_info(e.response.read()))
-            else:
-                error_text = mask_sensitive_info(e.response.text)
-                setattr(e, "message", error_text)
-                setattr(e, "text", error_text)
-
            setattr(e, "status_code", e.response.status_code)
-
+            if stream is True:
+                setattr(e, "message", e.response.read())
+            else:
+                setattr(e, "message", e.response.text)
            raise e
        except Exception as e:
            raise e
--- a/litellm/llms/custom_httpx/types.py
+++ b/litellm/llms/custom_httpx/types.py
@ -0,0 +1,11 @@
+from enum import Enum
+
+import litellm
+
+
+class httpxSpecialProvider(str, Enum):
+    LoggingCallback = "logging_callback"
+    GuardrailCallback = "guardrail_callback"
+    Caching = "caching"
+    Oauth2Check = "oauth2_check"
+    SecretManager = "secret_manager"
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -14,7 +14,6 @@ import requests  # type: ignore

 import litellm
 from litellm import verbose_logger
-from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.utils import ModelInfo, ProviderField, StreamingChoices

@ -457,10 +456,7 @@ def ollama_completion_stream(url, data, logging_obj):

 async def ollama_async_streaming(url, data, model_response, encoding, logging_obj):
    try:
-        _async_http_client = get_async_httpx_client(
-            llm_provider=litellm.LlmProviders.OLLAMA
-        )
-        client = _async_http_client.client
+        client = httpx.AsyncClient()
        async with client.stream(
            url=f"{url}", json=data, method="POST", timeout=litellm.request_timeout
        ) as response:
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -13,7 +13,6 @@ from pydantic import BaseModel

 import litellm
 from litellm import verbose_logger
-from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
 from litellm.types.llms.ollama import OllamaToolCall, OllamaToolCallFunction
 from litellm.types.llms.openai import ChatCompletionAssistantToolCall
 from litellm.types.utils import StreamingChoices
@ -446,10 +445,7 @@ async def ollama_async_streaming(
    url, api_key, data, model_response, encoding, logging_obj
 ):
    try:
-        _async_http_client = get_async_httpx_client(
-            llm_provider=litellm.LlmProviders.OLLAMA
-        )
-        client = _async_http_client.client
+        client = httpx.AsyncClient()
        _request = {
            "url": f"{url}",
            "json": data,
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -33,7 +33,6 @@ from litellm.types.llms.openai import (
    ChatCompletionAssistantToolCall,
    ChatCompletionFunctionMessage,
    ChatCompletionImageObject,
-    ChatCompletionImageUrlObject,
    ChatCompletionTextObject,
    ChatCompletionToolCallFunctionChunk,
    ChatCompletionToolMessage,
@ -682,27 +681,6 @@ def construct_tool_use_system_prompt(
    return tool_use_system_prompt


-def convert_generic_image_chunk_to_openai_image_obj(
-    image_chunk: GenericImageParsingChunk,
-) -> str:
-    """
-    Convert a generic image chunk to an OpenAI image object.
-
-    Input:
-    GenericImageParsingChunk(
-        type="base64",
-        media_type="image/jpeg",
-        data="...",
-    )
-
-    Return:
-    "data:image/jpeg;base64,{base64_image}"
-    """
-    return "data:{};{},{}".format(
-        image_chunk["media_type"], image_chunk["type"], image_chunk["data"]
-    )
-
-
 def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
    """
    Input:
@ -728,7 +706,6 @@ def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsing
            data=base64_data,
        )
    except Exception as e:
-        traceback.print_exc()
        if "Error: Unable to fetch image from URL" in str(e):
            raise e
        raise Exception(
@ -1159,44 +1136,15 @@ def convert_to_anthropic_tool_result(
        ]
    }
    """
-    anthropic_content: Union[
-        str,
-        List[Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam]],
-    ] = ""
+    content_str: str = ""
    if isinstance(message["content"], str):
-        anthropic_content = message["content"]
+        content_str = message["content"]
    elif isinstance(message["content"], List):
        content_list = message["content"]
-        anthropic_content_list: List[
-            Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam]
-        ] = []
        for content in content_list:
            if content["type"] == "text":
-                anthropic_content_list.append(
-                    AnthropicMessagesToolResultContent(
-                        type="text",
-                        text=content["text"],
-                    )
-                )
-            elif content["type"] == "image_url":
-                if isinstance(content["image_url"], str):
-                    image_chunk = convert_to_anthropic_image_obj(content["image_url"])
-                else:
-                    image_chunk = convert_to_anthropic_image_obj(
-                        content["image_url"]["url"]
-                    )
-                anthropic_content_list.append(
-                    AnthropicMessagesImageParam(
-                        type="image",
-                        source=AnthropicContentParamSource(
-                            type="base64",
-                            media_type=image_chunk["media_type"],
-                            data=image_chunk["data"],
-                        ),
-                    )
-                )
+                content_str += content["text"]

-        anthropic_content = anthropic_content_list
    anthropic_tool_result: Optional[AnthropicMessagesToolResultParam] = None
    ## PROMPT CACHING CHECK ##
    cache_control = message.get("cache_control", None)
@ -1207,14 +1155,14 @@ def convert_to_anthropic_tool_result(
        # We can't determine from openai message format whether it's a successful or
        # error call result so default to the successful result template
        anthropic_tool_result = AnthropicMessagesToolResultParam(
-            type="tool_result", tool_use_id=tool_call_id, content=anthropic_content
+            type="tool_result", tool_use_id=tool_call_id, content=content_str
        )

    if message["role"] == "function":
        function_message: ChatCompletionFunctionMessage = message
        tool_call_id = function_message.get("tool_call_id") or str(uuid.uuid4())
        anthropic_tool_result = AnthropicMessagesToolResultParam(
-            type="tool_result", tool_use_id=tool_call_id, content=anthropic_content
+            type="tool_result", tool_use_id=tool_call_id, content=content_str
        )

    if anthropic_tool_result is None:
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py
@ -107,10 +107,6 @@ def _get_image_mime_type_from_url(url: str) -> Optional[str]:
        return "image/png"
    elif url.endswith(".webp"):
        return "image/webp"
-    elif url.endswith(".mp4"):
-        return "video/mp4"
-    elif url.endswith(".pdf"):
-        return "application/pdf"
    return None


@ -298,12 +294,7 @@ def _transform_request_body(
    optional_params = {k: v for k, v in optional_params.items() if k not in remove_keys}

    try:
-        if custom_llm_provider == "gemini":
-            content = litellm.GoogleAIStudioGeminiConfig._transform_messages(
-                messages=messages
-            )
-        else:
-            content = litellm.VertexGeminiConfig._transform_messages(messages=messages)
+        content = _gemini_convert_messages_with_history(messages=messages)
        tools: Optional[Tools] = optional_params.pop("tools", None)
        tool_choice: Optional[ToolConfig] = optional_params.pop("tool_choice", None)
        safety_settings: Optional[List[SafetSettingsConfig]] = optional_params.pop(
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
@ -35,12 +35,7 @@ from litellm.llms.custom_httpx.http_handler import (
    HTTPHandler,
    get_async_httpx_client,
 )
-from litellm.llms.prompt_templates.factory import (
-    convert_generic_image_chunk_to_openai_image_obj,
-    convert_to_anthropic_image_obj,
-)
 from litellm.types.llms.openai import (
-    AllMessageValues,
    ChatCompletionResponseMessage,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
@ -83,8 +78,6 @@ from ..common_utils import (
 )
 from ..vertex_llm_base import VertexBase
 from .transformation import (
-    _gemini_convert_messages_with_history,
-    _process_gemini_image,
    async_transform_request_body,
    set_headers,
    sync_transform_request_body,
@ -919,10 +912,6 @@ class VertexGeminiConfig:

        return model_response

-    @staticmethod
-    def _transform_messages(messages: List[AllMessageValues]) -> List[ContentType]:
-        return _gemini_convert_messages_with_history(messages=messages)
-

 class GoogleAIStudioGeminiConfig(
    VertexGeminiConfig
@ -1026,32 +1015,6 @@ class GoogleAIStudioGeminiConfig(
            model, non_default_params, optional_params, drop_params
        )

-    @staticmethod
-    def _transform_messages(messages: List[AllMessageValues]) -> List[ContentType]:
-        """
-        Google AI Studio Gemini does not support image urls in messages.
-        """
-        for message in messages:
-            _message_content = message.get("content")
-            if _message_content is not None and isinstance(_message_content, list):
-                _parts: List[PartType] = []
-                for element in _message_content:
-                    if element.get("type") == "image_url":
-                        img_element = element
-                        _image_url: Optional[str] = None
-                        if isinstance(img_element.get("image_url"), dict):
-                            _image_url = img_element["image_url"].get("url")  # type: ignore
-                        else:
-                            _image_url = img_element.get("image_url")  # type: ignore
-                        if _image_url and "https://" in _image_url:
-                            image_obj = convert_to_anthropic_image_obj(_image_url)
-                            img_element["image_url"] = (  # type: ignore
-                                convert_generic_image_chunk_to_openai_image_obj(
-                                    image_obj
-                                )
-                            )
-        return _gemini_convert_messages_with_history(messages=messages)
-

 async def make_call(
    client: Optional[AsyncHTTPHandler],
--- a/litellm/main.py
+++ b/litellm/main.py
@ -3440,10 +3440,6 @@ def embedding(  # noqa: PLR0915
                or litellm.openai_key
                or get_secret_str("OPENAI_API_KEY")
            )
-
-            if extra_headers is not None:
-                optional_params["extra_headers"] = extra_headers
-
            api_type = "openai"
            api_version = None

--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2032,6 +2032,7 @@
        "tool_use_system_prompt_tokens": 264,
        "supports_assistant_prefill": true,
        "supports_prompt_caching": true,
+        "supports_pdf_input": true,
        "supports_response_schema": true
    },
    "claude-3-opus-20240229": {
@ -2097,7 +2098,6 @@
        "supports_vision": true,
        "tool_use_system_prompt_tokens": 159,
        "supports_assistant_prefill": true,
-        "supports_pdf_input": true,
        "supports_prompt_caching": true,
        "supports_response_schema": true
    },
@ -3383,8 +3383,6 @@
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_prompt_caching": true,
-        "tpm": 4000000,
-        "rpm": 2000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash-001": {
@ -3408,8 +3406,6 @@
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_prompt_caching": true,
-        "tpm": 4000000,
-        "rpm": 2000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash": {
@ -3432,8 +3428,6 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true, 
-        "tpm": 4000000,
-        "rpm": 2000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash-latest": {
@ -3456,32 +3450,6 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 2000,
-        "source": "https://ai.google.dev/pricing"
-    },
-    "gemini/gemini-1.5-flash-8b": {
-        "max_tokens": 8192,
-        "max_input_tokens": 1048576,
-        "max_output_tokens": 8192,
-        "max_images_per_prompt": 3000,
-        "max_videos_per_prompt": 10,
-        "max_video_length": 1,
-        "max_audio_length_hours": 8.4,
-        "max_audio_per_prompt": 1,
-        "max_pdf_size_mb": 30, 
-        "input_cost_per_token": 0,
-        "input_cost_per_token_above_128k_tokens": 0,
-        "output_cost_per_token": 0,
-        "output_cost_per_token_above_128k_tokens": 0,
-        "litellm_provider": "gemini",
-        "mode": "chat",
-        "supports_system_messages": true,
-        "supports_function_calling": true,
-        "supports_vision": true,
-        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 4000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash-8b-exp-0924": {
@ -3504,8 +3472,6 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 4000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-exp-1114": {
@ -3528,12 +3494,7 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 1000,
-        "source": "https://ai.google.dev/pricing",
-        "metadata": {
-            "notes": "Rate limits not documented for gemini-exp-1114. Assuming same as gemini-1.5-pro."
-        }
+        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash-exp-0827": {
        "max_tokens": 8192,
@ -3555,8 +3516,6 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 2000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-flash-8b-exp-0827": {
@ -3578,9 +3537,6 @@
        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
-        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 4000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-pro": {
@ -3594,10 +3550,7 @@
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_function_calling": true,
-        "rpd": 30000,
-        "tpm": 120000,
-        "rpm": 360,
-        "source": "https://ai.google.dev/gemini-api/docs/models/gemini"
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "gemini/gemini-1.5-pro": {
        "max_tokens": 8192,
@ -3614,8 +3567,6 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-002": {
@ -3634,8 +3585,6 @@
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
        "supports_prompt_caching": true,
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-001": {
@ -3654,8 +3603,6 @@
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
        "supports_prompt_caching": true,
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-exp-0801": {
@ -3673,8 +3620,6 @@
        "supports_vision": true,
        "supports_tool_choice": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-exp-0827": {
@ -3692,8 +3637,6 @@
        "supports_vision": true,
        "supports_tool_choice": true,
        "supports_response_schema": true,
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-latest": {
@ -3711,8 +3654,6 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "tpm": 4000000,
-        "rpm": 1000,
        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-pro-vision": {
@ -3727,9 +3668,6 @@
        "mode": "chat",
        "supports_function_calling": true,
        "supports_vision": true,
-        "rpd": 30000,
-        "tpm": 120000,
-        "rpm": 360,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "gemini/gemini-gemma-2-27b-it": {
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/4u3imMIH2UVoP8L-yPCjs/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/4u3imMIH2UVoP8L-yPCjs/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/4u3imMIH2UVoP8L-yPCjs/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/4u3imMIH2UVoP8L-yPCjs/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/777-80eb84a5285bfa2d.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/777-80eb84a5285bfa2d.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/777-9d9df0b75010dbf9.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/777-9d9df0b75010dbf9.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/902-58bf23027703b2e8.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/902-58bf23027703b2e8.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-05e5448bd170dbcb.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-05e5448bd170dbcb.js
@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{11837:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=11837)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-77825730d130b292.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-77825730d130b292.js
@ -0,0 +1 @@
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-413af091866cb902.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-413af091866cb902.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a952da77e0730c7c.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-a952da77e0730c7c.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-a13477d480030cb3.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-a13477d480030cb3.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/ea3759ed931c00b2.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/8fbba1b67a4788fc.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/8fbba1b67a4788fc.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/8fbba1b67a4788fc.css
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-b9c71b6f9761a436.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-b9c71b6f9761a436.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/ea3759ed931c00b2.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82989,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-292bb6a83427dbc7.js\",\"131\",\"static/chunks/131-4ee1d633e8928742.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-0c564a21577c9c53.js\",\"777\",\"static/chunks/777-9d9df0b75010dbf9.js\",\"931\",\"static/chunks/app/page-a952da77e0730c7c.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/ea3759ed931c00b2.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"pDx3dChtj-paUmJExuV6u\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a13477d480030cb3.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-a13477d480030cb3.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/8fbba1b67a4788fc.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82989,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-58bf23027703b2e8.js\",\"131\",\"static/chunks/131-4ee1d633e8928742.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-0c564a21577c9c53.js\",\"777\",\"static/chunks/777-80eb84a5285bfa2d.js\",\"931\",\"static/chunks/app/page-413af091866cb902.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/8fbba1b67a4788fc.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"4u3imMIH2UVoP8L-yPCjs\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[82989,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-292bb6a83427dbc7.js","131","static/chunks/131-4ee1d633e8928742.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-0c564a21577c9c53.js","777","static/chunks/777-9d9df0b75010dbf9.js","931","static/chunks/app/page-a952da77e0730c7c.js"],""]
+3:I[82989,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-4ee1d633e8928742.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-0c564a21577c9c53.js","777","static/chunks/777-80eb84a5285bfa2d.js","931","static/chunks/app/page-413af091866cb902.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["pDx3dChtj-paUmJExuV6u",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/ea3759ed931c00b2.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["4u3imMIH2UVoP8L-yPCjs",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/8fbba1b67a4788fc.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[87494,["902","static/chunks/902-292bb6a83427dbc7.js","131","static/chunks/131-4ee1d633e8928742.js","777","static/chunks/777-9d9df0b75010dbf9.js","418","static/chunks/app/model_hub/page-748a83a8e772a56b.js"],""]
+3:I[87494,["902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-4ee1d633e8928742.js","777","static/chunks/777-80eb84a5285bfa2d.js","418","static/chunks/app/model_hub/page-748a83a8e772a56b.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["pDx3dChtj-paUmJExuV6u",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/ea3759ed931c00b2.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["4u3imMIH2UVoP8L-yPCjs",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/8fbba1b67a4788fc.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[667,["665","static/chunks/3014691f-b24e8254c7593934.js","902","static/chunks/902-292bb6a83427dbc7.js","684","static/chunks/684-16b194c83a169f6d.js","777","static/chunks/777-9d9df0b75010dbf9.js","461","static/chunks/app/onboarding/page-884a15d08f8be397.js"],""]
+3:I[667,["665","static/chunks/3014691f-b24e8254c7593934.js","902","static/chunks/902-58bf23027703b2e8.js","684","static/chunks/684-16b194c83a169f6d.js","777","static/chunks/777-80eb84a5285bfa2d.js","461","static/chunks/app/onboarding/page-884a15d08f8be397.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["pDx3dChtj-paUmJExuV6u",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/ea3759ed931c00b2.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["4u3imMIH2UVoP8L-yPCjs",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/8fbba1b67a4788fc.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -11,44 +11,28 @@ model_list:
      model: vertex_ai/claude-3-5-sonnet-v2
      vertex_ai_project: "adroit-crow-413218"
      vertex_ai_location: "us-east5"  
-  - model_name: openai-gpt-4o-realtime-audio
+  - model_name: fake-openai-endpoint
    litellm_params:
-      model: openai/gpt-4o-realtime-preview-2024-10-01
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: openai/*
-    litellm_params:
-      model: openai/*
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: openai/*
-    litellm_params:
-      model: openai/*
-      api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      access_groups: ["public-openai-models"] 
-  - model_name: openai/gpt-4o
-    litellm_params:
-      model: openai/gpt-4o
-      api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      access_groups: ["private-openai-models"] 
-      
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
 router_settings:
-  routing_strategy: usage-based-routing-v2
-  #redis_url: "os.environ/REDIS_URL"
-  redis_host: "os.environ/REDIS_HOST"
-  redis_port: "os.environ/REDIS_PORT"
+  model_group_alias:
+    "gpt-4-turbo": # Aliased model name
+      model: "gpt-4"  # Actual model name in 'model_list'
+      hidden: true

 litellm_settings:
-  cache: true
-  cache_params:
-    type: redis
-    host: "os.environ/REDIS_HOST"
-    port: "os.environ/REDIS_PORT"
-    namespace: "litellm.caching"
-    ttl: 600
-#   key_generation_settings:
-#     team_key_generation:
-#       allowed_team_member_roles: ["admin"]
-#       required_params: ["tags"] # require team admins to set tags for cost-tracking when generating a team key
-#     personal_key_generation: # maps to 'Default Team' on UI 
-#       allowed_user_roles: ["proxy_admin"]
+  default_team_settings: 
+    - team_id: team-1
+      success_callback: ["langfuse"]
+      failure_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PROJECT1_PUBLIC # Project 1
+      langfuse_secret: os.environ/LANGFUSE_PROJECT1_SECRET # Project 1
+    - team_id: team-2
+      success_callback: ["langfuse"]
+      failure_callback: ["langfuse"]
+      langfuse_public_key: os.environ/LANGFUSE_PROJECT2_PUBLIC # Project 2
+      langfuse_secret: os.environ/LANGFUSE_PROJECT2_SECRET # Project 2
+      langfuse_host: https://us.cloud.langfuse.com
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -2,7 +2,6 @@ import enum
 import json
 import os
 import sys
-import traceback
 import uuid
 from dataclasses import fields
 from datetime import datetime
@ -13,15 +12,7 @@ from typing_extensions import Annotated, TypedDict

 from litellm.types.integrations.slack_alerting import AlertType
 from litellm.types.router import RouterErrors, UpdateRouterConfig
-from litellm.types.utils import (
-    EmbeddingResponse,
-    ImageResponse,
-    ModelResponse,
-    ProviderField,
-    StandardCallbackDynamicParams,
-    StandardPassThroughResponseObject,
-    TextCompletionResponse,
-)
+from litellm.types.utils import ProviderField, StandardCallbackDynamicParams

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
@ -891,7 +882,15 @@ class DeleteCustomerRequest(LiteLLMBase):
    user_ids: List[str]


-class MemberBase(LiteLLMBase):
+class Member(LiteLLMBase):
+    role: Literal[
+        LitellmUserRoles.ORG_ADMIN,
+        LitellmUserRoles.INTERNAL_USER,
+        LitellmUserRoles.INTERNAL_USER_VIEW_ONLY,
+        # older Member roles
+        "admin",
+        "user",
+    ]
    user_id: Optional[str] = None
    user_email: Optional[str] = None

@ -905,21 +904,6 @@ class MemberBase(LiteLLMBase):
        return values


-class Member(MemberBase):
-    role: Literal[
-        "admin",
-        "user",
-    ]
-
-
-class OrgMember(MemberBase):
-    role: Literal[
-        LitellmUserRoles.ORG_ADMIN,
-        LitellmUserRoles.INTERNAL_USER,
-        LitellmUserRoles.INTERNAL_USER_VIEW_ONLY,
-    ]
-
-
 class TeamBase(LiteLLMBase):
    team_alias: Optional[str] = None
    team_id: Optional[str] = None
@ -1985,25 +1969,6 @@ class MemberAddRequest(LiteLLMBase):
        super().__init__(**data)


-class OrgMemberAddRequest(LiteLLMBase):
-    member: Union[List[OrgMember], OrgMember]
-
-    def __init__(self, **data):
-        member_data = data.get("member")
-        if isinstance(member_data, list):
-            # If member is a list of dictionaries, convert each dictionary to a Member object
-            members = [OrgMember(**item) for item in member_data]
-            # Replace member_data with the list of Member objects
-            data["member"] = members
-        elif isinstance(member_data, dict):
-            # If member is a dictionary, convert it to a single Member object
-            member = OrgMember(**member_data)
-            # Replace member_data with the single Member object
-            data["member"] = member
-        # Call the superclass __init__ method to initialize the object
-        super().__init__(**data)
-
-
 class TeamAddMemberResponse(LiteLLM_TeamTable):
    updated_users: List[LiteLLM_UserTable]
    updated_team_memberships: List[LiteLLM_TeamMembership]
@ -2052,7 +2017,7 @@ class TeamMemberUpdateResponse(MemberUpdateResponse):


 # Organization Member Requests
-class OrganizationMemberAddRequest(OrgMemberAddRequest):
+class OrganizationMemberAddRequest(MemberAddRequest):
    organization_id: str
    max_budget_in_organization: Optional[float] = (
        None  # Users max budget within the organization
@ -2110,7 +2075,6 @@ class SpecialHeaders(enum.Enum):
    openai_authorization = "Authorization"
    azure_authorization = "API-Key"
    anthropic_authorization = "x-api-key"
-    google_ai_studio_authorization = "x-goog-api-key"


 class LitellmDataForBackendLLMCall(TypedDict, total=False):
@ -2169,25 +2133,3 @@ class UserManagementEndpointParamDocStringEnums(str, enum.Enum):
    spend_doc_str = """Optional[float] - Amount spent by user. Default is 0. Will be updated by proxy whenever user is used."""
    team_id_doc_str = """Optional[str] - [DEPRECATED PARAM] The team id of the user. Default is None."""
    duration_doc_str = """Optional[str] - Duration for the key auto-created on `/user/new`. Default is None."""
-
-
-PassThroughEndpointLoggingResultValues = Union[
-    ModelResponse,
-    TextCompletionResponse,
-    ImageResponse,
-    EmbeddingResponse,
-    StandardPassThroughResponseObject,
-]
-
-
-class PassThroughEndpointLoggingTypedDict(TypedDict):
-    result: Optional[PassThroughEndpointLoggingResultValues]
-    kwargs: dict
-
-
-LiteLLM_ManagementEndpoint_MetadataFields = [
-    "model_rpm_limit",
-    "model_tpm_limit",
-    "guardrails",
-    "tags",
-]
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -60,7 +60,6 @@ def common_checks(  # noqa: PLR0915
    global_proxy_spend: Optional[float],
    general_settings: dict,
    route: str,
-    llm_router: Optional[litellm.Router],
 ) -> bool:
    """
    Common checks across jwt + key-based auth.
@ -98,12 +97,7 @@ def common_checks(  # noqa: PLR0915
            # this means the team has access to all models on the proxy
            pass
        # check if the team model is an access_group
-        elif (
-            model_in_access_group(
-                model=_model, team_models=team_object.models, llm_router=llm_router
-            )
-            is True
-        ):
+        elif model_in_access_group(_model, team_object.models) is True:
            pass
        elif _model and "*" in _model:
            pass
@ -379,33 +373,36 @@ async def get_end_user_object(
        return None


-def model_in_access_group(
-    model: str, team_models: Optional[List[str]], llm_router: Optional[litellm.Router]
-) -> bool:
+def model_in_access_group(model: str, team_models: Optional[List[str]]) -> bool:
    from collections import defaultdict

+    from litellm.proxy.proxy_server import llm_router
+
    if team_models is None:
        return True
    if model in team_models:
        return True

-    access_groups: dict[str, list[str]] = defaultdict(list)
+    access_groups = defaultdict(list)
    if llm_router:
-        access_groups = llm_router.get_model_access_groups(model_name=model)
+        access_groups = llm_router.get_model_access_groups()

+    models_in_current_access_groups = []
    if len(access_groups) > 0:  # check if token contains any model access groups
        for idx, m in enumerate(
            team_models
        ):  # loop token models, if any of them are an access group add the access group
            if m in access_groups:
-                return True
+                # if it is an access group we need to remove it from valid_token.models
+                models_in_group = access_groups[m]
+                models_in_current_access_groups.extend(models_in_group)

    # Filter out models that are access_groups
    filtered_models = [m for m in team_models if m not in access_groups]
+    filtered_models += models_in_current_access_groups

    if model in filtered_models:
        return True
-
    return False


@ -589,63 +586,26 @@ async def _get_team_db_check(team_id: str, prisma_client: PrismaClient):
    )


-async def _get_team_object_from_db(team_id: str, prisma_client: PrismaClient):
-    return await prisma_client.db.litellm_teamtable.find_unique(
-        where={"team_id": team_id}
-    )
-
-
-async def _get_team_object_from_user_api_key_cache(
+async def get_team_object(
    team_id: str,
-    prisma_client: PrismaClient,
+    prisma_client: Optional[PrismaClient],
    user_api_key_cache: DualCache,
-    last_db_access_time: LimitedSizeOrderedDict,
-    db_cache_expiry: int,
-    proxy_logging_obj: Optional[ProxyLogging],
-    key: str,
+    parent_otel_span: Optional[Span] = None,
+    proxy_logging_obj: Optional[ProxyLogging] = None,
+    check_cache_only: Optional[bool] = None,
 ) -> LiteLLM_TeamTableCachedObj:
-    db_access_time_key = key
-    should_check_db = _should_check_db(
-        key=db_access_time_key,
-        last_db_access_time=last_db_access_time,
-        db_cache_expiry=db_cache_expiry,
-    )
-    if should_check_db:
-        response = await _get_team_db_check(
-            team_id=team_id, prisma_client=prisma_client
+    """
+    - Check if team id in proxy Team Table
+    - if valid, return LiteLLM_TeamTable object with defined limits
+    - if not, then raise an error
+    """
+    if prisma_client is None:
+        raise Exception(
+            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
        )
-    else:
-        response = None

-    if response is None:
-        raise Exception
-
-    _response = LiteLLM_TeamTableCachedObj(**response.dict())
-    # save the team object to cache
-    await _cache_team_object(
-        team_id=team_id,
-        team_table=_response,
-        user_api_key_cache=user_api_key_cache,
-        proxy_logging_obj=proxy_logging_obj,
-    )
-
-    # save to db access time
-    # save to db access time
-    _update_last_db_access_time(
-        key=db_access_time_key,
-        value=_response,
-        last_db_access_time=last_db_access_time,
-    )
-
-    return _response
-
-
-async def _get_team_object_from_cache(
-    key: str,
-    proxy_logging_obj: Optional[ProxyLogging],
-    user_api_key_cache: DualCache,
-    parent_otel_span: Optional[Span],
-) -> Optional[LiteLLM_TeamTableCachedObj]:
+    # check if in cache
+    key = "team_id:{}".format(team_id)
    cached_team_obj: Optional[LiteLLM_TeamTableCachedObj] = None

    ## CHECK REDIS CACHE ##
@ -653,7 +613,6 @@ async def _get_team_object_from_cache(
        proxy_logging_obj is not None
        and proxy_logging_obj.internal_usage_cache.dual_cache
    ):
-
        cached_team_obj = (
            await proxy_logging_obj.internal_usage_cache.dual_cache.async_get_cache(
                key=key, parent_otel_span=parent_otel_span
@ -669,58 +628,47 @@ async def _get_team_object_from_cache(
        elif isinstance(cached_team_obj, LiteLLM_TeamTableCachedObj):
            return cached_team_obj

-    return None
-
-
-async def get_team_object(
-    team_id: str,
-    prisma_client: Optional[PrismaClient],
-    user_api_key_cache: DualCache,
-    parent_otel_span: Optional[Span] = None,
-    proxy_logging_obj: Optional[ProxyLogging] = None,
-    check_cache_only: Optional[bool] = None,
-    check_db_only: Optional[bool] = None,
-) -> LiteLLM_TeamTableCachedObj:
-    """
-    - Check if team id in proxy Team Table
-    - if valid, return LiteLLM_TeamTable object with defined limits
-    - if not, then raise an error
-    """
-    if prisma_client is None:
+    if check_cache_only:
        raise Exception(
-            "No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
+            f"Team doesn't exist in cache + check_cache_only=True. Team={team_id}."
        )

-    # check if in cache
-    key = "team_id:{}".format(team_id)
-
-    if not check_db_only:
-        cached_team_obj = await _get_team_object_from_cache(
-            key=key,
-            proxy_logging_obj=proxy_logging_obj,
-            user_api_key_cache=user_api_key_cache,
-            parent_otel_span=parent_otel_span,
-        )
-
-        if cached_team_obj is not None:
-            return cached_team_obj
-
-        if check_cache_only:
-            raise Exception(
-                f"Team doesn't exist in cache + check_cache_only=True. Team={team_id}."
-            )
-
    # else, check db
    try:
-        return await _get_team_object_from_user_api_key_cache(
-            team_id=team_id,
-            prisma_client=prisma_client,
-            user_api_key_cache=user_api_key_cache,
-            proxy_logging_obj=proxy_logging_obj,
+        db_access_time_key = "team_id:{}".format(team_id)
+        should_check_db = _should_check_db(
+            key=db_access_time_key,
            last_db_access_time=last_db_access_time,
            db_cache_expiry=db_cache_expiry,
-            key=key,
        )
+        if should_check_db:
+            response = await _get_team_db_check(
+                team_id=team_id, prisma_client=prisma_client
+            )
+        else:
+            response = None
+
+        if response is None:
+            raise Exception
+
+        _response = LiteLLM_TeamTableCachedObj(**response.dict())
+        # save the team object to cache
+        await _cache_team_object(
+            team_id=team_id,
+            team_table=_response,
+            user_api_key_cache=user_api_key_cache,
+            proxy_logging_obj=proxy_logging_obj,
+        )
+
+        # save to db access time
+        # save to db access time
+        _update_last_db_access_time(
+            key=db_access_time_key,
+            value=_response,
+            last_db_access_time=last_db_access_time,
+        )
+
+        return _response
    except Exception:
        raise Exception(
            f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
@ -877,10 +825,7 @@ async def get_org_object(


 async def can_key_call_model(
-    model: str,
-    llm_model_list: Optional[list],
-    valid_token: UserAPIKeyAuth,
-    llm_router: Optional[litellm.Router],
+    model: str, llm_model_list: Optional[list], valid_token: UserAPIKeyAuth
 ) -> Literal[True]:
    """
    Checks if token can call a given model
@ -900,29 +845,35 @@ async def can_key_call_model(
    )
    from collections import defaultdict

+    from litellm.proxy.proxy_server import llm_router
+
    access_groups = defaultdict(list)
    if llm_router:
-        access_groups = llm_router.get_model_access_groups(model_name=model)
+        access_groups = llm_router.get_model_access_groups()

-    if (
-        len(access_groups) > 0 and llm_router is not None
-    ):  # check if token contains any model access groups
+    models_in_current_access_groups = []
+    if len(access_groups) > 0:  # check if token contains any model access groups
        for idx, m in enumerate(
            valid_token.models
        ):  # loop token models, if any of them are an access group add the access group
            if m in access_groups:
-                return True
+                # if it is an access group we need to remove it from valid_token.models
+                models_in_group = access_groups[m]
+                models_in_current_access_groups.extend(models_in_group)

    # Filter out models that are access_groups
    filtered_models = [m for m in valid_token.models if m not in access_groups]

+    filtered_models += models_in_current_access_groups
    verbose_proxy_logger.debug(f"model: {model}; allowed_models: {filtered_models}")

    all_model_access: bool = False

    if (
-        len(filtered_models) == 0 and len(valid_token.models) == 0
-    ) or "*" in filtered_models:
+        len(filtered_models) == 0
+        or "*" in filtered_models
+        or "openai/*" in filtered_models
+    ):
        all_model_access = True

    if model is not None and model not in filtered_models and all_model_access is False:
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -28,8 +28,6 @@ from fastapi import (
    Request,
    Response,
    UploadFile,
-    WebSocket,
-    WebSocketDisconnect,
    status,
 )
 from fastapi.middleware.cors import CORSMiddleware
@ -97,11 +95,6 @@ anthropic_api_key_header = APIKeyHeader(
    auto_error=False,
    description="If anthropic client used.",
 )
-google_ai_studio_api_key_header = APIKeyHeader(
-    name=SpecialHeaders.google_ai_studio_authorization.value,
-    auto_error=False,
-    description="If google ai studio client used.",
-)


 def _get_bearer_token(
@ -197,52 +190,6 @@ def _is_allowed_route(
        )


-async def user_api_key_auth_websocket(websocket: WebSocket):
-    # Accept the WebSocket connection
-
-    request = Request(scope={"type": "http"})
-    request._url = websocket.url
-
-    query_params = websocket.query_params
-
-    model = query_params.get("model")
-
-    async def return_body():
-        return_string = f'{{"model": "{model}"}}'
-        # return string as bytes
-        return return_string.encode()
-
-    request.body = return_body  # type: ignore
-
-    # Extract the Authorization header
-    authorization = websocket.headers.get("authorization")
-
-    # If no Authorization header, try the api-key header
-    if not authorization:
-        api_key = websocket.headers.get("api-key")
-        if not api_key:
-            await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
-            raise HTTPException(status_code=403, detail="No API key provided")
-    else:
-        # Extract the API key from the Bearer token
-        if not authorization.startswith("Bearer "):
-            await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
-            raise HTTPException(
-                status_code=403, detail="Invalid Authorization header format"
-            )
-
-        api_key = authorization[len("Bearer ") :].strip()
-
-    # Call user_api_key_auth with the extracted API key
-    # Note: You'll need to modify this to work with WebSocket context if needed
-    try:
-        return await user_api_key_auth(request=request, api_key=f"Bearer {api_key}")
-    except Exception as e:
-        verbose_proxy_logger.exception(e)
-        await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
-        raise HTTPException(status_code=403, detail=str(e))
-
-
 async def user_api_key_auth(  # noqa: PLR0915
    request: Request,
    api_key: str = fastapi.Security(api_key_header),
@ -250,16 +197,12 @@ async def user_api_key_auth(  # noqa: PLR0915
    anthropic_api_key_header: Optional[str] = fastapi.Security(
        anthropic_api_key_header
    ),
-    google_ai_studio_api_key_header: Optional[str] = fastapi.Security(
-        google_ai_studio_api_key_header
-    ),
 ) -> UserAPIKeyAuth:
    from litellm.proxy.proxy_server import (
        general_settings,
        jwt_handler,
        litellm_proxy_admin_name,
        llm_model_list,
-        llm_router,
        master_key,
        open_telemetry_logger,
        prisma_client,
@ -290,8 +233,6 @@ async def user_api_key_auth(  # noqa: PLR0915
            api_key = azure_api_key_header
        elif isinstance(anthropic_api_key_header, str):
            api_key = anthropic_api_key_header
-        elif isinstance(google_ai_studio_api_key_header, str):
-            api_key = google_ai_studio_api_key_header
        elif pass_through_endpoints is not None:
            for endpoint in pass_through_endpoints:
                if endpoint.get("path", "") == route:
@ -543,7 +484,6 @@ async def user_api_key_auth(  # noqa: PLR0915
                    general_settings=general_settings,
                    global_proxy_spend=global_proxy_spend,
                    route=route,
-                    llm_router=llm_router,
                )

                # return UserAPIKeyAuth object
@ -907,7 +847,6 @@ async def user_api_key_auth(  # noqa: PLR0915
                        model=model,
                        llm_model_list=llm_model_list,
                        valid_token=valid_token,
-                        llm_router=llm_router,
                    )

                if fallback_models is not None:
@ -916,7 +855,6 @@ async def user_api_key_auth(  # noqa: PLR0915
                            model=m,
                            llm_model_list=llm_model_list,
                            valid_token=valid_token,
-                            llm_router=llm_router,
                        )

            # Check 2. If user_id for this token is in budget - done in common_checks()
@ -1177,7 +1115,6 @@ async def user_api_key_auth(  # noqa: PLR0915
                general_settings=general_settings,
                global_proxy_spend=global_proxy_spend,
                route=route,
-                llm_router=llm_router,
            )
            # Token passed all checks
            if valid_token is None:
@ -1250,15 +1187,13 @@ async def user_api_key_auth(  # noqa: PLR0915
            extra={"requester_ip": requester_ip},
        )

-        # Log this exception to OTEL, Datadog etc
-        asyncio.create_task(
-            proxy_logging_obj.async_log_proxy_authentication_errors(
+        # Log this exception to OTEL
+        if open_telemetry_logger is not None:
+            await open_telemetry_logger.async_post_call_failure_hook(  # type: ignore
                original_exception=e,
-                request=request,
-                parent_otel_span=parent_otel_span,
-                api_key=api_key,
+                request_data={},
+                user_api_key_dict=UserAPIKeyAuth(parent_otel_span=parent_otel_span),
            )
-        )

        if isinstance(e, litellm.BudgetExceededError):
            raise ProxyException(
--- a/litellm/proxy/common_utils/http_parsing_utils.py
+++ b/litellm/proxy/common_utils/http_parsing_utils.py
@ -1,6 +1,6 @@
 import ast
 import json
-from typing import Dict, List, Optional
+from typing import List, Optional

 from fastapi import Request, UploadFile, status

@ -8,43 +8,31 @@ from litellm._logging import verbose_proxy_logger
 from litellm.types.router import Deployment


-async def _read_request_body(request: Optional[Request]) -> Dict:
+async def _read_request_body(request: Optional[Request]) -> dict:
    """
-    Safely read the request body and parse it as JSON.
+    Asynchronous function to read the request body and parse it as JSON or literal data.

    Parameters:
    - request: The request object to read the body from

    Returns:
-    - dict: Parsed request data as a dictionary or an empty dictionary if parsing fails
+    - dict: Parsed request data as a dictionary
    """
    try:
+        request_data: dict = {}
        if request is None:
-            return {}
-
-        # Read the request body
+            return request_data
        body = await request.body()

-        # Return empty dict if body is empty or None
-        if not body:
-            return {}
-
-        # Decode the body to a string
+        if body == b"" or body is None:
+            return request_data
        body_str = body.decode()
-
-        # Attempt JSON parsing (safe for untrusted input)
-        return json.loads(body_str)
-
-    except json.JSONDecodeError:
-        # Log detailed information for debugging
-        verbose_proxy_logger.exception("Invalid JSON payload received.")
-        return {}
-
-    except Exception as e:
-        # Catch unexpected errors to avoid crashes
-        verbose_proxy_logger.exception(
-            "Unexpected error reading request body - {}".format(e)
-        )
+        try:
+            request_data = ast.literal_eval(body_str)
+        except Exception:
+            request_data = json.loads(body_str)
+        return request_data
+    except Exception:
        return {}


--- a/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/bedrock_guardrails.py
@ -214,10 +214,10 @@ class BedrockGuardrail(CustomGuardrail, BaseAWSLLM):
            prepared_request.url,
            prepared_request.headers,
        )
-
+        _json_data = json.dumps(request_data)  # type: ignore
        response = await self.async_handler.post(
            url=prepared_request.url,
-            data=prepared_request.body,  # type: ignore
+            json=request_data,  # type: ignore
            headers=prepared_request.headers,  # type: ignore
        )
        verbose_proxy_logger.debug("Bedrock AI response: %s", response.text)
--- a/litellm/proxy/hooks/proxy_failure_handler.py
+++ b/litellm/proxy/hooks/proxy_failure_handler.py
@ -1,87 +0,0 @@
-"""
-Runs when LLM Exceptions occur on LiteLLM Proxy
-"""
-
-import copy
-import json
-import uuid
-
-import litellm
-from litellm.proxy._types import LiteLLM_ErrorLogs
-
-
-async def _PROXY_failure_handler(
-    kwargs,  # kwargs to completion
-    completion_response: litellm.ModelResponse,  # response from completion
-    start_time=None,
-    end_time=None,  # start/end time for completion
-):
-    """
-    Async Failure Handler - runs when LLM Exceptions occur on LiteLLM Proxy.
-    This function logs the errors to the Prisma DB
-
-    Can be disabled by setting the following on proxy_config.yaml:
-    ```yaml
-    general_settings:
-      disable_error_logs: True
-    ```
-
-    """
-    from litellm._logging import verbose_proxy_logger
-    from litellm.proxy.proxy_server import general_settings, prisma_client
-
-    if general_settings.get("disable_error_logs") is True:
-        return
-
-    if prisma_client is not None:
-        verbose_proxy_logger.debug(
-            "inside _PROXY_failure_handler kwargs=", extra=kwargs
-        )
-
-        _exception = kwargs.get("exception")
-        _exception_type = _exception.__class__.__name__
-        _model = kwargs.get("model", None)
-
-        _optional_params = kwargs.get("optional_params", {})
-        _optional_params = copy.deepcopy(_optional_params)
-
-        for k, v in _optional_params.items():
-            v = str(v)
-            v = v[:100]
-
-        _status_code = "500"
-        try:
-            _status_code = str(_exception.status_code)
-        except Exception:
-            # Don't let this fail logging the exception to the dB
-            pass
-
-        _litellm_params = kwargs.get("litellm_params", {}) or {}
-        _metadata = _litellm_params.get("metadata", {}) or {}
-        _model_id = _metadata.get("model_info", {}).get("id", "")
-        _model_group = _metadata.get("model_group", "")
-        api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
-        _exception_string = str(_exception)
-
-        error_log = LiteLLM_ErrorLogs(
-            request_id=str(uuid.uuid4()),
-            model_group=_model_group,
-            model_id=_model_id,
-            litellm_model_name=kwargs.get("model"),
-            request_kwargs=_optional_params,
-            api_base=api_base,
-            exception_type=_exception_type,
-            status_code=_status_code,
-            exception_string=_exception_string,
-            startTime=kwargs.get("start_time"),
-            endTime=kwargs.get("end_time"),
-        )
-
-        error_log_dict = error_log.model_dump()
-        error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
-
-        await prisma_client.db.litellm_errorlogs.create(
-            data=error_log_dict  # type: ignore
-        )
-
-    pass
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -288,12 +288,12 @@ class LiteLLMProxyRequestSetup:

        ## KEY-LEVEL SPEND LOGS / TAGS
        if "tags" in key_metadata and key_metadata["tags"] is not None:
-            data[_metadata_variable_name]["tags"] = (
-                LiteLLMProxyRequestSetup._merge_tags(
-                    request_tags=data[_metadata_variable_name].get("tags"),
-                    tags_to_add=key_metadata["tags"],
-                )
-            )
+            if "tags" in data[_metadata_variable_name] and isinstance(
+                data[_metadata_variable_name]["tags"], list
+            ):
+                data[_metadata_variable_name]["tags"].extend(key_metadata["tags"])
+            else:
+                data[_metadata_variable_name]["tags"] = key_metadata["tags"]
        if "spend_logs_metadata" in key_metadata and isinstance(
            key_metadata["spend_logs_metadata"], dict
        ):
@ -319,30 +319,6 @@ class LiteLLMProxyRequestSetup:
            data["disable_fallbacks"] = key_metadata["disable_fallbacks"]
        return data

-    @staticmethod
-    def _merge_tags(request_tags: Optional[list], tags_to_add: Optional[list]) -> list:
-        """
-        Helper function to merge two lists of tags, ensuring no duplicates.
-
-        Args:
-            request_tags (Optional[list]): List of tags from the original request
-            tags_to_add (Optional[list]): List of tags to add
-
-        Returns:
-            list: Combined list of unique tags
-        """
-        final_tags = []
-
-        if request_tags and isinstance(request_tags, list):
-            final_tags.extend(request_tags)
-
-        if tags_to_add and isinstance(tags_to_add, list):
-            for tag in tags_to_add:
-                if tag not in final_tags:
-                    final_tags.append(tag)
-
-        return final_tags
-

 async def add_litellm_data_to_request(  # noqa: PLR0915
    data: dict,
@ -466,10 +442,12 @@ async def add_litellm_data_to_request(  # noqa: PLR0915
    ## TEAM-LEVEL SPEND LOGS/TAGS
    team_metadata = user_api_key_dict.team_metadata or {}
    if "tags" in team_metadata and team_metadata["tags"] is not None:
-        data[_metadata_variable_name]["tags"] = LiteLLMProxyRequestSetup._merge_tags(
-            request_tags=data[_metadata_variable_name].get("tags"),
-            tags_to_add=team_metadata["tags"],
-        )
+        if "tags" in data[_metadata_variable_name] and isinstance(
+            data[_metadata_variable_name]["tags"], list
+        ):
+            data[_metadata_variable_name]["tags"].extend(team_metadata["tags"])
+        else:
+            data[_metadata_variable_name]["tags"] = team_metadata["tags"]
    if "spend_logs_metadata" in team_metadata and isinstance(
        team_metadata["spend_logs_metadata"], dict
    ):
--- a/litellm/proxy/management_endpoints/internal_user_endpoints.py
+++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py
@ -30,9 +30,8 @@ from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.management_endpoints.key_management_endpoints import (
-    duration_in_seconds,
+    _duration_in_seconds,
    generate_key_helper_fn,
-    prepare_metadata_fields,
 )
 from litellm.proxy.management_helpers.utils import (
    add_new_member,
@ -43,7 +42,7 @@ from litellm.proxy.utils import handle_exception_on_proxy
 router = APIRouter()


-def _update_internal_new_user_params(data_json: dict, data: NewUserRequest) -> dict:
+def _update_internal_user_params(data_json: dict, data: NewUserRequest) -> dict:
    if "user_id" in data_json and data_json["user_id"] is None:
        data_json["user_id"] = str(uuid.uuid4())
    auto_create_key = data_json.pop("auto_create_key", True)
@ -146,7 +145,7 @@ async def new_user(
    from litellm.proxy.proxy_server import general_settings, proxy_logging_obj

    data_json = data.json()  # type: ignore
-    data_json = _update_internal_new_user_params(data_json, data)
+    data_json = _update_internal_user_params(data_json, data)
    response = await generate_key_helper_fn(request_type="user", **data_json)

    # Admin UI Logic
@ -439,52 +438,6 @@ async def user_info(  # noqa: PLR0915
        raise handle_exception_on_proxy(e)


-def _update_internal_user_params(data_json: dict, data: UpdateUserRequest) -> dict:
-    non_default_values = {}
-    for k, v in data_json.items():
-        if (
-            v is not None
-            and v
-            not in (
-                [],
-                {},
-                0,
-            )
-            and k not in LiteLLM_ManagementEndpoint_MetadataFields
-        ):  # models default to [], spend defaults to 0, we should not reset these values
-            non_default_values[k] = v
-
-    is_internal_user = False
-    if data.user_role == LitellmUserRoles.INTERNAL_USER:
-        is_internal_user = True
-
-    if "budget_duration" in non_default_values:
-        duration_s = duration_in_seconds(duration=non_default_values["budget_duration"])
-        user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
-        non_default_values["budget_reset_at"] = user_reset_at
-
-    if "max_budget" not in non_default_values:
-        if (
-            is_internal_user and litellm.max_internal_user_budget is not None
-        ):  # applies internal user limits, if user role updated
-            non_default_values["max_budget"] = litellm.max_internal_user_budget
-
-    if (
-        "budget_duration" not in non_default_values
-    ):  # applies internal user limits, if user role updated
-        if is_internal_user and litellm.internal_user_budget_duration is not None:
-            non_default_values["budget_duration"] = (
-                litellm.internal_user_budget_duration
-            )
-            duration_s = duration_in_seconds(
-                duration=non_default_values["budget_duration"]
-            )
-            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
-            non_default_values["budget_reset_at"] = user_reset_at
-
-    return non_default_values
-
-
@router.post(
    "/user/update",
    tags=["Internal User management"],
@ -506,8 +459,7 @@ async def user_update(
        "user_id": "test-litellm-user-4",
        "user_role": "proxy_admin_viewer"
    }'
-    ```
-    
+
    Parameters:
        - user_id: Optional[str] - Specify a user id. If not set, a unique id will be generated.
        - user_email: Optional[str] - Specify a user email.
@ -539,7 +491,7 @@ async def user_update(
        - duration: Optional[str] - [NOT IMPLEMENTED].
        - key_alias: Optional[str] - [NOT IMPLEMENTED].
            
-    
+    ```
    """
    from litellm.proxy.proxy_server import prisma_client

@ -550,21 +502,46 @@ async def user_update(
            raise Exception("Not connected to DB!")

        # get non default values for key
-        non_default_values = _update_internal_user_params(
-            data_json=data_json, data=data
-        )
+        non_default_values = {}
+        for k, v in data_json.items():
+            if v is not None and v not in (
+                [],
+                {},
+                0,
+            ):  # models default to [], spend defaults to 0, we should not reset these values
+                non_default_values[k] = v

-        existing_user_row = await prisma_client.get_data(
-            user_id=data.user_id, table_name="user", query_type="find_unique"
-        )
+        is_internal_user = False
+        if data.user_role == LitellmUserRoles.INTERNAL_USER:
+            is_internal_user = True

-        existing_metadata = existing_user_row.metadata if existing_user_row else {}
+        if "budget_duration" in non_default_values:
+            duration_s = _duration_in_seconds(
+                duration=non_default_values["budget_duration"]
+            )
+            user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
+            non_default_values["budget_reset_at"] = user_reset_at

-        non_default_values = prepare_metadata_fields(
-            data=data,
-            non_default_values=non_default_values,
-            existing_metadata=existing_metadata or {},
-        )
+        if "max_budget" not in non_default_values:
+            if (
+                is_internal_user and litellm.max_internal_user_budget is not None
+            ):  # applies internal user limits, if user role updated
+                non_default_values["max_budget"] = litellm.max_internal_user_budget
+
+        if (
+            "budget_duration" not in non_default_values
+        ):  # applies internal user limits, if user role updated
+            if is_internal_user and litellm.internal_user_budget_duration is not None:
+                non_default_values["budget_duration"] = (
+                    litellm.internal_user_budget_duration
+                )
+                duration_s = _duration_in_seconds(
+                    duration=non_default_values["budget_duration"]
+                )
+                user_reset_at = datetime.now(timezone.utc) + timedelta(
+                    seconds=duration_s
+                )
+                non_default_values["budget_reset_at"] = user_reset_at

        ## ADD USER, IF NEW ##
        verbose_proxy_logger.debug("/user/update: Received data = %s", data)
@ -748,8 +725,8 @@ async def delete_user(
    - user_ids: List[str] - The list of user id's to be deleted.
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
        user_api_key_cache,
--- a/litellm/proxy/management_endpoints/key_management_endpoints.py
+++ b/litellm/proxy/management_endpoints/key_management_endpoints.py
@ -17,7 +17,7 @@ import secrets
 import traceback
 import uuid
 from datetime import datetime, timedelta, timezone
-from typing import List, Optional, Tuple, cast
+from typing import List, Optional, Tuple

 import fastapi
 from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, status
@ -29,182 +29,16 @@ from litellm.proxy.auth.auth_checks import (
    _cache_key_object,
    _delete_cache_key_object,
    get_key_object,
-    get_team_object,
 )
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.hooks.key_management_event_hooks import KeyManagementEventHooks
 from litellm.proxy.management_helpers.utils import management_endpoint_wrapper
 from litellm.proxy.utils import (
+    _duration_in_seconds,
    _hash_token_if_needed,
-    duration_in_seconds,
    handle_exception_on_proxy,
 )
 from litellm.secret_managers.main import get_secret
-from litellm.types.utils import PersonalUIKeyGenerationConfig, TeamUIKeyGenerationConfig
-
-
-def _is_team_key(data: GenerateKeyRequest):
-    return data.team_id is not None
-
-
-def _get_user_in_team(
-    team_table: LiteLLM_TeamTableCachedObj, user_id: Optional[str]
-) -> Optional[Member]:
-    if user_id is None:
-        return None
-    for member in team_table.members_with_roles:
-        if member.user_id is not None and member.user_id == user_id:
-            return member
-    return None
-
-
-def _team_key_generation_team_member_check(
-    team_table: LiteLLM_TeamTableCachedObj,
-    user_api_key_dict: UserAPIKeyAuth,
-    team_key_generation: Optional[TeamUIKeyGenerationConfig],
-):
-    if (
-        team_key_generation is None
-        or "allowed_team_member_roles" not in team_key_generation
-    ):
-        return True
-
-    user_in_team = _get_user_in_team(
-        team_table=team_table, user_id=user_api_key_dict.user_id
-    )
-    if user_in_team is None:
-        raise HTTPException(
-            status_code=400,
-            detail=f"User={user_api_key_dict.user_id} not assigned to team={team_table.team_id}",
-        )
-
-    if user_in_team.role not in team_key_generation["allowed_team_member_roles"]:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Team member role {user_in_team.role} not in allowed_team_member_roles={team_key_generation['allowed_team_member_roles']}",
-        )
-    return True
-
-
-def _key_generation_required_param_check(
-    data: GenerateKeyRequest, required_params: Optional[List[str]]
-):
-    if required_params is None:
-        return True
-
-    data_dict = data.model_dump(exclude_unset=True)
-    for param in required_params:
-        if param not in data_dict:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Required param {param} not in data",
-            )
-    return True
-
-
-def _team_key_generation_check(
-    team_table: LiteLLM_TeamTableCachedObj,
-    user_api_key_dict: UserAPIKeyAuth,
-    data: GenerateKeyRequest,
-):
-    if (
-        litellm.key_generation_settings is None
-        or litellm.key_generation_settings.get("team_key_generation") is None
-    ):
-        return True
-
-    _team_key_generation = litellm.key_generation_settings["team_key_generation"]  # type: ignore
-
-    _team_key_generation_team_member_check(
-        team_table=team_table,
-        user_api_key_dict=user_api_key_dict,
-        team_key_generation=_team_key_generation,
-    )
-    _key_generation_required_param_check(
-        data,
-        _team_key_generation.get("required_params"),
-    )
-
-    return True
-
-
-def _personal_key_membership_check(
-    user_api_key_dict: UserAPIKeyAuth,
-    personal_key_generation: Optional[PersonalUIKeyGenerationConfig],
-):
-    if (
-        personal_key_generation is None
-        or "allowed_user_roles" not in personal_key_generation
-    ):
-        return True
-
-    if user_api_key_dict.user_role not in personal_key_generation["allowed_user_roles"]:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Personal key creation has been restricted by admin. Allowed roles={litellm.key_generation_settings['personal_key_generation']['allowed_user_roles']}. Your role={user_api_key_dict.user_role}",  # type: ignore
-        )
-
-    return True
-
-
-def _personal_key_generation_check(
-    user_api_key_dict: UserAPIKeyAuth, data: GenerateKeyRequest
-):
-
-    if (
-        litellm.key_generation_settings is None
-        or litellm.key_generation_settings.get("personal_key_generation") is None
-    ):
-        return True
-
-    _personal_key_generation = litellm.key_generation_settings["personal_key_generation"]  # type: ignore
-
-    _personal_key_membership_check(
-        user_api_key_dict,
-        personal_key_generation=_personal_key_generation,
-    )
-
-    _key_generation_required_param_check(
-        data,
-        _personal_key_generation.get("required_params"),
-    )
-
-    return True
-
-
-def key_generation_check(
-    team_table: Optional[LiteLLM_TeamTableCachedObj],
-    user_api_key_dict: UserAPIKeyAuth,
-    data: GenerateKeyRequest,
-) -> bool:
-    """
-    Check if admin has restricted key creation to certain roles for teams or individuals
-    """
-    if (
-        litellm.key_generation_settings is None
-        or user_api_key_dict.user_role == LitellmUserRoles.PROXY_ADMIN.value
-    ):
-        return True
-
-    ## check if key is for team or individual
-    is_team_key = _is_team_key(data=data)
-
-    if is_team_key:
-        if team_table is None:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Unable to find team object in database. Team ID: {data.team_id}",
-            )
-        return _team_key_generation_check(
-            team_table=team_table,
-            user_api_key_dict=user_api_key_dict,
-            data=data,
-        )
-    else:
-        return _personal_key_generation_check(
-            user_api_key_dict=user_api_key_dict, data=data
-        )
-

 router = APIRouter()

@ -281,7 +115,6 @@ async def generate_key_fn(  # noqa: PLR0915
            litellm_proxy_admin_name,
            prisma_client,
            proxy_logging_obj,
-            user_api_key_cache,
            user_custom_key_generate,
        )

@ -298,21 +131,6 @@ async def generate_key_fn(  # noqa: PLR0915
                raise HTTPException(
                    status_code=status.HTTP_403_FORBIDDEN, detail=message
                )
-        elif litellm.key_generation_settings is not None:
-            if data.team_id is None:
-                team_table: Optional[LiteLLM_TeamTableCachedObj] = None
-            else:
-                team_table = await get_team_object(
-                    team_id=data.team_id,
-                    prisma_client=prisma_client,
-                    user_api_key_cache=user_api_key_cache,
-                    parent_otel_span=user_api_key_dict.parent_otel_span,
-                )
-            key_generation_check(
-                team_table=team_table,
-                user_api_key_dict=user_api_key_dict,
-                data=data,
-            )
        # check if user set default key/generate params on config.yaml
        if litellm.default_key_generate_params is not None:
            for elem in data:
@ -362,10 +180,10 @@ async def generate_key_fn(  # noqa: PLR0915
                                )
                        # Compare durations
                        elif key in ["budget_duration", "duration"]:
-                            upperbound_duration = duration_in_seconds(
+                            upperbound_duration = _duration_in_seconds(
                                duration=upperbound_value
                            )
-                            user_duration = duration_in_seconds(duration=value)
+                            user_duration = _duration_in_seconds(duration=value)
                            if user_duration > upperbound_duration:
                                raise HTTPException(
                                    status_code=400,
@ -394,8 +212,7 @@ async def generate_key_fn(  # noqa: PLR0915
                }
            )
            _budget_id = getattr(_budget, "budget_id", None)
-        data_json = data.model_dump(exclude_unset=True, exclude_none=True)  # type: ignore
-
+        data_json = data.json()  # type: ignore
        # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
        if "max_budget" in data_json:
            data_json["key_max_budget"] = data_json.pop("max_budget", None)
@ -421,11 +238,6 @@ async def generate_key_fn(  # noqa: PLR0915

            data_json.pop("tags")

-        await _enforce_unique_key_alias(
-            key_alias=data_json.get("key_alias", None),
-            prisma_client=prisma_client,
-        )
-
        response = await generate_key_helper_fn(
            request_type="key", **data_json, table_name="key"
        )
@ -453,52 +265,12 @@ async def generate_key_fn(  # noqa: PLR0915
        raise handle_exception_on_proxy(e)


-def prepare_metadata_fields(
-    data: BaseModel, non_default_values: dict, existing_metadata: dict
-) -> dict:
-    """
-    Check LiteLLM_ManagementEndpoint_MetadataFields (proxy/_types.py) for fields that are allowed to be updated
-    """
-
-    if "metadata" not in non_default_values:  # allow user to set metadata to none
-        non_default_values["metadata"] = existing_metadata.copy()
-
-    casted_metadata = cast(dict, non_default_values["metadata"])
-
-    data_json = data.model_dump(exclude_unset=True, exclude_none=True)
-
-    try:
-        for k, v in data_json.items():
-            if k == "model_tpm_limit" or k == "model_rpm_limit":
-                if k not in casted_metadata or casted_metadata[k] is None:
-                    casted_metadata[k] = {}
-                casted_metadata[k].update(v)
-
-            if k == "tags" or k == "guardrails":
-                if k not in casted_metadata or casted_metadata[k] is None:
-                    casted_metadata[k] = []
-                seen = set(casted_metadata[k])
-                casted_metadata[k].extend(
-                    x for x in v if x not in seen and not seen.add(x)  # type: ignore
-                )  # prevent duplicates from being added + maintain initial order
-
-    except Exception as e:
-        verbose_proxy_logger.exception(
-            "litellm.proxy.proxy_server.prepare_metadata_fields(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-
-    non_default_values["metadata"] = casted_metadata
-    return non_default_values
-
-
 def prepare_key_update_data(
    data: Union[UpdateKeyRequest, RegenerateKeyRequest], existing_key_row
 ):
    data_json: dict = data.model_dump(exclude_unset=True)
    data_json.pop("key", None)
-    _metadata_fields = ["model_rpm_limit", "model_tpm_limit", "guardrails", "tags"]
+    _metadata_fields = ["model_rpm_limit", "model_tpm_limit", "guardrails"]
    non_default_values = {}
    for k, v in data_json.items():
        if k in _metadata_fields:
@ -508,7 +280,7 @@ def prepare_key_update_data(
    if "duration" in non_default_values:
        duration = non_default_values.pop("duration")
        if duration and (isinstance(duration, str)) and len(duration) > 0:
-            duration_s = duration_in_seconds(duration=duration)
+            duration_s = _duration_in_seconds(duration=duration)
            expires = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["expires"] = expires

@ -519,16 +291,27 @@ def prepare_key_update_data(
            and (isinstance(budget_duration, str))
            and len(budget_duration) > 0
        ):
-            duration_s = duration_in_seconds(duration=budget_duration)
+            duration_s = _duration_in_seconds(duration=budget_duration)
            key_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
            non_default_values["budget_reset_at"] = key_reset_at
-            non_default_values["budget_duration"] = budget_duration

    _metadata = existing_key_row.metadata or {}

-    non_default_values = prepare_metadata_fields(
-        data=data, non_default_values=non_default_values, existing_metadata=_metadata
-    )
+    if data.model_tpm_limit:
+        if "model_tpm_limit" not in _metadata:
+            _metadata["model_tpm_limit"] = {}
+        _metadata["model_tpm_limit"].update(data.model_tpm_limit)
+        non_default_values["metadata"] = _metadata
+
+    if data.model_rpm_limit:
+        if "model_rpm_limit" not in _metadata:
+            _metadata["model_rpm_limit"] = {}
+        _metadata["model_rpm_limit"].update(data.model_rpm_limit)
+        non_default_values["metadata"] = _metadata
+
+    if data.guardrails:
+        _metadata["guardrails"] = data.guardrails
+        non_default_values["metadata"] = _metadata

    return non_default_values

@ -620,12 +403,6 @@ async def update_key_fn(
            data=data, existing_key_row=existing_key_row
        )

-        await _enforce_unique_key_alias(
-            key_alias=non_default_values.get("key_alias", None),
-            prisma_client=prisma_client,
-            existing_key_token=existing_key_row.token,
-        )
-
        response = await prisma_client.update_data(
            token=key, data={**non_default_values, "token": key}
        )
@ -953,11 +730,11 @@ async def generate_key_helper_fn(  # noqa: PLR0915
    request_type: Literal[
        "user", "key"
    ],  # identifies if this request is from /user/new or /key/generate
-    duration: Optional[str] = None,
-    models: list = [],
-    aliases: dict = {},
-    config: dict = {},
-    spend: float = 0.0,
+    duration: Optional[str],
+    models: list,
+    aliases: dict,
+    config: dict,
+    spend: float,
    key_max_budget: Optional[float] = None,  # key_max_budget is used to Budget Per key
    key_budget_duration: Optional[str] = None,
    budget_id: Optional[float] = None,  # budget id <-> LiteLLM_BudgetTable
@ -986,8 +763,8 @@ async def generate_key_helper_fn(  # noqa: PLR0915
    allowed_cache_controls: Optional[list] = [],
    permissions: Optional[dict] = {},
    model_max_budget: Optional[dict] = {},
-    model_rpm_limit: Optional[dict] = None,
-    model_tpm_limit: Optional[dict] = None,
+    model_rpm_limit: Optional[dict] = {},
+    model_tpm_limit: Optional[dict] = {},
    guardrails: Optional[list] = None,
    teams: Optional[list] = None,
    organization_id: Optional[str] = None,
@ -1014,19 +791,19 @@ async def generate_key_helper_fn(  # noqa: PLR0915
    if duration is None:  # allow tokens that never expire
        expires = None
    else:
-        duration_s = duration_in_seconds(duration=duration)
+        duration_s = _duration_in_seconds(duration=duration)
        expires = datetime.now(timezone.utc) + timedelta(seconds=duration_s)

    if key_budget_duration is None:  # one-time budget
        key_reset_at = None
    else:
-        duration_s = duration_in_seconds(duration=key_budget_duration)
+        duration_s = _duration_in_seconds(duration=key_budget_duration)
        key_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)

    if budget_duration is None:  # one-time budget
        reset_at = None
    else:
-        duration_s = duration_in_seconds(duration=budget_duration)
+        duration_s = _duration_in_seconds(duration=budget_duration)
        reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)

    aliases_json = json.dumps(aliases)
@ -1924,38 +1701,3 @@ async def test_key_logging(
            status="healthy",
            details=f"No logger exceptions triggered, system is healthy. Manually check if logs were sent to {logging_callbacks} ",
        )
-
-
-async def _enforce_unique_key_alias(
-    key_alias: Optional[str],
-    prisma_client: Any,
-    existing_key_token: Optional[str] = None,
-) -> None:
-    """
-    Helper to enforce unique key aliases across all keys.
-
-    Args:
-        key_alias (Optional[str]): The key alias to check
-        prisma_client (Any): Prisma client instance
-        existing_key_token (Optional[str]): ID of existing key being updated, to exclude from uniqueness check
-            (The Admin UI passes key_alias, in all Edit key requests. So we need to be sure that if we find a key with the same alias, it's not the same key we're updating)
-
-    Raises:
-        ProxyException: If key alias already exists on a different key
-    """
-    if key_alias is not None and prisma_client is not None:
-        where_clause: dict[str, Any] = {"key_alias": key_alias}
-        if existing_key_token:
-            # Exclude the current key from the uniqueness check
-            where_clause["NOT"] = {"token": existing_key_token}
-
-        existing_key = await prisma_client.db.litellm_verificationtoken.find_first(
-            where=where_clause
-        )
-        if existing_key is not None:
-            raise ProxyException(
-                message=f"Key with alias '{key_alias}' already exists. Unique key aliases across all keys are required.",
-                type=ProxyErrorTypes.bad_request_error,
-                param="key_alias",
-                code=status.HTTP_400_BAD_REQUEST,
-            )
--- a/litellm/proxy/management_endpoints/organization_endpoints.py
+++ b/litellm/proxy/management_endpoints/organization_endpoints.py
@ -352,7 +352,7 @@ async def organization_member_add(
                },
            )

-        members: List[OrgMember]
+        members: List[Member]
        if isinstance(data.member, List):
            members = data.member
        else:
@ -397,7 +397,7 @@ async def organization_member_add(


 async def add_member_to_organization(
-    member: OrgMember,
+    member: Member,
    organization_id: str,
    prisma_client: PrismaClient,
 ) -> Tuple[LiteLLM_UserTable, LiteLLM_OrganizationMembershipTable]:
--- a/litellm/proxy/management_endpoints/team_callback_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_callback_endpoints.py
@ -90,8 +90,8 @@ async def add_team_callbacks(
    """
    try:
        from litellm.proxy.proxy_server import (
+            _duration_in_seconds,
            create_audit_log_for_update,
-            duration_in_seconds,
            litellm_proxy_admin_name,
            prisma_client,
        )
--- a/litellm/proxy/management_endpoints/team_endpoints.py
+++ b/litellm/proxy/management_endpoints/team_endpoints.py
@ -169,8 +169,8 @@ async def new_team(  # noqa: PLR0915
    ```
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -289,7 +289,7 @@ async def new_team(  # noqa: PLR0915

    # If budget_duration is set, set `budget_reset_at`
    if complete_team_data.budget_duration is not None:
-        duration_s = duration_in_seconds(duration=complete_team_data.budget_duration)
+        duration_s = _duration_in_seconds(duration=complete_team_data.budget_duration)
        reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
        complete_team_data.budget_reset_at = reset_at

@ -396,8 +396,8 @@ async def update_team(
    """
    from litellm.proxy.auth.auth_checks import _cache_team_object
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
        proxy_logging_obj,
@ -425,7 +425,7 @@ async def update_team(

    # Check budget_duration and budget_reset_at
    if data.budget_duration is not None:
-        duration_s = duration_in_seconds(duration=data.budget_duration)
+        duration_s = _duration_in_seconds(duration=data.budget_duration)
        reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)

        # set the budget_reset_at in DB
@ -547,7 +547,6 @@ async def team_member_add(
        parent_otel_span=None,
        proxy_logging_obj=proxy_logging_obj,
        check_cache_only=False,
-        check_db_only=True,
    )
    if existing_team_row is None:
        raise HTTPException(
@ -710,8 +709,8 @@ async def team_member_delete(
    ```
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -830,8 +829,8 @@ async def team_member_update(
    Update team member budgets
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -966,8 +965,8 @@ async def delete_team(
    ```
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -1055,8 +1054,8 @@ async def team_info(
    ```
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -1204,8 +1203,8 @@ async def block_team(

    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -1252,8 +1251,8 @@ async def unblock_team(
    ```
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -1295,8 +1294,8 @@ async def list_team(
    - user_id: str - Optional. If passed will only return teams that the user_id is a member of.
    """
    from litellm.proxy.proxy_server import (
+        _duration_in_seconds,
        create_audit_log_for_update,
-        duration_in_seconds,
        litellm_proxy_admin_name,
        prisma_client,
    )
@ -1367,7 +1366,6 @@ async def list_team(
            """.format(
                team.team_id, team.model_dump(), str(e)
            )
-            verbose_proxy_logger.exception(team_exception)
-            continue
+            raise HTTPException(status_code=400, detail={"error": team_exception})

    return returned_responses
--- a/litellm/proxy/model_config.yaml
+++ b/litellm/proxy/model_config.yaml
@ -1,10 +0,0 @@
-model_list:
-  - model_name: gpt-4o
-    litellm_params:
-      model: openai/gpt-4o
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: fake-anthropic-endpoint
-    litellm_params:
-      model: anthropic/fake
-      api_base: https://exampleanthropicendpoint-production.up.railway.app/
-
--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@ -54,26 +54,17 @@ def create_request_copy(request: Request):
    }


-@router.api_route(
-    "/gemini/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Google AI Studio Pass-through", "pass-through"],
-)
+@router.api_route("/gemini/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def gemini_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
 ):
-    """
-    [Docs](https://docs.litellm.ai/docs/pass_through/google_ai_studio)
-    """
    ## CHECK FOR LITELLM API KEY IN THE QUERY PARAMS - ?..key=LITELLM_API_KEY
-    google_ai_studio_api_key = request.query_params.get("key") or request.headers.get(
-        "x-goog-api-key"
-    )
+    api_key = request.query_params.get("key")

    user_api_key_dict = await user_api_key_auth(
-        request=request, api_key=f"Bearer {google_ai_studio_api_key}"
+        request=request, api_key="Bearer {}".format(api_key)
    )

    base_target_url = "https://generativelanguage.googleapis.com"
@ -120,20 +111,13 @@ async def gemini_proxy_route(
    return received_value


-@router.api_route(
-    "/cohere/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Cohere Pass-through", "pass-through"],
-)
+@router.api_route("/cohere/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def cohere_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
-    """
-    [Docs](https://docs.litellm.ai/docs/pass_through/cohere)
-    """
    base_target_url = "https://api.cohere.com"
    encoded_endpoint = httpx.URL(endpoint).path

@ -170,9 +154,7 @@ async def cohere_proxy_route(


@router.api_route(
-    "/anthropic/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Anthropic Pass-through", "pass-through"],
+    "/anthropic/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"]
 )
 async def anthropic_proxy_route(
    endpoint: str,
@ -180,9 +162,6 @@ async def anthropic_proxy_route(
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
-    """
-    [Docs](https://docs.litellm.ai/docs/anthropic_completion)
-    """
    base_target_url = "https://api.anthropic.com"
    encoded_endpoint = httpx.URL(endpoint).path

@ -222,20 +201,13 @@ async def anthropic_proxy_route(
    return received_value


-@router.api_route(
-    "/bedrock/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Bedrock Pass-through", "pass-through"],
-)
+@router.api_route("/bedrock/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def bedrock_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
-    """
-    [Docs](https://docs.litellm.ai/docs/pass_through/bedrock)
-    """
    create_request_copy(request)

    try:
@ -303,22 +275,13 @@ async def bedrock_proxy_route(
    return received_value


-@router.api_route(
-    "/azure/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Azure Pass-through", "pass-through"],
-)
+@router.api_route("/azure/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def azure_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
-    """
-    Call any azure endpoint using the proxy.
-
-    Just use `{PROXY_BASE_URL}/azure/{endpoint:path}`
-    """
    base_target_url = get_secret_str(secret_name="AZURE_API_BASE")
    if base_target_url is None:
        raise Exception(
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/anthropic_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/anthropic_passthrough_logging_handler.py
@ -14,7 +14,6 @@ from litellm.llms.anthropic.chat.handler import (
    ModelResponseIterator as AnthropicModelResponseIterator,
 )
 from litellm.llms.anthropic.chat.transformation import AnthropicConfig
-from litellm.proxy._types import PassThroughEndpointLoggingTypedDict

 if TYPE_CHECKING:
    from ..success_handler import PassThroughEndpointLogging
@ -27,7 +26,7 @@ else:
 class AnthropicPassthroughLoggingHandler:

    @staticmethod
-    def anthropic_passthrough_handler(
+    async def anthropic_passthrough_handler(
        httpx_response: httpx.Response,
        response_body: dict,
        logging_obj: LiteLLMLoggingObj,
@ -37,7 +36,7 @@ class AnthropicPassthroughLoggingHandler:
        end_time: datetime,
        cache_hit: bool,
        **kwargs,
-    ) -> PassThroughEndpointLoggingTypedDict:
+    ):
        """
        Transforms Anthropic response to OpenAI response, generates a standard logging object so downstream logging can be handled
        """
@ -68,10 +67,15 @@ class AnthropicPassthroughLoggingHandler:
            logging_obj=logging_obj,
        )

-        return {
-            "result": litellm_model_response,
-            "kwargs": kwargs,
-        }
+        await logging_obj.async_success_handler(
+            result=litellm_model_response,
+            start_time=start_time,
+            end_time=end_time,
+            cache_hit=cache_hit,
+            **kwargs,
+        )
+
+        pass

    @staticmethod
    def _create_anthropic_response_logging_payload(
@ -119,7 +123,7 @@ class AnthropicPassthroughLoggingHandler:
        return kwargs

    @staticmethod
-    def _handle_logging_anthropic_collected_chunks(
+    async def _handle_logging_anthropic_collected_chunks(
        litellm_logging_obj: LiteLLMLoggingObj,
        passthrough_success_handler_obj: PassThroughEndpointLogging,
        url_route: str,
@ -128,7 +132,7 @@ class AnthropicPassthroughLoggingHandler:
        start_time: datetime,
        all_chunks: List[str],
        end_time: datetime,
-    ) -> PassThroughEndpointLoggingTypedDict:
+    ):
        """
        Takes raw chunks from Anthropic passthrough endpoint and logs them in litellm callbacks

@ -148,10 +152,7 @@ class AnthropicPassthroughLoggingHandler:
            verbose_proxy_logger.error(
                "Unable to build complete streaming response for Anthropic passthrough endpoint, not logging..."
            )
-            return {
-                "result": None,
-                "kwargs": {},
-            }
+            return
        kwargs = AnthropicPassthroughLoggingHandler._create_anthropic_response_logging_payload(
            litellm_model_response=complete_streaming_response,
            model=model,
@ -160,11 +161,13 @@ class AnthropicPassthroughLoggingHandler:
            end_time=end_time,
            logging_obj=litellm_logging_obj,
        )
-
-        return {
-            "result": complete_streaming_response,
-            "kwargs": kwargs,
-        }
+        await litellm_logging_obj.async_success_handler(
+            result=complete_streaming_response,
+            start_time=start_time,
+            end_time=end_time,
+            cache_hit=False,
+            **kwargs,
+        )

    @staticmethod
    def _build_complete_streaming_response(
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/vertex_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/vertex_passthrough_logging_handler.py
@ -14,7 +14,6 @@ from litellm.litellm_core_utils.litellm_logging import (
 from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
    ModelResponseIterator as VertexModelResponseIterator,
 )
-from litellm.proxy._types import PassThroughEndpointLoggingTypedDict

 if TYPE_CHECKING:
    from ..success_handler import PassThroughEndpointLogging
@ -26,7 +25,7 @@ else:

 class VertexPassthroughLoggingHandler:
    @staticmethod
-    def vertex_passthrough_handler(
+    async def vertex_passthrough_handler(
        httpx_response: httpx.Response,
        logging_obj: LiteLLMLoggingObj,
        url_route: str,
@ -35,7 +34,7 @@ class VertexPassthroughLoggingHandler:
        end_time: datetime,
        cache_hit: bool,
        **kwargs,
-    ) -> PassThroughEndpointLoggingTypedDict:
+    ):
        if "generateContent" in url_route:
            model = VertexPassthroughLoggingHandler.extract_model_from_url(url_route)

@ -66,11 +65,13 @@ class VertexPassthroughLoggingHandler:
                logging_obj=logging_obj,
            )

-            return {
-                "result": litellm_model_response,
-                "kwargs": kwargs,
-            }
-
+            await logging_obj.async_success_handler(
+                result=litellm_model_response,
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=cache_hit,
+                **kwargs,
+            )
        elif "predict" in url_route:
            from litellm.llms.vertex_ai_and_google_ai_studio.image_generation.image_generation_handler import (
                VertexImageGeneration,
@ -111,18 +112,16 @@ class VertexPassthroughLoggingHandler:
            logging_obj.model = model
            logging_obj.model_call_details["model"] = logging_obj.model

-            return {
-                "result": litellm_prediction_response,
-                "kwargs": kwargs,
-            }
-        else:
-            return {
-                "result": None,
-                "kwargs": kwargs,
-            }
+            await logging_obj.async_success_handler(
+                result=litellm_prediction_response,
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=cache_hit,
+                **kwargs,
+            )

    @staticmethod
-    def _handle_logging_vertex_collected_chunks(
+    async def _handle_logging_vertex_collected_chunks(
        litellm_logging_obj: LiteLLMLoggingObj,
        passthrough_success_handler_obj: PassThroughEndpointLogging,
        url_route: str,
@ -131,7 +130,7 @@ class VertexPassthroughLoggingHandler:
        start_time: datetime,
        all_chunks: List[str],
        end_time: datetime,
-    ) -> PassThroughEndpointLoggingTypedDict:
+    ):
        """
        Takes raw chunks from Vertex passthrough endpoint and logs them in litellm callbacks

@ -153,11 +152,7 @@ class VertexPassthroughLoggingHandler:
            verbose_proxy_logger.error(
                "Unable to build complete streaming response for Vertex passthrough endpoint, not logging..."
            )
-            return {
-                "result": None,
-                "kwargs": kwargs,
-            }
-
+            return
        kwargs = VertexPassthroughLoggingHandler._create_vertex_response_logging_payload_for_generate_content(
            litellm_model_response=complete_streaming_response,
            model=model,
@ -166,11 +161,13 @@ class VertexPassthroughLoggingHandler:
            end_time=end_time,
            logging_obj=litellm_logging_obj,
        )
-
-        return {
-            "result": complete_streaming_response,
-            "kwargs": kwargs,
-        }
+        await litellm_logging_obj.async_success_handler(
+            result=complete_streaming_response,
+            start_time=start_time,
+            end_time=end_time,
+            cache_hit=False,
+            **kwargs,
+        )

    @staticmethod
    def _build_complete_streaming_response(
--- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
@ -22,7 +22,6 @@ import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
-from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
 from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
    ModelResponseIterator,
 )
@ -36,7 +35,6 @@ from litellm.proxy._types import (
 )
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.secret_managers.main import get_secret_str
-from litellm.types.llms.custom_http import httpxSpecialProvider

 from .streaming_handler import PassThroughStreamingHandler
 from .success_handler import PassThroughEndpointLogging
@ -365,11 +363,8 @@ async def pass_through_request(  # noqa: PLR0915
            data=_parsed_body,
            call_type="pass_through_endpoint",
        )
-        async_client_obj = get_async_httpx_client(
-            llm_provider=httpxSpecialProvider.PassThroughEndpoint,
-            params={"timeout": 600},
-        )
-        async_client = async_client_obj.client
+
+        async_client = httpx.AsyncClient(timeout=600)

        litellm_call_id = str(uuid.uuid4())

@ -393,7 +388,6 @@ async def pass_through_request(  # noqa: PLR0915
            _parsed_body=_parsed_body,
            passthrough_logging_payload=passthrough_logging_payload,
            litellm_call_id=litellm_call_id,
-            request=request,
        )
        # done for supporting 'parallel_request_limiter.py' with pass-through endpoints
        logging_obj.update_environment_variables(
@ -529,18 +523,16 @@ async def pass_through_request(  # noqa: PLR0915
        response_body: Optional[dict] = get_response_body(response)
        passthrough_logging_payload["response_body"] = response_body
        end_time = datetime.now()
-        asyncio.create_task(
-            pass_through_endpoint_logging.pass_through_async_success_handler(
-                httpx_response=response,
-                response_body=response_body,
-                url_route=str(url),
-                result="",
-                start_time=start_time,
-                end_time=end_time,
-                logging_obj=logging_obj,
-                cache_hit=False,
-                **kwargs,
-            )
+        await pass_through_endpoint_logging.pass_through_async_success_handler(
+            httpx_response=response,
+            response_body=response_body,
+            url_route=str(url),
+            result="",
+            start_time=start_time,
+            end_time=end_time,
+            logging_obj=logging_obj,
+            cache_hit=False,
+            **kwargs,
        )

        return Response(
@ -575,7 +567,6 @@ async def pass_through_request(  # noqa: PLR0915


 def _init_kwargs_for_pass_through_endpoint(
-    request: Request,
    user_api_key_dict: UserAPIKeyAuth,
    passthrough_logging_payload: PassthroughStandardLoggingPayload,
    _parsed_body: Optional[dict] = None,
@ -591,12 +582,6 @@ def _init_kwargs_for_pass_through_endpoint(
    }
    if _litellm_metadata:
        _metadata.update(_litellm_metadata)
-
-    _metadata = _update_metadata_with_tags_in_header(
-        request=request,
-        metadata=_metadata,
-    )
-
    kwargs = {
        "litellm_params": {
            "metadata": _metadata,
@ -608,18 +593,6 @@ def _init_kwargs_for_pass_through_endpoint(
    return kwargs


-def _update_metadata_with_tags_in_header(request: Request, metadata: dict) -> dict:
-    """
-    If tags are in the request headers, add them to the metadata
-
-    Used for google and vertex JS SDKs
-    """
-    _tags = request.headers.get("tags")
-    if _tags:
-        metadata["tags"] = _tags.split(",")
-    return metadata
-
-
 def create_pass_through_route(
    endpoint,
    target: str,
--- a/litellm/proxy/pass_through_endpoints/streaming_handler.py
+++ b/litellm/proxy/pass_through_endpoints/streaming_handler.py
@ -1,6 +1,5 @@
 import asyncio
 import json
-import threading
 from datetime import datetime
 from enum import Enum
 from typing import AsyncIterable, Dict, List, Optional, Union
@ -16,12 +15,7 @@ from litellm.llms.anthropic.chat.handler import (
 from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
    ModelResponseIterator as VertexAIIterator,
 )
-from litellm.proxy._types import PassThroughEndpointLoggingResultValues
-from litellm.types.utils import (
-    GenericStreamingChunk,
-    ModelResponse,
-    StandardPassThroughResponseObject,
-)
+from litellm.types.utils import GenericStreamingChunk

 from .llm_provider_handlers.anthropic_passthrough_logging_handler import (
    AnthropicPassthroughLoggingHandler,
@ -58,17 +52,15 @@ class PassThroughStreamingHandler:
            # After all chunks are processed, handle post-processing
            end_time = datetime.now()

-            asyncio.create_task(
-                PassThroughStreamingHandler._route_streaming_logging_to_handler(
-                    litellm_logging_obj=litellm_logging_obj,
-                    passthrough_success_handler_obj=passthrough_success_handler_obj,
-                    url_route=url_route,
-                    request_body=request_body or {},
-                    endpoint_type=endpoint_type,
-                    start_time=start_time,
-                    raw_bytes=raw_bytes,
-                    end_time=end_time,
-                )
+            await PassThroughStreamingHandler._route_streaming_logging_to_handler(
+                litellm_logging_obj=litellm_logging_obj,
+                passthrough_success_handler_obj=passthrough_success_handler_obj,
+                url_route=url_route,
+                request_body=request_body or {},
+                endpoint_type=endpoint_type,
+                start_time=start_time,
+                raw_bytes=raw_bytes,
+                end_time=end_time,
            )
        except Exception as e:
            verbose_proxy_logger.error(f"Error in chunk_processor: {str(e)}")
@ -95,12 +87,8 @@ class PassThroughStreamingHandler:
        all_chunks = PassThroughStreamingHandler._convert_raw_bytes_to_str_lines(
            raw_bytes
        )
-        standard_logging_response_object: Optional[
-            PassThroughEndpointLoggingResultValues
-        ] = None
-        kwargs: dict = {}
        if endpoint_type == EndpointType.ANTHROPIC:
-            anthropic_passthrough_logging_handler_result = AnthropicPassthroughLoggingHandler._handle_logging_anthropic_collected_chunks(
+            await AnthropicPassthroughLoggingHandler._handle_logging_anthropic_collected_chunks(
                litellm_logging_obj=litellm_logging_obj,
                passthrough_success_handler_obj=passthrough_success_handler_obj,
                url_route=url_route,
@ -110,48 +98,20 @@ class PassThroughStreamingHandler:
                all_chunks=all_chunks,
                end_time=end_time,
            )
-            standard_logging_response_object = (
-                anthropic_passthrough_logging_handler_result["result"]
-            )
-            kwargs = anthropic_passthrough_logging_handler_result["kwargs"]
        elif endpoint_type == EndpointType.VERTEX_AI:
-            vertex_passthrough_logging_handler_result = (
-                VertexPassthroughLoggingHandler._handle_logging_vertex_collected_chunks(
-                    litellm_logging_obj=litellm_logging_obj,
-                    passthrough_success_handler_obj=passthrough_success_handler_obj,
-                    url_route=url_route,
-                    request_body=request_body,
-                    endpoint_type=endpoint_type,
-                    start_time=start_time,
-                    all_chunks=all_chunks,
-                    end_time=end_time,
-                )
+            await VertexPassthroughLoggingHandler._handle_logging_vertex_collected_chunks(
+                litellm_logging_obj=litellm_logging_obj,
+                passthrough_success_handler_obj=passthrough_success_handler_obj,
+                url_route=url_route,
+                request_body=request_body,
+                endpoint_type=endpoint_type,
+                start_time=start_time,
+                all_chunks=all_chunks,
+                end_time=end_time,
            )
-            standard_logging_response_object = (
-                vertex_passthrough_logging_handler_result["result"]
-            )
-            kwargs = vertex_passthrough_logging_handler_result["kwargs"]
-
-        if standard_logging_response_object is None:
-            standard_logging_response_object = StandardPassThroughResponseObject(
-                response=f"cannot parse chunks to standard response object. Chunks={all_chunks}"
-            )
-        threading.Thread(
-            target=litellm_logging_obj.success_handler,
-            args=(
-                standard_logging_response_object,
-                start_time,
-                end_time,
-                False,
-            ),
-        ).start()
-        await litellm_logging_obj.async_success_handler(
-            result=standard_logging_response_object,
-            start_time=start_time,
-            end_time=end_time,
-            cache_hit=False,
-            **kwargs,
-        )
+        elif endpoint_type == EndpointType.GENERIC:
+            # No logging is supported for generic streaming endpoints
+            pass

    @staticmethod
    def _convert_raw_bytes_to_str_lines(raw_bytes: List[bytes]) -> List[str]:
--- a/litellm/proxy/pass_through_endpoints/success_handler.py
+++ b/litellm/proxy/pass_through_endpoints/success_handler.py
@ -15,10 +15,8 @@ from litellm.litellm_core_utils.litellm_logging import (
 from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
    VertexLLM,
 )
-from litellm.proxy._types import PassThroughEndpointLoggingResultValues
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.types.utils import StandardPassThroughResponseObject
-from litellm.utils import executor as thread_pool_executor

 from .llm_provider_handlers.anthropic_passthrough_logging_handler import (
    AnthropicPassthroughLoggingHandler,
@ -51,70 +49,53 @@ class PassThroughEndpointLogging:
        cache_hit: bool,
        **kwargs,
    ):
-        standard_logging_response_object: Optional[
-            PassThroughEndpointLoggingResultValues
-        ] = None
        if self.is_vertex_route(url_route):
-            vertex_passthrough_logging_handler_result = (
-                VertexPassthroughLoggingHandler.vertex_passthrough_handler(
-                    httpx_response=httpx_response,
-                    logging_obj=logging_obj,
-                    url_route=url_route,
-                    result=result,
-                    start_time=start_time,
-                    end_time=end_time,
-                    cache_hit=cache_hit,
-                    **kwargs,
-                )
+            await VertexPassthroughLoggingHandler.vertex_passthrough_handler(
+                httpx_response=httpx_response,
+                logging_obj=logging_obj,
+                url_route=url_route,
+                result=result,
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=cache_hit,
+                **kwargs,
            )
-            standard_logging_response_object = (
-                vertex_passthrough_logging_handler_result["result"]
-            )
-            kwargs = vertex_passthrough_logging_handler_result["kwargs"]
        elif self.is_anthropic_route(url_route):
-            anthropic_passthrough_logging_handler_result = (
-                AnthropicPassthroughLoggingHandler.anthropic_passthrough_handler(
-                    httpx_response=httpx_response,
-                    response_body=response_body or {},
-                    logging_obj=logging_obj,
-                    url_route=url_route,
-                    result=result,
-                    start_time=start_time,
-                    end_time=end_time,
-                    cache_hit=cache_hit,
-                    **kwargs,
-                )
+            await AnthropicPassthroughLoggingHandler.anthropic_passthrough_handler(
+                httpx_response=httpx_response,
+                response_body=response_body or {},
+                logging_obj=logging_obj,
+                url_route=url_route,
+                result=result,
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=cache_hit,
+                **kwargs,
            )
-
-            standard_logging_response_object = (
-                anthropic_passthrough_logging_handler_result["result"]
-            )
-            kwargs = anthropic_passthrough_logging_handler_result["kwargs"]
-        if standard_logging_response_object is None:
+        else:
            standard_logging_response_object = StandardPassThroughResponseObject(
                response=httpx_response.text
            )
-        thread_pool_executor.submit(
-            logging_obj.success_handler,
-            args=(
-                standard_logging_response_object,
-                start_time,
-                end_time,
-                cache_hit,
-            ),
-        )
-
-        await logging_obj.async_success_handler(
-            result=(
-                json.dumps(result)
-                if isinstance(result, dict)
-                else standard_logging_response_object
-            ),
-            start_time=start_time,
-            end_time=end_time,
-            cache_hit=False,
-            **kwargs,
-        )
+            threading.Thread(
+                target=logging_obj.success_handler,
+                args=(
+                    standard_logging_response_object,
+                    start_time,
+                    end_time,
+                    cache_hit,
+                ),
+            ).start()
+            await logging_obj.async_success_handler(
+                result=(
+                    json.dumps(result)
+                    if isinstance(result, dict)
+                    else standard_logging_response_object
+                ),
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=False,
+                **kwargs,
+            )

    def is_vertex_route(self, url_route: str):
        for route in self.TRACKED_VERTEX_ROUTES:
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,5 +1,9 @@
-include:
-  - model_config.yaml
+model_list:
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY

-litellm_settings:
-  callbacks: ["datadog"] 
+default_vertex_config:
+  vertex_project: "adroit-crow-413218"
+  vertex_location: "us-central1"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -134,10 +134,7 @@ from litellm.proxy.auth.model_checks import (
    get_key_models,
    get_team_models,
 )
-from litellm.proxy.auth.user_api_key_auth import (
-    user_api_key_auth,
-    user_api_key_auth_websocket,
-)
+from litellm.proxy.auth.user_api_key_auth import user_api_key_auth

 ## Import All Misc routes here ##
 from litellm.proxy.caching_routes import router as caching_router
@ -176,7 +173,6 @@ from litellm.proxy.health_endpoints._health_endpoints import router as health_ro
 from litellm.proxy.hooks.prompt_injection_detection import (
    _OPTIONAL_PromptInjectionDetection,
 )
-from litellm.proxy.hooks.proxy_failure_handler import _PROXY_failure_handler
 from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request
 from litellm.proxy.management_endpoints.customer_endpoints import (
    router as customer_router,
@ -186,8 +182,8 @@ from litellm.proxy.management_endpoints.internal_user_endpoints import (
 )
 from litellm.proxy.management_endpoints.internal_user_endpoints import user_update
 from litellm.proxy.management_endpoints.key_management_endpoints import (
+    _duration_in_seconds,
    delete_verification_token,
-    duration_in_seconds,
    generate_key_helper_fn,
 )
 from litellm.proxy.management_endpoints.key_management_endpoints import (
@ -272,7 +268,6 @@ from litellm.types.llms.anthropic import (
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import RouterGeneralSettings
 from litellm.types.utils import StandardLoggingPayload
-from litellm.utils import get_end_user_id_for_cost_tracking

 try:
    from litellm._version import version
@ -530,6 +525,14 @@ db_writer_client: Optional[HTTPHandler] = None
 ### logger ###


+def _get_pydantic_json_dict(pydantic_obj: BaseModel) -> dict:
+    try:
+        return pydantic_obj.model_dump()  # type: ignore
+    except Exception:
+        # if using pydantic v1
+        return pydantic_obj.dict()
+
+
 def get_custom_headers(
    *,
    user_api_key_dict: UserAPIKeyAuth,
@ -683,6 +686,68 @@ def cost_tracking():
                litellm._async_success_callback.append(_PROXY_track_cost_callback)  # type: ignore


+async def _PROXY_failure_handler(
+    kwargs,  # kwargs to completion
+    completion_response: litellm.ModelResponse,  # response from completion
+    start_time=None,
+    end_time=None,  # start/end time for completion
+):
+    global prisma_client
+    if prisma_client is not None:
+        verbose_proxy_logger.debug(
+            "inside _PROXY_failure_handler kwargs=", extra=kwargs
+        )
+
+        _exception = kwargs.get("exception")
+        _exception_type = _exception.__class__.__name__
+        _model = kwargs.get("model", None)
+
+        _optional_params = kwargs.get("optional_params", {})
+        _optional_params = copy.deepcopy(_optional_params)
+
+        for k, v in _optional_params.items():
+            v = str(v)
+            v = v[:100]
+
+        _status_code = "500"
+        try:
+            _status_code = str(_exception.status_code)
+        except Exception:
+            # Don't let this fail logging the exception to the dB
+            pass
+
+        _litellm_params = kwargs.get("litellm_params", {}) or {}
+        _metadata = _litellm_params.get("metadata", {}) or {}
+        _model_id = _metadata.get("model_info", {}).get("id", "")
+        _model_group = _metadata.get("model_group", "")
+        api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
+        _exception_string = str(_exception)
+
+        error_log = LiteLLM_ErrorLogs(
+            request_id=str(uuid.uuid4()),
+            model_group=_model_group,
+            model_id=_model_id,
+            litellm_model_name=kwargs.get("model"),
+            request_kwargs=_optional_params,
+            api_base=api_base,
+            exception_type=_exception_type,
+            status_code=_status_code,
+            exception_string=_exception_string,
+            startTime=kwargs.get("start_time"),
+            endTime=kwargs.get("end_time"),
+        )
+
+        # helper function to convert to dict on pydantic v2 & v1
+        error_log_dict = _get_pydantic_json_dict(error_log)
+        error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
+
+        await prisma_client.db.litellm_errorlogs.create(
+            data=error_log_dict  # type: ignore
+        )
+
+    pass
+
+
@log_db_metrics
 async def _PROXY_track_cost_callback(
    kwargs,  # kwargs to completion
@ -698,7 +763,8 @@ async def _PROXY_track_cost_callback(
        )
        parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs=kwargs)
        litellm_params = kwargs.get("litellm_params", {}) or {}
-        end_user_id = get_end_user_id_for_cost_tracking(litellm_params)
+        proxy_server_request = litellm_params.get("proxy_server_request") or {}
+        end_user_id = proxy_server_request.get("body", {}).get("user", None)
        metadata = get_litellm_metadata_from_kwargs(kwargs=kwargs)
        user_id = metadata.get("user_api_key_user_id", None)
        team_id = metadata.get("user_api_key_team_id", None)
@ -1311,16 +1377,6 @@ class ProxyConfig:
        _, file_extension = os.path.splitext(config_file_path)
        return file_extension.lower() == ".yaml" or file_extension.lower() == ".yml"

-    def _load_yaml_file(self, file_path: str) -> dict:
-        """
-        Load and parse a YAML file
-        """
-        try:
-            with open(file_path, "r") as file:
-                return yaml.safe_load(file) or {}
-        except Exception as e:
-            raise Exception(f"Error loading yaml file {file_path}: {str(e)}")
-
    async def _get_config_from_file(
        self, config_file_path: Optional[str] = None
    ) -> dict:
@ -1351,51 +1407,6 @@ class ProxyConfig:
                "litellm_settings": {},
            }

-        # Process includes
-        config = self._process_includes(
-            config=config, base_dir=os.path.dirname(os.path.abspath(file_path or ""))
-        )
-
-        verbose_proxy_logger.debug(f"loaded config={json.dumps(config, indent=4)}")
-        return config
-
-    def _process_includes(self, config: dict, base_dir: str) -> dict:
-        """
-        Process includes by appending their contents to the main config
-
-        Handles nested config.yamls with `include` section
-
-        Example config: This will get the contents from files in `include` and append it
-        ```yaml
-        include:
-            - model_config.yaml
-
-        litellm_settings:
-            callbacks: ["prometheus"]
-        ```
-        """
-        if "include" not in config:
-            return config
-
-        if not isinstance(config["include"], list):
-            raise ValueError("'include' must be a list of file paths")
-
-        # Load and append all included files
-        for include_file in config["include"]:
-            file_path = os.path.join(base_dir, include_file)
-            if not os.path.exists(file_path):
-                raise FileNotFoundError(f"Included file not found: {file_path}")
-
-            included_config = self._load_yaml_file(file_path)
-            # Simply update/extend the main config with included config
-            for key, value in included_config.items():
-                if isinstance(value, list) and key in config:
-                    config[key].extend(value)
-                else:
-                    config[key] = value
-
-        # Remove the include directive
-        del config["include"]
        return config

    async def save_config(self, new_config: dict):
@ -4328,11 +4339,7 @@ from litellm import _arealtime


@app.websocket("/v1/realtime")
-async def websocket_endpoint(
-    websocket: WebSocket,
-    model: str,
-    user_api_key_dict=Depends(user_api_key_auth_websocket),
-):
+async def websocket_endpoint(websocket: WebSocket, model: str):
    import websockets

    await websocket.accept()
@ -5656,11 +5663,11 @@ async def anthropic_response(  # noqa: PLR0915
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    """
-    🚨 DEPRECATED ENDPOINT🚨
+    This is a BETA endpoint that calls 100+ LLMs in the anthropic format.

-    Use `{PROXY_BASE_URL}/anthropic/v1/messages` instead - [Docs](https://docs.litellm.ai/docs/anthropic_completion).
+    To do a simple pass-through for anthropic, do `{PROXY_BASE_URL}/anthropic/v1/messages`

-    This was a BETA endpoint that calls 100+ LLMs in the anthropic format.
+    Docs - https://docs.litellm.ai/docs/anthropic_completion
    """
    from litellm import adapter_completion
    from litellm.adapters.anthropic_adapter import anthropic_adapter
--- a/litellm/proxy/route_llm_request.py
+++ b/litellm/proxy/route_llm_request.py
@ -86,6 +86,7 @@ async def route_request(
        else:
            models = [model.strip() for model in data.pop("model").split(",")]
            return llm_router.abatch_completion(models=models, **data)
+
    elif llm_router is not None:
        if (
            data["model"] in router_model_names
@ -112,9 +113,6 @@ async def route_request(
                or len(llm_router.pattern_router.patterns) > 0
            ):
                return getattr(llm_router, f"{route_type}")(**data)
-            elif route_type == "amoderation":
-                # moderation endpoint does not require `model` parameter
-                return getattr(llm_router, f"{route_type}")(**data)

    elif user_model is not None:
        return getattr(litellm, f"{route_type}")(**data)
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -26,11 +26,6 @@ from typing import (
    overload,
 )

-from litellm.litellm_core_utils.duration_parser import (
-    _extract_from_regex,
-    duration_in_seconds,
-    get_last_day_of_month,
-)
 from litellm.proxy._types import ProxyErrorTypes, ProxyException

 try:
@ -342,14 +337,14 @@ class ProxyLogging:
                alert_to_webhook_url=self.alert_to_webhook_url,
            )

-            if self.alerting is not None and "slack" in self.alerting:
+            if (
+                self.alerting is not None
+                and "slack" in self.alerting
+                and "daily_reports" in self.alert_types
+            ):
                # NOTE: ENSURE we only add callbacks when alerting is on
                # We should NOT add callbacks when alerting is off
-                if "daily_reports" in self.alert_types:
-                    litellm.callbacks.append(self.slack_alerting_instance)  # type: ignore
-                litellm.success_callback.append(
-                    self.slack_alerting_instance.response_taking_too_long_callback
-                )
+                litellm.callbacks.append(self.slack_alerting_instance)  # type: ignore

        if redis_cache is not None:
            self.internal_usage_cache.dual_cache.redis_cache = redis_cache
@ -359,6 +354,9 @@ class ProxyLogging:
        litellm.callbacks.append(self.max_budget_limiter)  # type: ignore
        litellm.callbacks.append(self.cache_control_check)  # type: ignore
        litellm.callbacks.append(self.service_logging_obj)  # type: ignore
+        litellm.success_callback.append(
+            self.slack_alerting_instance.response_taking_too_long_callback
+        )
        for callback in litellm.callbacks:
            if isinstance(callback, str):
                callback = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class(  # type: ignore
@ -854,20 +852,6 @@ class ProxyLogging:
                    ),
                ).start()

-        await self._run_post_call_failure_hook_custom_loggers(
-            original_exception=original_exception,
-            request_data=request_data,
-            user_api_key_dict=user_api_key_dict,
-        )
-
-        return
-
-    async def _run_post_call_failure_hook_custom_loggers(
-        self,
-        original_exception: Exception,
-        request_data: dict,
-        user_api_key_dict: UserAPIKeyAuth,
-    ):
        for callback in litellm.callbacks:
            try:
                _callback: Optional[CustomLogger] = None
@ -886,38 +870,7 @@ class ProxyLogging:
            except Exception as e:
                raise e

-    async def async_log_proxy_authentication_errors(
-        self,
-        original_exception: Exception,
-        request: Request,
-        parent_otel_span: Optional[Any],
-        api_key: Optional[str],
-    ):
-        """
-        Handler for Logging Authentication Errors on LiteLLM Proxy
-        Why not use post_call_failure_hook?
-        - `post_call_failure_hook` calls `litellm_logging_obj.async_failure_handler`. This led to the Exception being logged twice
-
-        What does this handler do?
-        - Logs Authentication Errors (like invalid API Key passed) to CustomLogger compatible classes (OTEL, Datadog etc)
-            - calls CustomLogger.async_post_call_failure_hook
-        """
-
-        user_api_key_dict = UserAPIKeyAuth(
-            parent_otel_span=parent_otel_span,
-            token=_hash_token_if_needed(token=api_key or ""),
-        )
-        try:
-            request_data = await request.json()
-        except json.JSONDecodeError:
-            # For GET requests or requests without a JSON body
-            request_data = {}
-        await self._run_post_call_failure_hook_custom_loggers(
-            original_exception=original_exception,
-            request_data=request_data,
-            user_api_key_dict=user_api_key_dict,
-        )
-        pass
+        return

    async def post_call_success_hook(
        self,
@ -2479,6 +2432,86 @@ def _hash_token_if_needed(token: str) -> str:
        return token


+def _extract_from_regex(duration: str) -> Tuple[int, str]:
+    match = re.match(r"(\d+)(mo|[smhd]?)", duration)
+
+    if not match:
+        raise ValueError("Invalid duration format")
+
+    value, unit = match.groups()
+    value = int(value)
+
+    return value, unit
+
+
+def get_last_day_of_month(year, month):
+    # Handle December case
+    if month == 12:
+        return 31
+    # Next month is January, so subtract a day from March 1st
+    next_month = datetime(year=year, month=month + 1, day=1)
+    last_day_of_month = (next_month - timedelta(days=1)).day
+    return last_day_of_month
+
+
+def _duration_in_seconds(duration: str) -> int:
+    """
+    Parameters:
+    - duration:
+        - "<number>s" - seconds
+        - "<number>m" - minutes
+        - "<number>h" - hours
+        - "<number>d" - days
+        - "<number>mo" - months
+
+    Returns time in seconds till when budget needs to be reset
+    """
+    value, unit = _extract_from_regex(duration=duration)
+
+    if unit == "s":
+        return value
+    elif unit == "m":
+        return value * 60
+    elif unit == "h":
+        return value * 3600
+    elif unit == "d":
+        return value * 86400
+    elif unit == "mo":
+        now = time.time()
+        current_time = datetime.fromtimestamp(now)
+
+        if current_time.month == 12:
+            target_year = current_time.year + 1
+            target_month = 1
+        else:
+            target_year = current_time.year
+            target_month = current_time.month + value
+
+        # Determine the day to set for next month
+        target_day = current_time.day
+        last_day_of_target_month = get_last_day_of_month(target_year, target_month)
+
+        if target_day > last_day_of_target_month:
+            target_day = last_day_of_target_month
+
+        next_month = datetime(
+            year=target_year,
+            month=target_month,
+            day=target_day,
+            hour=current_time.hour,
+            minute=current_time.minute,
+            second=current_time.second,
+            microsecond=current_time.microsecond,
+        )
+
+        # Calculate the duration until the first day of the next month
+        duration_until_next_month = next_month - current_time
+        return int(duration_until_next_month.total_seconds())
+
+    else:
+        raise ValueError("Unsupported duration unit")
+
+
 async def reset_budget(prisma_client: PrismaClient):
    """
    Gets all the non-expired keys for a db, which need spend to be reset
@ -2497,7 +2530,7 @@ async def reset_budget(prisma_client: PrismaClient):
        if keys_to_reset is not None and len(keys_to_reset) > 0:
            for key in keys_to_reset:
                key.spend = 0.0
-                duration_s = duration_in_seconds(duration=key.budget_duration)
+                duration_s = _duration_in_seconds(duration=key.budget_duration)
                key.budget_reset_at = now + timedelta(seconds=duration_s)

            await prisma_client.update_data(
@ -2513,7 +2546,7 @@ async def reset_budget(prisma_client: PrismaClient):
        if users_to_reset is not None and len(users_to_reset) > 0:
            for user in users_to_reset:
                user.spend = 0.0
-                duration_s = duration_in_seconds(duration=user.budget_duration)
+                duration_s = _duration_in_seconds(duration=user.budget_duration)
                user.budget_reset_at = now + timedelta(seconds=duration_s)

            await prisma_client.update_data(
@ -2531,7 +2564,7 @@ async def reset_budget(prisma_client: PrismaClient):
        if teams_to_reset is not None and len(teams_to_reset) > 0:
            team_reset_requests = []
            for team in teams_to_reset:
-                duration_s = duration_in_seconds(duration=team.budget_duration)
+                duration_s = _duration_in_seconds(duration=team.budget_duration)
                reset_team_budget_request = ResetTeamBudgetRequest(
                    team_id=team.team_id,
                    spend=0.0,
--- a/litellm/proxy/vertex_ai_endpoints/langfuse_endpoints.py
+++ b/litellm/proxy/vertex_ai_endpoints/langfuse_endpoints.py
@ -58,21 +58,12 @@ def create_request_copy(request: Request):
    }


-@router.api_route(
-    "/langfuse/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Langfuse Pass-through", "pass-through"],
-)
+@router.api_route("/langfuse/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"])
 async def langfuse_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
 ):
-    """
-    Call Langfuse via LiteLLM proxy. Works with Langfuse SDK.
-
-    [Docs](https://docs.litellm.ai/docs/pass_through/langfuse)
-    """
    ## CHECK FOR LITELLM API KEY IN THE QUERY PARAMS - ?..key=LITELLM_API_KEY
    api_key = request.headers.get("Authorization") or ""

--- a/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
+++ b/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
@ -28,54 +28,25 @@ from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.proxy.pass_through_endpoints.pass_through_endpoints import (
    create_pass_through_route,
 )
-from litellm.secret_managers.main import get_secret_str
-from litellm.types.passthrough_endpoints.vertex_ai import *

 router = APIRouter()
-
-default_vertex_config: VertexPassThroughCredentials = VertexPassThroughCredentials()
+default_vertex_config = None


-def _get_vertex_env_vars() -> VertexPassThroughCredentials:
-    """
-    Helper to get vertex pass through config from environment variables
-
-    The following environment variables are used:
-    - DEFAULT_VERTEXAI_PROJECT (project id)
-    - DEFAULT_VERTEXAI_LOCATION (location)
-    - DEFAULT_GOOGLE_APPLICATION_CREDENTIALS (path to credentials file)
-    """
-    return VertexPassThroughCredentials(
-        vertex_project=get_secret_str("DEFAULT_VERTEXAI_PROJECT"),
-        vertex_location=get_secret_str("DEFAULT_VERTEXAI_LOCATION"),
-        vertex_credentials=get_secret_str("DEFAULT_GOOGLE_APPLICATION_CREDENTIALS"),
-    )
-
-
-def set_default_vertex_config(config: Optional[dict] = None):
-    """Sets vertex configuration from provided config and/or environment variables
-
-    Args:
-        config (Optional[dict]): Configuration dictionary
-        Example: {
-            "vertex_project": "my-project-123",
-            "vertex_location": "us-central1",
-            "vertex_credentials": "os.environ/GOOGLE_CREDS"
-        }
-    """
+def set_default_vertex_config(config):
    global default_vertex_config
-
-    # Initialize config dictionary if None
    if config is None:
-        default_vertex_config = _get_vertex_env_vars()
        return

+    if not isinstance(config, dict):
+        raise ValueError("invalid config, vertex default config must be a dictionary")
+
    if isinstance(config, dict):
        for key, value in config.items():
            if isinstance(value, str) and value.startswith("os.environ/"):
                config[key] = litellm.get_secret(value)

-    default_vertex_config = VertexPassThroughCredentials(**config)
+    default_vertex_config = config


 def exception_handler(e: Exception):
@ -142,26 +113,13 @@ def construct_target_url(


@router.api_route(
-    "/vertex-ai/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Vertex AI Pass-through", "pass-through"],
-    include_in_schema=False,
-)
-@router.api_route(
-    "/vertex_ai/{endpoint:path}",
-    methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
-    tags=["Vertex AI Pass-through", "pass-through"],
+    "/vertex-ai/{endpoint:path}", methods=["GET", "POST", "PUT", "DELETE"]
 )
 async def vertex_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
 ):
-    """
-    Call LiteLLM proxy via Vertex AI SDK.
-
-    [Docs](https://docs.litellm.ai/docs/pass_through/vertex_ai)
-    """
    encoded_endpoint = httpx.URL(endpoint).path

    import re
@ -177,7 +135,7 @@ async def vertex_proxy_route(
    vertex_project = None
    vertex_location = None
    # Use headers from the incoming request if default_vertex_config is not set
-    if default_vertex_config.vertex_project is None:
+    if default_vertex_config is None:
        headers = dict(request.headers) or {}
        verbose_proxy_logger.debug(
            "default_vertex_config  not set, incoming request headers %s", headers
@ -190,9 +148,9 @@ async def vertex_proxy_route(
        headers.pop("content-length", None)
        headers.pop("host", None)
    else:
-        vertex_project = default_vertex_config.vertex_project
-        vertex_location = default_vertex_config.vertex_location
-        vertex_credentials = default_vertex_config.vertex_credentials
+        vertex_project = default_vertex_config.get("vertex_project")
+        vertex_location = default_vertex_config.get("vertex_location")
+        vertex_credentials = default_vertex_config.get("vertex_credentials")

        base_target_url = f"https://{vertex_location}-aiplatform.googleapis.com/"

--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@ -91,7 +91,6 @@ def rerank(
    model_info = kwargs.get("model_info", None)
    metadata = kwargs.get("metadata", {})
    user = kwargs.get("user", None)
-    client = kwargs.get("client", None)
    try:
        _is_async = kwargs.pop("arerank", False) is True
        optional_params = GenericLiteLLMParams(**kwargs)
@ -151,7 +150,7 @@ def rerank(
                or optional_params.api_base
                or litellm.api_base
                or get_secret("COHERE_API_BASE")  # type: ignore
-                or "https://api.cohere.com"
+                or "https://api.cohere.com/v1/rerank"
            )

            if api_base is None:
@ -174,7 +173,6 @@ def rerank(
                _is_async=_is_async,
                headers=headers,
                litellm_logging_obj=litellm_logging_obj,
-                client=client,
            )
        elif _custom_llm_provider == "azure_ai":
            api_base = (
--- a/litellm/router.py
+++ b/litellm/router.py
@ -41,7 +41,6 @@ from typing import (
 import httpx
 import openai
 from openai import AsyncOpenAI
-from pydantic import BaseModel
 from typing_extensions import overload

 import litellm
@ -123,7 +122,6 @@ from litellm.types.router import (
    ModelInfo,
    ProviderBudgetConfigType,
    RetryPolicy,
-    RouterCacheEnum,
    RouterErrors,
    RouterGeneralSettings,
    RouterModelGroupAliasItem,
@ -241,6 +239,7 @@ class Router:
        ] = "simple-shuffle",
        routing_strategy_args: dict = {},  # just for latency-based
        provider_budget_config: Optional[ProviderBudgetConfigType] = None,
+        semaphore: Optional[asyncio.Semaphore] = None,
        alerting_config: Optional[AlertingConfig] = None,
        router_general_settings: Optional[
            RouterGeneralSettings
@ -316,6 +315,8 @@ class Router:

        from litellm._service_logger import ServiceLogging

+        if semaphore:
+            self.semaphore = semaphore
        self.set_verbose = set_verbose
        self.debug_level = debug_level
        self.enable_pre_call_checks = enable_pre_call_checks
@ -505,14 +506,6 @@ class Router:
            litellm.success_callback.append(self.sync_deployment_callback_on_success)
        else:
            litellm.success_callback = [self.sync_deployment_callback_on_success]
-        if isinstance(litellm._async_failure_callback, list):
-            litellm._async_failure_callback.append(
-                self.async_deployment_callback_on_failure
-            )
-        else:
-            litellm._async_failure_callback = [
-                self.async_deployment_callback_on_failure
-            ]
        ## COOLDOWNS ##
        if isinstance(litellm.failure_callback, list):
            litellm.failure_callback.append(self.deployment_callback_on_failure)
@ -2563,7 +2556,10 @@ class Router:
        original_function: Callable,
        **kwargs,
    ):
-        if kwargs.get("model") and self.get_model_list(model_name=kwargs["model"]):
+        if (
+            "model" in kwargs
+            and self.get_model_list(model_name=kwargs["model"]) is not None
+        ):
            deployment = await self.async_get_available_deployment(
                model=kwargs["model"]
            )
@ -3295,14 +3291,13 @@ class Router:
    ):
        """
        Track remaining tpm/rpm quota for model in model_list
+
+        Currently, only updates TPM usage.
        """
        try:
            if kwargs["litellm_params"].get("metadata") is None:
                pass
            else:
-                deployment_name = kwargs["litellm_params"]["metadata"].get(
-                    "deployment", None
-                )  # stable name - works for wildcard routes as well
                model_group = kwargs["litellm_params"]["metadata"].get(
                    "model_group", None
                )
@ -3313,8 +3308,6 @@ class Router:
                elif isinstance(id, int):
                    id = str(id)

-                parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
-
                _usage_obj = completion_response.get("usage")
                total_tokens = _usage_obj.get("total_tokens", 0) if _usage_obj else 0

@ -3326,14 +3319,13 @@ class Router:
                    "%H-%M"
                )  # use the same timezone regardless of system clock

-                tpm_key = RouterCacheEnum.TPM.value.format(
-                    id=id, current_minute=current_minute, model=deployment_name
-                )
+                tpm_key = f"global_router:{id}:tpm:{current_minute}"
                # ------------
                # Update usage
                # ------------
                # update cache

+                parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
                ## TPM
                await self.cache.async_increment_cache(
                    key=tpm_key,
@ -3342,17 +3334,6 @@ class Router:
                    ttl=RoutingArgs.ttl.value,
                )

-                ## RPM
-                rpm_key = RouterCacheEnum.RPM.value.format(
-                    id=id, current_minute=current_minute, model=deployment_name
-                )
-                await self.cache.async_increment_cache(
-                    key=rpm_key,
-                    value=1,
-                    parent_otel_span=parent_otel_span,
-                    ttl=RoutingArgs.ttl.value,
-                )
-
                increment_deployment_successes_for_current_minute(
                    litellm_router_instance=self,
                    deployment_id=id,
@ -3465,40 +3446,6 @@ class Router:
        except Exception as e:
            raise e

-    async def async_deployment_callback_on_failure(
-        self, kwargs, completion_response: Optional[Any], start_time, end_time
-    ):
-        """
-        Update RPM usage for a deployment
-        """
-        deployment_name = kwargs["litellm_params"]["metadata"].get(
-            "deployment", None
-        )  # handles wildcard routes - by giving the original name sent to `litellm.completion`
-        model_group = kwargs["litellm_params"]["metadata"].get("model_group", None)
-        model_info = kwargs["litellm_params"].get("model_info", {}) or {}
-        id = model_info.get("id", None)
-        if model_group is None or id is None:
-            return
-        elif isinstance(id, int):
-            id = str(id)
-        parent_otel_span = _get_parent_otel_span_from_kwargs(kwargs)
-
-        dt = get_utc_datetime()
-        current_minute = dt.strftime(
-            "%H-%M"
-        )  # use the same timezone regardless of system clock
-
-        ## RPM
-        rpm_key = RouterCacheEnum.RPM.value.format(
-            id=id, current_minute=current_minute, model=deployment_name
-        )
-        await self.cache.async_increment_cache(
-            key=rpm_key,
-            value=1,
-            parent_otel_span=parent_otel_span,
-            ttl=RoutingArgs.ttl.value,
-        )
-
    def log_retry(self, kwargs: dict, e: Exception) -> dict:
        """
        When a retry or fallback happens, log the details of the just failed model call - similar to Sentry breadcrumbing
@ -4176,24 +4123,7 @@ class Router:
                    raise Exception("Model Name invalid - {}".format(type(model)))
        return None

-    @overload
-    def get_router_model_info(
-        self, deployment: dict, received_model_name: str, id: None = None
-    ) -> ModelMapInfo:
-        pass
-
-    @overload
-    def get_router_model_info(
-        self, deployment: None, received_model_name: str, id: str
-    ) -> ModelMapInfo:
-        pass
-
-    def get_router_model_info(
-        self,
-        deployment: Optional[dict],
-        received_model_name: str,
-        id: Optional[str] = None,
-    ) -> ModelMapInfo:
+    def get_router_model_info(self, deployment: dict) -> ModelMapInfo:
        """
        For a given model id, return the model info (max tokens, input cost, output cost, etc.).

@ -4207,14 +4137,6 @@ class Router:
        Raises:
        - ValueError -> If model is not mapped yet
        """
-        if id is not None:
-            _deployment = self.get_deployment(model_id=id)
-            if _deployment is not None:
-                deployment = _deployment.model_dump(exclude_none=True)
-
-        if deployment is None:
-            raise ValueError("Deployment not found")
-
        ## GET BASE MODEL
        base_model = deployment.get("model_info", {}).get("base_model", None)
        if base_model is None:
@ -4236,27 +4158,10 @@ class Router:
        elif custom_llm_provider != "azure":
            model = _model

-            potential_models = self.pattern_router.route(received_model_name)
-            if "*" in model and potential_models is not None:  # if wildcard route
-                for potential_model in potential_models:
-                    try:
-                        if potential_model.get("model_info", {}).get(
-                            "id"
-                        ) == deployment.get("model_info", {}).get("id"):
-                            model = potential_model.get("litellm_params", {}).get(
-                                "model"
-                            )
-                            break
-                    except Exception:
-                        pass
-
        ## GET LITELLM MODEL INFO - raises exception, if model is not mapped
-        if not model.startswith(custom_llm_provider):
-            model_info_name = "{}/{}".format(custom_llm_provider, model)
-        else:
-            model_info_name = model
-
-        model_info = litellm.get_model_info(model=model_info_name)
+        model_info = litellm.get_model_info(
+            model="{}/{}".format(custom_llm_provider, model)
+        )

        ## CHECK USER SET MODEL INFO
        user_model_info = deployment.get("model_info", {})
@ -4306,10 +4211,8 @@ class Router:
        total_tpm: Optional[int] = None
        total_rpm: Optional[int] = None
        configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
-        model_list = self.get_model_list(model_name=model_group)
-        if model_list is None:
-            return None
-        for model in model_list:
+
+        for model in self.model_list:
            is_match = False
            if (
                "model_name" in model and model["model_name"] == model_group
@ -4324,7 +4227,7 @@ class Router:
            if not is_match:
                continue
            # model in model group found #
-            litellm_params = LiteLLM_Params(**model["litellm_params"])  # type: ignore
+            litellm_params = LiteLLM_Params(**model["litellm_params"])
            # get configurable clientside auth params
            configurable_clientside_auth_params = (
                litellm_params.configurable_clientside_auth_params
@ -4332,30 +4235,38 @@ class Router:
            # get model tpm
            _deployment_tpm: Optional[int] = None
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("tpm", None)  # type: ignore
+                _deployment_tpm = model.get("tpm", None)
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)  # type: ignore
+                _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)
            if _deployment_tpm is None:
-                _deployment_tpm = model.get("model_info", {}).get("tpm", None)  # type: ignore
+                _deployment_tpm = model.get("model_info", {}).get("tpm", None)

+            if _deployment_tpm is not None:
+                if total_tpm is None:
+                    total_tpm = 0
+                total_tpm += _deployment_tpm  # type: ignore
            # get model rpm
            _deployment_rpm: Optional[int] = None
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("rpm", None)  # type: ignore
+                _deployment_rpm = model.get("rpm", None)
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)  # type: ignore
+                _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)
            if _deployment_rpm is None:
-                _deployment_rpm = model.get("model_info", {}).get("rpm", None)  # type: ignore
+                _deployment_rpm = model.get("model_info", {}).get("rpm", None)

+            if _deployment_rpm is not None:
+                if total_rpm is None:
+                    total_rpm = 0
+                total_rpm += _deployment_rpm  # type: ignore
            # get model info
            try:
                model_info = litellm.get_model_info(model=litellm_params.model)
            except Exception:
                model_info = None
            # get llm provider
-            litellm_model, llm_provider = "", ""
+            model, llm_provider = "", ""
            try:
-                litellm_model, llm_provider, _, _ = litellm.get_llm_provider(
+                model, llm_provider, _, _ = litellm.get_llm_provider(
                    model=litellm_params.model,
                    custom_llm_provider=litellm_params.custom_llm_provider,
                )
@ -4366,7 +4277,7 @@ class Router:

            if model_info is None:
                supported_openai_params = litellm.get_supported_openai_params(
-                    model=litellm_model, custom_llm_provider=llm_provider
+                    model=model, custom_llm_provider=llm_provider
                )
                if supported_openai_params is None:
                    supported_openai_params = []
@ -4456,20 +4367,7 @@ class Router:
                    model_group_info.supported_openai_params = model_info[
                        "supported_openai_params"
                    ]
-                if model_info.get("tpm", None) is not None and _deployment_tpm is None:
-                    _deployment_tpm = model_info.get("tpm")
-                if model_info.get("rpm", None) is not None and _deployment_rpm is None:
-                    _deployment_rpm = model_info.get("rpm")

-            if _deployment_tpm is not None:
-                if total_tpm is None:
-                    total_tpm = 0
-                total_tpm += _deployment_tpm  # type: ignore
-
-            if _deployment_rpm is not None:
-                if total_rpm is None:
-                    total_rpm = 0
-                total_rpm += _deployment_rpm  # type: ignore
        if model_group_info is not None:
            ## UPDATE WITH TOTAL TPM/RPM FOR MODEL GROUP
            if total_tpm is not None:
@ -4521,10 +4419,7 @@ class Router:
        self, model_group: str
    ) -> Tuple[Optional[int], Optional[int]]:
        """
-        Returns current tpm/rpm usage for model group
-
-        Parameters:
-        - model_group: str - the received model name from the user (can be a wildcard route).
+        Returns remaining tpm/rpm quota for model group

        Returns:
        - usage: Tuple[tpm, rpm]
@ -4535,37 +4430,20 @@ class Router:
        )  # use the same timezone regardless of system clock
        tpm_keys: List[str] = []
        rpm_keys: List[str] = []
-
-        model_list = self.get_model_list(model_name=model_group)
-        if model_list is None:  # no matching deployments
-            return None, None
-
-        for model in model_list:
-            id: Optional[str] = model.get("model_info", {}).get("id")  # type: ignore
-            litellm_model: Optional[str] = model["litellm_params"].get(
-                "model"
-            )  # USE THE MODEL SENT TO litellm.completion() - consistent with how global_router cache is written.
-            if id is None or litellm_model is None:
-                continue
-            tpm_keys.append(
-                RouterCacheEnum.TPM.value.format(
-                    id=id,
-                    model=litellm_model,
-                    current_minute=current_minute,
+        for model in self.model_list:
+            if "model_name" in model and model["model_name"] == model_group:
+                tpm_keys.append(
+                    f"global_router:{model['model_info']['id']}:tpm:{current_minute}"
                )
-            )
-            rpm_keys.append(
-                RouterCacheEnum.RPM.value.format(
-                    id=id,
-                    model=litellm_model,
-                    current_minute=current_minute,
+                rpm_keys.append(
+                    f"global_router:{model['model_info']['id']}:rpm:{current_minute}"
                )
-            )
        combined_tpm_rpm_keys = tpm_keys + rpm_keys

        combined_tpm_rpm_values = await self.cache.async_batch_get_cache(
            keys=combined_tpm_rpm_keys
        )
+
        if combined_tpm_rpm_values is None:
            return None, None

@ -4590,32 +4468,6 @@ class Router:
                    rpm_usage += t
        return tpm_usage, rpm_usage

-    async def get_remaining_model_group_usage(self, model_group: str) -> Dict[str, int]:
-
-        current_tpm, current_rpm = await self.get_model_group_usage(model_group)
-
-        model_group_info = self.get_model_group_info(model_group)
-
-        if model_group_info is not None and model_group_info.tpm is not None:
-            tpm_limit = model_group_info.tpm
-        else:
-            tpm_limit = None
-
-        if model_group_info is not None and model_group_info.rpm is not None:
-            rpm_limit = model_group_info.rpm
-        else:
-            rpm_limit = None
-
-        returned_dict = {}
-        if tpm_limit is not None and current_tpm is not None:
-            returned_dict["x-ratelimit-remaining-tokens"] = tpm_limit - current_tpm
-            returned_dict["x-ratelimit-limit-tokens"] = tpm_limit
-        if rpm_limit is not None and current_rpm is not None:
-            returned_dict["x-ratelimit-remaining-requests"] = rpm_limit - current_rpm
-            returned_dict["x-ratelimit-limit-requests"] = rpm_limit
-
-        return returned_dict
-
    async def set_response_headers(
        self, response: Any, model_group: Optional[str] = None
    ) -> Any:
@ -4626,30 +4478,6 @@ class Router:
        # - if healthy_deployments > 1, return model group rate limit headers
        # - else return the model's rate limit headers
        """
-        if (
-            isinstance(response, BaseModel)
-            and hasattr(response, "_hidden_params")
-            and isinstance(response._hidden_params, dict)  # type: ignore
-        ):
-            response._hidden_params.setdefault("additional_headers", {})  # type: ignore
-            response._hidden_params["additional_headers"][  # type: ignore
-                "x-litellm-model-group"
-            ] = model_group
-
-            additional_headers = response._hidden_params["additional_headers"]  # type: ignore
-
-            if (
-                "x-ratelimit-remaining-tokens" not in additional_headers
-                and "x-ratelimit-remaining-requests" not in additional_headers
-                and model_group is not None
-            ):
-                remaining_usage = await self.get_remaining_model_group_usage(
-                    model_group
-                )
-
-                for header, value in remaining_usage.items():
-                    if value is not None:
-                        additional_headers[header] = value
        return response

    def get_model_ids(self, model_name: Optional[str] = None) -> List[str]:
@ -4712,9 +4540,6 @@ class Router:
        if hasattr(self, "model_list"):
            returned_models: List[DeploymentTypedDict] = []

-            if model_name is not None:
-                returned_models.extend(self._get_all_deployments(model_name=model_name))
-
            if hasattr(self, "model_group_alias"):
                for model_alias, model_value in self.model_group_alias.items():

@ -4735,32 +4560,21 @@ class Router:
                        )
                    )

-            if len(returned_models) == 0:  # check if wildcard route
-                potential_wildcard_models = self.pattern_router.route(model_name)
-                if potential_wildcard_models is not None:
-                    returned_models.extend(
-                        [DeploymentTypedDict(**m) for m in potential_wildcard_models]  # type: ignore
-                    )
-
            if model_name is None:
                returned_models += self.model_list

                return returned_models
-
+            returned_models.extend(self._get_all_deployments(model_name=model_name))
            return returned_models
        return None

-    def get_model_access_groups(self, model_name: Optional[str] = None):
-        """
-        If model_name is provided, only return access groups for that model.
-        """
+    def get_model_access_groups(self):
        from collections import defaultdict

        access_groups = defaultdict(list)

-        model_list = self.get_model_list(model_name=model_name)
-        if model_list:
-            for m in model_list:
+        if self.model_list:
+            for m in self.model_list:
                for group in m.get("model_info", {}).get("access_groups", []):
                    model_name = m["model_name"]
                    access_groups[group].append(model_name)
@ -4996,12 +4810,10 @@ class Router:
                    base_model = deployment.get("litellm_params", {}).get(
                        "base_model", None
                    )
-                model_info = self.get_router_model_info(
-                    deployment=deployment, received_model_name=model
-                )
                model = base_model or deployment.get("litellm_params", {}).get(
                    "model", None
                )
+                model_info = self.get_router_model_info(deployment=deployment)

                if (
                    isinstance(model_info, dict)
--- a/litellm/router_strategy/provider_budgets.py
+++ b/litellm/router_strategy/provider_budgets.py
@ -18,17 +18,13 @@ anthropic:
 ```
 """

-import asyncio
-from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypedDict, Union

 import litellm
 from litellm._logging import verbose_router_logger
 from litellm.caching.caching import DualCache
-from litellm.caching.redis_cache import RedisPipelineIncrementOperation
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
-from litellm.litellm_core_utils.duration_parser import duration_in_seconds
 from litellm.router_utils.cooldown_callbacks import (
    _get_prometheus_logger_from_callbacks,
 )
@ -47,14 +43,10 @@ if TYPE_CHECKING:
 else:
    Span = Any

-DEFAULT_REDIS_SYNC_INTERVAL = 1
-

 class ProviderBudgetLimiting(CustomLogger):
    def __init__(self, router_cache: DualCache, provider_budget_config: dict):
        self.router_cache = router_cache
-        self.redis_increment_operation_queue: List[RedisPipelineIncrementOperation] = []
-        asyncio.create_task(self.periodic_sync_in_memory_spend_with_redis())

        # cast elements of provider_budget_config to ProviderBudgetInfo
        for provider, config in provider_budget_config.items():
@ -180,76 +172,19 @@ class ProviderBudgetLimiting(CustomLogger):

        return potential_deployments

-    async def _get_or_set_budget_start_time(
-        self, start_time_key: str, current_time: float, ttl_seconds: int
-    ) -> float:
-        """
-        Checks if the key = `provider_budget_start_time:{provider}` exists in cache.
-
-        If it does, return the value.
-        If it does not, set the key to `current_time` and return the value.
-        """
-        budget_start = await self.router_cache.async_get_cache(start_time_key)
-        if budget_start is None:
-            await self.router_cache.async_set_cache(
-                key=start_time_key, value=current_time, ttl=ttl_seconds
-            )
-            return current_time
-        return float(budget_start)
-
-    async def _handle_new_budget_window(
-        self,
-        spend_key: str,
-        start_time_key: str,
-        current_time: float,
-        response_cost: float,
-        ttl_seconds: int,
-    ) -> float:
-        """
-        Handle start of new budget window by resetting spend and start time
-
-        Enters this when:
-        - The budget does not exist in cache, so we need to set it
-        - The budget window has expired, so we need to reset everything
-
-        Does 2 things:
-        - stores key: `provider_spend:{provider}:1d`, value: response_cost
-        - stores key: `provider_budget_start_time:{provider}`, value: current_time.
-            This stores the start time of the new budget window
-        """
-        await self.router_cache.async_set_cache(
-            key=spend_key, value=response_cost, ttl=ttl_seconds
-        )
-        await self.router_cache.async_set_cache(
-            key=start_time_key, value=current_time, ttl=ttl_seconds
-        )
-        return current_time
-
-    async def _increment_spend_in_current_window(
-        self, spend_key: str, response_cost: float, ttl: int
-    ):
-        """
-        Increment spend within existing budget window
-
-        Runs once the budget start time exists in Redis Cache (on the 2nd and subsequent requests to the same provider)
-
-        - Increments the spend in memory cache (so spend instantly updated in memory)
-        - Queues the increment operation to Redis Pipeline (using batched pipeline to optimize performance. Using Redis for multi instance environment of LiteLLM)
-        """
-        await self.router_cache.in_memory_cache.async_increment(
-            key=spend_key,
-            value=response_cost,
-            ttl=ttl,
-        )
-        increment_op = RedisPipelineIncrementOperation(
-            key=spend_key,
-            increment_value=response_cost,
-            ttl=ttl,
-        )
-        self.redis_increment_operation_queue.append(increment_op)
-
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
-        """Original method now uses helper functions"""
+        """
+        Increment provider spend in DualCache (InMemory + Redis)
+
+        Handles saving current provider spend to Redis.
+
+        Spend is stored as:
+            provider_spend:{provider}:{time_period}
+            ex. provider_spend:openai:1d
+            ex. provider_spend:anthropic:7d
+
+        The time period is tracked for time_periods set in the provider budget config.
+        """
        verbose_router_logger.debug("in ProviderBudgetLimiting.async_log_success_event")
        standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
            "standard_logging_object", None
@ -272,145 +207,19 @@ class ProviderBudgetLimiting(CustomLogger):
            )

        spend_key = f"provider_spend:{custom_llm_provider}:{budget_config.time_period}"
-        start_time_key = f"provider_budget_start_time:{custom_llm_provider}"
-
-        current_time = datetime.now(timezone.utc).timestamp()
-        ttl_seconds = duration_in_seconds(budget_config.time_period)
-
-        budget_start = await self._get_or_set_budget_start_time(
-            start_time_key=start_time_key,
-            current_time=current_time,
-            ttl_seconds=ttl_seconds,
-        )
-
-        if budget_start is None:
-            # First spend for this provider
-            budget_start = await self._handle_new_budget_window(
-                spend_key=spend_key,
-                start_time_key=start_time_key,
-                current_time=current_time,
-                response_cost=response_cost,
-                ttl_seconds=ttl_seconds,
-            )
-        elif (current_time - budget_start) > ttl_seconds:
-            # Budget window expired - reset everything
-            verbose_router_logger.debug("Budget window expired - resetting everything")
-            budget_start = await self._handle_new_budget_window(
-                spend_key=spend_key,
-                start_time_key=start_time_key,
-                current_time=current_time,
-                response_cost=response_cost,
-                ttl_seconds=ttl_seconds,
-            )
-        else:
-            # Within existing window - increment spend
-            remaining_time = ttl_seconds - (current_time - budget_start)
-            ttl_for_increment = int(remaining_time)
-
-            await self._increment_spend_in_current_window(
-                spend_key=spend_key, response_cost=response_cost, ttl=ttl_for_increment
-            )
-
+        ttl_seconds = self.get_ttl_seconds(budget_config.time_period)
        verbose_router_logger.debug(
-            f"Incremented spend for {spend_key} by {response_cost}"
+            f"Incrementing spend for {spend_key} by {response_cost}, ttl: {ttl_seconds}"
+        )
+        # Increment the spend in Redis and set TTL
+        await self.router_cache.async_increment_cache(
+            key=spend_key,
+            value=response_cost,
+            ttl=ttl_seconds,
+        )
+        verbose_router_logger.debug(
+            f"Incremented spend for {spend_key} by {response_cost}, ttl: {ttl_seconds}"
        )
-
-    async def periodic_sync_in_memory_spend_with_redis(self):
-        """
-        Handler that triggers sync_in_memory_spend_with_redis every DEFAULT_REDIS_SYNC_INTERVAL seconds
-
-        Required for multi-instance environment usage of provider budgets
-        """
-        while True:
-            try:
-                await self._sync_in_memory_spend_with_redis()
-                await asyncio.sleep(
-                    DEFAULT_REDIS_SYNC_INTERVAL
-                )  # Wait for DEFAULT_REDIS_SYNC_INTERVAL seconds before next sync
-            except Exception as e:
-                verbose_router_logger.error(f"Error in periodic sync task: {str(e)}")
-                await asyncio.sleep(
-                    DEFAULT_REDIS_SYNC_INTERVAL
-                )  # Still wait DEFAULT_REDIS_SYNC_INTERVAL seconds on error before retrying
-
-    async def _push_in_memory_increments_to_redis(self):
-        """
-        How this works:
-        - async_log_success_event collects all provider spend increments in `redis_increment_operation_queue`
-        - This function pushes all increments to Redis in a batched pipeline to optimize performance
-
-        Only runs if Redis is initialized
-        """
-        try:
-            if not self.router_cache.redis_cache:
-                return  # Redis is not initialized
-
-            verbose_router_logger.debug(
-                "Pushing Redis Increment Pipeline for queue: %s",
-                self.redis_increment_operation_queue,
-            )
-            if len(self.redis_increment_operation_queue) > 0:
-                asyncio.create_task(
-                    self.router_cache.redis_cache.async_increment_pipeline(
-                        increment_list=self.redis_increment_operation_queue,
-                    )
-                )
-
-            self.redis_increment_operation_queue = []
-
-        except Exception as e:
-            verbose_router_logger.error(
-                f"Error syncing in-memory cache with Redis: {str(e)}"
-            )
-
-    async def _sync_in_memory_spend_with_redis(self):
-        """
-        Ensures in-memory cache is updated with latest Redis values for all provider spends.
-
-        Why Do we need this?
-        - Optimization to hit sub 100ms latency. Performance was impacted when redis was used for read/write per request
-        - Use provider budgets in multi-instance environment, we use Redis to sync spend across all instances
-
-        What this does:
-        1. Push all provider spend increments to Redis
-        2. Fetch all current provider spend from Redis to update in-memory cache
-        """
-
-        try:
-            # No need to sync if Redis cache is not initialized
-            if self.router_cache.redis_cache is None:
-                return
-
-            # 1. Push all provider spend increments to Redis
-            await self._push_in_memory_increments_to_redis()
-
-            # 2. Fetch all current provider spend from Redis to update in-memory cache
-            cache_keys = []
-            for provider, config in self.provider_budget_config.items():
-                if config is None:
-                    continue
-                cache_keys.append(f"provider_spend:{provider}:{config.time_period}")
-
-            # Batch fetch current spend values from Redis
-            redis_values = await self.router_cache.redis_cache.async_batch_get_cache(
-                key_list=cache_keys
-            )
-
-            # Update in-memory cache with Redis values
-            if isinstance(redis_values, dict):  # Check if redis_values is a dictionary
-                for key, value in redis_values.items():
-                    if value is not None:
-                        await self.router_cache.in_memory_cache.async_set_cache(
-                            key=key, value=float(value)
-                        )
-                        verbose_router_logger.debug(
-                            f"Updated in-memory cache for {key}: {value}"
-                        )
-
-        except Exception as e:
-            verbose_router_logger.error(
-                f"Error syncing in-memory cache with Redis: {str(e)}"
-            )

    def _get_budget_config_for_provider(
        self, provider: str
@ -433,6 +242,15 @@ class ProviderBudgetLimiting(CustomLogger):
            return None
        return custom_llm_provider

+    def get_ttl_seconds(self, time_period: str) -> int:
+        """
+        Convert time period (e.g., '1d', '30d') to seconds for Redis TTL
+        """
+        if time_period.endswith("d"):
+            days = int(time_period[:-1])
+            return days * 24 * 60 * 60
+        raise ValueError(f"Unsupported time period format: {time_period}")
+
    def _track_provider_remaining_budget_prometheus(
        self, provider: str, spend: float, budget_limit: float
    ):
--- a/litellm/router_utils/pattern_match_deployments.py
+++ b/litellm/router_utils/pattern_match_deployments.py
@ -79,9 +79,7 @@ class PatternMatchRouter:

        return new_deployments

-    def route(
-        self, request: Optional[str], filtered_model_names: Optional[List[str]] = None
-    ) -> Optional[List[Dict]]:
+    def route(self, request: Optional[str]) -> Optional[List[Dict]]:
        """
        Route a requested model to the corresponding llm deployments based on the regex pattern

@ -91,26 +89,14 @@ class PatternMatchRouter:

        Args:
            request: Optional[str]
-            filtered_model_names: Optional[List[str]] - if provided, only return deployments that match the filtered_model_names
+
        Returns:
            Optional[List[Deployment]]: llm deployments
        """
        try:
            if request is None:
                return None
-
-            regex_filtered_model_names = (
-                [self._pattern_to_regex(m) for m in filtered_model_names]
-                if filtered_model_names is not None
-                else []
-            )
-
            for pattern, llm_deployments in self.patterns.items():
-                if (
-                    filtered_model_names is not None
-                    and pattern not in regex_filtered_model_names
-                ):
-                    continue
                pattern_match = re.match(pattern, request)
                if pattern_match:
                    return self._return_pattern_matched_deployments(
--- a/litellm/router_utils/response_headers.py
+++ b/litellm/router_utils/response_headers.py
--- a/litellm/secret_managers/aws_secret_manager_v2.py
+++ b/litellm/secret_managers/aws_secret_manager_v2.py
@ -31,8 +31,8 @@ from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
    get_async_httpx_client,
 )
+from litellm.llms.custom_httpx.types import httpxSpecialProvider
 from litellm.proxy._types import KeyManagementSystem
-from litellm.types.llms.custom_http import httpxSpecialProvider


 class AWSSecretsManagerV2(BaseAWSLLM):
--- a/litellm/tests/test_mlflow.py
+++ b/litellm/tests/test_mlflow.py
@ -0,0 +1,29 @@
+import pytest
+
+import litellm
+
+
+def test_mlflow_logging():
+    litellm.success_callback = ["mlflow"]
+    litellm.failure_callback = ["mlflow"]
+
+    litellm.completion(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "what llm are u"}],
+        max_tokens=10,
+        temperature=0.2,
+        user="test-user",
+    )
+
+@pytest.mark.asyncio()
+async def test_async_mlflow_logging():
+    litellm.success_callback = ["mlflow"]
+    litellm.failure_callback = ["mlflow"]
+
+    await litellm.acompletion(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "hi test from local arize"}],
+        mock_response="hello",
+        temperature=0.1,
+        user="OTEL_USER",
+    )
--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
Ishaan Jaff	0afdba0822	add unit tests for vertex pass through	2024-11-22 16:49:35 -08:00
Ishaan Jaff	8aa18b3977	use get_litellm_virtual_key	2024-11-22 16:44:35 -08:00
Ishaan Jaff	7674217e6c	docs add usage example for js	2024-11-22 16:40:40 -08:00
Ishaan Jaff	77fe5af5b3	simplify local	2024-11-22 16:31:58 -08:00
Ishaan Jaff	35040f12be	run unit tests 1st	2024-11-22 16:15:37 -08:00
Ishaan Jaff	06da8a5fbc	test_convert_raw_bytes_to_str_lines	2024-11-22 16:07:45 -08:00
Ishaan Jaff	413092ec1c	unit tests for streaming	2024-11-22 16:04:45 -08:00
Ishaan Jaff	88dbb706c1	test_chunk_processor_yields_raw_bytes	2024-11-22 16:04:30 -08:00
Ishaan Jaff	4b576571a1	test vertex	2024-11-22 15:47:03 -08:00
Ishaan Jaff	4b607e0cc2	use good name for test	2024-11-22 15:44:57 -08:00
Ishaan Jaff	c7c586c8a6	move basic bass through test	2024-11-22 15:44:28 -08:00
Ishaan Jaff	d3f23e0528	add working vertex jest tests	2024-11-22 15:40:56 -08:00
Ishaan Jaff	53e82b7f14	test vertex js	2024-11-22 15:17:59 -08:00
Ishaan Jaff	4972415372	use common _create_vertex_response_logging_payload_for_generate_content	2024-11-22 14:35:02 -08:00
Ishaan Jaff	7422af75fd	fix PassThroughStreamingHandler	2024-11-22 14:20:21 -08:00
Ishaan Jaff	04c9284da4	use PassThroughStreamingHandler	2024-11-22 14:19:28 -08:00
Ishaan Jaff	4273837add	fix vertex_proxy_route	2024-11-22 13:19:01 -08:00
Ishaan Jaff	bbb2e029b5	tes vertex JS sdk	2024-11-22 13:18:23 -08:00
Ishaan Jaff	e829b228b2	handle vertex pass through separately	2024-11-22 13:18:08 -08:00
Ishaan Jaff	dcab2d0c6f	add vertex js sdj example	2024-11-22 12:59:34 -08:00
Ishaan Jaff	f83708ed4e	stash gemini JS test	2024-11-22 12:59:01 -08:00
				`@ -1 +0,0 @@`
				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{11837:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=11837)}),_N_E=n.O()}]);`
				`@ -0,0 +1 @@`
				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);`